def calculate_weights(rdd_in: RDD) -> RDD: rdd_in = rdd_in.collect() total: int = compute_sum_of_values(rdd_in) total_counts_for_the_four.append(total) return SC.parallelize(rdd_in).map(lambda x: (x[0], x[1], round(100 * x[1] / total)))
def assert_rdd_equal(expected: Collection, result: RDD, check_order: bool = True): """ Compare two RDD or one RDD with a Collection :param expected: A Collection to compare. For convenience, doesn't need to be a RDD. :param result: The RDD to compare. :param check_order: Compare the order of values. """ if isinstance(expected, RDD): expected = expected.collect() else: expected = [_ for _ in expected] result = result.collect() # length comparison msg = f'RDD length {len(result)} does not match expected {len(expected)}' assert len(expected) == len(result), msg # value comparison if check_order is True: assert expected == result else: assert Counter(expected) == Counter(result)
def run(log: logging.Log4j, config: Dict[str, str], n_cols: int, vrn_rdd_tfm: RDD, results_rdd: RDD, prices_rdd_tfm: RDD) -> None: """Runner of Load phase. Loads the transformed RDDs back into the respective worksheets in GSheets: - vrn_rdd_tfm - "VRNCleaned" worksheet - results_rdd - "Results" worksheet - prices_rdd_tfm - "Prices" worksheet Args: log: Log4j object config: Key-value mappings of config values n_cols: Number of columns in original VRN worksheet vrn_rdd_tfm: Transformed VRN RDD results_rdd: Results RDD prices_rdd_tfm: Transformed car prices RDD """ # config values used spreadsheet_id = config[constants.CONFIG_GSHEET_SPREADSHEET_ID_DEV] ws_title_vrn_cleaned = config[constants.CONFIG_GSHEET_WS_VRN_CLEANED] ws_title_results = config[constants.CONFIG_GSHEET_WS_RESULTS] ws_title_prices = config[constants.CONFIG_GSHEET_WS_PRICES] # load VRN RDD and save to "VRNCleaned" worksheet vrn_data_tfm_flattened = vrn_rdd_tfm.collect() # Split this list into chunks, where each chunk is the number of elements per row vrn_data_tfm = list(genhelpers._chunks(vrn_data_tfm_flattened, n_cols)) vrn_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_vrn_cleaned, vrn_data_tfm, False) _log_load_resp(log, ws_title_results, vrn_resp) # load results RDD and save to "Results" worksheet results_data_flattened = results_rdd.collect() # Split this list into chunks, where each chunk is the number of elements per row results_data = list(genhelpers._chunks(results_data_flattened, n_cols)) results_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_results, results_data, False) _log_load_resp(log, ws_title_results, results_resp) # load prices RDD and save to "Prices" worksheet prices_data_tfm = prices_rdd_tfm.collect() prices_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_prices, prices_data_tfm, True) _log_load_resp(log, ws_title_prices, prices_resp) return None
def writeFile(rdd: RDD): num = rdd.count() if num > 0: result_dic = open_result() print(result_dic) rdd_c = rdd.collect() lst = ast.literal_eval(str(rdd_c)) for item in lst: key = item[0] value = item[1] if str(key).replace("'", '') in result_dic.keys(): result_dic[str(key).replace( "'", '')] = result_dic[str(key).replace("'", '')] + value else: result_dic[str(key).replace("'", '')] = value result = open(path, 'w', encoding='utf-8') result.write( json.dumps(result_dic).encode('gb18030').decode('unicode_escape')) result.close()
def writeFile(rdd: RDD, f_type): num = rdd.count() global save1 global save2 if num > 0: result_dic = open_result(f_type) rdd_c = rdd.collect() lst = ast.literal_eval(str(rdd_c)) for item in lst: key = item[0] value = item[1] if str(key).replace("'", '') in result_dic.keys(): result_dic[str(key).replace("'", '')] = result_dic[str(key).replace("'", '')] + value else: result_dic[str(key).replace("'", '')] = value if f_type == 0: save1 = result_dic if f_type == 1: save2 = result_dic result = open(path[f_type], 'w', encoding='utf-8') result.write(json.dumps(result_dic).encode('gb18030').decode('unicode_escape')) result.close()
def m_o(ctx: SparkContext, data: pr.RDD) -> None: assert isinstance(ctx, SparkContext) assert 1 == len(data.collect())