def expand_csv(csv_file_path, output_csv_file_path, other_fields=None): """ Expand a CSV file """ other_fields = other_fields or [] csv_file_reader = CsvHandler(csv_file_path).read_csv() csv_handler = CsvHandler(output_csv_file_path) output_fields = other_fields + ["normalized_address"] csv_output_file_writer = csv_handler.write_csv(output_fields) for row in csv_file_reader: expanded_rows = CsvAddressExpander.expand_row( row, other_fields=other_fields) csv_output_file_writer.writerows(expanded_rows)
def test_write_csv(): output_csv_file_path = f"{CURRENT_DIRECTORY}/resources/csv_handler/write_test.csv" csv_handler = CsvHandler(output_csv_file_path) writer = csv_handler.write_csv(["country", "address"]) writer.writerow({ "country": "Wales", "address": "61 Wellfield Road Roath Cardiff" }) csv_handler.close() csv_handler = CsvHandler(output_csv_file_path) reader = csv_handler.read_csv() row = next(reader) assert row["country"] == "Wales" assert row["address"] == "61 Wellfield Road Roath Cardiff" csv_handler.close() os.remove(output_csv_file_path)
def merge(self, output_path): """ Do the merge the best way possible """ logging.info( f"Starting merge {self.csv_file1_path} + {self.csv_file2_path} " f"to {output_path}") start_time = time.time() # Expand the CSV file 1, i.e. duplicate rows by adding a new column # called "normalized_address" each new row will have all # the same values in the first columns, # but a different one in "normalized_address" expanded_csv_file1_path = "expanded_file1.csv" CsvAddressExpander.expand_csv(self.csv_file1_path, expanded_csv_file1_path, ["id_store", "variable1"]) file1_df = pd.read_csv(expanded_csv_file1_path, sep=";") # Set an index to increase perfomance of filters file1_df.set_index("normalized_address", drop=True, inplace=True) # Prepare the output CSV writer output_handler = CsvHandler(output_path) csv_output_file_writer = output_handler.write_csv( ['id', 'var1', 'var2', 'ratio']) csv_output_file_row_count = 0 # Read each row of the CSV file 2 and expand their addresses, # loop through their normalized addresses and as soon as # there is a coincidence, write the result on the output CSV file csv_file2_reader = CsvHandler(self.csv_file2_path).read_csv() for row2 in csv_file2_reader: normalized_addresses = expand_address(row2["address"]) for normalized_address in normalized_addresses: # Exact match using the index (better performance) file1_normalized_address_selection =\ file1_df[file1_df.index == normalized_address] file1_normalized_address_selection_match_found =\ file1_normalized_address_selection.shape[0] > 0 if file1_normalized_address_selection_match_found: row1 = file1_normalized_address_selection.iloc[0] output_row = { "id": row1["id_store"], "var1": row1["variable1"], "var2": row2["variable2"], "ratio": (float(row1["variable1"]) / float(row2["variable2"]) if float(row2["variable2"]) != 0.0 else None) } csv_output_file_writer.writerow(output_row) csv_output_file_row_count += 1 break else: # Exact match has not delivered any results # TODO: implement fuzzy matching logging.error(f"Error. {row2['address']}' couldn't be found " f"on expanded address set.") os.remove(expanded_csv_file1_path) elapsed_time = time.time() - start_time csv1_address_count = file1_df[["id_store"]].drop_duplicates().shape[0] merged_address_percentage =\ 100.0 * csv_output_file_row_count / csv1_address_count logging.info(f"Merge of addresses {csv_output_file_row_count}" f"of {csv1_address_count} " f"({merged_address_percentage} %) " f"completed in {output_path} in {elapsed_time} seconds")