def expand_csv(csv_file_path, output_csv_file_path, other_fields=None):
     """
     Expand a CSV file
     """
     other_fields = other_fields or []
     csv_file_reader = CsvHandler(csv_file_path).read_csv()
     csv_handler = CsvHandler(output_csv_file_path)
     output_fields = other_fields + ["normalized_address"]
     csv_output_file_writer = csv_handler.write_csv(output_fields)
     for row in csv_file_reader:
         expanded_rows = CsvAddressExpander.expand_row(
             row, other_fields=other_fields)
         csv_output_file_writer.writerows(expanded_rows)
Example #2
0
def test_write_csv():
    output_csv_file_path = f"{CURRENT_DIRECTORY}/resources/csv_handler/write_test.csv"

    csv_handler = CsvHandler(output_csv_file_path)
    writer = csv_handler.write_csv(["country", "address"])
    writer.writerow({
        "country": "Wales",
        "address": "61 Wellfield Road Roath Cardiff"
    })
    csv_handler.close()

    csv_handler = CsvHandler(output_csv_file_path)
    reader = csv_handler.read_csv()
    row = next(reader)
    assert row["country"] == "Wales"
    assert row["address"] == "61 Wellfield Road Roath Cardiff"
    csv_handler.close()

    os.remove(output_csv_file_path)
Example #3
0
    def merge(self, output_path):
        """
        Do the merge the best way possible
        """

        logging.info(
            f"Starting merge {self.csv_file1_path} + {self.csv_file2_path} "
            f"to {output_path}")

        start_time = time.time()

        # Expand the CSV file 1, i.e. duplicate rows by adding a new column
        # called "normalized_address" each new row will have all
        # the same values in the first columns,
        # but a different one in "normalized_address"
        expanded_csv_file1_path = "expanded_file1.csv"
        CsvAddressExpander.expand_csv(self.csv_file1_path,
                                      expanded_csv_file1_path,
                                      ["id_store", "variable1"])
        file1_df = pd.read_csv(expanded_csv_file1_path, sep=";")
        # Set an index to increase perfomance of filters
        file1_df.set_index("normalized_address", drop=True, inplace=True)

        # Prepare the output CSV writer
        output_handler = CsvHandler(output_path)
        csv_output_file_writer = output_handler.write_csv(
            ['id', 'var1', 'var2', 'ratio'])
        csv_output_file_row_count = 0
        # Read each row of the CSV file 2 and expand their addresses,
        # loop through their normalized addresses and as soon as
        # there is a coincidence, write the result on the output CSV file
        csv_file2_reader = CsvHandler(self.csv_file2_path).read_csv()
        for row2 in csv_file2_reader:
            normalized_addresses = expand_address(row2["address"])
            for normalized_address in normalized_addresses:
                # Exact match using the index (better performance)
                file1_normalized_address_selection =\
                    file1_df[file1_df.index == normalized_address]
                file1_normalized_address_selection_match_found =\
                    file1_normalized_address_selection.shape[0] > 0
                if file1_normalized_address_selection_match_found:
                    row1 = file1_normalized_address_selection.iloc[0]
                    output_row = {
                        "id":
                        row1["id_store"],
                        "var1":
                        row1["variable1"],
                        "var2":
                        row2["variable2"],
                        "ratio":
                        (float(row1["variable1"]) / float(row2["variable2"])
                         if float(row2["variable2"]) != 0.0 else None)
                    }
                    csv_output_file_writer.writerow(output_row)
                    csv_output_file_row_count += 1
                    break
            else:
                # Exact match has not delivered any results
                # TODO: implement fuzzy matching
                logging.error(f"Error. {row2['address']}' couldn't be found "
                              f"on expanded address set.")
        os.remove(expanded_csv_file1_path)

        elapsed_time = time.time() - start_time
        csv1_address_count = file1_df[["id_store"]].drop_duplicates().shape[0]
        merged_address_percentage =\
            100.0 * csv_output_file_row_count / csv1_address_count
        logging.info(f"Merge of addresses {csv_output_file_row_count}"
                     f"of {csv1_address_count} "
                     f"({merged_address_percentage} %) "
                     f"completed in {output_path} in {elapsed_time} seconds")