def _test_join_pair( self, read_table_: Callable, schema: Dict[str, str], left: Path, right: Path, on: List[str], how: str, ): with TemporaryDirectory() as workdir: workdir = Path(workdir) tmpfile = workdir / "tmpfile.csv" table_join(left, right, on, tmpfile, how=how) test_result = export_csv(read_table_(tmpfile), schema=schema) pandas_how = how.replace("outer", "left") pandas_result = export_csv(read_table_(left).merge( read_table_(right), on=on, how=pandas_how), schema=schema) # Converting to a CSV in memory sometimes produces out-of-order values test_result_lines = sorted(test_result.split("\n")) pandas_result_lines = sorted(pandas_result.split("\n")) for line1, line2 in zip(test_result_lines, pandas_result_lines): self.assertEqual(line1, line2)
def merge_output_tables( tables_folder: Path, output_path: Path, drop_empty_columns: bool = False, use_table_names: List[str] = None, ) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. This function requires index.csv to be present under `tables_folder`. Arguments: tables_folder: Input directory where all CSV files exist. output_path: Output directory for the resulting main.csv file. drop_empty_columns: Flag determining whether columns with null values only should be removed from the output. exclude_table_names: Tables which should be removed from the combined output. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Use temporary files to avoid computing everything in memory temp_input = workdir / "tmp.1.csv" temp_output = workdir / "tmp.2.csv" # Start with all combinations of <location key x date> _make_location_key_and_date_table(tables_folder / "index.csv", temp_output) temp_input, temp_output = temp_output, temp_input for table_file_path in table_paths: # Join by <location key> or <location key x date> depending on what's available table_columns = get_table_columns(table_file_path) join_on = [col for col in ("key", "location_key", "date") if col in table_columns] # Iteratively perform left outer joins on all tables table_join(temp_input, table_file_path, join_on, temp_output, how="outer") # Flip-flop the temp files to avoid a copy temp_input, temp_output = temp_output, temp_input # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Remove columns which provide no data because they are only null values if drop_empty_columns: table_drop_nan_columns(temp_input, temp_output) temp_input, temp_output = temp_output, temp_input # Ensure that the table is appropriately sorted and write to output location table_sort(temp_input, output_path)
def _test_join_pair( self, read_table_: Callable, schema: Dict[str, str], left: Path, right: Path, on: List[str], how_mem: str, how_pandas: str, ): with temporary_directory() as workdir: output_file_1 = workdir / "output.1.csv" output_file_2 = workdir / "output.2.csv" # Join using our memory efficient method table_join(left, right, on, output_file_1, how=how_mem) # Join using the pandas method pandas_result = read_table_(left).merge(read_table_(right), on=on, how=how_pandas) export_csv(pandas_result, output_file_2, schema=schema) _compare_tables_equal(self, output_file_1, output_file_2)
def make_main_table(tables_folder: Path, output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) print("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = [ date.date().isoformat() for date in date_range("2020-01-01", max_date) ] date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) print("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) print("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path) print("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path) shutil.move(temp_file_path, main_table_path) print(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) print("Sorted main table")