def publish_global_tables( tables_folder: Path, output_folder: Path, use_table_names: List[str], column_adapter: Dict[str, str], ) -> None: """ Copy all the tables from `tables_folder` into `output_folder` converting the column names to the requested schema. Arguments: tables_folder: Input directory containing tables as CSV files. output_folder: Directory where the output tables will be written. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names) # Whether it's "key" or "location_key" depends on the schema location_key = "location_key" if "location_key" in column_adapter.values( ) else "key" with temporary_directory() as workdir: for csv_path in table_paths: # Copy all output files to a temporary folder, renaming columns if necessary _logger.log_info(f"Renaming columns for {csv_path.name}") table_rename(csv_path, workdir / csv_path.name, column_adapter) for csv_path in table_paths: # Sort output files by location key, since the following breakout step requires it _logger.log_info(f"Sorting {csv_path.name}") table_sort(workdir / csv_path.name, output_folder / csv_path.name, [location_key])
def publish_global_tables(tables_folder: Path, output_folder: Path, use_table_names: List[str] = None) -> None: """ Copy all the tables from `tables_folder` into `output_folder` converting the column names to the latest schema, and join all the tables into a single main.csv file. Arguments: tables_folder: Input directory containing tables as CSV files. output_folder: Directory where the output tables will be written. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) with temporary_directory() as workdir: for csv_path in table_paths: # Copy all output files to a temporary folder, renaming columns if necessary _logger.log_info(f"Renaming columns for {csv_path.name}") table_rename(csv_path, workdir / csv_path.name, OUTPUT_COLUMN_ADAPTER) for csv_path in table_paths: # Sort output files by location key, since the following breakout step requires it _logger.log_info(f"Sorting {csv_path.name}") table_sort(workdir / csv_path.name, output_folder / csv_path.name, ["location_key"])
def merge_output_tables( tables_folder: Path, output_path: Path, drop_empty_columns: bool = False, use_table_names: List[str] = None, ) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. This function requires index.csv to be present under `tables_folder`. Arguments: tables_folder: Input directory where all CSV files exist. output_path: Output directory for the resulting main.csv file. drop_empty_columns: Flag determining whether columns with null values only should be removed from the output. exclude_table_names: Tables which should be removed from the combined output. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Use temporary files to avoid computing everything in memory temp_input = workdir / "tmp.1.csv" temp_output = workdir / "tmp.2.csv" # Start with all combinations of <location key x date> _make_location_key_and_date_table(tables_folder / "index.csv", temp_output) temp_input, temp_output = temp_output, temp_input for table_file_path in table_paths: # Join by <location key> or <location key x date> depending on what's available table_columns = get_table_columns(table_file_path) join_on = [col for col in ("key", "location_key", "date") if col in table_columns] # Iteratively perform left outer joins on all tables table_join(temp_input, table_file_path, join_on, temp_output, how="outer") # Flip-flop the temp files to avoid a copy temp_input, temp_output = temp_output, temp_input # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Remove columns which provide no data because they are only null values if drop_empty_columns: table_drop_nan_columns(temp_input, temp_output) temp_input, temp_output = temp_output, temp_input # Ensure that the table is appropriately sorted and write to output location table_sort(temp_input, output_path)
def test_table_sort(self): test_csv = """col1,col2,col3 a,1,foo d,4,bar c,3,foo b,2,bar """ with TemporaryDirectory() as workdir: workdir = Path(workdir) input_file = workdir / "in.csv" with open(input_file, "w") as fd: for line in test_csv.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") # Sort using the default (first) column output_file_1 = workdir / "out.csv" table_sort(input_file, output_file_1) output_file_2 = workdir / "pandas.csv" read_table(input_file).sort_values(["col1"]).to_csv(output_file_2, index=False) for line1, line2 in zip(read_lines(output_file_1), read_lines(output_file_2)): self.assertEqual(line1.strip(), line2.strip()) # Sort by each column in order for sort_column in ("col1", "col2", "col3"): output_file_1 = workdir / "out.csv" table_sort(input_file, output_file_1, [sort_column]) output_file_2 = workdir / "pandas.csv" read_table(input_file).sort_values([sort_column ]).to_csv(output_file_2, index=False) for line1, line2 in zip(read_lines(output_file_1), read_lines(output_file_2)): self.assertEqual(line1.strip(), line2.strip())
def test_table_sort(self): test_csv = _make_test_csv_file( """ col1,col2,col3 a,1,foo d,4,bar c,3,foo b,2,bar """ ) with temporary_directory() as workdir: # Sort using the default (first) column output_file_1 = workdir / "out.csv" table_sort(test_csv, output_file_1) test_csv.seek(0) output_file_2 = workdir / "pandas.csv" read_table(test_csv, file_type="csv").sort_values(["col1"]).to_csv( output_file_2, index=False ) _compare_tables_equal(self, output_file_1, output_file_2) # Sort by each column in order for sort_column in ("col1", "col2", "col3"): output_file_1 = workdir / f"1.{sort_column}.csv" table_sort(test_csv, output_file_1, [sort_column]) test_csv.seek(0) output_file_2 = workdir / f"2.{sort_column}.csv" read_table(test_csv, file_type="csv").sort_values([sort_column]).to_csv( output_file_2, index=False ) _compare_tables_equal(self, output_file_1, output_file_2)
def publish_global_tables(tables_folder: Path, output_folder: Path) -> None: """ Copy all the tables from `tables_folder` into `output_folder` converting the column names to the latest schema. Arguments: tables_folder: Input directory containing tables as CSV files. output_folder: Directory where the output tables will be written. """ table_paths = list(tables_folder.glob("*.csv")) with temporary_directory() as workdir: for csv_path in table_paths: # Copy all output files to a temporary folder, renaming columns if necessary _logger.log_info(f"Renaming columns for {csv_path.name}") table_rename(csv_path, workdir / csv_path.name, OUTPUT_COLUMN_ADAPTER) for csv_path in table_paths: # Sort output files by location key, since the following breakout step requires it _logger.log_info(f"Sorting {csv_path.name}") table_sort(workdir / csv_path.name, output_folder / csv_path.name, ["location_key"])
def make_main_table(tables_folder: Path, output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) print("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = [ date.date().isoformat() for date in date_range("2020-01-01", max_date) ] date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) print("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) print("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path) print("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path) shutil.move(temp_file_path, main_table_path) print(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) print("Sorted main table")