def test_table_rename(self): test_csv = """col1,col2,col3 a,1,foo b,2,bar c,3,foo d,4,bar """ expected = """cola,colb a,1 b,2 c,3 d,4 """ with TemporaryDirectory() as workdir: workdir = Path(workdir) input_file = workdir / "in.csv" with open(input_file, "w") as fd: for line in test_csv.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") output_file = workdir / "out.csv" table_rename(input_file, output_file, {"col1": "cola", "col2": "colb", "col3": None}) for line1, line2 in zip(expected.split("\n"), read_lines(output_file)): self.assertEqual(line1.strip(), line2.strip())
def publish_global_tables( tables_folder: Path, output_folder: Path, use_table_names: List[str], column_adapter: Dict[str, str], ) -> None: """ Copy all the tables from `tables_folder` into `output_folder` converting the column names to the requested schema. Arguments: tables_folder: Input directory containing tables as CSV files. output_folder: Directory where the output tables will be written. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names) # Whether it's "key" or "location_key" depends on the schema location_key = "location_key" if "location_key" in column_adapter.values( ) else "key" with temporary_directory() as workdir: for csv_path in table_paths: # Copy all output files to a temporary folder, renaming columns if necessary _logger.log_info(f"Renaming columns for {csv_path.name}") table_rename(csv_path, workdir / csv_path.name, column_adapter) for csv_path in table_paths: # Sort output files by location key, since the following breakout step requires it _logger.log_info(f"Sorting {csv_path.name}") table_sort(workdir / csv_path.name, output_folder / csv_path.name, [location_key])
def test_table_file_reimport(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) sqlite_file = workdir / "tmp.sqlite" tables_folder = SRC / "test" / "data" # Verify that all tables were imported with create_sqlite_database(db_file=sqlite_file) as conn: for table_path in tables_folder.glob("*.csv"): table_name = _safe_table_name(table_path.stem) table_import_from_file(conn, table_path, table_name=table_name) self._check_table_not_empty(conn, table_name) # Dirty hack used to compare appropriate column names. Ideally this would be # handled by the SQL module, which should convert the table and column names to # whatever they were prior to sanitizing them. temp_file_path_1 = workdir / f"{table_name}.1.csv" column_adapter = { col: _safe_column_name(col).replace("[", "").replace("]", "") for col in get_table_columns(table_path) } table_rename(table_path, temp_file_path_1, column_adapter) temp_file_path_2 = workdir / f"{table_name}.2.csv" table_export_csv(conn, table_name, temp_file_path_2) _compare_tables_equal(self, temp_file_path_1, temp_file_path_2)
def publish_global_tables(tables_folder: Path, output_folder: Path, use_table_names: List[str] = None) -> None: """ Copy all the tables from `tables_folder` into `output_folder` converting the column names to the latest schema, and join all the tables into a single main.csv file. Arguments: tables_folder: Input directory containing tables as CSV files. output_folder: Directory where the output tables will be written. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) with temporary_directory() as workdir: for csv_path in table_paths: # Copy all output files to a temporary folder, renaming columns if necessary _logger.log_info(f"Renaming columns for {csv_path.name}") table_rename(csv_path, workdir / csv_path.name, OUTPUT_COLUMN_ADAPTER) for csv_path in table_paths: # Sort output files by location key, since the following breakout step requires it _logger.log_info(f"Sorting {csv_path.name}") table_sort(workdir / csv_path.name, output_folder / csv_path.name, ["location_key"])
def publish_subset_latest(tables_folder: Path, output_folder: Path, key: str = "location_key", **tqdm_kwargs) -> List[Path]: """ This method outputs the latest record by date per location key for each of the input tables. Arguments: tables_folder: Directory containing input CSV files. output_folder: Output path for the resulting data. key: Column name to group by. """ agg_table_name = "aggregated" # Create a latest subset version for each of the tables in parallel map_iter = [ table for table in tables_folder.glob("*.csv") if table.stem != agg_table_name ] _logger.log_info(f"Computing latest subset for {len(map_iter)} tables") map_opts = dict(total=len(map_iter), desc="Creating latest subsets", **tqdm_kwargs) map_func = partial(_grouped_subset_latest, output_folder, group_column=key) for table in pbar(map(map_func, map_iter), **map_opts): yield table # Use a temporary directory for intermediate files with temporary_directory() as workdir: latest_dates_table = workdir / "dates.csv" latest_dates_map = _latest_date_by_group(output_folder, group_by=key) with open(latest_dates_table, "w") as fh: fh.write("location_key,date\n") for location_key, date in latest_dates_map.items(): fh.write(f"{location_key},{date}\n") join_table_paths = [latest_dates_table] tables_in = (table for table in output_folder.glob("*.csv") if table.stem in V3_TABLE_LIST) for table_file in tables_in: table_columns = get_table_columns(table_file) if "date" not in table_columns: join_table_paths.append(table_file) else: tmp_file = workdir / table_file.name table_rename(table_file, tmp_file, {"date": None}) join_table_paths.append(tmp_file) # Join them all into a single file for the aggregate version output_agg = output_folder / f"{agg_table_name}.csv" table_merge(join_table_paths, output_agg, on=[key], how="OUTER") yield output_agg
def test_table_rename(self): test_csv = _make_test_csv_file( """ col1,col2,col3 a,1,foo b,2,bar c,3,foo d,4,bar """ ) expected = _make_test_csv_file( """ cola,colb a,1 b,2 c,3 d,4 """ ) with temporary_file() as output_file: table_rename(test_csv, output_file, {"col1": "cola", "col2": "colb", "col3": None}) _compare_tables_equal(self, output_file, expected)
def publish_global_tables(tables_folder: Path, output_folder: Path) -> None: """ Copy all the tables from `tables_folder` into `output_folder` converting the column names to the latest schema. Arguments: tables_folder: Input directory containing tables as CSV files. output_folder: Directory where the output tables will be written. """ table_paths = list(tables_folder.glob("*.csv")) with temporary_directory() as workdir: for csv_path in table_paths: # Copy all output files to a temporary folder, renaming columns if necessary _logger.log_info(f"Renaming columns for {csv_path.name}") table_rename(csv_path, workdir / csv_path.name, OUTPUT_COLUMN_ADAPTER) for csv_path in table_paths: # Sort output files by location key, since the following breakout step requires it _logger.log_info(f"Sorting {csv_path.name}") table_sort(workdir / csv_path.name, output_folder / csv_path.name, ["location_key"])