def test_table_records_reimport(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) schema = { _safe_column_name(col): dtype for col, dtype in get_schema().items() } sqlite_file = workdir / "tmp.sqlite" tables_folder = SRC / "test" / "data" with create_sqlite_database(db_file=sqlite_file) as conn: for table_path in tables_folder.glob("*.csv"): table_name = _safe_table_name(table_path.stem) table_import_from_file(conn, table_path, schema=schema) # Export the records to a list records_output_1 = list(table_select_all(conn, table_name)) # Import the list of records table_name_2 = table_name + "_new" table_import_from_records(conn, table_name_2, records_output_1, schema=schema) # Re-export the records as a list records_output_2 = list( table_select_all(conn, table_name_2)) for record1, record2 in zip(records_output_1, records_output_2): self.assertDictEqual(record1, record2)
def test_table_file_reimport(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) sqlite_file = workdir / "tmp.sqlite" tables_folder = SRC / "test" / "data" # Verify that all tables were imported with create_sqlite_database(db_file=sqlite_file) as conn: for table_path in tables_folder.glob("*.csv"): table_name = _safe_table_name(table_path.stem) table_import_from_file(conn, table_path, table_name=table_name) self._check_table_not_empty(conn, table_name) # Dirty hack used to compare appropriate column names. Ideally this would be # handled by the SQL module, which should convert the table and column names to # whatever they were prior to sanitizing them. temp_file_path_1 = workdir / f"{table_name}.1.csv" column_adapter = { col: _safe_column_name(col).replace("[", "").replace("]", "") for col in get_table_columns(table_path) } table_rename(table_path, temp_file_path_1, column_adapter) temp_file_path_2 = workdir / f"{table_name}.2.csv" table_export_csv(conn, table_name, temp_file_path_2) _compare_tables_equal(self, temp_file_path_1, temp_file_path_2)
def test_import_tables_into_sqlite(self): with temporary_directory() as workdir: intermediate = workdir / "intermediate" intermediate.mkdir(parents=True, exist_ok=True) # Copy all test tables into the temporary directory publish_global_tables( SRC / "test" / "data", intermediate, use_table_names=V3_TABLE_LIST ) # Create the SQLite file and open it sqlite_output = workdir / "database.sqlite" table_paths = list(intermediate.glob("*.csv")) import_tables_into_sqlite(table_paths, sqlite_output) with create_sqlite_database(sqlite_output) as conn: # Verify that each table contains all the data for table in table_paths: temp_path = workdir / f"{table.stem}.csv" table_export_csv(conn, _safe_table_name(table.stem), temp_path) _compare_tables_equal(self, table, temp_path)
def merge_output_tables_sqlite( tables_folder: Path, output_path: Path, sqlite_file: Path = None, drop_empty_columns: bool = False, use_table_names: List[str] = None, ) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. This function requires index.csv to be present under `tables_folder`. Arguments: table_paths: List of CSV files to join into a single table. output_path: Output path for the resulting CSV file. sqlite_path: Path for the SQLite database to use for importing data, defaults to a temporary database on disk. drop_empty_columns: Flag determining whether columns with null values only should be removed from the output. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Use two temporary tables as I/O for intermediate operations temp_table_input, temp_table_output = "tmp_table_name_1", "tmp_table_name_2" # Start with all combinations of <location key x date> keys_and_dates_table_path = workdir / f"{temp_table_input}.csv" _logger.log_info("Creating keys and dates table") index_table = [ table for table in table_paths if table.stem == "index" ][0] _make_location_key_and_date_table(index_table, keys_and_dates_table_path) # Create an SQLite database _logger.log_info("Importing all tables into SQLite") database_file = sqlite_file or workdir / "database.sqlite" import_tables_into_sqlite([keys_and_dates_table_path] + table_paths, database_file) with create_sqlite_database(database_file) as conn: _logger.log_info(f"Merging all tables into a flat output") for table in table_paths: _logger.log_info(f"Merging {table.stem}") # Read the table's header to determine how to merge it table_name = _safe_table_name(table.stem) table_columns = get_table_columns(table) join_on = [ col for col in ("key", "location_key", "date") if col in table_columns ] # Join with the current intermediate table sql_table_join( conn, left=temp_table_input, right=table_name, on=join_on, how="left outer", into_table=temp_table_output, ) # Flip-flop the I/O tables to avoid a copy temp_table_input, temp_table_output = temp_table_output, temp_table_input sort_values = ("location_key", "date") _logger.log_info(f"Exporting output as CSV") sql_export_csv(conn, temp_table_input, output_path=output_path, sort_by=sort_values) # Remove the intermediate tables from the SQLite database sql_table_drop(conn, temp_table_input) sql_table_drop(conn, temp_table_output)