def test_table_file_reimport(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) sqlite_file = workdir / "tmp.sqlite" tables_folder = SRC / "test" / "data" # Verify that all tables were imported with create_sqlite_database(db_file=sqlite_file) as conn: for table_path in tables_folder.glob("*.csv"): table_name = _safe_table_name(table_path.stem) table_import_from_file(conn, table_path, table_name=table_name) self._check_table_not_empty(conn, table_name) # Dirty hack used to compare appropriate column names. Ideally this would be # handled by the SQL module, which should convert the table and column names to # whatever they were prior to sanitizing them. temp_file_path_1 = workdir / f"{table_name}.1.csv" column_adapter = { col: _safe_column_name(col).replace("[", "").replace("]", "") for col in get_table_columns(table_path) } table_rename(table_path, temp_file_path_1, column_adapter) temp_file_path_2 = workdir / f"{table_name}.2.csv" table_export_csv(conn, table_name, temp_file_path_2) _compare_tables_equal(self, temp_file_path_1, temp_file_path_2)
def test_create_sqlite_db_file(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) sqlite_file = workdir / "tmp.sqlite" with create_sqlite_database(db_file=sqlite_file) as conn: self.assertEqual(conn.execute("SELECT 1").fetchone()[0], 1)
def test_table_records_reimport(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) schema = { _safe_column_name(col): dtype for col, dtype in get_schema().items() } sqlite_file = workdir / "tmp.sqlite" tables_folder = SRC / "test" / "data" with create_sqlite_database(db_file=sqlite_file) as conn: for table_path in tables_folder.glob("*.csv"): table_name = _safe_table_name(table_path.stem) table_import_from_file(conn, table_path, schema=schema) # Export the records to a list records_output_1 = list(table_select_all(conn, table_name)) # Import the list of records table_name_2 = table_name + "_new" table_import_from_records(conn, table_name_2, records_output_1, schema=schema) # Re-export the records as a list records_output_2 = list( table_select_all(conn, table_name_2)) for record1, record2 in zip(records_output_1, records_output_2): self.assertDictEqual(record1, record2)
def test_import_tables_into_sqlite(self): with temporary_directory() as workdir: intermediate = workdir / "intermediate" intermediate.mkdir(parents=True, exist_ok=True) # Copy all test tables into the temporary directory publish_global_tables( SRC / "test" / "data", intermediate, use_table_names=V3_TABLE_LIST ) # Create the SQLite file and open it sqlite_output = workdir / "database.sqlite" table_paths = list(intermediate.glob("*.csv")) import_tables_into_sqlite(table_paths, sqlite_output) with create_sqlite_database(sqlite_output) as conn: # Verify that each table contains all the data for table in table_paths: temp_path = workdir / f"{table.stem}.csv" table_export_csv(conn, _safe_table_name(table.stem), temp_path) _compare_tables_equal(self, table, temp_path)
def import_tables_into_sqlite(table_paths: List[Path], output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: table_paths: List of CSV files to join into a single table. output_path: Output path for the resulting SQLite file. """ # Import all tables into a database on disk at the provided path with create_sqlite_database(output_path) as conn: # Get a list of all tables indexed by <location_key> or by <location_key, date> schema = get_schema() for table_file_path in table_paths: table_name = table_file_path.stem _logger.log_info(f"Importing {table_name} into SQLite") table_columns = get_table_columns(table_file_path) table_schema = {col: schema.get(col, str) for col in table_columns} table_import_from_file(conn, table_file_path, table_name=table_name, schema=table_schema)
def test_create_sqlite_db_in_memory(self): with create_sqlite_database() as conn: self.assertEqual(conn.execute("SELECT 1").fetchone()[0], 1)
def _test_table_merge(self, how_sqlite: str, how_pandas: str): test_data_1 = DataFrame.from_records([ { "col1": "a", "col2": "1" }, { "col1": "a", "col2": "2" }, { "col1": "b", "col2": "3" }, { "col1": "b", "col2": "4" }, { "col1": "c", "col2": "5" }, { "col1": "c", "col2": "6" }, ]) test_data_2 = DataFrame.from_records([ { "col1": "a", "col3": "foo" }, { "col1": "b", "col3": "bar" }, { "col1": "c", "col3": "baz" }, ]) test_data_3 = DataFrame.from_records([ { "col1": "a", "col4": "apple" }, { "col1": "b", "col4": "banana" }, { "col1": "c", "col4": "orange" }, ]) with TemporaryDirectory() as workdir: workdir = Path(workdir) sqlite_file = workdir / "tmp.sqlite" with create_sqlite_database(db_file=sqlite_file) as conn: table_name_1 = "_1" table_name_2 = "_2" table_name_3 = "_3" table_create(conn, table_name_1, { "col1": "TEXT", "col2": "TEXT" }) table_create(conn, table_name_2, { "col1": "TEXT", "col3": "TEXT" }) table_create(conn, table_name_3, { "col1": "TEXT", "col4": "TEXT" }) table_import_from_records( conn, table_name_1, _dataframe_records_iterator(test_data_1)) table_import_from_records( conn, table_name_2, _dataframe_records_iterator(test_data_2)) table_import_from_records( conn, table_name_3, _dataframe_records_iterator(test_data_3)) self._check_table_not_empty(conn, table_name_1) self._check_table_not_empty(conn, table_name_2) self._check_table_not_empty(conn, table_name_3) expected = table_merge_pandas( [test_data_1, test_data_2, test_data_3], on=["col1"], how=how_pandas) # Merge and output as an iterable result1 = DataFrame.from_records( table_merge_sql( conn, [table_name_1, table_name_2, table_name_3], on=["col1"], how=how_sqlite, )) self._compare_dataframes_equal(result1, expected) # Merge into a table, and output its data table_name_merged = "_merged" table_merge_sql( conn, [table_name_1, table_name_2, table_name_3], on=["col1"], how=how_sqlite, into_table=table_name_merged, ) result2 = DataFrame.from_records( table_select_all(conn, table_name_merged)) self._compare_dataframes_equal(result2, expected)
def _test_table_join(self, how_sqlite: str, how_pandas: str): test_data_left = DataFrame.from_records([ { "col1": "a", "col2": "1" }, { "col1": "a", "col2": "2" }, { "col1": "b", "col2": "3" }, { "col1": "b", "col2": "4" }, { "col1": "c", "col2": "5" }, { "col1": "c", "col2": "6" }, ]) test_data_right = DataFrame.from_records([ { "col1": "a", "col3": "foo" }, { "col1": "b", "col3": "bar" }, { "col1": "c", "col3": "baz" }, ]) with TemporaryDirectory() as workdir: workdir = Path(workdir) sqlite_file = workdir / "tmp.sqlite" with create_sqlite_database(db_file=sqlite_file) as conn: table_name_left = "_left" table_name_right = "_right" table_create(conn, table_name_left, { "col1": "TEXT", "col2": "TEXT" }) table_create(conn, table_name_right, { "col1": "TEXT", "col3": "TEXT" }) table_import_from_records( conn, table_name_left, _dataframe_records_iterator(test_data_left)) table_import_from_records( conn, table_name_right, _dataframe_records_iterator(test_data_right)) self._check_table_not_empty(conn, table_name_left) self._check_table_not_empty(conn, table_name_right) expected = test_data_left.merge(test_data_right, on=["col1"], how=how_pandas) # Merge and output as an iterable result1 = DataFrame.from_records( table_join_sql(conn, table_name_left, table_name_right, on=["col1"], how=how_sqlite)) self._compare_dataframes_equal(result1, expected) # Merge into a table, and output its data table_name_merged = "_merged" table_join_sql( conn, table_name_left, table_name_right, on=["col1"], how=how_sqlite, into_table=table_name_merged, ) result2 = DataFrame.from_records( table_select_all(conn, table_name_merged)) self._compare_dataframes_equal(result2, expected)
def merge_output_tables_sqlite( tables_folder: Path, output_path: Path, sqlite_file: Path = None, drop_empty_columns: bool = False, use_table_names: List[str] = None, ) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. This function requires index.csv to be present under `tables_folder`. Arguments: table_paths: List of CSV files to join into a single table. output_path: Output path for the resulting CSV file. sqlite_path: Path for the SQLite database to use for importing data, defaults to a temporary database on disk. drop_empty_columns: Flag determining whether columns with null values only should be removed from the output. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Use two temporary tables as I/O for intermediate operations temp_table_input, temp_table_output = "tmp_table_name_1", "tmp_table_name_2" # Start with all combinations of <location key x date> keys_and_dates_table_path = workdir / f"{temp_table_input}.csv" _logger.log_info("Creating keys and dates table") index_table = [ table for table in table_paths if table.stem == "index" ][0] _make_location_key_and_date_table(index_table, keys_and_dates_table_path) # Create an SQLite database _logger.log_info("Importing all tables into SQLite") database_file = sqlite_file or workdir / "database.sqlite" import_tables_into_sqlite([keys_and_dates_table_path] + table_paths, database_file) with create_sqlite_database(database_file) as conn: _logger.log_info(f"Merging all tables into a flat output") for table in table_paths: _logger.log_info(f"Merging {table.stem}") # Read the table's header to determine how to merge it table_name = _safe_table_name(table.stem) table_columns = get_table_columns(table) join_on = [ col for col in ("key", "location_key", "date") if col in table_columns ] # Join with the current intermediate table sql_table_join( conn, left=temp_table_input, right=table_name, on=join_on, how="left outer", into_table=temp_table_output, ) # Flip-flop the I/O tables to avoid a copy temp_table_input, temp_table_output = temp_table_output, temp_table_input sort_values = ("location_key", "date") _logger.log_info(f"Exporting output as CSV") sql_export_csv(conn, temp_table_input, output_path=output_path, sort_by=sort_values) # Remove the intermediate tables from the SQLite database sql_table_drop(conn, temp_table_input) sql_table_drop(conn, temp_table_output)