Example #1
0
    def test_table_file_reimport(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            sqlite_file = workdir / "tmp.sqlite"
            tables_folder = SRC / "test" / "data"

            # Verify that all tables were imported
            with create_sqlite_database(db_file=sqlite_file) as conn:
                for table_path in tables_folder.glob("*.csv"):
                    table_name = _safe_table_name(table_path.stem)
                    table_import_from_file(conn,
                                           table_path,
                                           table_name=table_name)
                    self._check_table_not_empty(conn, table_name)

                    # Dirty hack used to compare appropriate column names. Ideally this would be
                    # handled by the SQL module, which should convert the table and column names to
                    # whatever they were prior to sanitizing them.
                    temp_file_path_1 = workdir / f"{table_name}.1.csv"
                    column_adapter = {
                        col:
                        _safe_column_name(col).replace("[",
                                                       "").replace("]", "")
                        for col in get_table_columns(table_path)
                    }
                    table_rename(table_path, temp_file_path_1, column_adapter)

                    temp_file_path_2 = workdir / f"{table_name}.2.csv"
                    table_export_csv(conn, table_name, temp_file_path_2)
                    _compare_tables_equal(self, temp_file_path_1,
                                          temp_file_path_2)
Example #2
0
    def test_create_sqlite_db_file(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            sqlite_file = workdir / "tmp.sqlite"
            with create_sqlite_database(db_file=sqlite_file) as conn:
                self.assertEqual(conn.execute("SELECT 1").fetchone()[0], 1)
Example #3
0
    def test_table_records_reimport(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            schema = {
                _safe_column_name(col): dtype
                for col, dtype in get_schema().items()
            }
            sqlite_file = workdir / "tmp.sqlite"
            tables_folder = SRC / "test" / "data"
            with create_sqlite_database(db_file=sqlite_file) as conn:
                for table_path in tables_folder.glob("*.csv"):
                    table_name = _safe_table_name(table_path.stem)
                    table_import_from_file(conn, table_path, schema=schema)

                    # Export the records to a list
                    records_output_1 = list(table_select_all(conn, table_name))

                    # Import the list of records
                    table_name_2 = table_name + "_new"
                    table_import_from_records(conn,
                                              table_name_2,
                                              records_output_1,
                                              schema=schema)

                    # Re-export the records as a list
                    records_output_2 = list(
                        table_select_all(conn, table_name_2))

                    for record1, record2 in zip(records_output_1,
                                                records_output_2):
                        self.assertDictEqual(record1, record2)
Example #4
0
    def test_import_tables_into_sqlite(self):
        with temporary_directory() as workdir:
            intermediate = workdir / "intermediate"
            intermediate.mkdir(parents=True, exist_ok=True)

            # Copy all test tables into the temporary directory
            publish_global_tables(
                SRC / "test" / "data", intermediate, use_table_names=V3_TABLE_LIST
            )

            # Create the SQLite file and open it
            sqlite_output = workdir / "database.sqlite"
            table_paths = list(intermediate.glob("*.csv"))
            import_tables_into_sqlite(table_paths, sqlite_output)
            with create_sqlite_database(sqlite_output) as conn:

                # Verify that each table contains all the data
                for table in table_paths:
                    temp_path = workdir / f"{table.stem}.csv"
                    table_export_csv(conn, _safe_table_name(table.stem), temp_path)

                    _compare_tables_equal(self, table, temp_path)
Example #5
0
def import_tables_into_sqlite(table_paths: List[Path],
                              output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        table_paths: List of CSV files to join into a single table.
        output_path: Output path for the resulting SQLite file.
    """
    # Import all tables into a database on disk at the provided path
    with create_sqlite_database(output_path) as conn:

        # Get a list of all tables indexed by <location_key> or by <location_key, date>
        schema = get_schema()
        for table_file_path in table_paths:
            table_name = table_file_path.stem
            _logger.log_info(f"Importing {table_name} into SQLite")
            table_columns = get_table_columns(table_file_path)
            table_schema = {col: schema.get(col, str) for col in table_columns}
            table_import_from_file(conn,
                                   table_file_path,
                                   table_name=table_name,
                                   schema=table_schema)
Example #6
0
 def test_create_sqlite_db_in_memory(self):
     with create_sqlite_database() as conn:
         self.assertEqual(conn.execute("SELECT 1").fetchone()[0], 1)
Example #7
0
    def _test_table_merge(self, how_sqlite: str, how_pandas: str):
        test_data_1 = DataFrame.from_records([
            {
                "col1": "a",
                "col2": "1"
            },
            {
                "col1": "a",
                "col2": "2"
            },
            {
                "col1": "b",
                "col2": "3"
            },
            {
                "col1": "b",
                "col2": "4"
            },
            {
                "col1": "c",
                "col2": "5"
            },
            {
                "col1": "c",
                "col2": "6"
            },
        ])

        test_data_2 = DataFrame.from_records([
            {
                "col1": "a",
                "col3": "foo"
            },
            {
                "col1": "b",
                "col3": "bar"
            },
            {
                "col1": "c",
                "col3": "baz"
            },
        ])

        test_data_3 = DataFrame.from_records([
            {
                "col1": "a",
                "col4": "apple"
            },
            {
                "col1": "b",
                "col4": "banana"
            },
            {
                "col1": "c",
                "col4": "orange"
            },
        ])

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            sqlite_file = workdir / "tmp.sqlite"
            with create_sqlite_database(db_file=sqlite_file) as conn:
                table_name_1 = "_1"
                table_name_2 = "_2"
                table_name_3 = "_3"

                table_create(conn, table_name_1, {
                    "col1": "TEXT",
                    "col2": "TEXT"
                })
                table_create(conn, table_name_2, {
                    "col1": "TEXT",
                    "col3": "TEXT"
                })
                table_create(conn, table_name_3, {
                    "col1": "TEXT",
                    "col4": "TEXT"
                })
                table_import_from_records(
                    conn, table_name_1,
                    _dataframe_records_iterator(test_data_1))
                table_import_from_records(
                    conn, table_name_2,
                    _dataframe_records_iterator(test_data_2))
                table_import_from_records(
                    conn, table_name_3,
                    _dataframe_records_iterator(test_data_3))

                self._check_table_not_empty(conn, table_name_1)
                self._check_table_not_empty(conn, table_name_2)
                self._check_table_not_empty(conn, table_name_3)

                expected = table_merge_pandas(
                    [test_data_1, test_data_2, test_data_3],
                    on=["col1"],
                    how=how_pandas)

                # Merge and output as an iterable
                result1 = DataFrame.from_records(
                    table_merge_sql(
                        conn,
                        [table_name_1, table_name_2, table_name_3],
                        on=["col1"],
                        how=how_sqlite,
                    ))
                self._compare_dataframes_equal(result1, expected)

                # Merge into a table, and output its data
                table_name_merged = "_merged"
                table_merge_sql(
                    conn,
                    [table_name_1, table_name_2, table_name_3],
                    on=["col1"],
                    how=how_sqlite,
                    into_table=table_name_merged,
                )
                result2 = DataFrame.from_records(
                    table_select_all(conn, table_name_merged))
                self._compare_dataframes_equal(result2, expected)
Example #8
0
    def _test_table_join(self, how_sqlite: str, how_pandas: str):
        test_data_left = DataFrame.from_records([
            {
                "col1": "a",
                "col2": "1"
            },
            {
                "col1": "a",
                "col2": "2"
            },
            {
                "col1": "b",
                "col2": "3"
            },
            {
                "col1": "b",
                "col2": "4"
            },
            {
                "col1": "c",
                "col2": "5"
            },
            {
                "col1": "c",
                "col2": "6"
            },
        ])

        test_data_right = DataFrame.from_records([
            {
                "col1": "a",
                "col3": "foo"
            },
            {
                "col1": "b",
                "col3": "bar"
            },
            {
                "col1": "c",
                "col3": "baz"
            },
        ])

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            sqlite_file = workdir / "tmp.sqlite"
            with create_sqlite_database(db_file=sqlite_file) as conn:
                table_name_left = "_left"
                table_name_right = "_right"

                table_create(conn, table_name_left, {
                    "col1": "TEXT",
                    "col2": "TEXT"
                })
                table_create(conn, table_name_right, {
                    "col1": "TEXT",
                    "col3": "TEXT"
                })
                table_import_from_records(
                    conn, table_name_left,
                    _dataframe_records_iterator(test_data_left))
                table_import_from_records(
                    conn, table_name_right,
                    _dataframe_records_iterator(test_data_right))

                self._check_table_not_empty(conn, table_name_left)
                self._check_table_not_empty(conn, table_name_right)

                expected = test_data_left.merge(test_data_right,
                                                on=["col1"],
                                                how=how_pandas)

                # Merge and output as an iterable
                result1 = DataFrame.from_records(
                    table_join_sql(conn,
                                   table_name_left,
                                   table_name_right,
                                   on=["col1"],
                                   how=how_sqlite))
                self._compare_dataframes_equal(result1, expected)

                # Merge into a table, and output its data
                table_name_merged = "_merged"
                table_join_sql(
                    conn,
                    table_name_left,
                    table_name_right,
                    on=["col1"],
                    how=how_sqlite,
                    into_table=table_name_merged,
                )
                result2 = DataFrame.from_records(
                    table_select_all(conn, table_name_merged))
                self._compare_dataframes_equal(result2, expected)
Example #9
0
def merge_output_tables_sqlite(
    tables_folder: Path,
    output_path: Path,
    sqlite_file: Path = None,
    drop_empty_columns: bool = False,
    use_table_names: List[str] = None,
) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>. This function
    requires index.csv to be present under `tables_folder`.

    Arguments:
        table_paths: List of CSV files to join into a single table.
        output_path: Output path for the resulting CSV file.
        sqlite_path: Path for the SQLite database to use for importing data, defaults to a temporary
            database on disk.
        drop_empty_columns: Flag determining whether columns with null values only should be
            removed from the output.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names
                                        or V2_TABLE_LIST)

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Use two temporary tables as I/O for intermediate operations
        temp_table_input, temp_table_output = "tmp_table_name_1", "tmp_table_name_2"

        # Start with all combinations of <location key x date>
        keys_and_dates_table_path = workdir / f"{temp_table_input}.csv"
        _logger.log_info("Creating keys and dates table")
        index_table = [
            table for table in table_paths if table.stem == "index"
        ][0]
        _make_location_key_and_date_table(index_table,
                                          keys_and_dates_table_path)

        # Create an SQLite database
        _logger.log_info("Importing all tables into SQLite")
        database_file = sqlite_file or workdir / "database.sqlite"
        import_tables_into_sqlite([keys_and_dates_table_path] + table_paths,
                                  database_file)

        with create_sqlite_database(database_file) as conn:

            _logger.log_info(f"Merging all tables into a flat output")
            for table in table_paths:
                _logger.log_info(f"Merging {table.stem}")

                # Read the table's header to determine how to merge it
                table_name = _safe_table_name(table.stem)
                table_columns = get_table_columns(table)
                join_on = [
                    col for col in ("key", "location_key", "date")
                    if col in table_columns
                ]

                # Join with the current intermediate table
                sql_table_join(
                    conn,
                    left=temp_table_input,
                    right=table_name,
                    on=join_on,
                    how="left outer",
                    into_table=temp_table_output,
                )

                # Flip-flop the I/O tables to avoid a copy
                temp_table_input, temp_table_output = temp_table_output, temp_table_input

        sort_values = ("location_key", "date")
        _logger.log_info(f"Exporting output as CSV")
        sql_export_csv(conn,
                       temp_table_input,
                       output_path=output_path,
                       sort_by=sort_values)

        # Remove the intermediate tables from the SQLite database
        sql_table_drop(conn, temp_table_input)
        sql_table_drop(conn, temp_table_output)