def test_table_rename(self):
        test_csv = """col1,col2,col3
        a,1,foo
        b,2,bar
        c,3,foo
        d,4,bar
        """

        expected = """cola,colb
        a,1
        b,2
        c,3
        d,4
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            input_file = workdir / "in.csv"
            with open(input_file, "w") as fd:
                for line in test_csv.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")
            output_file = workdir / "out.csv"
            table_rename(input_file, output_file, {"col1": "cola", "col2": "colb", "col3": None})

            for line1, line2 in zip(expected.split("\n"), read_lines(output_file)):
                self.assertEqual(line1.strip(), line2.strip())
Beispiel #2
0
def publish_global_tables(
    tables_folder: Path,
    output_folder: Path,
    use_table_names: List[str],
    column_adapter: Dict[str, str],
) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    requested schema.
    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names)

    # Whether it's "key" or "location_key" depends on the schema
    location_key = "location_key" if "location_key" in column_adapter.values(
    ) else "key"

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name, column_adapter)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       [location_key])
Beispiel #3
0
    def test_table_file_reimport(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            sqlite_file = workdir / "tmp.sqlite"
            tables_folder = SRC / "test" / "data"

            # Verify that all tables were imported
            with create_sqlite_database(db_file=sqlite_file) as conn:
                for table_path in tables_folder.glob("*.csv"):
                    table_name = _safe_table_name(table_path.stem)
                    table_import_from_file(conn,
                                           table_path,
                                           table_name=table_name)
                    self._check_table_not_empty(conn, table_name)

                    # Dirty hack used to compare appropriate column names. Ideally this would be
                    # handled by the SQL module, which should convert the table and column names to
                    # whatever they were prior to sanitizing them.
                    temp_file_path_1 = workdir / f"{table_name}.1.csv"
                    column_adapter = {
                        col:
                        _safe_column_name(col).replace("[",
                                                       "").replace("]", "")
                        for col in get_table_columns(table_path)
                    }
                    table_rename(table_path, temp_file_path_1, column_adapter)

                    temp_file_path_2 = workdir / f"{table_name}.2.csv"
                    table_export_csv(conn, table_name, temp_file_path_2)
                    _compare_tables_equal(self, temp_file_path_1,
                                          temp_file_path_2)
Beispiel #4
0
def publish_global_tables(tables_folder: Path,
                          output_folder: Path,
                          use_table_names: List[str] = None) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    latest schema, and join all the tables into a single main.csv file.

    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names
                                        or V2_TABLE_LIST)

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name,
                         OUTPUT_COLUMN_ADAPTER)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       ["location_key"])
def publish_subset_latest(tables_folder: Path,
                          output_folder: Path,
                          key: str = "location_key",
                          **tqdm_kwargs) -> List[Path]:
    """
    This method outputs the latest record by date per location key for each of the input tables.

    Arguments:
        tables_folder: Directory containing input CSV files.
        output_folder: Output path for the resulting data.
        key: Column name to group by.
    """
    agg_table_name = "aggregated"

    # Create a latest subset version for each of the tables in parallel
    map_iter = [
        table for table in tables_folder.glob("*.csv")
        if table.stem != agg_table_name
    ]
    _logger.log_info(f"Computing latest subset for {len(map_iter)} tables")
    map_opts = dict(total=len(map_iter),
                    desc="Creating latest subsets",
                    **tqdm_kwargs)
    map_func = partial(_grouped_subset_latest, output_folder, group_column=key)
    for table in pbar(map(map_func, map_iter), **map_opts):
        yield table

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        latest_dates_table = workdir / "dates.csv"
        latest_dates_map = _latest_date_by_group(output_folder, group_by=key)
        with open(latest_dates_table, "w") as fh:
            fh.write("location_key,date\n")
            for location_key, date in latest_dates_map.items():
                fh.write(f"{location_key},{date}\n")

        join_table_paths = [latest_dates_table]
        tables_in = (table for table in output_folder.glob("*.csv")
                     if table.stem in V3_TABLE_LIST)
        for table_file in tables_in:
            table_columns = get_table_columns(table_file)
            if "date" not in table_columns:
                join_table_paths.append(table_file)
            else:
                tmp_file = workdir / table_file.name
                table_rename(table_file, tmp_file, {"date": None})
                join_table_paths.append(tmp_file)

        # Join them all into a single file for the aggregate version
        output_agg = output_folder / f"{agg_table_name}.csv"
        table_merge(join_table_paths, output_agg, on=[key], how="OUTER")
        yield output_agg
Beispiel #6
0
    def test_table_rename(self):
        test_csv = _make_test_csv_file(
            """
            col1,col2,col3
            a,1,foo
            b,2,bar
            c,3,foo
            d,4,bar
            """
        )
        expected = _make_test_csv_file(
            """
            cola,colb
            a,1
            b,2
            c,3
            d,4
            """
        )

        with temporary_file() as output_file:
            table_rename(test_csv, output_file, {"col1": "cola", "col2": "colb", "col3": None})
            _compare_tables_equal(self, output_file, expected)
def publish_global_tables(tables_folder: Path, output_folder: Path) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    latest schema.

    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    table_paths = list(tables_folder.glob("*.csv"))

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name,
                         OUTPUT_COLUMN_ADAPTER)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       ["location_key"])