Example #1
0
def publish_global_tables(
    tables_folder: Path,
    output_folder: Path,
    use_table_names: List[str],
    column_adapter: Dict[str, str],
) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    requested schema.
    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names)

    # Whether it's "key" or "location_key" depends on the schema
    location_key = "location_key" if "location_key" in column_adapter.values(
    ) else "key"

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name, column_adapter)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       [location_key])
Example #2
0
def publish_global_tables(tables_folder: Path,
                          output_folder: Path,
                          use_table_names: List[str] = None) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    latest schema, and join all the tables into a single main.csv file.

    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names
                                        or V2_TABLE_LIST)

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name,
                         OUTPUT_COLUMN_ADAPTER)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       ["location_key"])
Example #3
0
def merge_output_tables(
    tables_folder: Path,
    output_path: Path,
    drop_empty_columns: bool = False,
    use_table_names: List[str] = None,
) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>. This function
    requires index.csv to be present under `tables_folder`.

    Arguments:
        tables_folder: Input directory where all CSV files exist.
        output_path: Output directory for the resulting main.csv file.
        drop_empty_columns: Flag determining whether columns with null values only should be
            removed from the output.
        exclude_table_names: Tables which should be removed from the combined output.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST)

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Use temporary files to avoid computing everything in memory
        temp_input = workdir / "tmp.1.csv"
        temp_output = workdir / "tmp.2.csv"

        # Start with all combinations of <location key x date>
        _make_location_key_and_date_table(tables_folder / "index.csv", temp_output)
        temp_input, temp_output = temp_output, temp_input

        for table_file_path in table_paths:
            # Join by <location key> or <location key x date> depending on what's available
            table_columns = get_table_columns(table_file_path)
            join_on = [col for col in ("key", "location_key", "date") if col in table_columns]

            # Iteratively perform left outer joins on all tables
            table_join(temp_input, table_file_path, join_on, temp_output, how="outer")

            # Flip-flop the temp files to avoid a copy
            temp_input, temp_output = temp_output, temp_input

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Remove columns which provide no data because they are only null values
        if drop_empty_columns:
            table_drop_nan_columns(temp_input, temp_output)
            temp_input, temp_output = temp_output, temp_input

        # Ensure that the table is appropriately sorted and write to output location
        table_sort(temp_input, output_path)
    def test_table_sort(self):
        test_csv = """col1,col2,col3
        a,1,foo
        d,4,bar
        c,3,foo
        b,2,bar
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            input_file = workdir / "in.csv"
            with open(input_file, "w") as fd:
                for line in test_csv.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")

            # Sort using the default (first) column
            output_file_1 = workdir / "out.csv"
            table_sort(input_file, output_file_1)

            output_file_2 = workdir / "pandas.csv"
            read_table(input_file).sort_values(["col1"]).to_csv(output_file_2,
                                                                index=False)

            for line1, line2 in zip(read_lines(output_file_1),
                                    read_lines(output_file_2)):
                self.assertEqual(line1.strip(), line2.strip())

            # Sort by each column in order
            for sort_column in ("col1", "col2", "col3"):

                output_file_1 = workdir / "out.csv"
                table_sort(input_file, output_file_1, [sort_column])

                output_file_2 = workdir / "pandas.csv"
                read_table(input_file).sort_values([sort_column
                                                    ]).to_csv(output_file_2,
                                                              index=False)

                for line1, line2 in zip(read_lines(output_file_1),
                                        read_lines(output_file_2)):
                    self.assertEqual(line1.strip(), line2.strip())
Example #5
0
    def test_table_sort(self):
        test_csv = _make_test_csv_file(
            """
            col1,col2,col3
            a,1,foo
            d,4,bar
            c,3,foo
            b,2,bar
            """
        )

        with temporary_directory() as workdir:

            # Sort using the default (first) column
            output_file_1 = workdir / "out.csv"
            table_sort(test_csv, output_file_1)

            test_csv.seek(0)
            output_file_2 = workdir / "pandas.csv"
            read_table(test_csv, file_type="csv").sort_values(["col1"]).to_csv(
                output_file_2, index=False
            )

            _compare_tables_equal(self, output_file_1, output_file_2)

            # Sort by each column in order
            for sort_column in ("col1", "col2", "col3"):

                output_file_1 = workdir / f"1.{sort_column}.csv"
                table_sort(test_csv, output_file_1, [sort_column])

                test_csv.seek(0)
                output_file_2 = workdir / f"2.{sort_column}.csv"
                read_table(test_csv, file_type="csv").sort_values([sort_column]).to_csv(
                    output_file_2, index=False
                )

                _compare_tables_equal(self, output_file_1, output_file_2)
Example #6
0
def publish_global_tables(tables_folder: Path, output_folder: Path) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    latest schema.

    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    table_paths = list(tables_folder.glob("*.csv"))

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name,
                         OUTPUT_COLUMN_ADAPTER)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       ["location_key"])
Example #7
0
def make_main_table(tables_folder: Path, output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        print("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = [
            date.date().isoformat()
            for date in date_range("2020-01-01", max_date)
        ]
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        print("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        print("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path, tables_folder / "index.csv", ["key"],
                   main_table_path)
        print("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path, table_file_path, join_on,
                           temp_file_path)
                shutil.move(temp_file_path, main_table_path)
                print(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        print("Sorted main table")