Beispiel #1
0
    def _test_join_pair(
        self,
        read_table_: Callable,
        schema: Dict[str, str],
        left: Path,
        right: Path,
        on: List[str],
        how: str,
    ):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            tmpfile = workdir / "tmpfile.csv"

            table_join(left, right, on, tmpfile, how=how)
            test_result = export_csv(read_table_(tmpfile), schema=schema)
            pandas_how = how.replace("outer", "left")
            pandas_result = export_csv(read_table_(left).merge(
                read_table_(right), on=on, how=pandas_how),
                                       schema=schema)

            # Converting to a CSV in memory sometimes produces out-of-order values
            test_result_lines = sorted(test_result.split("\n"))
            pandas_result_lines = sorted(pandas_result.split("\n"))

            for line1, line2 in zip(test_result_lines, pandas_result_lines):
                self.assertEqual(line1, line2)
Beispiel #2
0
def merge_output_tables(
    tables_folder: Path,
    output_path: Path,
    drop_empty_columns: bool = False,
    use_table_names: List[str] = None,
) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>. This function
    requires index.csv to be present under `tables_folder`.

    Arguments:
        tables_folder: Input directory where all CSV files exist.
        output_path: Output directory for the resulting main.csv file.
        drop_empty_columns: Flag determining whether columns with null values only should be
            removed from the output.
        exclude_table_names: Tables which should be removed from the combined output.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST)

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Use temporary files to avoid computing everything in memory
        temp_input = workdir / "tmp.1.csv"
        temp_output = workdir / "tmp.2.csv"

        # Start with all combinations of <location key x date>
        _make_location_key_and_date_table(tables_folder / "index.csv", temp_output)
        temp_input, temp_output = temp_output, temp_input

        for table_file_path in table_paths:
            # Join by <location key> or <location key x date> depending on what's available
            table_columns = get_table_columns(table_file_path)
            join_on = [col for col in ("key", "location_key", "date") if col in table_columns]

            # Iteratively perform left outer joins on all tables
            table_join(temp_input, table_file_path, join_on, temp_output, how="outer")

            # Flip-flop the temp files to avoid a copy
            temp_input, temp_output = temp_output, temp_input

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Remove columns which provide no data because they are only null values
        if drop_empty_columns:
            table_drop_nan_columns(temp_input, temp_output)
            temp_input, temp_output = temp_output, temp_input

        # Ensure that the table is appropriately sorted and write to output location
        table_sort(temp_input, output_path)
Beispiel #3
0
    def _test_join_pair(
        self,
        read_table_: Callable,
        schema: Dict[str, str],
        left: Path,
        right: Path,
        on: List[str],
        how_mem: str,
        how_pandas: str,
    ):
        with temporary_directory() as workdir:
            output_file_1 = workdir / "output.1.csv"
            output_file_2 = workdir / "output.2.csv"

            # Join using our memory efficient method
            table_join(left, right, on, output_file_1, how=how_mem)

            # Join using the pandas method
            pandas_result = read_table_(left).merge(read_table_(right), on=on, how=how_pandas)
            export_csv(pandas_result, output_file_2, schema=schema)

            _compare_tables_equal(self, output_file_1, output_file_2)
Beispiel #4
0
def make_main_table(tables_folder: Path, output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        print("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = [
            date.date().isoformat()
            for date in date_range("2020-01-01", max_date)
        ]
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        print("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        print("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path, tables_folder / "index.csv", ["key"],
                   main_table_path)
        print("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path, table_file_path, join_on,
                           temp_file_path)
                shutil.move(temp_file_path, main_table_path)
                print(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        print("Sorted main table")