Exemple #1
0
    def _test_join_all(self, how_mem: str, how_pandas: str):

        # Create a custom function used to read tables casting to the expected schema
        read_table_ = partial(read_table, schema=SCHEMA, low_memory=False)

        # Test joining the index table with every other table
        left = SRC / "test" / "data" / "index.csv"
        for right in pbar([*(SRC / "test" / "data").glob("*.csv")],
                          leave=False):
            if left.name == right.name:
                continue

            left_columns = get_table_columns(left)
            right_columns = get_table_columns(right)

            if not "date" in right_columns:
                self._test_join_pair(read_table_, SCHEMA, left, right, ["key"],
                                     how_mem, how_pandas)

            if "date" in left_columns and not "date" in right_columns:
                self._test_join_pair(read_table_, SCHEMA, left, right, ["key"],
                                     how_mem, how_pandas)

            if "date" in left_columns and "date" in right_columns:
                self._test_join_pair(read_table_, SCHEMA, left, right,
                                     ["key", "date"], how_mem, how_pandas)
Exemple #2
0
    def _test_make_main_table_helper(self, main_table_path: Path,
                                     column_adapter: Dict[str, str]):
        main_table = read_table(main_table_path, schema=SCHEMA)

        # Verify that all columns from all tables exist
        for pipeline in get_pipelines():
            for column_name in pipeline.schema.keys():
                column_name = column_adapter.get(column_name)
                if column_name is not None:
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

        # Main table should follow a lexical sort (outside of header)
        main_table_records = []
        for line in read_lines(main_table_path):
            main_table_records.append(line)
        main_table_records = main_table_records[1:]
        self.assertListEqual(main_table_records,
                             list(sorted(main_table_records)))

        # Make sure that all columns present in the index table are in the main table
        main_table_columns = set(get_table_columns(main_table_path))
        index_table_columns = set(
            get_table_columns(SRC / "test" / "data" / "index.csv"))
        for column in index_table_columns:
            column = column_adapter.get(column, column)
            self.assertTrue(column in main_table_columns,
                            f"{column} not in main")

        # Make the main table easier to deal with since we optimize for memory usage
        location_key = "location_key" if "location_key" in main_table.columns else "key"
        main_table.set_index(location_key, inplace=True)
        main_table["date"] = main_table["date"].astype(str)

        # Define sets of columns to check
        column_prefixes = ("new", "total", "cumulative")
        column_filter = lambda col: col.split("_")[
            0] in column_prefixes and "age" not in col
        columns = list(filter(column_filter, main_table.columns))
        self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2)
        main_table = main_table[["date"] + columns]

        # Spot check: Country of Andorra
        self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31")

        # Spot check: State of New South Wales
        self._spot_check_subset(main_table, "AU_NSW", "2020-09-01",
                                "2020-12-31")

        # Spot check: Alachua County
        self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01",
                                "2020-12-31")
def _make_location_key_and_date_table(index_table: Path,
                                      output_path: Path) -> None:
    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Make sure that there is an index table present
        assert index_table.exists(), "Index table not found"

        # Index table will determine if we use "key" or "location_key" as column name
        index_columns = get_table_columns(index_table)
        location_key = "location_key" if "location_key" in index_columns else "key"

        # Create a single-column table with only the keys
        keys_table_path = workdir / "location_keys.csv"
        with open(keys_table_path, "w") as fd:
            fd.write(f"{location_key}\n")
            fd.writelines(
                f"{value}\n"
                for value in table_read_column(index_table, location_key))

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_table_path = workdir / "dates.csv"
        with open(date_table_path, "w") as fd:
            fd.write("date\n")
            fd.writelines(f"{value}\n"
                          for value in date_range("2020-01-01", max_date))

        # Output all combinations of <key x date>
        table_cross_product(keys_table_path, date_table_path, output_path)
    def _compare_tables_equal(self, table1: Path, table2: Path) -> None:
        cols1 = get_table_columns(table1)
        cols2 = get_table_columns(table2)
        self.assertEqual(set(cols1), set(cols2))

        # Converting to a CSV in memory sometimes produces out-of-order values
        records1 = list(read_lines(table1))
        records2 = list(read_lines(table2))
        self.assertEqual(len(records1), len(records2))

        reader1 = csv.reader(records1)
        reader2 = csv.reader(records2)
        for record1, record2 in zip(reader1, reader2):
            record1 = {col: val for col, val in zip(cols1, record1)}
            record2 = {col: val for col, val in zip(cols2, record2)}
            self.assertEqual(record1, record2)
Exemple #5
0
    def test_table_file_reimport(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            sqlite_file = workdir / "tmp.sqlite"
            tables_folder = SRC / "test" / "data"

            # Verify that all tables were imported
            with create_sqlite_database(db_file=sqlite_file) as conn:
                for table_path in tables_folder.glob("*.csv"):
                    table_name = _safe_table_name(table_path.stem)
                    table_import_from_file(conn,
                                           table_path,
                                           table_name=table_name)
                    self._check_table_not_empty(conn, table_name)

                    # Dirty hack used to compare appropriate column names. Ideally this would be
                    # handled by the SQL module, which should convert the table and column names to
                    # whatever they were prior to sanitizing them.
                    temp_file_path_1 = workdir / f"{table_name}.1.csv"
                    column_adapter = {
                        col:
                        _safe_column_name(col).replace("[",
                                                       "").replace("]", "")
                        for col in get_table_columns(table_path)
                    }
                    table_rename(table_path, temp_file_path_1, column_adapter)

                    temp_file_path_2 = workdir / f"{table_name}.2.csv"
                    table_export_csv(conn, table_name, temp_file_path_2)
                    _compare_tables_equal(self, temp_file_path_1,
                                          temp_file_path_2)
Exemple #6
0
def read_source_output(data_pipeline: DataPipeline,
                       data_source: DataSource) -> DataFrame:
    with temporary_directory() as workdir:
        output_path = workdir / f"{data_source.uuid(data_pipeline.table)}.csv"
        try:
            download_file(GCS_BUCKET_TEST, f"intermediate/{output_path.name}",
                          output_path)
            columns = get_table_columns(output_path)
            dates = list(table_read_column(
                output_path, "date")) if "date" in columns else [None]
            return {
                "pipeline":
                data_pipeline.name,
                "data_source":
                f"{data_source.__module__}.{data_source.name}",
                "columns":
                ",".join(columns),
                "first_date":
                min(dates),
                "last_date":
                max(dates),
                "location_keys":
                ",".join(sorted(set(table_read_column(output_path, "key")))),
            }
        except Exception as exc:
            print(exc, file=sys.stderr)
            return []
Exemple #7
0
def _compare_tables_equal(test_case: ProfiledTestCase, table1: Path, table2: Path) -> None:
    cols1 = get_table_columns(table1)
    cols2 = get_table_columns(table2)
    test_case.assertEqual(set(cols1), set(cols2))

    # Converting to a CSV in memory sometimes produces out-of-order values
    with open_file_like(table1) as fd1, open_file_like(table2) as fd2:
        records1 = list(line_reader(fd1, skip_empty=True))
        records2 = list(line_reader(fd2, skip_empty=True))
        test_case.assertEqual(len(records1), len(records2))

        reader1 = csv.reader(records1)
        reader2 = csv.reader(records2)
        for record1, record2 in zip(reader1, reader2):
            record1 = {col: val for col, val in zip(cols1, record1)}
            record2 = {col: val for col, val in zip(cols2, record2)}
            test_case.assertEqual(record1, record2)
Exemple #8
0
def _subset_latest(output_folder: Path, csv_file: Path) -> Path:
    output_file = output_folder / csv_file.name
    # Degenerate case: table has no "date" column
    columns = get_table_columns(csv_file)
    if "date" not in columns:
        shutil.copyfile(csv_file, output_file)
    else:
        table_grouped_tail(csv_file, output_file, ["key"])
    return output_file
def _latest_date_by_group(tables_folder: Path, group_by: str = "location_key") -> Dict[str, str]:
    groups: Dict[str, str] = {}
    for table_file in tables_folder.glob("*.csv"):
        table_columns = get_table_columns(table_file)
        if "date" in table_columns:
            iter1 = table_read_column(table_file, "date")
            iter2 = table_read_column(table_file, group_by)
            for date, key in zip(iter1, iter2):
                groups[key] = max(groups.get(key, date), date)
    return groups
def publish_subset_latest(tables_folder: Path,
                          output_folder: Path,
                          key: str = "location_key",
                          **tqdm_kwargs) -> List[Path]:
    """
    This method outputs the latest record by date per location key for each of the input tables.

    Arguments:
        tables_folder: Directory containing input CSV files.
        output_folder: Output path for the resulting data.
        key: Column name to group by.
    """
    agg_table_name = "aggregated"

    # Create a latest subset version for each of the tables in parallel
    map_iter = [
        table for table in tables_folder.glob("*.csv")
        if table.stem != agg_table_name
    ]
    _logger.log_info(f"Computing latest subset for {len(map_iter)} tables")
    map_opts = dict(total=len(map_iter),
                    desc="Creating latest subsets",
                    **tqdm_kwargs)
    map_func = partial(_grouped_subset_latest, output_folder, group_column=key)
    for table in pbar(map(map_func, map_iter), **map_opts):
        yield table

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        latest_dates_table = workdir / "dates.csv"
        latest_dates_map = _latest_date_by_group(output_folder, group_by=key)
        with open(latest_dates_table, "w") as fh:
            fh.write("location_key,date\n")
            for location_key, date in latest_dates_map.items():
                fh.write(f"{location_key},{date}\n")

        join_table_paths = [latest_dates_table]
        tables_in = (table for table in output_folder.glob("*.csv")
                     if table.stem in V3_TABLE_LIST)
        for table_file in tables_in:
            table_columns = get_table_columns(table_file)
            if "date" not in table_columns:
                join_table_paths.append(table_file)
            else:
                tmp_file = workdir / table_file.name
                table_rename(table_file, tmp_file, {"date": None})
                join_table_paths.append(tmp_file)

        # Join them all into a single file for the aggregate version
        output_agg = output_folder / f"{agg_table_name}.csv"
        table_merge(join_table_paths, output_agg, on=[key], how="OUTER")
        yield output_agg
Exemple #11
0
def merge_output_tables(
    tables_folder: Path,
    output_path: Path,
    drop_empty_columns: bool = False,
    use_table_names: List[str] = None,
) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>. This function
    requires index.csv to be present under `tables_folder`.

    Arguments:
        tables_folder: Input directory where all CSV files exist.
        output_path: Output directory for the resulting main.csv file.
        drop_empty_columns: Flag determining whether columns with null values only should be
            removed from the output.
        exclude_table_names: Tables which should be removed from the combined output.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST)

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Use temporary files to avoid computing everything in memory
        temp_input = workdir / "tmp.1.csv"
        temp_output = workdir / "tmp.2.csv"

        # Start with all combinations of <location key x date>
        _make_location_key_and_date_table(tables_folder / "index.csv", temp_output)
        temp_input, temp_output = temp_output, temp_input

        for table_file_path in table_paths:
            # Join by <location key> or <location key x date> depending on what's available
            table_columns = get_table_columns(table_file_path)
            join_on = [col for col in ("key", "location_key", "date") if col in table_columns]

            # Iteratively perform left outer joins on all tables
            table_join(temp_input, table_file_path, join_on, temp_output, how="outer")

            # Flip-flop the temp files to avoid a copy
            temp_input, temp_output = temp_output, temp_input

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Remove columns which provide no data because they are only null values
        if drop_empty_columns:
            table_drop_nan_columns(temp_input, temp_output)
            temp_input, temp_output = temp_output, temp_input

        # Ensure that the table is appropriately sorted and write to output location
        table_sort(temp_input, output_path)
Exemple #12
0
def import_tables_into_sqlite(table_paths: List[Path],
                              output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        table_paths: List of CSV files to join into a single table.
        output_path: Output path for the resulting SQLite file.
    """
    # Import all tables into a database on disk at the provided path
    with create_sqlite_database(output_path) as conn:

        # Get a list of all tables indexed by <location_key> or by <location_key, date>
        schema = get_schema()
        for table_file_path in table_paths:
            table_name = table_file_path.stem
            _logger.log_info(f"Importing {table_name} into SQLite")
            table_columns = get_table_columns(table_file_path)
            table_schema = {col: schema.get(col, str) for col in table_columns}
            table_import_from_file(conn,
                                   table_file_path,
                                   table_name=table_name,
                                   schema=table_schema)
Exemple #13
0
def make_main_table(tables_folder: Path, output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        print("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = [
            date.date().isoformat()
            for date in date_range("2020-01-01", max_date)
        ]
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        print("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        print("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path, tables_folder / "index.csv", ["key"],
                   main_table_path)
        print("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path, table_file_path, join_on,
                           temp_file_path)
                shutil.move(temp_file_path, main_table_path)
                print(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        print("Sorted main table")
Exemple #14
0
def merge_output_tables_sqlite(
    tables_folder: Path,
    output_path: Path,
    sqlite_file: Path = None,
    drop_empty_columns: bool = False,
    use_table_names: List[str] = None,
) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>. This function
    requires index.csv to be present under `tables_folder`.

    Arguments:
        table_paths: List of CSV files to join into a single table.
        output_path: Output path for the resulting CSV file.
        sqlite_path: Path for the SQLite database to use for importing data, defaults to a temporary
            database on disk.
        drop_empty_columns: Flag determining whether columns with null values only should be
            removed from the output.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names
                                        or V2_TABLE_LIST)

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Use two temporary tables as I/O for intermediate operations
        temp_table_input, temp_table_output = "tmp_table_name_1", "tmp_table_name_2"

        # Start with all combinations of <location key x date>
        keys_and_dates_table_path = workdir / f"{temp_table_input}.csv"
        _logger.log_info("Creating keys and dates table")
        index_table = [
            table for table in table_paths if table.stem == "index"
        ][0]
        _make_location_key_and_date_table(index_table,
                                          keys_and_dates_table_path)

        # Create an SQLite database
        _logger.log_info("Importing all tables into SQLite")
        database_file = sqlite_file or workdir / "database.sqlite"
        import_tables_into_sqlite([keys_and_dates_table_path] + table_paths,
                                  database_file)

        with create_sqlite_database(database_file) as conn:

            _logger.log_info(f"Merging all tables into a flat output")
            for table in table_paths:
                _logger.log_info(f"Merging {table.stem}")

                # Read the table's header to determine how to merge it
                table_name = _safe_table_name(table.stem)
                table_columns = get_table_columns(table)
                join_on = [
                    col for col in ("key", "location_key", "date")
                    if col in table_columns
                ]

                # Join with the current intermediate table
                sql_table_join(
                    conn,
                    left=temp_table_input,
                    right=table_name,
                    on=join_on,
                    how="left outer",
                    into_table=temp_table_output,
                )

                # Flip-flop the I/O tables to avoid a copy
                temp_table_input, temp_table_output = temp_table_output, temp_table_input

        sort_values = ("location_key", "date")
        _logger.log_info(f"Exporting output as CSV")
        sql_export_csv(conn,
                       temp_table_input,
                       output_path=output_path,
                       sort_by=sort_values)

        # Remove the intermediate tables from the SQLite database
        sql_table_drop(conn, temp_table_input)
        sql_table_drop(conn, temp_table_output)