def _make_location_key_and_date_table(index_table: Path,
                                      output_path: Path) -> None:
    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Make sure that there is an index table present
        assert index_table.exists(), "Index table not found"

        # Index table will determine if we use "key" or "location_key" as column name
        index_columns = get_table_columns(index_table)
        location_key = "location_key" if "location_key" in index_columns else "key"

        # Create a single-column table with only the keys
        keys_table_path = workdir / "location_keys.csv"
        with open(keys_table_path, "w") as fd:
            fd.write(f"{location_key}\n")
            fd.writelines(
                f"{value}\n"
                for value in table_read_column(index_table, location_key))

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_table_path = workdir / "dates.csv"
        with open(date_table_path, "w") as fd:
            fd.write("date\n")
            fd.writelines(f"{value}\n"
                          for value in date_range("2020-01-01", max_date))

        # Output all combinations of <key x date>
        table_cross_product(keys_table_path, date_table_path, output_path)
Beispiel #2
0
def read_source_output(data_pipeline: DataPipeline,
                       data_source: DataSource) -> DataFrame:
    with temporary_directory() as workdir:
        output_path = workdir / f"{data_source.uuid(data_pipeline.table)}.csv"
        try:
            download_file(GCS_BUCKET_TEST, f"intermediate/{output_path.name}",
                          output_path)
            columns = get_table_columns(output_path)
            dates = list(table_read_column(
                output_path, "date")) if "date" in columns else [None]
            return {
                "pipeline":
                data_pipeline.name,
                "data_source":
                f"{data_source.__module__}.{data_source.name}",
                "columns":
                ",".join(columns),
                "first_date":
                min(dates),
                "last_date":
                max(dates),
                "location_keys":
                ",".join(sorted(set(table_read_column(output_path, "key")))),
            }
        except Exception as exc:
            print(exc, file=sys.stderr)
            return []
def publish_v3_main_table() -> Response:
    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download all the global tables into our local storage
        download_folder(
            GCS_BUCKET_PROD,
            "v3",
            input_folder,
            lambda x: x.suffix == ".csv" and all(token not in str(x)
                                                 for token in ("/", "main.")),
        )

        file_name = "covid-19-open-data.csv"
        with ZipFile(output_folder / f"{file_name}.zip",
                     mode="w",
                     compression=ZIP_DEFLATED) as zip_archive:
            with zip_archive.open(file_name, "w") as output_file:
                merge_output_tables_sqlite(input_folder,
                                           TextIOWrapper(output_file),
                                           use_table_names=V3_TABLE_LIST)

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v3", output_folder)

    return Response("OK", status=200)
Beispiel #4
0
def publish_global_tables(tables_folder: Path,
                          output_folder: Path,
                          use_table_names: List[str] = None) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    latest schema, and join all the tables into a single main.csv file.

    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names
                                        or V2_TABLE_LIST)

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name,
                         OUTPUT_COLUMN_ADAPTER)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       ["location_key"])
Beispiel #5
0
def main(output_folder: Path,
         tables_folder: Path,
         use_table_names: List[str] = None) -> None:
    """
    This script takes the processed outputs located in `tables_folder` and publishes them into the
    output folder by performing the following operations:

        1. Copy all the tables from `tables_folder` to `output_folder`, renaming fields if
           necessary.
        2. Create different slices of data, such as the latest known record for each region, files
           for the last day of data, files for each individual region.
        3. Produce a main table, created by iteratively performing left outer joins on all other
           tables for each slice of data (bot not for the global tables).
    """
    # Wipe the output folder first
    for item in output_folder.glob("*"):
        if item.name.startswith("."):
            continue
        if item.is_file():
            item.unlink()
        else:
            shutil.rmtree(item)

    # Create the folder which will be published using a stable schema
    output_folder = output_folder / "v3"
    output_folder.mkdir(exist_ok=True, parents=True)

    # Publish the tables containing all location keys
    publish_global_tables(tables_folder, output_folder)

    # Create a temporary folder which will host all the location breakouts
    with temporary_directory() as breakout_folder:

        # Break out each table into separate folders based on the location key
        publish_location_breakouts(output_folder,
                                   breakout_folder,
                                   use_table_names=use_table_names)

        # Create a folder which will host all the location aggregates
        location_aggregates_folder = output_folder / "location"
        location_aggregates_folder.mkdir(exist_ok=True, parents=True)

        # Aggregate the tables for each location independently
        location_keys = table_read_column(output_folder / "index.csv",
                                          "location_key")
        publish_location_aggregates(
            breakout_folder,
            location_aggregates_folder,
            location_keys,
            use_table_names=use_table_names,
        )

    # Create the aggregated table and put it in a compressed file
    agg_file_path = output_folder / "aggregated.csv.gz"
    with gzip.open(agg_file_path, "wt") as compressed_file:
        merge_location_breakout_tables(location_aggregates_folder,
                                       compressed_file)

    # Convert all CSV files to JSON using values format
    convert_tables_to_json(output_folder, output_folder)
Beispiel #6
0
 def test_update_bad_pipeline_name(self):
     with temporary_directory() as workdir:
         bad_pipeline_name = "does_not_exist"
         with self.assertRaises(AssertionError):
             update_data(workdir, only=[bad_pipeline_name])
         with self.assertRaises(AssertionError):
             update_data(workdir, exclude=[bad_pipeline_name])
def update_table(table_name: str = None,
                 job_group: str = None,
                 parallel_jobs: int = 8) -> Response:
    table_name = _get_request_param("table", table_name)
    job_group = _get_request_param("job_group", job_group)
    process_count = _get_request_param("parallel_jobs", parallel_jobs)
    # Default to 1 if invalid process count is given
    process_count = safe_int_cast(process_count) or 1

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        (workdir / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]

            # Early exit: job group contains no data sources
            if not data_pipeline.data_sources:
                return Response(
                    f"No data sources matched job group {job_group} for table {table_name}",
                    status=400,
                )

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        logger.log_info(f"Updating data sources: {data_source_names}")

        # When running the data pipeline, use as many parallel processes as allowed and avoid
        # downloading files multiple times.
        run_options = dict(process_count=process_count, skip_existing=True)

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(workdir, **run_options)
        data_pipeline._save_intermediate_results(workdir / "intermediate",
                                                 intermediate_results)
        intermediate_files = list(
            map(str, (workdir / "intermediate").glob("*.csv")))
        logger.log_info(f"Created intermediate tables: {intermediate_files}")

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      workdir / "intermediate")

    return Response("OK", status=200)
Beispiel #8
0
    def test_table_grouped_tail_real_data(self):
        with temporary_directory() as workdir:

            for table_path in (SRC / "test" / "data").glob("*.csv"):
                table = read_table(table_path, schema=SCHEMA)
                test_output = workdir / f"latest_{table_path.name}"
                pandas_output = workdir / f"latest_pandas_{table_path.name}"

                # Create the latest slice of the given table
                table_grouped_tail(table_path, test_output, ["key"])

                # Create a latest slice using pandas grouping
                table = table.groupby("key").aggregate(
                    agg_last_not_null).reset_index()
                export_csv(table, path=pandas_output, schema=SCHEMA)

                # Converting to a CSV in memory sometimes produces out-of-order values
                with open_file_like(test_output) as fd1, open_file_like(
                        pandas_output) as fd2:
                    test_result_lines = list(sorted(fd1))
                    pandas_result_lines = list(sorted(fd2))

                self.assertEqual(len(test_result_lines),
                                 len(pandas_result_lines))
                for line1, line2 in zip(test_result_lines,
                                        pandas_result_lines):
                    self.assertEqual(line1, line2)
Beispiel #9
0
def publish_sources(prod_folder: str = "v3") -> Response:
    """Publishes a table with the source of each datapoint."""
    prod_folder = _get_request_param("prod_folder", prod_folder)

    with temporary_directory() as workdir:

        # Get the data sources and write a JSON file summarizing them to disk
        metadata = create_metadata_dict()
        with open(workdir / "metadata.json", "w") as fh:
            json.dump(metadata, fh)

        # Iterate over the individual tables and build their sources file
        # TODO: create source map for all tables, not just a hand-picked subset
        for table_name in ("epidemiology", "hospitalizations", "vaccinations",
                           "by-age"):
            data_sources = metadata["sources"]
            pipeline = DataPipeline.load(table_name.replace("-", "_"))
            source_map = map_table_sources_to_index(data_sources,
                                                    pipeline,
                                                    prod_folder=prod_folder)
            output_table_sources(source_map,
                                 workdir / f"{table_name}.sources.csv")

        # Upload to root folder
        upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir)

    return Response("OK", status=200)
Beispiel #10
0
def publish_v3_latest_tables() -> Response:
    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download all the global tables into our local storage
        forbid_tokens = ("/", "main.", "aggregated.")
        download_folder(
            GCS_BUCKET_PROD,
            "v3",
            input_folder,
            lambda x: x.suffix == ".csv" and all(token not in str(x)
                                                 for token in forbid_tokens),
        )
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Create subsets for easy API-like access to slices of data
        list(publish_subset_latest(input_folder, output_folder))
        logger.log_info("Table subsets created")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v3/latest", output_folder)

    return Response("OK", status=200)
Beispiel #11
0
def publish_v3_main_table() -> Response:
    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Get a list of valid location keys
        location_keys = list(
            table_read_column(SRC / "data" / "metadata.csv", "key"))

        # Download all the location breakout tables into our local storage
        download_folder(GCS_BUCKET_PROD, "v3", input_folder,
                        lambda x: "location/" in str(x))
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Create the aggregated table and put it in a compressed file
        agg_file_path = output_folder / "aggregated.csv.gz"
        with gzip.open(agg_file_path, "wt") as compressed_file:
            merge_location_breakout_tables(input_folder, compressed_file,
                                           location_keys)

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v3", output_folder)

    return Response("OK", status=200)
Beispiel #12
0
def publish_versions(prod_folder: str = "v3") -> Response:
    """Lists all the blobs in the bucket with generation."""
    prod_folder = _get_request_param("prod_folder", prod_folder)

    # Enumerate all the versions for each of the global tables
    prefix = prod_folder + "/"
    blob_index: Dict[str, List[str]] = {}
    bucket = get_storage_bucket(GCS_BUCKET_PROD)
    for table_name in ["aggregated", "main"] + list(get_table_names()):
        blobs = bucket.list_blobs(prefix=prefix + table_name, versions=True)
        for blob in blobs:
            fname = blob.name.replace(prefix, "")
            blob_index[fname] = blob_index.get(fname, [])
            blob_index[fname].append(blob.generation)

    # Repeat the process for the intermediate tables
    bucket = get_storage_bucket(GCS_BUCKET_TEST)
    blobs = bucket.list_blobs(prefix="intermediate/", versions=True)
    for blob in blobs:
        # Keep the "intermediate/" prefix to distinguish from the tables
        fname = blob.name
        blob_index[fname] = blob_index.get(fname, [])
        blob_index[fname].append(blob.generation)

    with temporary_directory() as workdir:
        # Write it to disk
        fname = workdir / "versions.json"
        with open(fname, "w") as fh:
            json.dump(blob_index, fh)

        # Upload to root folder
        upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir)

    return Response("OK", status=200)
Beispiel #13
0
def publish_global_tables_(prod_folder: str = "v2") -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)

    with temporary_directory() as workdir:
        tables_folder = workdir / "tables"
        public_folder = workdir / "public"
        tables_folder.mkdir(parents=True, exist_ok=True)
        public_folder.mkdir(parents=True, exist_ok=True)

        # Download all the combined tables into our local storage
        download_folder(GCS_BUCKET_TEST, "tables", tables_folder)

        # Publish the tables containing all location keys
        table_names, column_adapter = None, None
        if prod_folder == "v2":
            table_names, column_adapter = V2_TABLE_LIST, {}
        if prod_folder == "v3":
            table_names, column_adapter = V3_TABLE_LIST, OUTPUT_COLUMN_ADAPTER
        assert table_names is not None and column_adapter is not None
        publish_global_tables(tables_folder,
                              public_folder,
                              use_table_names=table_names,
                              column_adapter=column_adapter)

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, public_folder)

    return Response("OK", status=200)
def publish_json_tables(prod_folder: str = "v2") -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download all the global tables into our local storage
        forbid_tokens = ("/", "main.", "aggregated.")
        download_folder(
            GCS_BUCKET_PROD,
            prod_folder,
            input_folder,
            lambda x: x.suffix == ".csv" and all(token not in str(x)
                                                 for token in forbid_tokens),
        )
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Convert all files to JSON
        convert_tables_to_json(input_folder, output_folder)
        logger.log_info("CSV files converted to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder)

    return Response("OK", status=200)
Beispiel #15
0
def publish_global_tables(
    tables_folder: Path,
    output_folder: Path,
    use_table_names: List[str],
    column_adapter: Dict[str, str],
) -> None:
    """
    Copy all the tables from `tables_folder` into `output_folder` converting the column names to the
    requested schema.
    Arguments:
        tables_folder: Input directory containing tables as CSV files.
        output_folder: Directory where the output tables will be written.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names)

    # Whether it's "key" or "location_key" depends on the schema
    location_key = "location_key" if "location_key" in column_adapter.values(
    ) else "key"

    with temporary_directory() as workdir:

        for csv_path in table_paths:
            # Copy all output files to a temporary folder, renaming columns if necessary
            _logger.log_info(f"Renaming columns for {csv_path.name}")
            table_rename(csv_path, workdir / csv_path.name, column_adapter)

        for csv_path in table_paths:
            # Sort output files by location key, since the following breakout step requires it
            _logger.log_info(f"Sorting {csv_path.name}")
            table_sort(workdir / csv_path.name, output_folder / csv_path.name,
                       [location_key])
    def _upload_file(remote_path: str, file_path: Path):
        target_path = file_path.relative_to(local_folder)
        if filter_func is None or filter_func(target_path):
            logger.log_debug(f"Uploading {target_path} to {remote_path}/")
            blob = bucket.blob(os.path.join(remote_path, target_path))
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    name, suffix = file_path.name, file_path.suffix

                    # If it's an extension we should compress, upload compressed file
                    if suffix[1:] in COMPRESS_EXTENSIONS:
                        with temporary_directory() as workdir:
                            gzipped_file = workdir / name
                            gzip_file(file_path, gzipped_file)
                            blob.content_encoding = "gzip"
                            return blob.upload_from_filename(gzipped_file)

                    # Otherwise upload the file as-is
                    else:
                        return blob.upload_from_filename(file_path)

                except Exception as exc:
                    log_message = f"Error uploading {target_path}."
                    logger.log_warning(log_message,
                                       traceback=traceback.format_exc())
                    # Exponential back-off
                    time.sleep(2**i)

            # If error persists, there must be something wrong with the network so we are better
            # off crashing the appengine server.
            error_message = f"Error uploading {target_path}"
            logger.log_error(error_message)
            raise IOError(error_message)
Beispiel #17
0
def publish_json_tables(prod_folder: str = "v2") -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download all the processed tables into our local storage
        download_folder(
            GCS_BUCKET_PROD,
            prod_folder,
            input_folder,
            lambda x: all(token not in str(x) for token in ("/", "main.")),
        )

        # Convert all files to JSON
        list(convert_tables_to_json(input_folder, output_folder))
        logger.log_info("CSV files converted to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder)

    return Response("OK", status=200)
Beispiel #18
0
    def test_open_file_like_file(self):
        with temporary_directory() as workdir:
            temp_file_path = workdir / "temp.txt"

            with open_file_like(temp_file_path, "w") as fd:
                fd.write("hello world")

            self._assert_file_contents_equal(temp_file_path, "hello world")
Beispiel #19
0
 def test_update_only_pipeline(self):
     with temporary_directory() as workdir:
         quick_pipeline_name = "index"  # Pick a pipeline that is quick to run
         update_data(workdir, only=[quick_pipeline_name])
         self.assertSetEqual(
             set(subfolder.name for subfolder in workdir.iterdir()),
             {"intermediate", "tables", "snapshot"},
         )
def combine_table(table_name: str = None) -> Response:
    table_name = _get_request_param("table", table_name)
    logger.log_info(f"Combining data sources for {table_name}")

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "tables").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Get a list of the intermediate files used by this data pipeline
        intermediate_file_names = []
        for data_source in data_pipeline.data_sources:
            intermediate_file_names.append(
                f"{data_source.uuid(data_pipeline.table)}.csv")
        logger.log_info(
            f"Downloading intermediate tables {intermediate_file_names}")

        # Download only the necessary intermediate files
        download_folder(
            GCS_BUCKET_TEST,
            "intermediate",
            workdir / "intermediate",
            lambda x: x.name in intermediate_file_names,
        )

        # Re-load all intermediate results
        intermediate_results = data_pipeline._load_intermediate_results(
            workdir / "intermediate")
        logger.log_info(
            f"Loaded intermediate tables {intermediate_file_names}")

        # Limit the number of processes to avoid OOM in big datasets
        process_count = 4

        # Combine all intermediate results into a single dataframe
        pipeline_output = data_pipeline.combine(intermediate_results,
                                                process_count=process_count)
        logger.log_info(f"Combined intermediate tables into {table_name}")

        # Output combined data to disk
        output_path = workdir / "tables" / f"{table_name}.csv"
        export_csv(pipeline_output, output_path, schema=data_pipeline.schema)
        logger.log_info(f"Exported combined {table_name} to CSV")

        # Upload results to the test bucket because these are not prod files
        # They will be copied to prod in the publish step, so main.csv is in sync
        logger.log_info(f"Uploading combined {table_name}...")
        upload_folder(GCS_BUCKET_TEST, "tables", workdir / "tables")

    return Response("OK", status=200)
def publish_json_locations(prod_folder: str = "v2",
                           location_key_from: str = None,
                           location_key_until: str = None) -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)
    location_key_from = _get_request_param("location_key_from",
                                           location_key_from)
    location_key_until = _get_request_param("location_key_until",
                                            location_key_until)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Convert the tables to JSON for each location independently
        location_keys = list(
            table_read_column(SRC / "data" / "metadata.csv", "key"))
        if location_key_from is not None:
            location_keys = [
                key for key in location_keys if key >= location_key_from
            ]
        if location_key_until is not None:
            location_keys = [
                key for key in location_keys if key <= location_key_until
            ]
        logger.log_info(
            f"Converting {len(location_keys)} location subsets to JSON "
            f"from {location_keys[0]} until {location_keys[-1]}")

        # Download all the processed tables into our local storage
        def match_path(table_path: Path) -> bool:
            try:
                if prod_folder == "v2":
                    location_key, table_name = str(table_path).split("/", 1)
                    return table_name == "main.csv" and location_key in location_keys
                elif prod_folder == "v3":
                    location_path, location_key = table_path.parent.name, table_path.stem
                    return location_path == "location" and location_key in location_keys
            except:
                return False

        download_folder(GCS_BUCKET_PROD, prod_folder, input_folder, match_path)
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Convert all files to JSON
        convert_tables_to_json(input_folder, output_folder)
        converted_count = sum(1 for _ in output_folder.glob("**/*.json"))
        logger.log_info(f"Converted {converted_count} files to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder)

    return Response("OK", status=200)
Beispiel #22
0
def _test_data_source(pipeline_name: DataPipeline,
                      data_source_idx: DataSource,
                      random_seed: int = 0):
    # Re-load the data pipeline and data source
    # It seems inefficient but it's necessary because we can't move these objects across
    # processes
    pipeline = DataPipeline.load(pipeline_name)
    data_source = pipeline.data_sources[data_source_idx]

    # Replace the error logging function to keep logs cleaner during tests
    data_source.log_error = _log_nothing
    data_source.log_warning = _log_nothing

    # Load the real cache files
    cache = requests.get("{}/sitemap.json".format(CACHE_URL)).json()

    data_source_name = data_source.__class__.__name__
    data_source_opts = data_source.config
    if data_source_opts.get("test", {}).get("skip"):
        return

    # Make a copy of all auxiliary files
    aux = {
        name: table.copy()
        for name, table in pipeline.auxiliary_tables.items()
    }

    # If we have a hint for the expected keys, use only those from metadata
    metadata_query = data_source_opts.get("test", {}).get("metadata_query")
    if metadata_query:
        aux["metadata"] = aux["metadata"].query(metadata_query)

    # Get a small sample of metadata, since we are testing for whether a source produces
    # _any_ output, not if the output is exhaustive
    sample_size = min(len(aux["metadata"]), METADATA_SAMPLE_SIZE)
    aux["metadata"] = aux["metadata"].sample(sample_size,
                                             random_state=random_seed)

    # Build the failure message to log the config of this data source
    failure_message = (
        f"{data_source_name} from {pipeline_name} pipeline failed with options {data_source_opts} "
        f"and using metadata keys {aux['metadata']['key'].values.tolist()}")

    # Use a different temporary working directory for each data source
    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        try:
            output_data = data_source.run(workdir, cache, aux)
        except Exception as exc:
            traceback.print_exc()
            raise RuntimeError(failure_message)

        # Run our battery of tests against the output data to ensure it looks correct

        # Data source has at least one row in output
        assert len(output_data) >= 1, failure_message
Beispiel #23
0
def load_combined_table(pipeline: DataPipeline, prod_folder: str) -> DataFrame:
    table_name = pipeline.table
    with temporary_directory() as workdir:
        output_path = workdir / f"{table_name}.csv"
        download_file(GCS_BUCKET_PROD, f"{prod_folder}/{table_name}.csv",
                      output_path)
        combined_table = read_table(output_path)
        index_columns = (["date"] if "date" in combined_table.columns else
                         []) + ["location_key"]
        return combined_table.set_index(index_columns)
    def test_make_main_table_v3(self):
        with temporary_directory() as workdir:

            # Copy all test tables into the temporary directory
            publish_global_tables(SRC / "test" / "data", workdir)

            # Create the main table
            main_table_path = workdir / "main.csv"
            merge_output_tables(workdir, main_table_path, use_table_names=V3_TABLE_LIST)

            self._test_make_main_table_helper(main_table_path, OUTPUT_COLUMN_ADAPTER)
    def test_make_main_table(self):
        with temporary_directory() as workdir:

            # Copy all test tables into the temporary directory
            copy_tables(SRC / "test" / "data", workdir)

            # Create the main table
            main_table_path = workdir / "main.csv"
            merge_output_tables(workdir, main_table_path)

            self._test_make_main_table_helper(main_table_path, {})
def publish_v3_location_subsets(
    location_key_from: str = None, location_key_until: str = None
) -> Response:
    location_key_from = _get_request_param("location_key_from", location_key_from)
    location_key_until = _get_request_param("location_key_until", location_key_until)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        intermediate_folder = workdir / "temp"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        location_keys = list(table_read_column(SRC / "data" / "metadata.csv", "key"))
        if location_key_from is not None:
            location_keys = [key for key in location_keys if key >= location_key_from]
        if location_key_until is not None:
            location_keys = [key for key in location_keys if key <= location_key_until]
        logger.log_info(
            f"Publishing {len(location_keys)} location subsets "
            f"from {location_keys[0]} until {location_keys[-1]}"
        )

        # Download all the global tables into our local storage
        forbid_tokens = ("/", "main.", "aggregated.")
        download_folder(
            GCS_BUCKET_PROD,
            "v3",
            input_folder,
            lambda x: x.suffix == ".csv" and all(token not in str(x) for token in forbid_tokens),
        )
        logger.log_info(f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files")

        # Break out each table into separate folders based on the location key
        publish_location_breakouts(input_folder, intermediate_folder, use_table_names=V3_TABLE_LIST)
        logger.log_info("Created all table location breakouts")

        # Create a folder which will host all the location aggregates
        location_aggregates_folder = output_folder / "location"
        location_aggregates_folder.mkdir(parents=True, exist_ok=True)

        # Aggregate the tables for each location independently
        publish_location_aggregates(
            intermediate_folder,
            location_aggregates_folder,
            location_keys,
            use_table_names=V3_TABLE_LIST,
        )
        logger.log_info("Aggregated all table breakouts by location")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v3", output_folder)

    return Response("OK", status=200)
Beispiel #27
0
def publish_subset_latest(tables_folder: Path,
                          output_folder: Path,
                          key: str = "location_key",
                          **tqdm_kwargs) -> List[Path]:
    """
    This method outputs the latest record by date per location key for each of the input tables.

    Arguments:
        tables_folder: Directory containing input CSV files.
        output_folder: Output path for the resulting data.
        key: Column name to group by.
    """
    agg_table_name = "aggregated"

    # Create a latest subset version for each of the tables in parallel
    map_iter = [
        table for table in tables_folder.glob("*.csv")
        if table.stem != agg_table_name
    ]
    _logger.log_info(f"Computing latest subset for {len(map_iter)} tables")
    map_opts = dict(total=len(map_iter),
                    desc="Creating latest subsets",
                    **tqdm_kwargs)
    map_func = partial(_grouped_subset_latest, output_folder, group_column=key)
    for table in pbar(map(map_func, map_iter), **map_opts):
        yield table

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        latest_dates_table = workdir / "dates.csv"
        latest_dates_map = _latest_date_by_group(output_folder, group_by=key)
        with open(latest_dates_table, "w") as fh:
            fh.write("location_key,date\n")
            for location_key, date in latest_dates_map.items():
                fh.write(f"{location_key},{date}\n")

        join_table_paths = [latest_dates_table]
        tables_in = (table for table in output_folder.glob("*.csv")
                     if table.stem in V3_TABLE_LIST)
        for table_file in tables_in:
            table_columns = get_table_columns(table_file)
            if "date" not in table_columns:
                join_table_paths.append(table_file)
            else:
                tmp_file = workdir / table_file.name
                table_rename(table_file, tmp_file, {"date": None})
                join_table_paths.append(tmp_file)

        # Join them all into a single file for the aggregate version
        output_agg = output_folder / f"{agg_table_name}.csv"
        table_merge(join_table_paths, output_agg, on=[key], how="OUTER")
        yield output_agg
Beispiel #28
0
def merge_output_tables(
    tables_folder: Path,
    output_path: Path,
    drop_empty_columns: bool = False,
    use_table_names: List[str] = None,
) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>. This function
    requires index.csv to be present under `tables_folder`.

    Arguments:
        tables_folder: Input directory where all CSV files exist.
        output_path: Output directory for the resulting main.csv file.
        drop_empty_columns: Flag determining whether columns with null values only should be
            removed from the output.
        exclude_table_names: Tables which should be removed from the combined output.
    """
    # Default to a known list of tables to use when none is given
    table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST)

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Use temporary files to avoid computing everything in memory
        temp_input = workdir / "tmp.1.csv"
        temp_output = workdir / "tmp.2.csv"

        # Start with all combinations of <location key x date>
        _make_location_key_and_date_table(tables_folder / "index.csv", temp_output)
        temp_input, temp_output = temp_output, temp_input

        for table_file_path in table_paths:
            # Join by <location key> or <location key x date> depending on what's available
            table_columns = get_table_columns(table_file_path)
            join_on = [col for col in ("key", "location_key", "date") if col in table_columns]

            # Iteratively perform left outer joins on all tables
            table_join(temp_input, table_file_path, join_on, temp_output, how="outer")

            # Flip-flop the temp files to avoid a copy
            temp_input, temp_output = temp_output, temp_input

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Remove columns which provide no data because they are only null values
        if drop_empty_columns:
            table_drop_nan_columns(temp_input, temp_output)
            temp_input, temp_output = temp_output, temp_input

        # Ensure that the table is appropriately sorted and write to output location
        table_sort(temp_input, output_path)
Beispiel #29
0
    def test_fetch_skip_existing(self):
        src = DummyDataSouce()
        original_fetch_func = src.fetch

        def monkey_patch_fetch(output_folder: Path, cache: Dict[str, str],
                               fetch_opts: List[Dict[str, Any]]):
            self.assertEqual(
                True, fetch_opts[0].get("opts", {}).get("skip_existing"))
            return original_fetch_func(output_folder, cache, fetch_opts)

        src.fetch = monkey_patch_fetch
        with temporary_directory() as workdir:
            src.run(workdir, {}, DUMMY_DATA_SOURCE_AUX, skip_existing=True)
Beispiel #30
0
    def _test_table_merge(self, how_mem: str, how_pandas: str):
        test_data_1 = DataFrame.from_records(
            [
                {"col1": "a", "col2": "1"},
                {"col1": "a", "col2": "2"},
                {"col1": "b", "col2": "3"},
                {"col1": "b", "col2": "4"},
                {"col1": "c", "col2": "5"},
                {"col1": "c", "col2": "6"},
            ]
        )

        test_data_2 = DataFrame.from_records(
            [
                {"col1": "a", "col3": "foo"},
                {"col1": "b", "col3": "bar"},
                {"col1": "c", "col3": "baz"},
            ]
        )

        test_data_3 = DataFrame.from_records(
            [
                {"col1": "a", "col4": "apple"},
                {"col1": "b", "col4": "banana"},
                {"col1": "c", "col4": "orange"},
            ]
        )

        with temporary_directory() as workdir:

            test_file_1 = workdir / "test.1.csv"
            test_file_2 = workdir / "test.2.csv"
            test_file_3 = workdir / "test.3.csv"

            export_csv(test_data_1, test_file_1)
            export_csv(test_data_2, test_file_2)
            export_csv(test_data_3, test_file_3)

            output_file_1 = workdir / "output.1.csv"
            output_file_2 = workdir / "output.2.csv"

            expected = table_merge_pandas(
                [test_data_1, test_data_2, test_data_3], on=["col1"], how=how_pandas
            )
            export_csv(expected, path=output_file_1)

            table_merge_mem(
                [test_file_1, test_file_2, test_file_3], output_file_2, on=["col1"], how=how_mem
            )

            _compare_tables_equal(self, output_file_1, output_file_2)