def update_table(table_name: str = None,
                 job_group: str = None,
                 parallel_jobs: int = 8) -> Response:
    table_name = _get_request_param("table", table_name)
    job_group = _get_request_param("job_group", job_group)
    process_count = _get_request_param("parallel_jobs", parallel_jobs)
    # Default to 1 if invalid process count is given
    process_count = safe_int_cast(process_count) or 1

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        (workdir / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]

            # Early exit: job group contains no data sources
            if not data_pipeline.data_sources:
                return Response(
                    f"No data sources matched job group {job_group} for table {table_name}",
                    status=400,
                )

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        logger.log_info(f"Updating data sources: {data_source_names}")

        # When running the data pipeline, use as many parallel processes as allowed and avoid
        # downloading files multiple times.
        run_options = dict(process_count=process_count, skip_existing=True)

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(workdir, **run_options)
        data_pipeline._save_intermediate_results(workdir / "intermediate",
                                                 intermediate_results)
        intermediate_files = list(
            map(str, (workdir / "intermediate").glob("*.csv")))
        logger.log_info(f"Created intermediate tables: {intermediate_files}")

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      workdir / "intermediate")

    return Response("OK", status=200)
Example #2
0
def publish_sources(prod_folder: str = "v3") -> Response:
    """Publishes a table with the source of each datapoint."""
    prod_folder = _get_request_param("prod_folder", prod_folder)

    with temporary_directory() as workdir:

        # Get the data sources and write a JSON file summarizing them to disk
        metadata = create_metadata_dict()
        with open(workdir / "metadata.json", "w") as fh:
            json.dump(metadata, fh)

        # Iterate over the individual tables and build their sources file
        # TODO: create source map for all tables, not just a hand-picked subset
        for table_name in ("epidemiology", "hospitalizations", "vaccinations",
                           "by-age"):
            data_sources = metadata["sources"]
            pipeline = DataPipeline.load(table_name.replace("-", "_"))
            source_map = map_table_sources_to_index(data_sources,
                                                    pipeline,
                                                    prod_folder=prod_folder)
            output_table_sources(source_map,
                                 workdir / f"{table_name}.sources.csv")

        # Upload to root folder
        upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir)

    return Response("OK", status=200)
Example #3
0
def _test_data_source(pipeline_name: DataPipeline,
                      data_source_idx: DataSource,
                      random_seed: int = 0):
    # Re-load the data pipeline and data source
    # It seems inefficient but it's necessary because we can't move these objects across
    # processes
    pipeline = DataPipeline.load(pipeline_name)
    data_source = pipeline.data_sources[data_source_idx]

    # Replace the error logging function to keep logs cleaner during tests
    data_source.log_error = _log_nothing
    data_source.log_warning = _log_nothing

    # Load the real cache files
    cache = requests.get("{}/sitemap.json".format(CACHE_URL)).json()

    data_source_name = data_source.__class__.__name__
    data_source_opts = data_source.config
    if data_source_opts.get("test", {}).get("skip"):
        return

    # Make a copy of all auxiliary files
    aux = {
        name: table.copy()
        for name, table in pipeline.auxiliary_tables.items()
    }

    # If we have a hint for the expected keys, use only those from metadata
    metadata_query = data_source_opts.get("test", {}).get("metadata_query")
    if metadata_query:
        aux["metadata"] = aux["metadata"].query(metadata_query)

    # Get a small sample of metadata, since we are testing for whether a source produces
    # _any_ output, not if the output is exhaustive
    sample_size = min(len(aux["metadata"]), METADATA_SAMPLE_SIZE)
    aux["metadata"] = aux["metadata"].sample(sample_size,
                                             random_state=random_seed)

    # Build the failure message to log the config of this data source
    failure_message = (
        f"{data_source_name} from {pipeline_name} pipeline failed with options {data_source_opts} "
        f"and using metadata keys {aux['metadata']['key'].values.tolist()}")

    # Use a different temporary working directory for each data source
    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        try:
            output_data = data_source.run(workdir, cache, aux)
        except Exception as exc:
            traceback.print_exc()
            raise RuntimeError(failure_message)

        # Run our battery of tests against the output data to ensure it looks correct

        # Data source has at least one row in output
        assert len(output_data) >= 1, failure_message
def combine_table(table_name: str = None) -> Response:
    table_name = _get_request_param("table", table_name)
    logger.log_info(f"Combining data sources for {table_name}")

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "tables").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Get a list of the intermediate files used by this data pipeline
        intermediate_file_names = []
        for data_source in data_pipeline.data_sources:
            intermediate_file_names.append(
                f"{data_source.uuid(data_pipeline.table)}.csv")
        logger.log_info(
            f"Downloading intermediate tables {intermediate_file_names}")

        # Download only the necessary intermediate files
        download_folder(
            GCS_BUCKET_TEST,
            "intermediate",
            workdir / "intermediate",
            lambda x: x.name in intermediate_file_names,
        )

        # Re-load all intermediate results
        intermediate_results = data_pipeline._load_intermediate_results(
            workdir / "intermediate")
        logger.log_info(
            f"Loaded intermediate tables {intermediate_file_names}")

        # Limit the number of processes to avoid OOM in big datasets
        process_count = 4

        # Combine all intermediate results into a single dataframe
        pipeline_output = data_pipeline.combine(intermediate_results,
                                                process_count=process_count)
        logger.log_info(f"Combined intermediate tables into {table_name}")

        # Output combined data to disk
        output_path = workdir / "tables" / f"{table_name}.csv"
        export_csv(pipeline_output, output_path, schema=data_pipeline.schema)
        logger.log_info(f"Exported combined {table_name} to CSV")

        # Upload results to the test bucket because these are not prod files
        # They will be copied to prod in the publish step, so main.csv is in sync
        logger.log_info(f"Uploading combined {table_name}...")
        upload_folder(GCS_BUCKET_TEST, "tables", workdir / "tables")

    return Response("OK", status=200)
Example #5
0
def _test_data_pipeline(pipeline_name: str, random_seed: int = 0):

    # Load the data pipeline to get the number of data sources
    data_pipeline = DataPipeline.load(pipeline_name)

    # Load the data pipeline, iterate over each data source and run it to get its output
    pipeline_count = len(data_pipeline.data_sources)
    map_func = partial(_test_data_source, pipeline_name, random_seed=random_seed)
    _ = thread_map(map_func, range(pipeline_count), total=pipeline_count, max_workers=4)

    # Consume the results
    list(_)
Example #6
0
def update_table(table_name: str = None, job_group: int = None) -> Response:
    table_name = _get_request_param("table", table_name)
    job_group = _get_request_param("job_group", job_group)

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        (workdir / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]

            # Early exit: job group contains no data sources
            if not data_pipeline.data_sources:
                return Response(
                    f"No data sources matched job group {job_group} for table {table_name}",
                    status=400,
                )

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        logger.log_info(f"Updating data sources: {data_source_names}")

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(workdir, process_count=1)
        data_pipeline._save_intermediate_results(workdir / "intermediate",
                                                 intermediate_results)

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      workdir / "intermediate")

    return Response("OK", status=200)
Example #7
0
def combine_table(table_name: str = None) -> Response:
    table_name = _get_request_param("table", table_name)

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with TemporaryDirectory() as output_folder:
        output_folder = Path(output_folder)
        (output_folder / "tables").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Get a list of the intermediate files used by this data pipeline
        intermediate_file_names = []
        for data_source in data_pipeline.data_sources:
            intermediate_file_names.append(f"{data_source.uuid(data_pipeline.table)}.csv")

        # Download only the necessary intermediate files
        download_folder(
            GCS_BUCKET_TEST,
            "intermediate",
            output_folder / "intermediate",
            lambda x: x.name in intermediate_file_names,
        )

        # Re-load all intermediate results
        intermediate_results = data_pipeline._load_intermediate_results(
            output_folder / "intermediate"
        )

        # Combine all intermediate results into a single dataframe
        pipeline_output = data_pipeline.combine(intermediate_results)

        # Output combined data to disk
        export_csv(
            pipeline_output,
            output_folder / "tables" / f"{table_name}.csv",
            schema=data_pipeline.schema,
        )

        # Upload results to the test bucket because these are not prod files
        # They will be copied to prod in the publish step, so main.csv is in sync
        upload_folder(GCS_BUCKET_TEST, "tables", output_folder / "tables")

    return Response("OK", status=200)
def update_table(table_name: str = None, job_group: int = None) -> str:
    table_name = table_name or request.args.get("table")
    assert table_name in list(get_table_names())
    try:
        job_group = request.args.get("job_group")
    except:
        pass
    with TemporaryDirectory() as output_folder:
        output_folder = Path(output_folder)
        (output_folder / "snapshot").mkdir(parents=True, exist_ok=True)
        (output_folder / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]
            assert (
                data_pipeline.data_sources
            ), f"No data sources matched job group {job_group} for table {table_name}"

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        print(f"Data sources: {data_source_names}")

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(output_folder,
                                                   process_count=1)
        data_pipeline._save_intermediate_results(
            output_folder / "intermediate", intermediate_results)

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", output_folder / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      output_folder / "intermediate")

    return "OK"
Example #9
0
def get_source_configs(pipeline_names: List[str]) -> Iterator[Dict]:
    """Map a list of pipeline names to their source configs."""

    for pipeline_name in pipeline_names:
        data_pipeline = DataPipeline.load(pipeline_name)

        for data_source in data_pipeline.data_sources:
            data_source_config = data_source.config

            data_source_name = data_source_config.get("name")
            data_source_fetch_params = data_source_config.get("fetch", [])

            for fetch_param in data_source_fetch_params:
                file_ext = fetch_param.get("opts", {}).get("ext", "")
                data_source_url = fetch_param.get("url")

                yield {
                    "pipeline_name": pipeline_name,
                    "source_name": data_source_name,
                    "url": data_source_url,
                    "ext": file_ext,
                }
Example #10
0
    # Create a schema to output integers and avoid pandas converting to floating point
    schema_map = dict(date="str", location_key="str")
    schema = {col: schema_map.get(col, "int") for col in source_table.columns}

    # Output the table at the requested location
    export_csv(source_table, output_path, schema)


if __name__ == "__main__":
    # To authenticate with Cloud locally, run the following commands:
    # > $env:GOOGLE_CLOUD_PROJECT = "github-open-covid-19"
    # > $env:GCS_SERVICE_ACCOUNT = "*****@*****.**"
    # > $env:GCP_TOKEN = $(gcloud auth application-default print-access-token)

    # Create the output directory where the sources will go
    output_directory = SRC / ".." / "output" / "sources"
    output_directory.mkdir(exist_ok=True, parents=True)

    # Get the data sources and write a JSON file summarizing them to disk
    metadata = create_metadata_dict()
    with open(output_directory / "metadata.json", "w") as fh:
        json.dump(metadata, fh)

    # Iterate over the individual tables and build their sources file
    for table_name in ("epidemiology", "hospitalizations", "vaccinations",
                       "by-age"):
        pipeline = DataPipeline.load(table_name.replace("-", "_"))
        source_map = map_table_sources_to_index(metadata["sources"], pipeline)
        output_table_sources(source_map,
                             output_directory / f"{table_name}.sources.csv")
Example #11
0
def main(
        output_folder: Path,
        verify: str = None,
        only: List[str] = None,
        exclude: List[str] = None,
        process_count: int = cpu_count(),
        show_progress: bool = True,
) -> None:
    """
    Executes the data pipelines and places all outputs into `output_folder`. This is typically
    followed by publishing of the contents of the output folder to a server.

    Args:
        output_folder: Root folder where snapshot, intermediate and tables will be placed.
        verify: Run anomaly detection on the outputs using this strategy. Value must be one of:
            - None: (default) perform no anomaly detection
            - "simple": perform only fast anomaly detection
            - "full": perform exhaustive anomaly detection (can be very slow)
        only: If provided, only pipelines with a name appearing in this list will be run.
        exclude: If provided, pipelines with a name appearing in this list will not be run.
        process_count: Maximum number of processes to use during the data pipeline execution.
        show_progress: Display progress for the execution of individual DataSources within this
            pipeline.
    """

    assert not (only is not None and exclude is not None
                ), "--only and --exclude options cannot be used simultaneously"

    # Ensure that there is an output folder to put the data in
    (output_folder / "snapshot").mkdir(parents=True, exist_ok=True)
    (output_folder / "intermediate").mkdir(parents=True, exist_ok=True)
    (output_folder / "tables").mkdir(parents=True, exist_ok=True)

    # A pipeline chain is any subfolder not starting with "_" in the pipelines folder
    all_pipeline_names = []
    for item in (ROOT / "src" / "pipelines").iterdir():
        if not item.name.startswith("_") and not item.is_file():
            all_pipeline_names.append(item.name)

    # Verify that all of the provided pipeline names exist as pipelines
    for pipeline_name in (only or []) + (exclude or []):
        module_name = pipeline_name.replace("-", "_")
        assert module_name in all_pipeline_names, f'"{pipeline_name}" pipeline does not exist'

    # Run all the pipelines and place their outputs into the output folder
    # The output name for each pipeline chain will be the name of the directory that the chain is in
    for pipeline_name in all_pipeline_names:
        table_name = pipeline_name.replace("_", "-")
        # Skip if `exclude` was provided and this table is in it
        if exclude is not None and table_name in exclude:
            continue
        # Skip is `only` was provided and this table is not in it
        if only is not None and not table_name in only:
            continue
        data_pipeline = DataPipeline.load(pipeline_name)
        pipeline_output = data_pipeline.run(
            pipeline_name,
            output_folder,
            verify=verify,
            process_count=process_count,
            progress=show_progress,
        )
        export_csv(pipeline_output,
                   output_folder / "tables" / f"{table_name}.csv")
Example #12
0
def main(
        output_folder: Path,
        verify: str = None,
        only: List[str] = None,
        exclude: List[str] = None,
        location_key: str = None,
        strict_match: bool = False,
        process_count: int = cpu_count(),
        skip_download: bool = False,
) -> None:
    """
    Executes the data pipelines and places all outputs into `output_folder`. This is typically
    followed by publishing of the contents of the output folder to a server.

    Args:
        output_folder: Root folder where snapshot, intermediate and tables will be placed.
        verify: Run anomaly detection on the outputs using this strategy. Value must be one of:
            - None: (default) perform no anomaly detection
            - "simple": perform only fast anomaly detection
            - "full": perform exhaustive anomaly detection (can be very slow)
        only: If provided, only pipelines with a name appearing in this list will be run.
        exclude: If provided, pipelines with a name appearing in this list will not be run.
        location_key: If present, only run data sources which output data for this location.
        strict_match: In combination with `location_key`, filter data to only output `location_key`.
        process_count: Maximum number of processes to use during the data pipeline execution.
        skip_download: Skip downloading data sources if a cached version is available.
    """

    assert not (only is not None and exclude is not None
                ), "--only and --exclude options cannot be used simultaneously"

    # Ensure that there is an output folder to put the data in
    (output_folder / "snapshot").mkdir(parents=True, exist_ok=True)
    (output_folder / "intermediate").mkdir(parents=True, exist_ok=True)
    (output_folder / "tables").mkdir(parents=True, exist_ok=True)

    # A pipeline chain is any subfolder not starting with "_" in the pipelines folder
    all_pipeline_names = []
    for item in (SRC / "pipelines").iterdir():
        if not item.name.startswith("_") and not item.is_file():
            all_pipeline_names.append(item.name)

    # Verify that all of the provided pipeline names exist as pipelines
    for pipeline_name in (only or []) + (exclude or []):
        module_name = pipeline_name.replace("-", "_")
        assert module_name in all_pipeline_names, f'"{pipeline_name}" pipeline does not exist'

    # Run all the pipelines and place their outputs into the output folder. The output name for
    # each pipeline chain will be the name of the directory that the chain is in.
    for pipeline_name in all_pipeline_names:
        table_name = pipeline_name.replace("_", "-")

        # Skip if `exclude` was provided and this table is in it
        if exclude is not None and table_name in exclude:
            continue

        # Skip is `only` was provided and this table is not in it
        if only is not None and not table_name in only:
            continue

        # Load data pipeline and get rid of data sources if requested
        data_pipeline = DataPipeline.load(pipeline_name)
        if location_key is not None:
            exprs = [
                src.config.get("test", {}).get("location_key_match", ".*")
                for src in data_pipeline.data_sources
            ]
            exprs = [
                expr if isinstance(expr, list) else [expr] for expr in exprs
            ]
            data_pipeline.data_sources = [
                src for src, expr in zip(data_pipeline.data_sources, exprs)
                if any(re.match(expr_, location_key) for expr_ in expr)
            ]

        # Run the data pipeline to retrieve live data
        pipeline_output = data_pipeline.run(
            output_folder,
            process_count=process_count,
            verify_level=verify,
            skip_existing=skip_download,
        )

        # Filter out data output if requested
        if location_key is not None and strict_match:
            pipeline_output = pipeline_output[pipeline_output["key"] ==
                                              location_key]

        # Export the data output to disk as a CSV file
        export_csv(
            pipeline_output,
            output_folder / "tables" / f"{table_name}.csv",
            schema=data_pipeline.schema,
        )
Example #13
0
def get_pipelines() -> Iterator[DataPipeline]:
    """ Iterator with all the available data pipelines """
    for pipeline_name in get_pipeline_names():
        yield DataPipeline.load(pipeline_name)
Example #14
0
    profiler.enable()

# A pipeline chain is any subfolder not starting with "_" in the pipelines folder
all_pipeline_chains = []
for item in (ROOT / "src" / "pipelines").iterdir():
    if not item.name.startswith("_") and not item.is_file():
        all_pipeline_chains.append(item.name)

# Run all the pipelines and place their outputs into the output folder
# The output name for each pipeline chain will be the name of the directory that the chain is in
for pipeline_name in all_pipeline_chains:
    table_name = pipeline_name.replace("_", "-")
    if args.only and not table_name in args.only.split(","):
        continue
    if args.exclude and table_name in args.exclude.split(","):
        continue
    pipeline_chain = DataPipeline.load(pipeline_name)
    show_progress = not args.no_progress
    pipeline_output = pipeline_chain.run(pipeline_name,
                                         verify=args.verify,
                                         process_count=args.process_count,
                                         progress=show_progress)
    export_csv(pipeline_output,
               ROOT / "output" / "tables" / f"{table_name}.csv")

if args.profile:
    stats = Stats(profiler)
    stats.strip_dirs()
    stats.sort_stats("cumtime")
    stats.print_stats(20)