def update_table(table_name: str = None, job_group: str = None, parallel_jobs: int = 8) -> Response: table_name = _get_request_param("table", table_name) job_group = _get_request_param("job_group", job_group) process_count = _get_request_param("parallel_jobs", parallel_jobs) # Default to 1 if invalid process count is given process_count = safe_int_cast(process_count) or 1 # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with temporary_directory() as workdir: (workdir / "snapshot").mkdir(parents=True, exist_ok=True) (workdir / "intermediate").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Limit the sources to only the job_group provided if job_group is not None: data_pipeline.data_sources = [ data_source for data_source in data_pipeline.data_sources if data_source.config.get("automation", {}).get("job_group") == job_group ] # Early exit: job group contains no data sources if not data_pipeline.data_sources: return Response( f"No data sources matched job group {job_group} for table {table_name}", status=400, ) # Log the data sources being extracted data_source_names = [ src.config.get("name") for src in data_pipeline.data_sources ] logger.log_info(f"Updating data sources: {data_source_names}") # When running the data pipeline, use as many parallel processes as allowed and avoid # downloading files multiple times. run_options = dict(process_count=process_count, skip_existing=True) # Produce the intermediate files from the data source intermediate_results = data_pipeline.parse(workdir, **run_options) data_pipeline._save_intermediate_results(workdir / "intermediate", intermediate_results) intermediate_files = list( map(str, (workdir / "intermediate").glob("*.csv"))) logger.log_info(f"Created intermediate tables: {intermediate_files}") # Upload results to the test bucket because these are not prod files upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot") upload_folder(GCS_BUCKET_TEST, "intermediate", workdir / "intermediate") return Response("OK", status=200)
def publish_sources(prod_folder: str = "v3") -> Response: """Publishes a table with the source of each datapoint.""" prod_folder = _get_request_param("prod_folder", prod_folder) with temporary_directory() as workdir: # Get the data sources and write a JSON file summarizing them to disk metadata = create_metadata_dict() with open(workdir / "metadata.json", "w") as fh: json.dump(metadata, fh) # Iterate over the individual tables and build their sources file # TODO: create source map for all tables, not just a hand-picked subset for table_name in ("epidemiology", "hospitalizations", "vaccinations", "by-age"): data_sources = metadata["sources"] pipeline = DataPipeline.load(table_name.replace("-", "_")) source_map = map_table_sources_to_index(data_sources, pipeline, prod_folder=prod_folder) output_table_sources(source_map, workdir / f"{table_name}.sources.csv") # Upload to root folder upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir) return Response("OK", status=200)
def _test_data_source(pipeline_name: DataPipeline, data_source_idx: DataSource, random_seed: int = 0): # Re-load the data pipeline and data source # It seems inefficient but it's necessary because we can't move these objects across # processes pipeline = DataPipeline.load(pipeline_name) data_source = pipeline.data_sources[data_source_idx] # Replace the error logging function to keep logs cleaner during tests data_source.log_error = _log_nothing data_source.log_warning = _log_nothing # Load the real cache files cache = requests.get("{}/sitemap.json".format(CACHE_URL)).json() data_source_name = data_source.__class__.__name__ data_source_opts = data_source.config if data_source_opts.get("test", {}).get("skip"): return # Make a copy of all auxiliary files aux = { name: table.copy() for name, table in pipeline.auxiliary_tables.items() } # If we have a hint for the expected keys, use only those from metadata metadata_query = data_source_opts.get("test", {}).get("metadata_query") if metadata_query: aux["metadata"] = aux["metadata"].query(metadata_query) # Get a small sample of metadata, since we are testing for whether a source produces # _any_ output, not if the output is exhaustive sample_size = min(len(aux["metadata"]), METADATA_SAMPLE_SIZE) aux["metadata"] = aux["metadata"].sample(sample_size, random_state=random_seed) # Build the failure message to log the config of this data source failure_message = ( f"{data_source_name} from {pipeline_name} pipeline failed with options {data_source_opts} " f"and using metadata keys {aux['metadata']['key'].values.tolist()}") # Use a different temporary working directory for each data source with temporary_directory() as workdir: (workdir / "snapshot").mkdir(parents=True, exist_ok=True) try: output_data = data_source.run(workdir, cache, aux) except Exception as exc: traceback.print_exc() raise RuntimeError(failure_message) # Run our battery of tests against the output data to ensure it looks correct # Data source has at least one row in output assert len(output_data) >= 1, failure_message
def combine_table(table_name: str = None) -> Response: table_name = _get_request_param("table", table_name) logger.log_info(f"Combining data sources for {table_name}") # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with temporary_directory() as workdir: (workdir / "tables").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Get a list of the intermediate files used by this data pipeline intermediate_file_names = [] for data_source in data_pipeline.data_sources: intermediate_file_names.append( f"{data_source.uuid(data_pipeline.table)}.csv") logger.log_info( f"Downloading intermediate tables {intermediate_file_names}") # Download only the necessary intermediate files download_folder( GCS_BUCKET_TEST, "intermediate", workdir / "intermediate", lambda x: x.name in intermediate_file_names, ) # Re-load all intermediate results intermediate_results = data_pipeline._load_intermediate_results( workdir / "intermediate") logger.log_info( f"Loaded intermediate tables {intermediate_file_names}") # Limit the number of processes to avoid OOM in big datasets process_count = 4 # Combine all intermediate results into a single dataframe pipeline_output = data_pipeline.combine(intermediate_results, process_count=process_count) logger.log_info(f"Combined intermediate tables into {table_name}") # Output combined data to disk output_path = workdir / "tables" / f"{table_name}.csv" export_csv(pipeline_output, output_path, schema=data_pipeline.schema) logger.log_info(f"Exported combined {table_name} to CSV") # Upload results to the test bucket because these are not prod files # They will be copied to prod in the publish step, so main.csv is in sync logger.log_info(f"Uploading combined {table_name}...") upload_folder(GCS_BUCKET_TEST, "tables", workdir / "tables") return Response("OK", status=200)
def _test_data_pipeline(pipeline_name: str, random_seed: int = 0): # Load the data pipeline to get the number of data sources data_pipeline = DataPipeline.load(pipeline_name) # Load the data pipeline, iterate over each data source and run it to get its output pipeline_count = len(data_pipeline.data_sources) map_func = partial(_test_data_source, pipeline_name, random_seed=random_seed) _ = thread_map(map_func, range(pipeline_count), total=pipeline_count, max_workers=4) # Consume the results list(_)
def update_table(table_name: str = None, job_group: int = None) -> Response: table_name = _get_request_param("table", table_name) job_group = _get_request_param("job_group", job_group) # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with temporary_directory() as workdir: (workdir / "snapshot").mkdir(parents=True, exist_ok=True) (workdir / "intermediate").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Limit the sources to only the job_group provided if job_group is not None: data_pipeline.data_sources = [ data_source for data_source in data_pipeline.data_sources if data_source.config.get("automation", {}).get("job_group") == job_group ] # Early exit: job group contains no data sources if not data_pipeline.data_sources: return Response( f"No data sources matched job group {job_group} for table {table_name}", status=400, ) # Log the data sources being extracted data_source_names = [ src.config.get("name") for src in data_pipeline.data_sources ] logger.log_info(f"Updating data sources: {data_source_names}") # Produce the intermediate files from the data source intermediate_results = data_pipeline.parse(workdir, process_count=1) data_pipeline._save_intermediate_results(workdir / "intermediate", intermediate_results) # Upload results to the test bucket because these are not prod files upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot") upload_folder(GCS_BUCKET_TEST, "intermediate", workdir / "intermediate") return Response("OK", status=200)
def combine_table(table_name: str = None) -> Response: table_name = _get_request_param("table", table_name) # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with TemporaryDirectory() as output_folder: output_folder = Path(output_folder) (output_folder / "tables").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Get a list of the intermediate files used by this data pipeline intermediate_file_names = [] for data_source in data_pipeline.data_sources: intermediate_file_names.append(f"{data_source.uuid(data_pipeline.table)}.csv") # Download only the necessary intermediate files download_folder( GCS_BUCKET_TEST, "intermediate", output_folder / "intermediate", lambda x: x.name in intermediate_file_names, ) # Re-load all intermediate results intermediate_results = data_pipeline._load_intermediate_results( output_folder / "intermediate" ) # Combine all intermediate results into a single dataframe pipeline_output = data_pipeline.combine(intermediate_results) # Output combined data to disk export_csv( pipeline_output, output_folder / "tables" / f"{table_name}.csv", schema=data_pipeline.schema, ) # Upload results to the test bucket because these are not prod files # They will be copied to prod in the publish step, so main.csv is in sync upload_folder(GCS_BUCKET_TEST, "tables", output_folder / "tables") return Response("OK", status=200)
def update_table(table_name: str = None, job_group: int = None) -> str: table_name = table_name or request.args.get("table") assert table_name in list(get_table_names()) try: job_group = request.args.get("job_group") except: pass with TemporaryDirectory() as output_folder: output_folder = Path(output_folder) (output_folder / "snapshot").mkdir(parents=True, exist_ok=True) (output_folder / "intermediate").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Limit the sources to only the job_group provided if job_group is not None: data_pipeline.data_sources = [ data_source for data_source in data_pipeline.data_sources if data_source.config.get("automation", {}).get("job_group") == job_group ] assert ( data_pipeline.data_sources ), f"No data sources matched job group {job_group} for table {table_name}" # Log the data sources being extracted data_source_names = [ src.config.get("name") for src in data_pipeline.data_sources ] print(f"Data sources: {data_source_names}") # Produce the intermediate files from the data source intermediate_results = data_pipeline.parse(output_folder, process_count=1) data_pipeline._save_intermediate_results( output_folder / "intermediate", intermediate_results) # Upload results to the test bucket because these are not prod files upload_folder(GCS_BUCKET_TEST, "snapshot", output_folder / "snapshot") upload_folder(GCS_BUCKET_TEST, "intermediate", output_folder / "intermediate") return "OK"
def get_source_configs(pipeline_names: List[str]) -> Iterator[Dict]: """Map a list of pipeline names to their source configs.""" for pipeline_name in pipeline_names: data_pipeline = DataPipeline.load(pipeline_name) for data_source in data_pipeline.data_sources: data_source_config = data_source.config data_source_name = data_source_config.get("name") data_source_fetch_params = data_source_config.get("fetch", []) for fetch_param in data_source_fetch_params: file_ext = fetch_param.get("opts", {}).get("ext", "") data_source_url = fetch_param.get("url") yield { "pipeline_name": pipeline_name, "source_name": data_source_name, "url": data_source_url, "ext": file_ext, }
# Create a schema to output integers and avoid pandas converting to floating point schema_map = dict(date="str", location_key="str") schema = {col: schema_map.get(col, "int") for col in source_table.columns} # Output the table at the requested location export_csv(source_table, output_path, schema) if __name__ == "__main__": # To authenticate with Cloud locally, run the following commands: # > $env:GOOGLE_CLOUD_PROJECT = "github-open-covid-19" # > $env:GCS_SERVICE_ACCOUNT = "*****@*****.**" # > $env:GCP_TOKEN = $(gcloud auth application-default print-access-token) # Create the output directory where the sources will go output_directory = SRC / ".." / "output" / "sources" output_directory.mkdir(exist_ok=True, parents=True) # Get the data sources and write a JSON file summarizing them to disk metadata = create_metadata_dict() with open(output_directory / "metadata.json", "w") as fh: json.dump(metadata, fh) # Iterate over the individual tables and build their sources file for table_name in ("epidemiology", "hospitalizations", "vaccinations", "by-age"): pipeline = DataPipeline.load(table_name.replace("-", "_")) source_map = map_table_sources_to_index(metadata["sources"], pipeline) output_table_sources(source_map, output_directory / f"{table_name}.sources.csv")
def main( output_folder: Path, verify: str = None, only: List[str] = None, exclude: List[str] = None, process_count: int = cpu_count(), show_progress: bool = True, ) -> None: """ Executes the data pipelines and places all outputs into `output_folder`. This is typically followed by publishing of the contents of the output folder to a server. Args: output_folder: Root folder where snapshot, intermediate and tables will be placed. verify: Run anomaly detection on the outputs using this strategy. Value must be one of: - None: (default) perform no anomaly detection - "simple": perform only fast anomaly detection - "full": perform exhaustive anomaly detection (can be very slow) only: If provided, only pipelines with a name appearing in this list will be run. exclude: If provided, pipelines with a name appearing in this list will not be run. process_count: Maximum number of processes to use during the data pipeline execution. show_progress: Display progress for the execution of individual DataSources within this pipeline. """ assert not (only is not None and exclude is not None ), "--only and --exclude options cannot be used simultaneously" # Ensure that there is an output folder to put the data in (output_folder / "snapshot").mkdir(parents=True, exist_ok=True) (output_folder / "intermediate").mkdir(parents=True, exist_ok=True) (output_folder / "tables").mkdir(parents=True, exist_ok=True) # A pipeline chain is any subfolder not starting with "_" in the pipelines folder all_pipeline_names = [] for item in (ROOT / "src" / "pipelines").iterdir(): if not item.name.startswith("_") and not item.is_file(): all_pipeline_names.append(item.name) # Verify that all of the provided pipeline names exist as pipelines for pipeline_name in (only or []) + (exclude or []): module_name = pipeline_name.replace("-", "_") assert module_name in all_pipeline_names, f'"{pipeline_name}" pipeline does not exist' # Run all the pipelines and place their outputs into the output folder # The output name for each pipeline chain will be the name of the directory that the chain is in for pipeline_name in all_pipeline_names: table_name = pipeline_name.replace("_", "-") # Skip if `exclude` was provided and this table is in it if exclude is not None and table_name in exclude: continue # Skip is `only` was provided and this table is not in it if only is not None and not table_name in only: continue data_pipeline = DataPipeline.load(pipeline_name) pipeline_output = data_pipeline.run( pipeline_name, output_folder, verify=verify, process_count=process_count, progress=show_progress, ) export_csv(pipeline_output, output_folder / "tables" / f"{table_name}.csv")
def main( output_folder: Path, verify: str = None, only: List[str] = None, exclude: List[str] = None, location_key: str = None, strict_match: bool = False, process_count: int = cpu_count(), skip_download: bool = False, ) -> None: """ Executes the data pipelines and places all outputs into `output_folder`. This is typically followed by publishing of the contents of the output folder to a server. Args: output_folder: Root folder where snapshot, intermediate and tables will be placed. verify: Run anomaly detection on the outputs using this strategy. Value must be one of: - None: (default) perform no anomaly detection - "simple": perform only fast anomaly detection - "full": perform exhaustive anomaly detection (can be very slow) only: If provided, only pipelines with a name appearing in this list will be run. exclude: If provided, pipelines with a name appearing in this list will not be run. location_key: If present, only run data sources which output data for this location. strict_match: In combination with `location_key`, filter data to only output `location_key`. process_count: Maximum number of processes to use during the data pipeline execution. skip_download: Skip downloading data sources if a cached version is available. """ assert not (only is not None and exclude is not None ), "--only and --exclude options cannot be used simultaneously" # Ensure that there is an output folder to put the data in (output_folder / "snapshot").mkdir(parents=True, exist_ok=True) (output_folder / "intermediate").mkdir(parents=True, exist_ok=True) (output_folder / "tables").mkdir(parents=True, exist_ok=True) # A pipeline chain is any subfolder not starting with "_" in the pipelines folder all_pipeline_names = [] for item in (SRC / "pipelines").iterdir(): if not item.name.startswith("_") and not item.is_file(): all_pipeline_names.append(item.name) # Verify that all of the provided pipeline names exist as pipelines for pipeline_name in (only or []) + (exclude or []): module_name = pipeline_name.replace("-", "_") assert module_name in all_pipeline_names, f'"{pipeline_name}" pipeline does not exist' # Run all the pipelines and place their outputs into the output folder. The output name for # each pipeline chain will be the name of the directory that the chain is in. for pipeline_name in all_pipeline_names: table_name = pipeline_name.replace("_", "-") # Skip if `exclude` was provided and this table is in it if exclude is not None and table_name in exclude: continue # Skip is `only` was provided and this table is not in it if only is not None and not table_name in only: continue # Load data pipeline and get rid of data sources if requested data_pipeline = DataPipeline.load(pipeline_name) if location_key is not None: exprs = [ src.config.get("test", {}).get("location_key_match", ".*") for src in data_pipeline.data_sources ] exprs = [ expr if isinstance(expr, list) else [expr] for expr in exprs ] data_pipeline.data_sources = [ src for src, expr in zip(data_pipeline.data_sources, exprs) if any(re.match(expr_, location_key) for expr_ in expr) ] # Run the data pipeline to retrieve live data pipeline_output = data_pipeline.run( output_folder, process_count=process_count, verify_level=verify, skip_existing=skip_download, ) # Filter out data output if requested if location_key is not None and strict_match: pipeline_output = pipeline_output[pipeline_output["key"] == location_key] # Export the data output to disk as a CSV file export_csv( pipeline_output, output_folder / "tables" / f"{table_name}.csv", schema=data_pipeline.schema, )
def get_pipelines() -> Iterator[DataPipeline]: """ Iterator with all the available data pipelines """ for pipeline_name in get_pipeline_names(): yield DataPipeline.load(pipeline_name)
profiler.enable() # A pipeline chain is any subfolder not starting with "_" in the pipelines folder all_pipeline_chains = [] for item in (ROOT / "src" / "pipelines").iterdir(): if not item.name.startswith("_") and not item.is_file(): all_pipeline_chains.append(item.name) # Run all the pipelines and place their outputs into the output folder # The output name for each pipeline chain will be the name of the directory that the chain is in for pipeline_name in all_pipeline_chains: table_name = pipeline_name.replace("_", "-") if args.only and not table_name in args.only.split(","): continue if args.exclude and table_name in args.exclude.split(","): continue pipeline_chain = DataPipeline.load(pipeline_name) show_progress = not args.no_progress pipeline_output = pipeline_chain.run(pipeline_name, verify=args.verify, process_count=args.process_count, progress=show_progress) export_csv(pipeline_output, ROOT / "output" / "tables" / f"{table_name}.csv") if args.profile: stats = Stats(profiler) stats.strip_dirs() stats.sort_stats("cumtime") stats.print_stats(20)