def read_source_output(data_pipeline: DataPipeline, data_source: DataSource) -> DataFrame: with temporary_directory() as workdir: output_path = workdir / f"{data_source.uuid(data_pipeline.table)}.csv" try: download_file(GCS_BUCKET_TEST, f"intermediate/{output_path.name}", output_path) columns = get_table_columns(output_path) dates = list(table_read_column( output_path, "date")) if "date" in columns else [None] return { "pipeline": data_pipeline.name, "data_source": f"{data_source.__module__}.{data_source.name}", "columns": ",".join(columns), "first_date": min(dates), "last_date": max(dates), "location_keys": ",".join(sorted(set(table_read_column(output_path, "key")))), } except Exception as exc: print(exc, file=sys.stderr) return []
def _latest_date_by_group(tables_folder: Path, group_by: str = "location_key") -> Dict[str, str]: groups: Dict[str, str] = {} for table_file in tables_folder.glob("*.csv"): table_columns = get_table_columns(table_file) if "date" in table_columns: iter1 = table_read_column(table_file, "date") iter2 = table_read_column(table_file, group_by) for date, key in zip(iter1, iter2): groups[key] = max(groups.get(key, date), date) return groups
def _make_location_key_and_date_table(index_table: Path, output_path: Path) -> None: # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Make sure that there is an index table present assert index_table.exists(), "Index table not found" # Index table will determine if we use "key" or "location_key" as column name index_columns = get_table_columns(index_table) location_key = "location_key" if "location_key" in index_columns else "key" # Create a single-column table with only the keys keys_table_path = workdir / "location_keys.csv" with open(keys_table_path, "w") as fd: fd.write(f"{location_key}\n") fd.writelines( f"{value}\n" for value in table_read_column(index_table, location_key)) # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_table_path = workdir / "dates.csv" with open(date_table_path, "w") as fd: fd.write("date\n") fd.writelines(f"{value}\n" for value in date_range("2020-01-01", max_date)) # Output all combinations of <key x date> table_cross_product(keys_table_path, date_table_path, output_path)
def publish_v3_main_table() -> Response: with temporary_directory() as workdir: input_folder = workdir / "input" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) # Get a list of valid location keys location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) # Download all the location breakout tables into our local storage download_folder(GCS_BUCKET_PROD, "v3", input_folder, lambda x: "location/" in str(x)) logger.log_info( f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files" ) # Create the aggregated table and put it in a compressed file agg_file_path = output_folder / "aggregated.csv.gz" with gzip.open(agg_file_path, "wt") as compressed_file: merge_location_breakout_tables(input_folder, compressed_file, location_keys) # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v3", output_folder) return Response("OK", status=200)
def main(output_folder: Path, tables_folder: Path, use_table_names: List[str] = None) -> None: """ This script takes the processed outputs located in `tables_folder` and publishes them into the output folder by performing the following operations: 1. Copy all the tables from `tables_folder` to `output_folder`, renaming fields if necessary. 2. Create different slices of data, such as the latest known record for each region, files for the last day of data, files for each individual region. 3. Produce a main table, created by iteratively performing left outer joins on all other tables for each slice of data (bot not for the global tables). """ # Wipe the output folder first for item in output_folder.glob("*"): if item.name.startswith("."): continue if item.is_file(): item.unlink() else: shutil.rmtree(item) # Create the folder which will be published using a stable schema output_folder = output_folder / "v3" output_folder.mkdir(exist_ok=True, parents=True) # Publish the tables containing all location keys publish_global_tables(tables_folder, output_folder) # Create a temporary folder which will host all the location breakouts with temporary_directory() as breakout_folder: # Break out each table into separate folders based on the location key publish_location_breakouts(output_folder, breakout_folder, use_table_names=use_table_names) # Create a folder which will host all the location aggregates location_aggregates_folder = output_folder / "location" location_aggregates_folder.mkdir(exist_ok=True, parents=True) # Aggregate the tables for each location independently location_keys = table_read_column(output_folder / "index.csv", "location_key") publish_location_aggregates( breakout_folder, location_aggregates_folder, location_keys, use_table_names=use_table_names, ) # Create the aggregated table and put it in a compressed file agg_file_path = output_folder / "aggregated.csv.gz" with gzip.open(agg_file_path, "wt") as compressed_file: merge_location_breakout_tables(location_aggregates_folder, compressed_file) # Convert all CSV files to JSON using values format convert_tables_to_json(output_folder, output_folder)
def publish_json_locations(prod_folder: str = "v2", location_key_from: str = None, location_key_until: str = None) -> Response: prod_folder = _get_request_param("prod_folder", prod_folder) location_key_from = _get_request_param("location_key_from", location_key_from) location_key_until = _get_request_param("location_key_until", location_key_until) with temporary_directory() as workdir: input_folder = workdir / "input" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) # Convert the tables to JSON for each location independently location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) if location_key_from is not None: location_keys = [ key for key in location_keys if key >= location_key_from ] if location_key_until is not None: location_keys = [ key for key in location_keys if key <= location_key_until ] logger.log_info( f"Converting {len(location_keys)} location subsets to JSON " f"from {location_keys[0]} until {location_keys[-1]}") # Download all the processed tables into our local storage def match_path(table_path: Path) -> bool: try: if prod_folder == "v2": location_key, table_name = str(table_path).split("/", 1) return table_name == "main.csv" and location_key in location_keys elif prod_folder == "v3": location_path, location_key = table_path.parent.name, table_path.stem return location_path == "location" and location_key in location_keys except: return False download_folder(GCS_BUCKET_PROD, prod_folder, input_folder, match_path) logger.log_info( f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files" ) # Convert all files to JSON convert_tables_to_json(input_folder, output_folder) converted_count = sum(1 for _ in output_folder.glob("**/*.json")) logger.log_info(f"Converted {converted_count} files to JSON") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder) return Response("OK", status=200)
def publish_v3_location_subsets( location_key_from: str = None, location_key_until: str = None ) -> Response: location_key_from = _get_request_param("location_key_from", location_key_from) location_key_until = _get_request_param("location_key_until", location_key_until) with temporary_directory() as workdir: input_folder = workdir / "input" intermediate_folder = workdir / "temp" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) location_keys = list(table_read_column(SRC / "data" / "metadata.csv", "key")) if location_key_from is not None: location_keys = [key for key in location_keys if key >= location_key_from] if location_key_until is not None: location_keys = [key for key in location_keys if key <= location_key_until] logger.log_info( f"Publishing {len(location_keys)} location subsets " f"from {location_keys[0]} until {location_keys[-1]}" ) # Download all the global tables into our local storage forbid_tokens = ("/", "main.", "aggregated.") download_folder( GCS_BUCKET_PROD, "v3", input_folder, lambda x: x.suffix == ".csv" and all(token not in str(x) for token in forbid_tokens), ) logger.log_info(f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files") # Break out each table into separate folders based on the location key publish_location_breakouts(input_folder, intermediate_folder, use_table_names=V3_TABLE_LIST) logger.log_info("Created all table location breakouts") # Create a folder which will host all the location aggregates location_aggregates_folder = output_folder / "location" location_aggregates_folder.mkdir(parents=True, exist_ok=True) # Aggregate the tables for each location independently publish_location_aggregates( intermediate_folder, location_aggregates_folder, location_keys, use_table_names=V3_TABLE_LIST, ) logger.log_info("Aggregated all table breakouts by location") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v3", output_folder) return Response("OK", status=200)
def publish_v3_location_subsets(location_key_from: str = None, location_key_until: str = None) -> Response: location_key_from = _get_request_param("location_key_from", location_key_from) location_key_until = _get_request_param("location_key_until", location_key_until) with temporary_directory() as workdir: input_folder = workdir / "input" intermediate_folder = workdir / "temp" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) if location_key_from is not None: location_keys = [ key for key in location_keys if key >= location_key_from ] if location_key_until is not None: location_keys = [ key for key in location_keys if key <= location_key_until ] logger.log_info(f"Publishing {len(location_keys)} location subsets " f"from {location_keys[0]} until {location_keys[-1]}") # Download all the global tables into our local storage download_folder(GCS_BUCKET_PROD, "v3", input_folder, lambda x: "/" not in str(x)) # Break out each table into separate folders based on the location key publish_location_breakouts(input_folder, intermediate_folder, use_table_names=V3_TABLE_LIST) # Aggregate the tables for each location independently publish_location_aggregates(intermediate_folder, output_folder, location_keys, use_table_names=V3_TABLE_LIST) # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v3", output_folder) return Response("OK", status=200)
def schedule_all_jobs(project_id: str, location_id: str, time_zone: str) -> None: """ Clears all previously scheduled jobs and schedules all necessary jobs for the current configuration. """ client = scheduler_v1.CloudSchedulerClient() # Create a custom method with our parameters for ease of use _schedule_job = partial( schedule_job, client=client, project_id=project_id, location_id=location_id, time_zone=time_zone, ) # Clear all pre-existing jobs clear_jobs(client=client, project_id=project_id, location_id=location_id) # Read the list of all known locations, since we will be splitting some jobs based on that location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) # Cache pull job runs hourly _schedule_job(schedule="0 * * * *", path="/cache_pull") # The job that publishes combined tables into the prod bucket runs every 2 hours _schedule_job( path="/publish_tables", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */2 * * *", ) # The job that publishes aggregate outputs runs every 4 hours _schedule_job( # Run in a separate, preemptible instance path="/deferred/publish_main_table", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/4 * * *", ) # The job that publishes breakdown outputs runs every 4 hours _schedule_job( path="/deferred/publish_subset_tables", # Offset by 90 minutes to run after publishing schedule="30 1-23/4 * * *", ) # Converting the outputs to JSON is less critical but also slow so it's run separately for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"prod_folder=v2&location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_json?{job_params}", # Offset by 120 minutes to run after subset tables are published schedule="0 2-23/4 * * *", ) # Get new errors once a day at midday. _schedule_job(path="/report_errors_to_github", schedule="0 12 * * *") # Keep track of the different job groups to only output them once job_urls_seen = set() for data_pipeline in get_pipelines(): # The job that combines data sources into a table runs hourly _schedule_job( path=f"/deferred/combine_table?table={data_pipeline.table}", # Offset by 15 minutes to let other hourly tasks finish schedule="15 * * * *", ) for idx, data_source in enumerate(data_pipeline.data_sources): automation_opts = data_source.config.get("automation", {}) # The job to pull each individual data source runs hourly unless specified otherwise job_sched = automation_opts.get("schedule", "0 * * * *") # If the job is deferred, then prepend the token to the path job_prefix = "/deferred" if automation_opts.get("deferred") else "" # Each data source has a job group. All data sources within the same job group are run # as part of the same job in series. The default job group is the index of the data # source. job_group = automation_opts.get("job_group", idx) job_url = f"{job_prefix}/update_table?table={data_pipeline.table}&job_group={job_group}" if job_url not in job_urls_seen: job_urls_seen.add(job_url) _schedule_job(path=job_url, schedule=job_sched) # V3 publish jobs start here # Publish the tables with all location keys every 2 hours _schedule_job( path="/deferred/publish_v3_global_tables", # Offset by 30 minutes to let other hourly tasks finish schedule="30 */2 * * *", ) # Publish the main aggregated table every 2 hours _schedule_job( path="/deferred/publish_v3_main_table", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/2 * * *", ) # Break down the outputs by location key every 2 hours, and execute the job in chunks for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_v3_location_subsets?{job_params}", # Offset by 60 minutes to let other hourly tasks finish schedule="0 1-23/2 * * *", ) # Publish outputs in JSON format every 2 hours, and execute the job in chunks for subset in _split_into_subsets(location_keys, bin_count=5): job_params = f"prod_folder=v3&location_key_from={subset[0]}&location_key_until={subset[-1]}" _schedule_job( path=f"/deferred/publish_json?{job_params}", # Offset by 90 minutes to let other hourly tasks finish schedule="30 1-23/2 * * *", )
def main(output_folder: Path, tables_folder: Path, use_table_names: List[str] = None) -> None: """ This script takes the processed outputs located in `tables_folder` and publishes them into the output folder by performing the following operations: 1. Copy all the tables from `tables_folder` to `output_folder`, renaming fields if necessary. 2. Create different slices of data, such as the latest known record for each region, files for the last day of data, files for each individual region. 3. Produce a main table, created by iteratively performing left outer joins on all other tables for each slice of data (bot not for the global tables). """ # Wipe the output folder first for item in output_folder.glob("*"): if item.name.startswith("."): continue if item.is_file(): item.unlink() else: shutil.rmtree(item) # Create the folder which will be published using a stable schema v3_folder = output_folder / "v3" v3_folder.mkdir(exist_ok=True, parents=True) # Publish the tables containing all location keys publish_global_tables(tables_folder, v3_folder, use_table_names=use_table_names) # Break out each table into separate folders based on the location key publish_location_breakouts(v3_folder, v3_folder, use_table_names=use_table_names) # Aggregate the independent tables for each location location_keys = table_read_column(v3_folder / "index.csv", "location_key") publish_location_aggregates(v3_folder, v3_folder, location_keys, use_table_names=use_table_names) # Create a single table aggregating outputs from all other tables main_file_name = "covid-19-open-data.csv" main_file_zip_path = v3_folder / f"{main_file_name}.zip" with ZipFile(main_file_zip_path, mode="w", compression=ZIP_DEFLATED) as zip_archive: with zip_archive.open(main_file_name, "w") as output_file: merge_output_tables_sqlite(v3_folder, TextIOWrapper(output_file), use_table_names=use_table_names) # Convert all CSV files to JSON using values format global_tables = list(v3_folder.glob("*.csv")) location_tables = [ table for table in v3_folder.glob("**/*.csv") if table not in global_tables ] convert_tables_to_json(location_tables, v3_folder)