def update_table(table_name: str = None, job_group: str = None, parallel_jobs: int = 8) -> Response: table_name = _get_request_param("table", table_name) job_group = _get_request_param("job_group", job_group) process_count = _get_request_param("parallel_jobs", parallel_jobs) # Default to 1 if invalid process count is given process_count = safe_int_cast(process_count) or 1 # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with temporary_directory() as workdir: (workdir / "snapshot").mkdir(parents=True, exist_ok=True) (workdir / "intermediate").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Limit the sources to only the job_group provided if job_group is not None: data_pipeline.data_sources = [ data_source for data_source in data_pipeline.data_sources if data_source.config.get("automation", {}).get("job_group") == job_group ] # Early exit: job group contains no data sources if not data_pipeline.data_sources: return Response( f"No data sources matched job group {job_group} for table {table_name}", status=400, ) # Log the data sources being extracted data_source_names = [ src.config.get("name") for src in data_pipeline.data_sources ] logger.log_info(f"Updating data sources: {data_source_names}") # When running the data pipeline, use as many parallel processes as allowed and avoid # downloading files multiple times. run_options = dict(process_count=process_count, skip_existing=True) # Produce the intermediate files from the data source intermediate_results = data_pipeline.parse(workdir, **run_options) data_pipeline._save_intermediate_results(workdir / "intermediate", intermediate_results) intermediate_files = list( map(str, (workdir / "intermediate").glob("*.csv"))) logger.log_info(f"Created intermediate tables: {intermediate_files}") # Upload results to the test bucket because these are not prod files upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot") upload_folder(GCS_BUCKET_TEST, "intermediate", workdir / "intermediate") return Response("OK", status=200)
def publish_versions(prod_folder: str = "v3") -> Response: """Lists all the blobs in the bucket with generation.""" prod_folder = _get_request_param("prod_folder", prod_folder) # Enumerate all the versions for each of the global tables prefix = prod_folder + "/" blob_index: Dict[str, List[str]] = {} bucket = get_storage_bucket(GCS_BUCKET_PROD) for table_name in ["aggregated", "main"] + list(get_table_names()): blobs = bucket.list_blobs(prefix=prefix + table_name, versions=True) for blob in blobs: fname = blob.name.replace(prefix, "") blob_index[fname] = blob_index.get(fname, []) blob_index[fname].append(blob.generation) # Repeat the process for the intermediate tables bucket = get_storage_bucket(GCS_BUCKET_TEST) blobs = bucket.list_blobs(prefix="intermediate/", versions=True) for blob in blobs: # Keep the "intermediate/" prefix to distinguish from the tables fname = blob.name blob_index[fname] = blob_index.get(fname, []) blob_index[fname].append(blob.generation) with temporary_directory() as workdir: # Write it to disk fname = workdir / "versions.json" with open(fname, "w") as fh: json.dump(blob_index, fh) # Upload to root folder upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir) return Response("OK", status=200)
def combine_table(table_name: str = None) -> Response: table_name = _get_request_param("table", table_name) logger.log_info(f"Combining data sources for {table_name}") # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with temporary_directory() as workdir: (workdir / "tables").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Get a list of the intermediate files used by this data pipeline intermediate_file_names = [] for data_source in data_pipeline.data_sources: intermediate_file_names.append( f"{data_source.uuid(data_pipeline.table)}.csv") logger.log_info( f"Downloading intermediate tables {intermediate_file_names}") # Download only the necessary intermediate files download_folder( GCS_BUCKET_TEST, "intermediate", workdir / "intermediate", lambda x: x.name in intermediate_file_names, ) # Re-load all intermediate results intermediate_results = data_pipeline._load_intermediate_results( workdir / "intermediate") logger.log_info( f"Loaded intermediate tables {intermediate_file_names}") # Limit the number of processes to avoid OOM in big datasets process_count = 4 # Combine all intermediate results into a single dataframe pipeline_output = data_pipeline.combine(intermediate_results, process_count=process_count) logger.log_info(f"Combined intermediate tables into {table_name}") # Output combined data to disk output_path = workdir / "tables" / f"{table_name}.csv" export_csv(pipeline_output, output_path, schema=data_pipeline.schema) logger.log_info(f"Exported combined {table_name} to CSV") # Upload results to the test bucket because these are not prod files # They will be copied to prod in the publish step, so main.csv is in sync logger.log_info(f"Uploading combined {table_name}...") upload_folder(GCS_BUCKET_TEST, "tables", workdir / "tables") return Response("OK", status=200)
def update_table(table_name: str = None, job_group: int = None) -> Response: table_name = _get_request_param("table", table_name) job_group = _get_request_param("job_group", job_group) # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with temporary_directory() as workdir: (workdir / "snapshot").mkdir(parents=True, exist_ok=True) (workdir / "intermediate").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Limit the sources to only the job_group provided if job_group is not None: data_pipeline.data_sources = [ data_source for data_source in data_pipeline.data_sources if data_source.config.get("automation", {}).get("job_group") == job_group ] # Early exit: job group contains no data sources if not data_pipeline.data_sources: return Response( f"No data sources matched job group {job_group} for table {table_name}", status=400, ) # Log the data sources being extracted data_source_names = [ src.config.get("name") for src in data_pipeline.data_sources ] logger.log_info(f"Updating data sources: {data_source_names}") # Produce the intermediate files from the data source intermediate_results = data_pipeline.parse(workdir, process_count=1) data_pipeline._save_intermediate_results(workdir / "intermediate", intermediate_results) # Upload results to the test bucket because these are not prod files upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot") upload_folder(GCS_BUCKET_TEST, "intermediate", workdir / "intermediate") return Response("OK", status=200)
def combine_table(table_name: str = None) -> Response: table_name = _get_request_param("table", table_name) # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with TemporaryDirectory() as output_folder: output_folder = Path(output_folder) (output_folder / "tables").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Get a list of the intermediate files used by this data pipeline intermediate_file_names = [] for data_source in data_pipeline.data_sources: intermediate_file_names.append(f"{data_source.uuid(data_pipeline.table)}.csv") # Download only the necessary intermediate files download_folder( GCS_BUCKET_TEST, "intermediate", output_folder / "intermediate", lambda x: x.name in intermediate_file_names, ) # Re-load all intermediate results intermediate_results = data_pipeline._load_intermediate_results( output_folder / "intermediate" ) # Combine all intermediate results into a single dataframe pipeline_output = data_pipeline.combine(intermediate_results) # Output combined data to disk export_csv( pipeline_output, output_folder / "tables" / f"{table_name}.csv", schema=data_pipeline.schema, ) # Upload results to the test bucket because these are not prod files # They will be copied to prod in the publish step, so main.csv is in sync upload_folder(GCS_BUCKET_TEST, "tables", output_folder / "tables") return Response("OK", status=200)
def update_table(table_name: str = None, job_group: int = None) -> str: table_name = table_name or request.args.get("table") assert table_name in list(get_table_names()) try: job_group = request.args.get("job_group") except: pass with TemporaryDirectory() as output_folder: output_folder = Path(output_folder) (output_folder / "snapshot").mkdir(parents=True, exist_ok=True) (output_folder / "intermediate").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Limit the sources to only the job_group provided if job_group is not None: data_pipeline.data_sources = [ data_source for data_source in data_pipeline.data_sources if data_source.config.get("automation", {}).get("job_group") == job_group ] assert ( data_pipeline.data_sources ), f"No data sources matched job group {job_group} for table {table_name}" # Log the data sources being extracted data_source_names = [ src.config.get("name") for src in data_pipeline.data_sources ] print(f"Data sources: {data_source_names}") # Produce the intermediate files from the data source intermediate_results = data_pipeline.parse(output_folder, process_count=1) data_pipeline._save_intermediate_results( output_folder / "intermediate", intermediate_results) # Upload results to the test bucket because these are not prod files upload_folder(GCS_BUCKET_TEST, "snapshot", output_folder / "snapshot") upload_folder(GCS_BUCKET_TEST, "intermediate", output_folder / "intermediate") return "OK"
def publish_main_table() -> Response: with temporary_directory() as workdir: input_folder = workdir / "input" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) # Download the already published tables allowlist_filenames = [f"{table}.csv" for table in get_table_names()] download_folder(GCS_BUCKET_PROD, "v2", input_folder, lambda x: str(x) in allowlist_filenames) # Create the joint main table for all records main_table_path = output_folder / "main.csv" merge_output_tables(input_folder, main_table_path) logger.log_info("Main table created") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v2", output_folder) return Response("OK", status=200)
def main(): schema = get_schema() for table_name in tqdm(list(get_table_names())): table = fetch_table(table_name) table = table.sort_values([col for col in ("key", "date") if col in table.columns]) export_csv(table, path=SRC / "test" / "data" / f"{table_name}.csv", schema=schema)