def update_table(table_name: str = None,
                 job_group: str = None,
                 parallel_jobs: int = 8) -> Response:
    table_name = _get_request_param("table", table_name)
    job_group = _get_request_param("job_group", job_group)
    process_count = _get_request_param("parallel_jobs", parallel_jobs)
    # Default to 1 if invalid process count is given
    process_count = safe_int_cast(process_count) or 1

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        (workdir / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]

            # Early exit: job group contains no data sources
            if not data_pipeline.data_sources:
                return Response(
                    f"No data sources matched job group {job_group} for table {table_name}",
                    status=400,
                )

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        logger.log_info(f"Updating data sources: {data_source_names}")

        # When running the data pipeline, use as many parallel processes as allowed and avoid
        # downloading files multiple times.
        run_options = dict(process_count=process_count, skip_existing=True)

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(workdir, **run_options)
        data_pipeline._save_intermediate_results(workdir / "intermediate",
                                                 intermediate_results)
        intermediate_files = list(
            map(str, (workdir / "intermediate").glob("*.csv")))
        logger.log_info(f"Created intermediate tables: {intermediate_files}")

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      workdir / "intermediate")

    return Response("OK", status=200)
Exemple #2
0
def publish_versions(prod_folder: str = "v3") -> Response:
    """Lists all the blobs in the bucket with generation."""
    prod_folder = _get_request_param("prod_folder", prod_folder)

    # Enumerate all the versions for each of the global tables
    prefix = prod_folder + "/"
    blob_index: Dict[str, List[str]] = {}
    bucket = get_storage_bucket(GCS_BUCKET_PROD)
    for table_name in ["aggregated", "main"] + list(get_table_names()):
        blobs = bucket.list_blobs(prefix=prefix + table_name, versions=True)
        for blob in blobs:
            fname = blob.name.replace(prefix, "")
            blob_index[fname] = blob_index.get(fname, [])
            blob_index[fname].append(blob.generation)

    # Repeat the process for the intermediate tables
    bucket = get_storage_bucket(GCS_BUCKET_TEST)
    blobs = bucket.list_blobs(prefix="intermediate/", versions=True)
    for blob in blobs:
        # Keep the "intermediate/" prefix to distinguish from the tables
        fname = blob.name
        blob_index[fname] = blob_index.get(fname, [])
        blob_index[fname].append(blob.generation)

    with temporary_directory() as workdir:
        # Write it to disk
        fname = workdir / "versions.json"
        with open(fname, "w") as fh:
            json.dump(blob_index, fh)

        # Upload to root folder
        upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir)

    return Response("OK", status=200)
def combine_table(table_name: str = None) -> Response:
    table_name = _get_request_param("table", table_name)
    logger.log_info(f"Combining data sources for {table_name}")

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "tables").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Get a list of the intermediate files used by this data pipeline
        intermediate_file_names = []
        for data_source in data_pipeline.data_sources:
            intermediate_file_names.append(
                f"{data_source.uuid(data_pipeline.table)}.csv")
        logger.log_info(
            f"Downloading intermediate tables {intermediate_file_names}")

        # Download only the necessary intermediate files
        download_folder(
            GCS_BUCKET_TEST,
            "intermediate",
            workdir / "intermediate",
            lambda x: x.name in intermediate_file_names,
        )

        # Re-load all intermediate results
        intermediate_results = data_pipeline._load_intermediate_results(
            workdir / "intermediate")
        logger.log_info(
            f"Loaded intermediate tables {intermediate_file_names}")

        # Limit the number of processes to avoid OOM in big datasets
        process_count = 4

        # Combine all intermediate results into a single dataframe
        pipeline_output = data_pipeline.combine(intermediate_results,
                                                process_count=process_count)
        logger.log_info(f"Combined intermediate tables into {table_name}")

        # Output combined data to disk
        output_path = workdir / "tables" / f"{table_name}.csv"
        export_csv(pipeline_output, output_path, schema=data_pipeline.schema)
        logger.log_info(f"Exported combined {table_name} to CSV")

        # Upload results to the test bucket because these are not prod files
        # They will be copied to prod in the publish step, so main.csv is in sync
        logger.log_info(f"Uploading combined {table_name}...")
        upload_folder(GCS_BUCKET_TEST, "tables", workdir / "tables")

    return Response("OK", status=200)
def update_table(table_name: str = None, job_group: int = None) -> Response:
    table_name = _get_request_param("table", table_name)
    job_group = _get_request_param("job_group", job_group)

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with temporary_directory() as workdir:
        (workdir / "snapshot").mkdir(parents=True, exist_ok=True)
        (workdir / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]

            # Early exit: job group contains no data sources
            if not data_pipeline.data_sources:
                return Response(
                    f"No data sources matched job group {job_group} for table {table_name}",
                    status=400,
                )

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        logger.log_info(f"Updating data sources: {data_source_names}")

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(workdir, process_count=1)
        data_pipeline._save_intermediate_results(workdir / "intermediate",
                                                 intermediate_results)

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      workdir / "intermediate")

    return Response("OK", status=200)
Exemple #5
0
def combine_table(table_name: str = None) -> Response:
    table_name = _get_request_param("table", table_name)

    # Early exit: table name not found
    if table_name not in list(get_table_names()):
        return Response(f"Invalid table name {table_name}", status=400)

    with TemporaryDirectory() as output_folder:
        output_folder = Path(output_folder)
        (output_folder / "tables").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Get a list of the intermediate files used by this data pipeline
        intermediate_file_names = []
        for data_source in data_pipeline.data_sources:
            intermediate_file_names.append(f"{data_source.uuid(data_pipeline.table)}.csv")

        # Download only the necessary intermediate files
        download_folder(
            GCS_BUCKET_TEST,
            "intermediate",
            output_folder / "intermediate",
            lambda x: x.name in intermediate_file_names,
        )

        # Re-load all intermediate results
        intermediate_results = data_pipeline._load_intermediate_results(
            output_folder / "intermediate"
        )

        # Combine all intermediate results into a single dataframe
        pipeline_output = data_pipeline.combine(intermediate_results)

        # Output combined data to disk
        export_csv(
            pipeline_output,
            output_folder / "tables" / f"{table_name}.csv",
            schema=data_pipeline.schema,
        )

        # Upload results to the test bucket because these are not prod files
        # They will be copied to prod in the publish step, so main.csv is in sync
        upload_folder(GCS_BUCKET_TEST, "tables", output_folder / "tables")

    return Response("OK", status=200)
def update_table(table_name: str = None, job_group: int = None) -> str:
    table_name = table_name or request.args.get("table")
    assert table_name in list(get_table_names())
    try:
        job_group = request.args.get("job_group")
    except:
        pass
    with TemporaryDirectory() as output_folder:
        output_folder = Path(output_folder)
        (output_folder / "snapshot").mkdir(parents=True, exist_ok=True)
        (output_folder / "intermediate").mkdir(parents=True, exist_ok=True)

        # Load the pipeline configuration given its name
        pipeline_name = table_name.replace("-", "_")
        data_pipeline = DataPipeline.load(pipeline_name)

        # Limit the sources to only the job_group provided
        if job_group is not None:
            data_pipeline.data_sources = [
                data_source for data_source in data_pipeline.data_sources
                if data_source.config.get("automation", {}).get("job_group") ==
                job_group
            ]
            assert (
                data_pipeline.data_sources
            ), f"No data sources matched job group {job_group} for table {table_name}"

        # Log the data sources being extracted
        data_source_names = [
            src.config.get("name") for src in data_pipeline.data_sources
        ]
        print(f"Data sources: {data_source_names}")

        # Produce the intermediate files from the data source
        intermediate_results = data_pipeline.parse(output_folder,
                                                   process_count=1)
        data_pipeline._save_intermediate_results(
            output_folder / "intermediate", intermediate_results)

        # Upload results to the test bucket because these are not prod files
        upload_folder(GCS_BUCKET_TEST, "snapshot", output_folder / "snapshot")
        upload_folder(GCS_BUCKET_TEST, "intermediate",
                      output_folder / "intermediate")

    return "OK"
def publish_main_table() -> Response:
    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download the already published tables
        allowlist_filenames = [f"{table}.csv" for table in get_table_names()]
        download_folder(GCS_BUCKET_PROD, "v2", input_folder,
                        lambda x: str(x) in allowlist_filenames)

        # Create the joint main table for all records
        main_table_path = output_folder / "main.csv"
        merge_output_tables(input_folder, main_table_path)
        logger.log_info("Main table created")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v2", output_folder)

    return Response("OK", status=200)
def main():
    schema = get_schema()
    for table_name in tqdm(list(get_table_names())):
        table = fetch_table(table_name)
        table = table.sort_values([col for col in ("key", "date") if col in table.columns])
        export_csv(table, path=SRC / "test" / "data" / f"{table_name}.csv", schema=schema)