Ejemplo n.º 1
0
def main(*, source_project_id, source_dataset_id, destination_project_id,
         destination_dataset_id):
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)

    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)

    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if table.view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(dataset_id=table_ref.dataset_id,
                                  view_id=table.table_id,
                                  view_query_template=table.view_query),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset)
Ejemplo n.º 2
0
def run_export(project_id: str, dry_run: bool, state_codes: List[str],
               target_dataset: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to tables with the same names in the target dataset."""
    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f'Dataset {dataset_ref.dataset_id} does not exist')

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table, state_codes)
        logging.info(export_query)

        if not export_query:
            continue

        if target_dataset:
            if dry_run:
                logging.info(
                    "[DRY RUN] Exporting to target project.dataset.table [%s.%s.%s]",
                    project_id, target_dataset, table.table_id)
            else:
                logging.info(
                    "Exporting to target project.dataset.table [%s.%s.%s]",
                    project_id, target_dataset, table.table_id)
                copy_table_to_dataset(target_dataset, table.table_id,
                                      export_query, big_query_client)
def move_old_dataflow_metrics_to_cold_storage():
    """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the
    MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other
    output is moved to cold storage.
    """
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        filter_clause = """WHERE created_on NOT IN
                          (SELECT DISTINCT created_on FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}` 
                          ORDER BY created_on DESC
                          LIMIT {day_count_limit})""".format(
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_ref.table_id,
            day_count_limit=MAX_DAYS_IN_DATAFLOW_METRICS_TABLE)

        cold_storage_dataset_ref = bq_client.dataset_ref_for_id(
            cold_storage_dataset)

        if bq_client.table_exists(cold_storage_dataset_ref, table_id):
            # Move data from the Dataflow metrics dataset into the cold storage dataset
            insert_job = bq_client.insert_into_table_from_table_async(
                source_dataset_id=dataflow_metrics_dataset,
                source_table_id=table_id,
                destination_dataset_id=cold_storage_dataset,
                destination_table_id=table_id,
                source_data_filter_clause=filter_clause,
                allow_field_additions=True)

            # Wait for the insert job to complete before running the delete job
            insert_job.result()
        else:
            # This table doesn't yet exist in cold storage. Create it.
            table_query = f"SELECT * FROM `{bq_client.project_id}.{dataflow_metrics_dataset}.{table_id}` " \
                          f"{filter_clause}"

            create_job = bq_client.create_table_from_query_async(
                cold_storage_dataset,
                table_id,
                table_query,
                query_parameters=[])

            # Wait for the create job to complete before running the delete job
            create_job.result()

        # Delete that data from the Dataflow dataset
        delete_job = bq_client.delete_from_table_async(
            dataflow_metrics_dataset, table_ref.table_id, filter_clause)

        # Wait for the delete job to complete before moving on
        delete_job.result()
Ejemplo n.º 4
0
def run_export(dry_run: bool, state_code: str, target_bucket: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f"Dataset {dataset_ref.dataset_id} does not exist")

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table,
                                                    [state_code.upper()])
        logging.info(export_query)

        if not export_query:
            continue

        export_dir = gcs_export_directory(target_bucket, today,
                                          state_code.lower())
        export_file_name = f"{table.table_id}_{today.isoformat()}_export.csv"
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id="export_temporary_tables",
            intermediate_table_name=
            f"{dataset_ref.dataset_id}_{table.table_id}_{state_code.lower()}",
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config,
            )
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(
            export_configs, print_header=True)
Ejemplo n.º 5
0
def _copy_regional_dataset_to_multi_region(
        config: CloudSqlToBQConfig,
        dataset_override_prefix: Optional[str]) -> None:
    """Copies the unioned regional dataset for a schema to the multi-region dataset
    that contains the same data. Backs up the multi-region dataset before performing
    the copy. This backup dataset will get cleaned up if the copy succeeds, but
    otherwise will stick around for 1 week before tables expire.
    """
    bq_client = BigQueryClientImpl()

    source_dataset_id = config.unioned_regional_dataset(
        dataset_override_prefix)
    destination_dataset_id = config.unioned_multi_region_dataset(
        dataset_override_prefix)
    destination_dataset = bq_client.dataset_ref_for_id(destination_dataset_id)

    backup_dataset = bq_client.backup_dataset_tables_if_dataset_exists(
        destination_dataset_id)

    try:
        if bq_client.dataset_exists(destination_dataset):
            tables = bq_client.list_tables(destination_dataset_id)
            for table in tables:
                bq_client.delete_table(table.dataset_id, table.table_id)

        bq_client.create_dataset_if_necessary(
            destination_dataset,
            default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
            if dataset_override_prefix else None,
        )

        # Copy into the canonical unioned source datasets in the US multi-region
        bq_client.copy_dataset_tables_across_regions(
            source_dataset_id=source_dataset_id,
            destination_dataset_id=destination_dataset_id,
        )
    except Exception as e:
        logging.info(
            "Failed to flash [%s] to [%s] - contents backup can be found at [%s]",
            source_dataset_id,
            destination_dataset_id,
            backup_dataset.dataset_id if backup_dataset else "NO BACKUP",
        )
        raise e

    if backup_dataset:
        bq_client.delete_dataset(backup_dataset,
                                 delete_contents=True,
                                 not_found_ok=True)
Ejemplo n.º 6
0
def copy_bq_views(
    source_project_id: str,
    source_dataset_id: str,
    destination_project_id: str,
    destination_dataset_id: str,
) -> None:
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)
    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)
    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)
        view_query = table.view_query

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Remove any references to the source_project_id from the view_query
            updated_view_query = view_query.replace(source_project_id,
                                                    "{project_id}")

            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(
                    project_id=destination_project_id,
                    dataset_id=destination_dataset_id,
                    view_id=table.table_id,
                    description=table.description,
                    view_query_template=updated_view_query,
                ),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset,
            )
Ejemplo n.º 7
0
def _delete_empty_datasets() -> None:
    """Deletes all empty datasets in BigQuery."""
    bq_client = BigQueryClientImpl()
    datasets = bq_client.list_datasets()

    for dataset_resource in datasets:
        dataset_ref = bq_client.dataset_ref_for_id(dataset_resource.dataset_id)
        dataset = bq_client.get_dataset(dataset_ref)
        tables = peekable(bq_client.list_tables(dataset.dataset_id))
        created_time = dataset.created
        dataset_age_seconds = (datetime.datetime.now(datetime.timezone.utc) -
                               created_time).total_seconds()

        if not tables and dataset_age_seconds > DATASET_DELETION_MIN_SECONDS:
            logging.info(
                "Dataset %s is empty and was not created very recently. Deleting...",
                dataset_ref.dataset_id,
            )
            bq_client.delete_dataset(dataset_ref)
Ejemplo n.º 8
0
def compare_dataflow_output_to_sandbox(
    sandbox_dataset_prefix: str,
    job_name_to_compare: str,
    base_output_job_id: str,
    sandbox_output_job_id: str,
    additional_columns_to_compare: List[str],
    allow_overwrite: bool = False,
) -> None:
    """Compares the output for all metrics produced by the daily pipeline job with the given |job_name_to_compare|
    between the output from the |base_output_job_id| job in the dataflow_metrics dataset and the output from the
    |sandbox_output_job_id| job in the sandbox dataflow dataset."""
    bq_client = BigQueryClientImpl()
    sandbox_dataflow_dataset_id = (sandbox_dataset_prefix + "_" +
                                   DATAFLOW_METRICS_DATASET)

    sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix +
                                            "_dataflow_comparison_output")
    sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id(
        sandbox_comparison_output_dataset_id)

    if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any(
            bq_client.list_tables(sandbox_comparison_output_dataset_id)):
        if not allow_overwrite:
            if __name__ == "__main__":
                logging.error(
                    "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.",
                    sandbox_comparison_output_dataset_id,
                    bq_client.project_id,
                )
                sys.exit(1)
            else:
                raise ValueError(
                    f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: "
                    f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}."
                )
        else:
            # Clean up the existing tables in the dataset
            for table in bq_client.list_tables(
                    sandbox_comparison_output_dataset_id):
                bq_client.delete_table(table.dataset_id, table.table_id)

    bq_client.create_dataset_if_necessary(
        sandbox_comparison_output_dataset_ref,
        TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS)

    query_jobs: List[Tuple[QueryJob, str]] = []

    pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts(
        "daily_pipelines")

    for pipeline in pipelines:
        if pipeline.pop("job_name", str) == job_name_to_compare:
            pipeline_metric_types = pipeline.peek_optional("metric_types", str)

            if not pipeline_metric_types:
                raise ValueError(
                    f"Pipeline job {job_name_to_compare} missing required metric_types attribute."
                )

            metric_types_for_comparison = pipeline_metric_types.split()

            for metric_class, metric_table in DATAFLOW_METRICS_TO_TABLES.items(
            ):
                metric_type_value = DATAFLOW_TABLES_TO_METRIC_TYPES[
                    metric_table].value

                if metric_type_value in metric_types_for_comparison:
                    comparison_query = _query_for_metric_comparison(
                        bq_client,
                        base_output_job_id,
                        sandbox_output_job_id,
                        sandbox_dataflow_dataset_id,
                        metric_class,
                        metric_table,
                        additional_columns_to_compare,
                    )

                    query_job = bq_client.create_table_from_query_async(
                        dataset_id=sandbox_comparison_output_dataset_id,
                        table_id=metric_table,
                        query=comparison_query,
                        overwrite=True,
                    )

                    # Add query job to the list of running jobs
                    query_jobs.append((query_job, metric_table))

    for query_job, output_table_id in query_jobs:
        # Wait for the insert job to complete before looking for the table
        query_job.result()

        output_table = bq_client.get_table(
            sandbox_comparison_output_dataset_ref, output_table_id)

        if output_table.num_rows == 0:
            # If there are no rows in the output table, then the output was identical
            bq_client.delete_table(sandbox_comparison_output_dataset_id,
                                   output_table_id)

    metrics_with_different_output = peekable(
        bq_client.list_tables(sandbox_comparison_output_dataset_id))

    logging.info(
        "\n*************** DATAFLOW OUTPUT COMPARISON RESULTS ***************\n"
    )

    if metrics_with_different_output:
        for metric_table in metrics_with_different_output:
            # This will always be true, and is here to silence mypy warnings
            assert isinstance(metric_table, bigquery.table.TableListItem)

            logging.warning(
                "Dataflow output differs for metric %s. See %s.%s for diverging rows.",
                metric_table.table_id,
                sandbox_comparison_output_dataset_id,
                metric_table.table_id,
            )
    else:
        logging.info(
            "Dataflow output identical. Deleting dataset %s.",
            sandbox_comparison_output_dataset_ref.dataset_id,
        )
        bq_client.delete_dataset(sandbox_comparison_output_dataset_ref,
                                 delete_contents=True)
def main(dry_run: bool) -> None:
    """Moves all metrics with a PERSON methodology or a metric_period_months value greater than 1 to cold storage."""
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        logging.info("Migrating data to cold storage for table [%s]", table_id)

        filter_clause = "WHERE methodology = 'PERSON'"

        # Every metric except this one has a metric_period_months column
        if table_id != DATAFLOW_METRICS_TO_TABLES[
                ReincarcerationRecidivismRateMetric]:
            filter_clause += " OR metric_period_months != 1"

        # Query for rows to be moved to the cold storage table
        insert_query = """
            SELECT * FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {filter_clause}
        """.format(
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            filter_clause=filter_clause,
        )

        if dry_run:
            logging.info(
                "[DRY RUN] Would insert rows into [%s].[%s] from [%s].[%s] that match this query: %s",
                cold_storage_dataset,
                table_id,
                dataflow_metrics_dataset,
                table_id,
                insert_query,
            )
        else:
            # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary
            insert_job = bq_client.insert_into_table_from_query(
                destination_dataset_id=cold_storage_dataset,
                destination_table_id=table_id,
                query=insert_query,
                allow_field_additions=True,
                write_disposition=WriteDisposition.WRITE_APPEND,
            )

            # Wait for the insert job to complete before running the delete job
            insert_job.result()

        if dry_run:
            logging.info(
                "[DRY RUN] Would delete rows from [%s].[%s] %s",
                dataflow_metrics_dataset,
                table_id,
                filter_clause,
            )
        else:
            # Delete these rows from the Dataflow metrics table
            delete_job = bq_client.delete_from_table_async(
                dataflow_metrics_dataset,
                table_id,
                filter_clause=filter_clause)

            # Wait for the replace job to complete before moving on
            delete_job.result()

        logging.info("Done migrating data for table [%s]", table_id)
def move_old_dataflow_metrics_to_cold_storage() -> None:
    """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the
    MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other
    output is moved to cold storage.
    """
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        source_data_join_clause = """LEFT JOIN
                          (SELECT DISTINCT job_id AS keep_job_id FROM
                          `{project_id}.{reference_views_dataset}.most_recent_job_id_by_metric_and_state_code_materialized`)
                        ON job_id = keep_job_id
                        LEFT JOIN 
                          (SELECT DISTINCT created_on AS keep_created_date FROM
                          `{project_id}.{dataflow_metrics_dataset}.{table_id}`
                          ORDER BY created_on DESC
                          LIMIT {day_count_limit})
                        ON created_on = keep_created_date
                        """.format(
                            project_id=table_ref.project,
                            dataflow_metrics_dataset=table_ref.dataset_id,
                            reference_views_dataset=REFERENCE_VIEWS_DATASET,
                            table_id=table_id,
                            day_count_limit=MAX_DAYS_IN_DATAFLOW_METRICS_TABLE
                        )

        # Exclude these columns leftover from the exclusion join from being added to the metric tables in cold storage
        columns_to_exclude_from_transfer = ['keep_job_id', 'keep_created_date']

        # This filter will return the rows that should be moved to cold storage
        insert_filter_clause = "WHERE keep_job_id IS NULL AND keep_created_date IS NULL"

        # Query for rows to be moved to the cold storage table
        insert_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            {insert_filter_clause}
        """.format(
            columns_to_exclude=', '.join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
            insert_filter_clause=insert_filter_clause
        )

        # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary
        insert_job = bq_client.insert_into_table_from_query(
            destination_dataset_id=cold_storage_dataset,
            destination_table_id=table_id,
            query=insert_query,
            allow_field_additions=True,
            write_disposition=WriteDisposition.WRITE_APPEND)

        # Wait for the insert job to complete before running the replace job
        insert_job.result()

        # This will return the rows that were not moved to cold storage and should remain in the table
        replace_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            WHERE keep_job_id IS NOT NULL OR keep_created_date IS NOT NULL
        """.format(
            columns_to_exclude=', '.join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
        )

        # Replace the Dataflow table with only the rows that should remain
        replace_job = bq_client.create_table_from_query_async(
            dataflow_metrics_dataset, table_ref.table_id, query=replace_query, overwrite=True)

        # Wait for the replace job to complete before moving on
        replace_job.result()
Ejemplo n.º 11
0
def compare_metric_view_output_to_sandbox(
    sandbox_dataset_prefix: str,
    load_sandbox_views: bool,
    check_determinism: bool,
    allow_schema_changes: bool,
    dataset_id_filters: Optional[List[str]],
) -> None:
    """Compares the output of all deployed metric views to the output of the corresponding views in the sandbox
    dataset."""
    if load_sandbox_views:
        logging.info(
            "Loading views into sandbox datasets prefixed with %s",
            sandbox_dataset_prefix,
        )
        load_views_to_sandbox(sandbox_dataset_prefix)

    bq_client = BigQueryClientImpl()
    sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix +
                                            "_metric_view_comparison_output")
    sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id(
        sandbox_comparison_output_dataset_id)

    if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any(
            bq_client.list_tables(sandbox_comparison_output_dataset_id)):
        raise ValueError(
            f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: "
            f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}.")

    bq_client.create_dataset_if_necessary(
        sandbox_comparison_output_dataset_ref,
        TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS)

    query_jobs: List[Tuple[QueryJob, str]] = []
    skipped_views: List[str] = []

    for view_builders in VIEW_BUILDERS_BY_NAMESPACE.values():
        for view_builder in view_builders:
            # Only compare output of metric views
            if not isinstance(view_builder, MetricBigQueryViewBuilder):
                continue

            base_dataset_id = view_builder.dataset_id

            if dataset_id_filters and base_dataset_id not in dataset_id_filters:
                continue

            if view_builder in VIEW_BUILDERS_WITH_KNOWN_NOT_DETERMINISTIC_OUTPUT:
                logging.warning(
                    "View %s.%s has known non-deterministic output. Skipping output comparison.",
                    view_builder.dataset_id,
                    view_builder.view_id,
                )
                skipped_views.append(
                    f"{view_builder.dataset_id}.{view_builder.view_id}")
                continue

            sandbox_dataset_id = sandbox_dataset_prefix + "_" + base_dataset_id

            if not bq_client.dataset_exists(
                    bq_client.dataset_ref_for_id(sandbox_dataset_id)):
                raise ValueError(
                    f"Trying to compare output to a sandbox dataset that does not exist: "
                    f"{bq_client.project_id}.{sandbox_dataset_id}")

            base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id)
            base_view_id = (view_builder.build().materialized_view_table_id
                            if view_builder.should_materialize
                            and not check_determinism else
                            view_builder.view_id)

            if not base_view_id:
                raise ValueError(
                    "Unexpected empty base_view_id. view_id or materialized_view_table_id unset"
                    f"for {view_builder}.")

            if not check_determinism and not bq_client.table_exists(
                    base_dataset_ref, base_view_id):
                logging.warning(
                    "View %s.%s does not exist. Skipping output comparison.",
                    base_dataset_ref.dataset_id,
                    base_view_id,
                )
                skipped_views.append(f"{base_dataset_id}.{base_view_id}")
                continue

            if not bq_client.table_exists(
                    bq_client.dataset_ref_for_id(sandbox_dataset_id),
                    base_view_id):
                logging.warning(
                    "View %s.%s does not exist in sandbox. Skipping output comparison.",
                    sandbox_dataset_id,
                    base_view_id,
                )
                skipped_views.append(f"{sandbox_dataset_id}.{base_view_id}")
                continue
            query_job, output_table_id = _view_output_comparison_job(
                bq_client,
                view_builder,
                base_view_id,
                base_dataset_id,
                sandbox_dataset_id,
                sandbox_comparison_output_dataset_id,
                check_determinism,
                allow_schema_changes,
            )

            # Add query job to the list of running jobs
            query_jobs.append((query_job, output_table_id))

    for query_job, output_table_id in query_jobs:
        # Wait for the insert job to complete before looking for the table
        query_job.result()

        output_table = bq_client.get_table(
            sandbox_comparison_output_dataset_ref, output_table_id)

        if output_table.num_rows == 0:
            # If there are no rows in the output table, then the view output was identical
            bq_client.delete_table(sandbox_comparison_output_dataset_id,
                                   output_table_id)

    views_with_different_output = bq_client.list_tables(
        sandbox_comparison_output_dataset_id)
    views_with_different_output = peekable(views_with_different_output)

    logging.info(
        "\n*************** METRIC VIEW OUTPUT RESULTS ***************\n")

    if dataset_id_filters:
        logging.info(
            "Only compared metric view output for the following datasets: \n %s \n",
            dataset_id_filters,
        )

    logging.info(
        "Skipped output comparison for the following metric views: \n %s \n",
        skipped_views,
    )

    if views_with_different_output:
        for view in views_with_different_output:
            base_dataset_id, base_view_id = view.table_id.split("--")

            logging.warning(
                "View output differs for view %s.%s. See %s.%s for diverging rows.",
                base_dataset_id,
                base_view_id,
                sandbox_comparison_output_dataset_id,
                view.table_id,
            )
    else:
        output_message = (
            "identical between deployed views and sandbox datasets"
            if not check_determinism else "deterministic")
        logging.info(
            "View output %s. Deleting dataset %s.",
            output_message,
            sandbox_comparison_output_dataset_ref.dataset_id,
        )
        bq_client.delete_dataset(sandbox_comparison_output_dataset_ref,
                                 delete_contents=True)
def move_old_dataflow_metrics_to_cold_storage() -> None:
    """Moves old output in Dataflow metrics tables to tables in a cold storage dataset.
    We only keep the MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow
    metric table at once. All other output is moved to cold storage, unless it is the
    most recent job_id for a metric in a state where that metric is regularly calculated,
    and where the year and month of the output falls into the window of what is regularly
    calculated for that metric and state. See the production_calculation_pipeline_templates.yaml
    file for a list of regularly scheduled calculations.

    If a metric has been entirely decommissioned, handles the deletion of the corresponding table.
    """
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = dataflow_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    month_range_for_metric_and_state = _get_month_range_for_metric_and_state()

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        if table_id not in dataflow_config.DATAFLOW_TABLES_TO_METRIC_TYPES:
            # This metric has been deprecated. Handle the deletion of the table
            _decommission_dataflow_metric_table(bq_client, table_ref)
            continue

        is_unbounded_date_pipeline = any(
            pipeline in table_id
            for pipeline in dataflow_config.ALWAYS_UNBOUNDED_DATE_PIPELINES)

        # This means there are no currently scheduled pipelines writing metrics to
        # this table with specific month ranges
        no_active_month_range_pipelines = not month_range_for_metric_and_state[
            table_id].items()

        if is_unbounded_date_pipeline or no_active_month_range_pipelines:
            source_data_join_clause = SOURCE_DATA_JOIN_CLAUSE_STANDARD_TEMPLATE.format(
                project_id=table_ref.project,
                dataflow_metrics_dataset=table_ref.dataset_id,
                materialized_metrics_dataset=
                DATAFLOW_METRICS_MATERIALIZED_DATASET,
                table_id=table_id,
                day_count_limit=dataflow_config.
                MAX_DAYS_IN_DATAFLOW_METRICS_TABLE,
            )
        else:
            month_limit_by_state = "\nUNION ALL\n".join([
                f"SELECT '{state_code}' as state_code, {month_limit} as month_limit"
                for state_code, month_limit in
                month_range_for_metric_and_state[table_id].items()
            ])

            source_data_join_clause = (
                SOURCE_DATA_JOIN_CLAUSE_WITH_MONTH_LIMIT_TEMPLATE.format(
                    project_id=table_ref.project,
                    dataflow_metrics_dataset=table_ref.dataset_id,
                    materialized_metrics_dataset=
                    DATAFLOW_METRICS_MATERIALIZED_DATASET,
                    table_id=table_id,
                    day_count_limit=dataflow_config.
                    MAX_DAYS_IN_DATAFLOW_METRICS_TABLE,
                    month_limit_by_state=month_limit_by_state,
                ))

        # Exclude these columns leftover from the exclusion join from being added to the metric tables in cold storage
        columns_to_exclude_from_transfer = ["keep_job_id", "keep_created_date"]

        # This filter will return the rows that should be moved to cold storage
        insert_filter_clause = "WHERE keep_job_id IS NULL AND keep_created_date IS NULL"

        # Query for rows to be moved to the cold storage table
        insert_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            {insert_filter_clause}
        """.format(
            columns_to_exclude=", ".join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
            insert_filter_clause=insert_filter_clause,
        )

        # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary
        insert_job = bq_client.insert_into_table_from_query(
            destination_dataset_id=cold_storage_dataset,
            destination_table_id=table_id,
            query=insert_query,
            allow_field_additions=True,
            write_disposition=WriteDisposition.WRITE_APPEND,
        )

        # Wait for the insert job to complete before running the replace job
        insert_job.result()

        # This will return the rows that were not moved to cold storage and should remain in the table
        replace_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            WHERE keep_job_id IS NOT NULL OR keep_created_date IS NOT NULL
        """.format(
            columns_to_exclude=", ".join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
        )

        # Replace the Dataflow table with only the rows that should remain
        replace_job = bq_client.create_table_from_query_async(
            dataflow_metrics_dataset,
            table_ref.table_id,
            query=replace_query,
            overwrite=True,
        )

        # Wait for the replace job to complete before moving on
        replace_job.result()