def test_validate_no_metadata(self):
        mock_fs = create_autospec(DirectIngestGCSFileSystem)

        mock_fs.get_metadata.return_value = None

        validator = OptimizedMetricBigQueryViewExportValidator(mock_fs)
        for path in self.staging_paths:
            result = validator.validate(path)
            self.assertFalse(result)

        # We failed before validating the second path
        mock_fs.assert_has_calls([call.get_metadata(self.staging_paths[0])])
    def test_validate_success(self):
        mock_fs = create_autospec(DirectIngestGCSFileSystem)

        mock_fs.get_metadata.return_value = {"total_data_points": "5"}

        validator = OptimizedMetricBigQueryViewExportValidator(mock_fs)
        for path in self.staging_paths:
            result = validator.validate(path)
            self.assertTrue(result)

        mock_fs.assert_has_calls([
            call.get_metadata(self.staging_paths[0]),
            call.get_metadata(self.staging_paths[1]),
        ])
    def test_validate_not_integer(self) -> None:
        mock_fs = create_autospec(DirectIngestGCSFileSystem)

        mock_fs.get_metadata.return_value = {
            "total_data_points": "HELLO WORLD"
        }

        validator = OptimizedMetricBigQueryViewExportValidator(mock_fs)
        for path in self.staging_paths:
            result = validator.validate(path, allow_empty=False)
            self.assertFalse(result)

        # We failed before validating the second path
        mock_fs.assert_has_calls([call.get_metadata(self.staging_paths[0])])
Example #4
0
def get_delegate_export_map(
    gcsfs_client: GCSFileSystem,
    override_view_exporter: Optional[BigQueryViewExporter] = None,
) -> Dict[ExportOutputFormatType, BigQueryViewExporter]:
    """Builds the delegate_export_map, mapping the csv_exporter, json_exporter, and metric_exporter
    to the correct ExportOutputFormatType.
    """
    if override_view_exporter is None:
        bq_client = BigQueryClientImpl()

        # Some our views intentionally export empty files (e.g. some of the ingest_metadata views)
        # so we just check for existence
        csv_exporter = CSVBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        json_exporter = JsonLinesBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        metric_exporter = OptimizedMetricBigQueryViewExporter(
            bq_client,
            OptimizedMetricBigQueryViewExportValidator(gcsfs_client))

        delegate_export_map = {
            ExportOutputFormatType.CSV: csv_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: csv_exporter,
            ExportOutputFormatType.JSON: json_exporter,
            ExportOutputFormatType.METRIC: metric_exporter,
        }
    else:
        delegate_export_map = {
            ExportOutputFormatType.CSV: override_view_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: override_view_exporter,
            ExportOutputFormatType.JSON: override_view_exporter,
            ExportOutputFormatType.METRIC: override_view_exporter,
        }
    return delegate_export_map
    def test_validate_failure(self):
        mock_fs = create_autospec(DirectIngestGCSFileSystem)

        mock_fs.get_metadata.side_effect = [
            {
                "total_data_points": "5"
            },
            {
                "total_data_points": "0"
            },
        ]

        validator = OptimizedMetricBigQueryViewExportValidator(mock_fs)

        self.assertTrue(validator.validate(self.staging_paths[0]))
        self.assertFalse(validator.validate(self.staging_paths[1]))

        mock_fs.assert_has_calls([
            call.get_metadata(self.staging_paths[0]),
            call.get_metadata(self.staging_paths[1]),
        ])
Example #6
0
def export_view_data_to_cloud_storage(
    export_job_filter: str,
    override_view_exporter: Optional[BigQueryViewExporter] = None,
) -> None:
    """Exports data in BigQuery metric views to cloud storage buckets.

    Optionally takes in a BigQueryViewExporter for performing the export operation. If none is provided, this defaults
    to using a CompositeBigQueryViewExporter with delegates of JsonLinesBigQueryViewExporter and
    OptimizedMetricBigQueryViewExporter.
    """
    export_configs_for_filter: List[ExportViewCollectionConfig] = []
    bq_view_namespaces_to_update: Set[BigQueryViewNamespace] = set()
    for dataset_export_config in export_config.VIEW_COLLECTION_EXPORT_CONFIGS:
        if not dataset_export_config.matches_filter(export_job_filter):
            logging.info(
                "Skipped metric export for config [%s] with filter [%s]",
                dataset_export_config,
                export_job_filter,
            )
            continue

        export_configs_for_filter.append(dataset_export_config)
        bq_view_namespaces_to_update.add(
            dataset_export_config.bq_view_namespace)

    if not export_configs_for_filter:
        raise ValueError("Export filter did not match any export configs: ",
                         export_job_filter)

    for bq_view_namespace_to_update in bq_view_namespaces_to_update:
        view_builders_for_views_to_update = (
            view_update_manager.
            VIEW_BUILDERS_BY_NAMESPACE[bq_view_namespace_to_update])

        # TODO(#5125): Once view update is consistently trivial, always update all views in namespace
        if (bq_view_namespace_to_update
                in export_config.NAMESPACES_REQUIRING_FULL_UPDATE):
            view_update_manager.create_dataset_and_deploy_views_for_view_builders(
                bq_view_namespace_to_update, view_builders_for_views_to_update)

        # The view deploy will only have rematerialized views that had been updated since the last deploy, this call
        # will ensure that all materialized tables get refreshed.
        view_update_manager.rematerialize_views_for_namespace(
            bq_view_namespace=bq_view_namespace_to_update,
            candidate_view_builders=view_builders_for_views_to_update,
        )

    gcsfs_client = GcsfsFactory.build()
    if override_view_exporter is None:
        bq_client = BigQueryClientImpl()

        # Some our views intentionally export empty files (e.g. some of the ingest_metadata views)
        # so we just check for existence
        csv_exporter = CSVBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        json_exporter = JsonLinesBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        metric_exporter = OptimizedMetricBigQueryViewExporter(
            bq_client,
            OptimizedMetricBigQueryViewExportValidator(gcsfs_client))

        delegate_export_map = {
            ExportOutputFormatType.CSV: csv_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: csv_exporter,
            ExportOutputFormatType.JSON: json_exporter,
            ExportOutputFormatType.METRIC: metric_exporter,
        }
    else:
        delegate_export_map = {
            ExportOutputFormatType.CSV: override_view_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: override_view_exporter,
            ExportOutputFormatType.JSON: override_view_exporter,
            ExportOutputFormatType.METRIC: override_view_exporter,
        }

    project_id = metadata.project_id()

    for dataset_export_config in export_configs_for_filter:
        logging.info(
            "Starting metric export for dataset_config [%s] with filter [%s]",
            dataset_export_config,
            export_job_filter,
        )

        view_export_configs = dataset_export_config.export_configs_for_views_to_export(
            project_id=project_id)

        # The export will error if the validations fail for the set of view_export_configs. We want to log this failure
        # as a warning, but not block on the rest of the exports.
        try:
            export_views_with_exporters(gcsfs_client, view_export_configs,
                                        delegate_export_map)
        except ViewExportValidationError:
            warning_message = (
                f"Export validation failed for {dataset_export_config.export_name}"
            )

            if dataset_export_config.state_code_filter is not None:
                warning_message += (
                    f" for state: {dataset_export_config.state_code_filter}")

            logging.warning(warning_message)
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    dataset_export_config.export_name,
                    monitoring.TagKey.REGION:
                    dataset_export_config.state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_validation,
                                             1)

            # Do not treat validation failures as fatal errors
            continue
        except Exception as e:
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    dataset_export_config.export_name,
                    monitoring.TagKey.REGION:
                    dataset_export_config.state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_job, 1)
            raise e
Example #7
0
def export_view_data_to_cloud_storage(export_job_filter: Optional[str] = None,
                                      view_exporter: BigQueryViewExporter = None) -> None:
    """Exports data in BigQuery metric views to cloud storage buckets.

    Optionally takes in a BigQueryViewExporter for performing the export operation. If none is provided, this defaults
    to using a CompositeBigQueryViewExporter with delegates of JsonLinesBigQueryViewExporter and
    OptimizedMetricBigQueryViewExporter.
    """
    view_builders_for_views_to_update = view_config.VIEW_BUILDERS_FOR_VIEWS_TO_UPDATE
    view_update_manager.create_dataset_and_update_views_for_view_builders(BigQueryViewNamespace.STATE,
                                                                          view_builders_for_views_to_update)

    if not view_exporter:
        bq_client = BigQueryClientImpl()
        gcsfs_client = GcsfsFactory.build()

        json_exporter = JsonLinesBigQueryViewExporter(bq_client,
                                                      JsonLinesBigQueryViewExportValidator(gcsfs_client))

        optimized_exporter = OptimizedMetricBigQueryViewExporter(
            bq_client, OptimizedMetricBigQueryViewExportValidator(gcsfs_client))
        delegates = [json_exporter, optimized_exporter]

        view_exporter = CompositeBigQueryViewExporter(
            bq_client,
            gcsfs_client,
            delegates
        )

    project_id = metadata.project_id()

    # If the state code is set to COVID then it will match when the state_filter is None in
    # view_config.METRIC_DATASET_EXPORT_CONFIGS
    matched_export_config = False
    for dataset_export_config in view_config.METRIC_DATASET_EXPORT_CONFIGS:
        if not dataset_export_config.matches_filter(export_job_filter):
            logging.info("Skipped metric export for config [%s] with filter [%s]", dataset_export_config,
                         export_job_filter)
            continue

        matched_export_config = True
        logging.info("Starting metric export for dataset_config [%s] with filter [%s]", dataset_export_config,
                     export_job_filter)

        view_export_configs = dataset_export_config.export_configs_for_views_to_export(project_id=project_id)

        # The export will error if the validations fail for the set of view_export_configs. We want to log this failure
        # as a warning, but not block on the rest of the exports.
        try:
            view_exporter.export_and_validate(view_export_configs)
        except ViewExportValidationError:
            warning_message = f"Export validation failed from {dataset_export_config.dataset_id}"

            if dataset_export_config.state_code_filter is not None:
                warning_message += f" for state: {dataset_export_config.state_code_filter}"

            logging.warning(warning_message)
            with monitoring.measurements({
                monitoring.TagKey.METRIC_VIEW_EXPORT_NAME: dataset_export_config.export_name,
                monitoring.TagKey.REGION: dataset_export_config.state_code_filter
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_validation, 1)

            # Do not treat validation failures as fatal errors
            continue
        except Exception as e:
            with monitoring.measurements({
                monitoring.TagKey.METRIC_VIEW_EXPORT_NAME: dataset_export_config.export_name,
                monitoring.TagKey.REGION: dataset_export_config.state_code_filter
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_job, 1)
            raise e

    if not matched_export_config:
        raise ValueError("Export filter did not match any export configs: ", export_job_filter)