Ejemplo n.º 1
0
def _emit_failures(
    failed_to_run_validations: List[DataValidationJob],
    failed_validations: List[DataValidationJobResult],
) -> None:
    def tags_for_job(job: DataValidationJob) -> Dict[str, Any]:
        return {
            monitoring.TagKey.REGION: job.region_code,
            monitoring.TagKey.VALIDATION_CHECK_TYPE:
            job.validation.validation_type,
            monitoring.TagKey.VALIDATION_VIEW_ID:
            job.validation.validation_name,
        }

    for validation_job in failed_to_run_validations:
        logging.error("Failed to run data validation job: %s", validation_job)

        monitoring_tags = tags_for_job(validation_job)
        with monitoring.measurements(monitoring_tags) as measurements:
            measurements.measure_int_put(m_failed_to_run_validations, 1)

    for result in failed_validations:
        logging.error("Failed data validation: %s", result)

        monitoring_tags = tags_for_job(result.validation_job)
        with monitoring.measurements(monitoring_tags) as measurements:
            measurements.measure_int_put(m_failed_validations, 1)
Ejemplo n.º 2
0
def test_measurements_with_push_tags_and_exception(mock_mmap):
    tags = {'alice': 'foo'}

    # inside of pushed tag
    with pytest.raises(Exception):
        with monitoring.push_tags({'eve': 'baz'}):
            with monitoring.measurements(tags) as mmap:
                tags['bob'] = 'bar'
                assert mmap == mock_mmap
                raise Exception

    # outside of pushed tag
    with monitoring.measurements(tags) as mmap:
        tags['other'] = 'thing'
        assert mmap == mock_mmap

    assert_recorded_tags(mock_mmap, [{
        'alice': 'foo',
        'bob': 'bar',
        'eve': 'baz'
    }, {
        'alice': 'foo',
        'bob': 'bar',
        'other': 'thing'
    }])
Ejemplo n.º 3
0
def test_measurements_with_push_tags(mock_mmap):
    tags = {"alice": "foo"}

    # inside of pushed tag
    with monitoring.push_tags({"eve": "baz"}):
        with monitoring.measurements(tags) as mmap:
            tags["bob"] = "bar"
            assert mmap == mock_mmap

    # outside of pushed tag
    with monitoring.measurements(tags) as mmap:
        tags["other"] = "thing"
        assert mmap == mock_mmap

    assert_recorded_tags(
        mock_mmap,
        [
            {
                "alice": "foo",
                "bob": "bar",
                "eve": "baz"
            },
            {
                "alice": "foo",
                "bob": "bar",
                "other": "thing"
            },
        ],
    )
Ejemplo n.º 4
0
def test_measurements_with_push_tags_and_exception(mock_mmap) -> None:
    tags = {"alice": "foo"}

    # inside of pushed tag
    with pytest.raises(Exception):
        with monitoring.push_tags({"eve": "baz"}):
            with monitoring.measurements(tags) as mmap:
                tags["bob"] = "bar"
                assert mmap == mock_mmap
                raise Exception

    # outside of pushed tag
    with monitoring.measurements(tags) as mmap:
        tags["other"] = "thing"
        assert mmap == mock_mmap

    assert_recorded_tags(
        mock_mmap,
        [
            {
                "alice": "foo",
                "bob": "bar",
                "eve": "baz"
            },
            {
                "alice": "foo",
                "bob": "bar",
                "other": "thing"
            },
        ],
    )
Ejemplo n.º 5
0
def do_metric_export_for_configs(
    export_configs: Dict[str, Sequence[ExportBigQueryViewConfig]],
    state_code_filter: Optional[str],
    override_view_exporter: Optional[BigQueryViewExporter] = None,
) -> None:
    """Triggers the export given the export_configs."""

    gcsfs_client = GcsfsFactory.build()
    delegate_export_map = get_delegate_export_map(gcsfs_client,
                                                  override_view_exporter)

    for export_name, view_export_configs in export_configs.items():
        export_log_message = f"Starting [{export_name}] export"
        export_log_message += (f" for state_code [{state_code_filter}]."
                               if state_code_filter else ".")

        logging.info(export_log_message)

        # The export will error if the validations fail for the set of view_export_configs. We want to log this failure
        # as a warning, but not block on the rest of the exports.
        try:
            export_views_with_exporters(gcsfs_client, view_export_configs,
                                        delegate_export_map)
        except ViewExportValidationError as e:
            warning_message = f"Export validation failed for {export_name}"

            if state_code_filter:
                warning_message += f" for state: {state_code_filter}"

            logging.warning("%s\n%s", warning_message, str(e))
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    export_name,
                    monitoring.TagKey.REGION:
                    state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_validation,
                                             1)

            # Do not treat validation failures as fatal errors
            continue
        except Exception as e:
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    export_name,
                    monitoring.TagKey.REGION:
                    state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_job, 1)
            raise e
    def init_engine_for_db_instance(
        cls,
        database_key: SQLAlchemyDatabaseKey,
        db_url: URL,
        **dialect_specific_kwargs: Any,
    ) -> Engine:
        """Initializes a sqlalchemy Engine object for the given database / schema and
        caches it for future use."""

        if database_key in cls._engine_for_database:
            raise ValueError(f"Already initialized database [{database_key}]")

        try:
            engine = sqlalchemy.create_engine(
                db_url,
                isolation_level=database_key.isolation_level,
                poolclass=database_key.poolclass,
                **dialect_specific_kwargs,
            )
        except BaseException as e:
            logging.error(
                "Unable to connect to postgres instance for [%s]: %s",
                database_key,
                str(e),
            )
            with monitoring.measurements({
                    monitoring.TagKey.SCHEMA_TYPE:
                    database_key.schema_type.value,
                    monitoring.TagKey.DATABASE_NAME:
                    database_key.db_name,
            }) as measurements:
                measurements.measure_int_put(m_failed_engine_initialization, 1)
            raise e
        cls._engine_for_database[database_key] = engine
        return engine
Ejemplo n.º 7
0
def create_dataset_and_update_views_for_view_builders(
        bq_view_namespace: BigQueryViewNamespace,
        view_builders_to_update: Dict[str, Sequence[BigQueryViewBuilder]],
        dataset_overrides: Optional[Dict[str, str]] = None,
        materialized_views_only: bool = False) -> None:
    """Converts the map of dataset_ids to BigQueryViewBuilders lists into a map of dataset_ids to BigQueryViews by
    building each of the views. Then, calls create_dataset_and_update_views with those views and their parent
    datasets. Will override the default dataset_ids for any dataset_id specified in dataset_overrides. If
    materialized_views_only is True, will only update views that have a set materialized_view_table_id field."""
    set_default_table_expiration_for_new_datasets = bool(dataset_overrides)
    if set_default_table_expiration_for_new_datasets:
        logging.info(
            "Found non-empty dataset overrides. New datasets created in this process will have a "
            "default table expiration of 24 hours.")
    try:
        views_to_update = []
        for view_builders in view_builders_to_update.values():
            for view_builder in view_builders:
                view = view_builder.build(dataset_overrides=dataset_overrides)
                if not materialized_views_only or view.materialized_view_table_id is not None:
                    views_to_update.append(view)

        _create_dataset_and_update_views(
            views_to_update, set_default_table_expiration_for_new_datasets)
    except Exception as e:
        with monitoring.measurements({
                monitoring.TagKey.CREATE_UPDATE_VIEWS_NAMESPACE:
                bq_view_namespace.value
        }) as measurements:
            measurements.measure_int_put(m_failed_view_update, 1)
        raise e
Ejemplo n.º 8
0
def create_dataset_and_deploy_views_for_view_builders(
    # TODO(#5785): Clarify use case of BigQueryViewNamespace filter (see ticket for more)
    bq_view_namespace: BigQueryViewNamespace,
    view_builders_to_update: Sequence[BigQueryViewBuilder],
    dataset_overrides: Optional[Dict[str, str]] = None,
) -> None:
    """Creates or updates all the views in the provided list with the view query in the provided view builder list. If
    any materialized view has been updated (or if an ancestor view has been updated), the view will be re-materialized
    to ensure the schemas remain consistent.

    Should only be called if we expect the views to have changed (either the view query or schema from querying
    underlying tables), e.g. at deploy time.
    """
    set_default_table_expiration_for_new_datasets = bool(dataset_overrides)
    if set_default_table_expiration_for_new_datasets:
        logging.info(
            "Found non-empty dataset overrides. New datasets created in this process will have a "
            "default table expiration of 24 hours."
        )
    try:
        views_to_update = _build_views_to_update(
            candidate_view_builders=view_builders_to_update,
            dataset_overrides=dataset_overrides,
        )

        _create_dataset_and_deploy_views(
            views_to_update, set_default_table_expiration_for_new_datasets
        )
    except Exception as e:
        with monitoring.measurements(
            {monitoring.TagKey.CREATE_UPDATE_VIEWS_NAMESPACE: bq_view_namespace.value}
        ) as measurements:
            measurements.measure_int_put(m_failed_view_update, 1)
        raise e
Ejemplo n.º 9
0
def test_measurements(mock_mmap):
    tags = {"alice": "foo"}
    with monitoring.measurements(tags) as mmap:
        tags["bob"] = "bar"
        assert mmap == mock_mmap

    assert_recorded_tags(mock_mmap, [{"alice": "foo", "bob": "bar"}])
Ejemplo n.º 10
0
def test_measurements(mock_mmap):
    tags = {'alice': 'foo'}
    with monitoring.measurements(tags) as mmap:
        tags['bob'] = 'bar'
        assert mmap == mock_mmap

    assert_recorded_tags(mock_mmap, [{'alice': 'foo', 'bob': 'bar'}])
Ejemplo n.º 11
0
def rematerialize_views_for_namespace(
    # TODO(#5785): Clarify use case of BigQueryViewNamespace filter (see ticket for more)
    bq_view_namespace: BigQueryViewNamespace,
    candidate_view_builders: Sequence[BigQueryViewBuilder],
    dataset_overrides: Optional[Dict[str, str]] = None,
    skip_missing_views: bool = False,
) -> None:
    """For all views in a given namespace, re-materializes any materialized views. This should be called only when we
    want to refresh the data in the materialized view, not when we want to update the underlying query of the view.
    """
    set_default_table_expiration_for_new_datasets = bool(dataset_overrides)
    if set_default_table_expiration_for_new_datasets:
        logging.info(
            "Found non-empty dataset overrides. New datasets created in this process will have a "
            "default table expiration of 24 hours."
        )

    try:
        views_to_update = _build_views_to_update(
            candidate_view_builders=candidate_view_builders,
            dataset_overrides=dataset_overrides,
        )

        bq_client = BigQueryClientImpl()
        _create_all_datasets_if_necessary(
            bq_client, views_to_update, set_default_table_expiration_for_new_datasets
        )

        dag_walker = BigQueryViewDagWalker(views_to_update)

        def _materialize_view(
            v: BigQueryView, _parent_results: Dict[BigQueryView, None]
        ) -> None:
            if not v.materialized_view_table_id:
                logging.info(
                    "Skipping non-materialized view [%s.%s].", v.dataset_id, v.view_id
                )
                return

            if skip_missing_views and not bq_client.table_exists(
                bq_client.dataset_ref_for_id(dataset_id=v.dataset_id), v.view_id
            ):
                logging.info(
                    "Skipping materialization of view [%s.%s] which does not exist",
                    v.dataset_id,
                    v.view_id,
                )
                return

            bq_client.materialize_view_to_table(v)

        dag_walker.process_dag(_materialize_view)

    except Exception as e:
        with monitoring.measurements(
            {monitoring.TagKey.CREATE_UPDATE_VIEWS_NAMESPACE: bq_view_namespace.value}
        ) as measurements:
            measurements.measure_int_put(m_failed_view_update, 1)
        raise e from e
Ejemplo n.º 12
0
def _log_error(threshold_type: str, error_thresholds: Dict[str, float],
               error_ratio: float) -> None:
    logging.error(
        "Aborting because we exceeded the [%s] threshold of [%s] with an error ratio of [%s]",
        threshold_type, error_thresholds[threshold_type], error_ratio)
    with monitoring.measurements({monitoring.TagKey.REASON:
                                  threshold_type}) as m:
        m.measure_int_put(m_aborts, 1)
Ejemplo n.º 13
0
def test_measurements_with_exception(mock_mmap):
    with pytest.raises(Exception):
        tags = {"alice": "foo"}
        with monitoring.measurements(tags) as mmap:
            tags["bob"] = "bar"
            assert mmap == mock_mmap
            raise Exception

    assert_recorded_tags(mock_mmap, [{"alice": "foo", "bob": "bar"}])
Ejemplo n.º 14
0
 def parse(cls: Type[ClsT], label: str,
           enum_overrides: "EnumOverrides") -> Optional[ClsT]:
     try:
         return cls._parse_to_enum(label, enum_overrides)
     except EnumParsingError:
         with monitoring.measurements(
             {monitoring.TagKey.ENTITY_TYPE: cls.__name__}) as m:
             m.measure_int_put(m_enum_errors, 1)
         raise
Ejemplo n.º 15
0
def test_measurements_with_exception(mock_mmap):
    with pytest.raises(Exception):
        tags = {'alice': 'foo'}
        with monitoring.measurements(tags) as mmap:
            tags['bob'] = 'bar'
            assert mmap == mock_mmap
            raise Exception

    assert_recorded_tags(mock_mmap, [{'alice': 'foo', 'bob': 'bar'}])
Ejemplo n.º 16
0
def read_and_persist() -> Tuple[str, HTTPStatus]:
    """Reads all of the messages from Datastore for a region and persists
    them to the database.
    """

    region = request.args.get("region")

    if not isinstance(region, str):
        raise ValueError(f"Expected string region, found [{region}]")

    batch_tags = {
        monitoring.TagKey.STATUS: "COMPLETED",
        monitoring.TagKey.PERSISTED: False,
    }
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION: region}
    ), monitoring.measurements(batch_tags) as measurements:
        measurements.measure_int_put(m_batch_count, 1)

        session = sessions.get_most_recent_completed_session(
            region, ScrapeType.BACKGROUND
        )

        if not session:
            raise ValueError(
                f"Most recent session for region [{region}] is unexpectedly None"
            )

        scrape_type = session.scrape_type

        try:
            did_persist = persist_to_database(region, session.start)
            batch_tags[monitoring.TagKey.PERSISTED] = did_persist
        except Exception as e:
            logging.exception(
                "An exception occurred in read and persist: %s", type(e).__name__
            )
            batch_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(type(e).__name__)
            sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
            raise BatchPersistError(region, scrape_type) from e

        if did_persist:
            next_phase = scrape_phase.next_phase(request.endpoint)
            sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE)
            if next_phase:
                logging.info("Enqueueing %s for region %s.", next_phase, region)
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region, url=url_for(next_phase)
                )
            return "", HTTPStatus.OK

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
        return "", HTTPStatus.ACCEPTED
Ejemplo n.º 17
0
def _should_abort(total_root_entities: int,
                  system_level: SystemLevel,
                  conversion_result: IngestInfoConversionResult,
                  entity_matching_errors: int = 0,
                  data_validation_errors: int = 0,
                  database_invariant_errors: int = 0) -> bool:
    """
    Returns true if we should abort the current attempt to persist an IngestInfo
    object, given the number of errors we've encountered.
    """
    if total_root_entities == 0:
        logging.info("Aborting because the ingest info object contains no "
                     "root entity objects to persist.")
        return True

    if conversion_result.protected_class_errors:
        logging.error(
            "Aborting because there was an error regarding a protected class")
        with monitoring.measurements(
            {monitoring.TagKey.REASON: 'PROTECTED_CLASS_ERROR'}) as m:
            m.measure_int_put(m_aborts, 1)
        return True

    error_thresholds = _get_thresholds_for_system_level(system_level)

    overall_error_ratio = _calculate_overall_error_ratio(
        conversion_result, entity_matching_errors, data_validation_errors,
        total_root_entities)

    if overall_error_ratio > error_thresholds[OVERALL_THRESHOLD]:
        _log_error(OVERALL_THRESHOLD, error_thresholds, overall_error_ratio)
        return True

    if conversion_result.enum_parsing_errors / total_root_entities > error_thresholds[
            ENUM_THRESHOLD]:
        _log_error(ENUM_THRESHOLD, error_thresholds,
                   conversion_result.enum_parsing_errors / total_root_entities)
        return True

    if entity_matching_errors / total_root_entities > error_thresholds[
            ENTITY_MATCHING_THRESHOLD]:
        _log_error(ENTITY_MATCHING_THRESHOLD, error_thresholds,
                   entity_matching_errors / total_root_entities)
        return True

    if database_invariant_errors > error_thresholds[
            DATABASE_INVARIANT_THRESHOLD]:
        _log_error(DATABASE_INVARIANT_THRESHOLD, error_thresholds,
                   database_invariant_errors / total_root_entities)
        return True

    return False
Ejemplo n.º 18
0
def _emit_failures(failed_validations: List[DataValidationJobResult]):
    for result in failed_validations:
        logging.error("Failed data validation: %s", result)

        monitoring_tags: Dict[str, Any] = {
            monitoring.TagKey.REGION:
            result.validation_job.region_code,
            monitoring.TagKey.VALIDATION_CHECK_TYPE:
            result.validation_job.validation.validation_type,
            monitoring.TagKey.VALIDATION_VIEW_ID:
            result.validation_job.validation.view.view_id
        }
        with monitoring.measurements(monitoring_tags) as measurements:
            measurements.measure_int_put(m_failed_validations, 1)
Ejemplo n.º 19
0
def _should_abort(total_root_entities,
                  conversion_result: IngestInfoConversionResult,
                  entity_matching_errors=0,
                  data_validation_errors=0):
    """
    Returns true if we should abort the current attempt to persist an IngestInfo
    object, given the number of errors we've encountered.
    """
    if total_root_entities == 0:
        logging.info("Aborting because the ingest info object contains no "
                     "root entity objects to persist.")
        return True

    # TODO: finalize the logic in here.
    if conversion_result.protected_class_errors:
        logging.error(
            "Aborting because there was an error regarding a protected class")
        with monitoring.measurements(
            {monitoring.TagKey.REASON: 'PROTECTED_CLASS_ERROR'}) as m:
            m.measure_int_put(m_aborts, 1)
        return True
    if (conversion_result.enum_parsing_errors +
            conversion_result.general_parsing_errors + entity_matching_errors +
            data_validation_errors) / total_root_entities >= ERROR_THRESHOLD:
        logging.error(
            "Aborting because we exceeded the error threshold of [%s] with "
            "[%s] enum_parsing errors, [%s] general_parsing_errors, [%s] "
            "entity_matching_errors, and [%s] data_validation_errors",
            ERROR_THRESHOLD, conversion_result.enum_parsing_errors,
            conversion_result.general_parsing_errors, entity_matching_errors,
            data_validation_errors)
        with monitoring.measurements({monitoring.TagKey.REASON:
                                      'THRESHOLD'}) as m:
            m.measure_int_put(m_aborts, 1)
        return True
    return False
Ejemplo n.º 20
0
def read_and_persist():
    """Reads all of the messages from Datastore for a region and persists
    them to the database.
    """

    region = request.args.get('region')
    batch_tags = {
        monitoring.TagKey.STATUS: 'COMPLETED',
        monitoring.TagKey.PERSISTED: False
    }
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags({monitoring.TagKey.REGION: region}), \
         monitoring.measurements(batch_tags) as measurements:
        measurements.measure_int_put(m_batch_count, 1)

        session = sessions.get_most_recent_completed_session(
            region, ScrapeType.BACKGROUND)
        scrape_type = session.scrape_type

        try:
            did_persist = persist_to_database(region, session.start)
            batch_tags[monitoring.TagKey.PERSISTED] = did_persist
        except Exception as e:
            logging.exception("An exception occurred in read and persist: %s",
                              type(e).__name__)
            batch_tags[monitoring.TagKey.STATUS] = 'ERROR: {}' \
                .format(type(e).__name__)
            sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
            raise BatchPersistError(region, scrape_type)

        if did_persist:
            next_phase = scrape_phase.next_phase(request.endpoint)
            sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE)
            if next_phase:
                logging.info("Enqueueing %s for region %s.", region,
                             next_phase)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=url_for(next_phase))
            return '', HTTPStatus.OK

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
        return '', HTTPStatus.ACCEPTED
    def update_views_for_state(self) -> None:
        """Create or update the up to date views dataset for a state with latest views"""
        views_dataset = f"{self.state_code}_raw_data_up_to_date_views"
        raw_data_dataset = f"{self.state_code}_raw_data"
        succeeded_tables = []
        failed_tables = []

        self.bq_client.create_dataset_if_necessary(
            self.bq_client.dataset_ref_for_id(views_dataset))

        for raw_file_config in self.raw_file_region_config.raw_file_configs.values(
        ):
            if self.bq_client.table_exists(
                    self.bq_client.dataset_ref_for_id(raw_data_dataset),
                    raw_file_config.file_tag,
            ):
                try:
                    self._create_or_update_views_for_table(
                        raw_file_config=raw_file_config)
                    succeeded_tables.append(raw_file_config.file_tag)
                except Exception as e:
                    with monitoring.measurements({
                            monitoring.TagKey.CREATE_UPDATE_RAW_DATA_LATEST_VIEWS_FILE_TAG:
                            raw_file_config.file_tag
                    }) as measurements:
                        measurements.measure_int_put(
                            m_failed_latest_views_update, 1)
                    failed_tables.append(raw_file_config.file_tag)
                    raise ValueError(
                        f"Couldn't create/update views for file [{raw_file_config.file_tag}]"
                    ) from e
            else:
                logging.warning(
                    "Table with name [%s] does not exist in BQ... Skipping latest view update/creation",
                    raw_file_config.file_tag,
                )

        logging.info("Succeeded tables %s", succeeded_tables)
        if failed_tables:
            logging.error("Failed tables %s", failed_tables)
Ejemplo n.º 22
0
def create_managed_dataset_and_deploy_views_for_view_builders(
    view_source_table_datasets: Set[str],
    view_builders_to_update: Sequence[BigQueryViewBuilder],
    dataset_overrides: Optional[Dict[str, str]] = None,
    bq_region_override: Optional[str] = None,
    force_materialize: bool = False,
) -> None:
    """Creates or updates all the views in the provided list with the view query in the provided view builder list.
    If any materialized view has been updated (or if an ancestor view has been updated) or the force_materialize flag
    is set, the view will be re-materialized to ensure the schemas remain consistent. Also, cleans up unmanaged views
    and datasets by deleting them from BigQuery.

    Should only be called if we expect the views to have changed (either the view query or schema from querying
    underlying tables), e.g. at deploy time.
    """
    set_default_table_expiration_for_new_datasets = bool(dataset_overrides)
    if set_default_table_expiration_for_new_datasets:
        logging.info(
            "Found non-empty dataset overrides. New datasets created in this process will have a "
            "default table expiration of 24 hours.")
    try:
        views_to_update = _build_views_to_update(
            view_source_table_datasets=view_source_table_datasets,
            candidate_view_builders=view_builders_to_update,
            dataset_overrides=dataset_overrides,
        )

        _create_managed_dataset_and_deploy_views(
            views_to_update,
            bq_region_override,
            force_materialize,
            set_temp_dataset_table_expiration=
            set_default_table_expiration_for_new_datasets,
        )
    except Exception as e:
        with monitoring.measurements() as measurements:
            measurements.measure_int_put(m_failed_view_update, 1)
        raise e
Ejemplo n.º 23
0
    def run_inside_new_span(*args: Any, **kwargs: Any) -> None:
        tracer: tracer_module.Tracer = execution_context.get_opencensus_tracer()
        with tracer.span(name=func.__qualname__) as new_span:
            new_span.add_attribute("recidiviz.function.module", func.__module__)
            new_span.add_attribute("recidiviz.function.args", str(args))
            new_span.add_attribute("recidiviz.function.kwargs", str(kwargs))

            with monitoring.measurements(
                {
                    monitoring.TagKey.MODULE: func.__module__,
                    monitoring.TagKey.FUNCTION: func.__qualname__,
                    monitoring.TagKey.RECURSION_DEPTH: stack.get().count(id(func)),
                }
            ) as measurements:
                stack_token = stack.set(stack.get() + [id(func)])
                start = time.perf_counter()

                try:
                    return func(*args, **kwargs)
                finally:
                    measurements.measure_float_put(
                        m_duration_s, time.perf_counter() - start
                    )
                    stack.reset(stack_token)
Ejemplo n.º 24
0
def export_view_data_to_cloud_storage(
    export_job_filter: str,
    override_view_exporter: Optional[BigQueryViewExporter] = None,
) -> None:
    """Exports data in BigQuery metric views to cloud storage buckets.

    Optionally takes in a BigQueryViewExporter for performing the export operation. If none is provided, this defaults
    to using a CompositeBigQueryViewExporter with delegates of JsonLinesBigQueryViewExporter and
    OptimizedMetricBigQueryViewExporter.
    """
    export_configs_for_filter: List[ExportViewCollectionConfig] = []
    bq_view_namespaces_to_update: Set[BigQueryViewNamespace] = set()
    for dataset_export_config in export_config.VIEW_COLLECTION_EXPORT_CONFIGS:
        if not dataset_export_config.matches_filter(export_job_filter):
            logging.info(
                "Skipped metric export for config [%s] with filter [%s]",
                dataset_export_config,
                export_job_filter,
            )
            continue

        export_configs_for_filter.append(dataset_export_config)
        bq_view_namespaces_to_update.add(
            dataset_export_config.bq_view_namespace)

    if not export_configs_for_filter:
        raise ValueError("Export filter did not match any export configs: ",
                         export_job_filter)

    for bq_view_namespace_to_update in bq_view_namespaces_to_update:
        view_builders_for_views_to_update = (
            view_update_manager.
            VIEW_BUILDERS_BY_NAMESPACE[bq_view_namespace_to_update])

        # TODO(#5125): Once view update is consistently trivial, always update all views in namespace
        if (bq_view_namespace_to_update
                in export_config.NAMESPACES_REQUIRING_FULL_UPDATE):
            view_update_manager.create_dataset_and_deploy_views_for_view_builders(
                bq_view_namespace_to_update, view_builders_for_views_to_update)

        # The view deploy will only have rematerialized views that had been updated since the last deploy, this call
        # will ensure that all materialized tables get refreshed.
        view_update_manager.rematerialize_views_for_namespace(
            bq_view_namespace=bq_view_namespace_to_update,
            candidate_view_builders=view_builders_for_views_to_update,
        )

    gcsfs_client = GcsfsFactory.build()
    if override_view_exporter is None:
        bq_client = BigQueryClientImpl()

        # Some our views intentionally export empty files (e.g. some of the ingest_metadata views)
        # so we just check for existence
        csv_exporter = CSVBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        json_exporter = JsonLinesBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        metric_exporter = OptimizedMetricBigQueryViewExporter(
            bq_client,
            OptimizedMetricBigQueryViewExportValidator(gcsfs_client))

        delegate_export_map = {
            ExportOutputFormatType.CSV: csv_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: csv_exporter,
            ExportOutputFormatType.JSON: json_exporter,
            ExportOutputFormatType.METRIC: metric_exporter,
        }
    else:
        delegate_export_map = {
            ExportOutputFormatType.CSV: override_view_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: override_view_exporter,
            ExportOutputFormatType.JSON: override_view_exporter,
            ExportOutputFormatType.METRIC: override_view_exporter,
        }

    project_id = metadata.project_id()

    for dataset_export_config in export_configs_for_filter:
        logging.info(
            "Starting metric export for dataset_config [%s] with filter [%s]",
            dataset_export_config,
            export_job_filter,
        )

        view_export_configs = dataset_export_config.export_configs_for_views_to_export(
            project_id=project_id)

        # The export will error if the validations fail for the set of view_export_configs. We want to log this failure
        # as a warning, but not block on the rest of the exports.
        try:
            export_views_with_exporters(gcsfs_client, view_export_configs,
                                        delegate_export_map)
        except ViewExportValidationError:
            warning_message = (
                f"Export validation failed for {dataset_export_config.export_name}"
            )

            if dataset_export_config.state_code_filter is not None:
                warning_message += (
                    f" for state: {dataset_export_config.state_code_filter}")

            logging.warning(warning_message)
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    dataset_export_config.export_name,
                    monitoring.TagKey.REGION:
                    dataset_export_config.state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_validation,
                                             1)

            # Do not treat validation failures as fatal errors
            continue
        except Exception as e:
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    dataset_export_config.export_name,
                    monitoring.TagKey.REGION:
                    dataset_export_config.state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_job, 1)
            raise e
Ejemplo n.º 25
0
def increment_error(entity_name: str) -> None:
    mtags = {monitoring.TagKey.ENTITY_TYPE: entity_name}
    with monitoring.measurements(mtags) as measurements:
        measurements.measure_int_put(m_matching_errors, 1)
Ejemplo n.º 26
0
def write(ingest_info, metadata):
    """
    If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in
    the ingest_info. If a person with the given surname/birthday already exists,
    then update that person.

    Otherwise, simply log the given ingest_infos for debugging
    """
    ingest_info_validator.validate(ingest_info)

    mtags = {
        monitoring.TagKey.SHOULD_PERSIST: _should_persist(),
        monitoring.TagKey.PERSISTED: False
    }
    total_people = _get_total_people(ingest_info, metadata)
    with monitoring.measurements(mtags) as measurements:

        # Convert the people one at a time and count the errors as they happen.
        conversion_result: IngestInfoConversionResult = \
            ingest_info_converter.convert_to_persistence_entities(ingest_info,
                                                                  metadata)

        people, data_validation_errors = entity_validator.validate(
            conversion_result.people)
        logging.info(
            "Converted [%s] people with [%s] enum_parsing_errors, [%s]"
            " general_parsing_errors, [%s] protected_class_errors and "
            "[%s] data_validation_errors", len(people),
            conversion_result.enum_parsing_errors,
            conversion_result.general_parsing_errors,
            conversion_result.protected_class_errors, data_validation_errors)
        measurements.measure_int_put(m_people, len(people))

        if _should_abort(total_root_entities=total_people,
                         conversion_result=conversion_result,
                         data_validation_errors=data_validation_errors):
            #  TODO(#1665): remove once dangling PERSIST session investigation
            #   is complete.
            logging.info("_should_abort_ was true after converting people")
            return False

        if not _should_persist():
            return True

        persisted = False

        session = SessionFactory.for_schema_base(
            schema_base_for_system_level(metadata.system_level))

        try:
            logging.info("Starting entity matching")

            entity_matching_output = entity_matching.match(
                session, metadata.region, people)
            people = entity_matching_output.people
            total_root_entities = total_people \
                if metadata.system_level == SystemLevel.COUNTY \
                else entity_matching_output.total_root_entities
            logging.info("Completed entity matching with [%s] errors",
                         entity_matching_output.error_count)
            logging.info(
                "Completed entity matching and have [%s] total people "
                "to commit to DB", len(people))
            if _should_abort(
                    total_root_entities=total_root_entities,
                    conversion_result=conversion_result,
                    entity_matching_errors=entity_matching_output.error_count,
                    data_validation_errors=data_validation_errors):
                #  TODO(#1665): remove once dangling PERSIST session
                #   investigation is complete.
                logging.info("_should_abort_ was true after entity matching")
                return False

            database.write_people(
                session,
                people,
                metadata,
                orphaned_entities=entity_matching_output.orphaned_entities)
            logging.info("Successfully wrote to the database")
            session.commit()

            persisted = True
            mtags[monitoring.TagKey.PERSISTED] = True
        except Exception as e:
            logging.exception("An exception was raised in write(): [%s]",
                              type(e).__name__)
            # Record the error type that happened and increment the counter
            mtags[monitoring.TagKey.ERROR] = type(e).__name__
            measurements.measure_int_put(m_errors, 1)
            session.rollback()
            raise
        finally:
            session.close()
        return persisted
Ejemplo n.º 27
0
def export_view_data_to_cloud_storage(export_job_filter: Optional[str] = None,
                                      view_exporter: BigQueryViewExporter = None) -> None:
    """Exports data in BigQuery metric views to cloud storage buckets.

    Optionally takes in a BigQueryViewExporter for performing the export operation. If none is provided, this defaults
    to using a CompositeBigQueryViewExporter with delegates of JsonLinesBigQueryViewExporter and
    OptimizedMetricBigQueryViewExporter.
    """
    view_builders_for_views_to_update = view_config.VIEW_BUILDERS_FOR_VIEWS_TO_UPDATE
    view_update_manager.create_dataset_and_update_views_for_view_builders(BigQueryViewNamespace.STATE,
                                                                          view_builders_for_views_to_update)

    if not view_exporter:
        bq_client = BigQueryClientImpl()
        gcsfs_client = GcsfsFactory.build()

        json_exporter = JsonLinesBigQueryViewExporter(bq_client,
                                                      JsonLinesBigQueryViewExportValidator(gcsfs_client))

        optimized_exporter = OptimizedMetricBigQueryViewExporter(
            bq_client, OptimizedMetricBigQueryViewExportValidator(gcsfs_client))
        delegates = [json_exporter, optimized_exporter]

        view_exporter = CompositeBigQueryViewExporter(
            bq_client,
            gcsfs_client,
            delegates
        )

    project_id = metadata.project_id()

    # If the state code is set to COVID then it will match when the state_filter is None in
    # view_config.METRIC_DATASET_EXPORT_CONFIGS
    matched_export_config = False
    for dataset_export_config in view_config.METRIC_DATASET_EXPORT_CONFIGS:
        if not dataset_export_config.matches_filter(export_job_filter):
            logging.info("Skipped metric export for config [%s] with filter [%s]", dataset_export_config,
                         export_job_filter)
            continue

        matched_export_config = True
        logging.info("Starting metric export for dataset_config [%s] with filter [%s]", dataset_export_config,
                     export_job_filter)

        view_export_configs = dataset_export_config.export_configs_for_views_to_export(project_id=project_id)

        # The export will error if the validations fail for the set of view_export_configs. We want to log this failure
        # as a warning, but not block on the rest of the exports.
        try:
            view_exporter.export_and_validate(view_export_configs)
        except ViewExportValidationError:
            warning_message = f"Export validation failed from {dataset_export_config.dataset_id}"

            if dataset_export_config.state_code_filter is not None:
                warning_message += f" for state: {dataset_export_config.state_code_filter}"

            logging.warning(warning_message)
            with monitoring.measurements({
                monitoring.TagKey.METRIC_VIEW_EXPORT_NAME: dataset_export_config.export_name,
                monitoring.TagKey.REGION: dataset_export_config.state_code_filter
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_validation, 1)

            # Do not treat validation failures as fatal errors
            continue
        except Exception as e:
            with monitoring.measurements({
                monitoring.TagKey.METRIC_VIEW_EXPORT_NAME: dataset_export_config.export_name,
                monitoring.TagKey.REGION: dataset_export_config.state_code_filter
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_job, 1)
            raise e

    if not matched_export_config:
        raise ValueError("Export filter did not match any export configs: ", export_job_filter)
Ejemplo n.º 28
0
def write(
    ingest_info: IngestInfo,
    ingest_metadata: IngestMetadata,
    run_txn_fn: Callable[
        [Session, MeasurementMap, Callable[[Session], bool], Optional[int]],
        bool] = retry_transaction,
) -> bool:
    """
    If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in
    the ingest_info. If a person with the given surname/birthday already exists,
    then update that person.

    Otherwise, simply log the given ingest_infos for debugging

    `run_txn_fn` is exposed primarily for testing and should typically be left as `retry_transaction`. `run_txn_fn`
    must handle the coordination of the transaction including, when to run the body of the transaction and when to
    commit, rollback, or close the session.
    """
    ingest_info_validator.validate(ingest_info)

    mtags: Dict[str, Union[bool, str]] = {
        monitoring.TagKey.SHOULD_PERSIST: should_persist(),
        monitoring.TagKey.PERSISTED: False,
    }
    total_people = _get_total_people(ingest_info, ingest_metadata)
    with monitoring.measurements(mtags) as measurements:

        # Convert the people one at a time and count the errors as they happen.
        conversion_result: IngestInfoConversionResult = (
            ingest_info_converter.convert_to_persistence_entities(
                ingest_info, ingest_metadata))

        people, data_validation_errors = entity_validator.validate(
            conversion_result.people)
        logging.info(
            "Converted [%s] people with [%s] enum_parsing_errors, [%s]"
            " general_parsing_errors, [%s] protected_class_errors and "
            "[%s] data_validation_errors",
            len(people),
            conversion_result.enum_parsing_errors,
            conversion_result.general_parsing_errors,
            conversion_result.protected_class_errors,
            data_validation_errors,
        )
        measurements.measure_int_put(m_people, len(people))

        if _should_abort(
                total_root_entities=total_people,
                system_level=ingest_metadata.system_level,
                conversion_result=conversion_result,
                region_code=ingest_metadata.region,
                data_validation_errors=data_validation_errors,
        ):
            #  TODO(#1665): remove once dangling PERSIST session investigation
            #   is complete.
            logging.info("_should_abort_ was true after converting people")
            return False

        if not should_persist():
            return True

        @trace.span
        def match_and_write_people(session: Session) -> bool:
            logging.info("Starting entity matching")

            entity_matching_output = entity_matching.match(
                session, ingest_metadata.region, people)
            output_people = entity_matching_output.people
            total_root_entities = (total_people if ingest_metadata.system_level
                                   == SystemLevel.COUNTY else
                                   entity_matching_output.total_root_entities)
            logging.info(
                "Completed entity matching with [%s] errors",
                entity_matching_output.error_count,
            )
            logging.info(
                "Completed entity matching and have [%s] total people "
                "to commit to DB",
                len(output_people),
            )
            if _should_abort(
                    total_root_entities=total_root_entities,
                    system_level=ingest_metadata.system_level,
                    conversion_result=conversion_result,
                    region_code=ingest_metadata.region,
                    entity_matching_errors=entity_matching_output.error_count,
            ):
                #  TODO(#1665): remove once dangling PERSIST session
                #   investigation is complete.
                logging.info("_should_abort_ was true after entity matching")
                return False

            database_invariant_errors = (
                database_invariant_validator.validate_invariants(
                    session,
                    ingest_metadata.system_level,
                    ingest_metadata.region,
                    output_people,
                ))

            if _should_abort(
                    total_root_entities=total_root_entities,
                    system_level=ingest_metadata.system_level,
                    conversion_result=conversion_result,
                    region_code=ingest_metadata.region,
                    database_invariant_errors=database_invariant_errors,
            ):
                logging.info(
                    "_should_abort_ was true after database invariant validation"
                )
                return False

            database.write_people(
                session,
                output_people,
                ingest_metadata,
                orphaned_entities=entity_matching_output.orphaned_entities,
            )
            logging.info("Successfully wrote to the database")
            return True

        try:
            with SessionFactory.using_database(ingest_metadata.database_key,
                                               autocommit=False) as session:
                if not run_txn_fn(session, measurements,
                                  match_and_write_people, 5):
                    return False
            mtags[monitoring.TagKey.PERSISTED] = True
        except Exception as e:
            logging.exception("An exception was raised in write(): [%s]",
                              type(e).__name__)
            # Record the error type that happened and increment the counter
            mtags[monitoring.TagKey.ERROR] = type(e).__name__
            measurements.measure_int_put(m_errors, 1)
            raise
        return True
Ejemplo n.º 29
0
def work(region):
    """POST request handler to route chunk of scraper work

    Very thin shim to receive a chunk of work from the task queue, and call
    the relevant part of the specified scraper to execute it.

    All scraper work that hits a third-party website goes through this handler
    as small discrete tasks, so that we leverage the taskqueue's throttling and
    retry support for network requests to the sites (and don't DOS them).

    Because scraping will vary so significantly by region, this taskqueue
    handler is very lightweight - it really just accepts the POST for the task,
    and calls the relevant regional scraper to do whatever was asked. This
    allows it to stay agnostic to regional variation.

    Never called manually, so authentication is enforced in app.yaml.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        region: (string) Region code for the scraper in question.
        task: (string) Name of the function to call in the scraper
        params: (dict) Parameter payload to give the function being called
            (optional)

    Returns:
        Response code 200 if successful

        Any other response code will make taskqueue consider the task
        failed, and it will retry the task until it expires or succeeds
        (handling backoff logic, etc.)
    """
    # Verify this was actually a task queued by our app
    if "X-AppEngine-QueueName" not in request.headers:
        logging.error("Couldn't validate task was legit, exiting.")
        return ("", HTTPStatus.INTERNAL_SERVER_ERROR)
    queue_name = request.headers.get("X-AppEngine-QueueName")

    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    task = data["task"]
    params = QueueRequest.from_serializable(data["params"])

    if region != data["region"]:
        raise ValueError(
            "Region specified in task {} does not match region from url {}.".
            format(data["region"], region))

    task_tags = {monitoring.TagKey.STATUS: "COMPLETED"}
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION:
         region}), monitoring.measurements(task_tags) as measurements:
        measurements.measure_int_put(m_tasks, 1)
        if not sessions.get_current_session(
                ScrapeKey(region, params.scrape_type)):
            task_tags[monitoring.TagKey.STATUS] = "SKIPPED"
            logging.info(
                "Queue [%s], skipping task [%s] for [%s] because it "
                "is not in the current session.",
                queue_name,
                task,
                region,
            )
            return ("", HTTPStatus.OK)
        logging.info("Queue [%s], processing task [%s] for [%s].", queue_name,
                     task, region)

        scraper = regions.get_region(region).get_ingestor()
        scraper_task = getattr(scraper, task)

        try:
            scraper_task(params)
        except Exception as e:
            task_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(
                type(e).__name__)
            raise RequestProcessingError(region, task, params) from e

        # Respond to the task queue to mark this task as done
        return ("", HTTPStatus.OK)
Ejemplo n.º 30
0
 def inner(_region_code):
     tags = {'alice': 'foo'}
     with monitoring.measurements(tags) as mmap:
         tags['bob'] = 'bar'
         assert mmap == mock_mmap