def validate_ingest(
        self,
        ingest_info: IngestInfo,
        expected_ingest_info: IngestInfo,
        metadata: IngestMetadata,
    ) -> IngestInfo:
        """This function runs validation on a computed and expected ingest_info.

        Args:
            ingest_info: the computed ingest info object
            expected_ingest_info: the ingest info expected to be returned from
                `populate_data`. If `expected_ingest_info` is `None`, then
                expects the return value of `populate_data` to be `None`.
            metadata: an ingest info metadata struct to pass along to the proto
                converter.

        Returns:
            The result from populate_data in case the user needs to do any
            extra validations on the output.

        """

        if expected_ingest_info is None:
            assert ingest_info == expected_ingest_info
            return ingest_info

        # Attempt to convert the ingest_info to the ingest info proto,
        # validate the proto, and finally attempt to convert the proto into
        # our entitiy/ objects (which includes parsing strings into types)
        ingest_info_proto = serialization.convert_ingest_info_to_proto(
            ingest_info)
        validate(ingest_info_proto)
        res = ingest_info_converter.convert_to_persistence_entities(
            ingest_info_proto, metadata)

        assert res.enum_parsing_errors == 0
        assert res.general_parsing_errors == 0
        assert res.protected_class_errors == 0

        entity_validator.validate(res.people)

        differences = diff_ingest_infos(expected_ingest_info, ingest_info)

        if differences:
            self.fail(  # type: ignore[attr-defined]
                "IngestInfo objects do not match.\n"
                "Expected:\n{}\n"
                "Actual:\n{}\n"
                "Differences:\n{}\n\n"
                "(paste the following) scraped object:"
                "\n{}".format(
                    expected_ingest_info,
                    ingest_info,
                    "\n".join(differences),
                    repr(ingest_info),
                ))

        return ingest_info
Esempio n. 2
0
def write(ingest_info, metadata):
    """
    If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in
    the ingest_info. If a person with the given surname/birthday already exists,
    then update that person.

    Otherwise, simply log the given ingest_infos for debugging
    """
    ingest_info_validator.validate(ingest_info)

    mtags = {
        monitoring.TagKey.SHOULD_PERSIST: _should_persist(),
        monitoring.TagKey.PERSISTED: False
    }
    total_people = _get_total_people(ingest_info, metadata)
    with monitoring.measurements(mtags) as measurements:

        # Convert the people one at a time and count the errors as they happen.
        conversion_result: IngestInfoConversionResult = \
            ingest_info_converter.convert_to_persistence_entities(ingest_info,
                                                                  metadata)

        people, data_validation_errors = entity_validator.validate(
            conversion_result.people)
        logging.info(
            "Converted [%s] people with [%s] enum_parsing_errors, [%s]"
            " general_parsing_errors, [%s] protected_class_errors and "
            "[%s] data_validation_errors", len(people),
            conversion_result.enum_parsing_errors,
            conversion_result.general_parsing_errors,
            conversion_result.protected_class_errors, data_validation_errors)
        measurements.measure_int_put(m_people, len(people))

        if _should_abort(total_root_entities=total_people,
                         conversion_result=conversion_result,
                         data_validation_errors=data_validation_errors):
            #  TODO(#1665): remove once dangling PERSIST session investigation
            #   is complete.
            logging.info("_should_abort_ was true after converting people")
            return False

        if not _should_persist():
            return True

        persisted = False

        session = SessionFactory.for_schema_base(
            schema_base_for_system_level(metadata.system_level))

        try:
            logging.info("Starting entity matching")

            entity_matching_output = entity_matching.match(
                session, metadata.region, people)
            people = entity_matching_output.people
            total_root_entities = total_people \
                if metadata.system_level == SystemLevel.COUNTY \
                else entity_matching_output.total_root_entities
            logging.info("Completed entity matching with [%s] errors",
                         entity_matching_output.error_count)
            logging.info(
                "Completed entity matching and have [%s] total people "
                "to commit to DB", len(people))
            if _should_abort(
                    total_root_entities=total_root_entities,
                    conversion_result=conversion_result,
                    entity_matching_errors=entity_matching_output.error_count,
                    data_validation_errors=data_validation_errors):
                #  TODO(#1665): remove once dangling PERSIST session
                #   investigation is complete.
                logging.info("_should_abort_ was true after entity matching")
                return False

            database.write_people(
                session,
                people,
                metadata,
                orphaned_entities=entity_matching_output.orphaned_entities)
            logging.info("Successfully wrote to the database")
            session.commit()

            persisted = True
            mtags[monitoring.TagKey.PERSISTED] = True
        except Exception as e:
            logging.exception("An exception was raised in write(): [%s]",
                              type(e).__name__)
            # Record the error type that happened and increment the counter
            mtags[monitoring.TagKey.ERROR] = type(e).__name__
            measurements.measure_int_put(m_errors, 1)
            session.rollback()
            raise
        finally:
            session.close()
        return persisted
Esempio n. 3
0
def write(
    ingest_info: IngestInfo,
    ingest_metadata: IngestMetadata,
    run_txn_fn: Callable[
        [Session, MeasurementMap, Callable[[Session], bool], Optional[int]],
        bool] = retry_transaction,
) -> bool:
    """
    If in prod or if 'PERSIST_LOCALLY' is set to true, persist each person in
    the ingest_info. If a person with the given surname/birthday already exists,
    then update that person.

    Otherwise, simply log the given ingest_infos for debugging

    `run_txn_fn` is exposed primarily for testing and should typically be left as `retry_transaction`. `run_txn_fn`
    must handle the coordination of the transaction including, when to run the body of the transaction and when to
    commit, rollback, or close the session.
    """
    ingest_info_validator.validate(ingest_info)

    mtags: Dict[str, Union[bool, str]] = {
        monitoring.TagKey.SHOULD_PERSIST: should_persist(),
        monitoring.TagKey.PERSISTED: False,
    }
    total_people = _get_total_people(ingest_info, ingest_metadata)
    with monitoring.measurements(mtags) as measurements:

        # Convert the people one at a time and count the errors as they happen.
        conversion_result: IngestInfoConversionResult = (
            ingest_info_converter.convert_to_persistence_entities(
                ingest_info, ingest_metadata))

        people, data_validation_errors = entity_validator.validate(
            conversion_result.people)
        logging.info(
            "Converted [%s] people with [%s] enum_parsing_errors, [%s]"
            " general_parsing_errors, [%s] protected_class_errors and "
            "[%s] data_validation_errors",
            len(people),
            conversion_result.enum_parsing_errors,
            conversion_result.general_parsing_errors,
            conversion_result.protected_class_errors,
            data_validation_errors,
        )
        measurements.measure_int_put(m_people, len(people))

        if _should_abort(
                total_root_entities=total_people,
                system_level=ingest_metadata.system_level,
                conversion_result=conversion_result,
                region_code=ingest_metadata.region,
                data_validation_errors=data_validation_errors,
        ):
            #  TODO(#1665): remove once dangling PERSIST session investigation
            #   is complete.
            logging.info("_should_abort_ was true after converting people")
            return False

        if not should_persist():
            return True

        @trace.span
        def match_and_write_people(session: Session) -> bool:
            logging.info("Starting entity matching")

            entity_matching_output = entity_matching.match(
                session, ingest_metadata.region, people)
            output_people = entity_matching_output.people
            total_root_entities = (total_people if ingest_metadata.system_level
                                   == SystemLevel.COUNTY else
                                   entity_matching_output.total_root_entities)
            logging.info(
                "Completed entity matching with [%s] errors",
                entity_matching_output.error_count,
            )
            logging.info(
                "Completed entity matching and have [%s] total people "
                "to commit to DB",
                len(output_people),
            )
            if _should_abort(
                    total_root_entities=total_root_entities,
                    system_level=ingest_metadata.system_level,
                    conversion_result=conversion_result,
                    region_code=ingest_metadata.region,
                    entity_matching_errors=entity_matching_output.error_count,
            ):
                #  TODO(#1665): remove once dangling PERSIST session
                #   investigation is complete.
                logging.info("_should_abort_ was true after entity matching")
                return False

            database_invariant_errors = (
                database_invariant_validator.validate_invariants(
                    session,
                    ingest_metadata.system_level,
                    ingest_metadata.region,
                    output_people,
                ))

            if _should_abort(
                    total_root_entities=total_root_entities,
                    system_level=ingest_metadata.system_level,
                    conversion_result=conversion_result,
                    region_code=ingest_metadata.region,
                    database_invariant_errors=database_invariant_errors,
            ):
                logging.info(
                    "_should_abort_ was true after database invariant validation"
                )
                return False

            database.write_people(
                session,
                output_people,
                ingest_metadata,
                orphaned_entities=entity_matching_output.orphaned_entities,
            )
            logging.info("Successfully wrote to the database")
            return True

        try:
            with SessionFactory.using_database(ingest_metadata.database_key,
                                               autocommit=False) as session:
                if not run_txn_fn(session, measurements,
                                  match_and_write_people, 5):
                    return False
            mtags[monitoring.TagKey.PERSISTED] = True
        except Exception as e:
            logging.exception("An exception was raised in write(): [%s]",
                              type(e).__name__)
            # Record the error type that happened and increment the counter
            mtags[monitoring.TagKey.ERROR] = type(e).__name__
            measurements.measure_int_put(m_errors, 1)
            raise
        return True