def test_write_errors(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))

        start_time = datetime.now()
        batch_ingest_info_data = datastore_ingest_info.write_error(
            region="us_state_county",
            session_start_time=start_time,
            error="error string",
            trace_id="trace",
            task_hash=task_hash,
        )

        results = datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time)

        assert results == [batch_ingest_info_data]
        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")
    def test_batch_delete_over_500_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))
        start_time = datetime.now()

        # The Datastore limit for entity writes in one call is 500. Confirm
        # that batch delete is properly handled when more than 500 entities
        # exist for the same region.
        for i in range(600):
            datastore_ingest_info.write_ingest_info(
                region="us_state_county",
                session_start_time=start_time,
                ingest_info=sample_ingest_info(str(i)),
                task_hash=task_hash,
            )

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")

        assert (datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time) == [])
def persist_to_database(
    region_code: str, session_start_time: datetime.datetime
) -> bool:
    """Reads all of the ingest infos from Datastore for a region and persists
    them to the database.
    """
    region = regions.get_region(region_code)
    overrides = region.get_scraper_enum_overrides()

    ingest_info_data_list = _get_batch_ingest_info_list(region_code, session_start_time)

    logging.info("Received %s total ingest infos", len(ingest_info_data_list))
    if ingest_info_data_list:
        proto, failed_tasks = _get_proto_from_batch_ingest_info_data_list(
            ingest_info_data_list
        )

        if not proto.people:
            logging.error("Scrape session returned 0 people.")
            return False

        for batch_ingest_info_datum in failed_tasks.values():
            logging.error(
                "Task with trace_id %s failed with error %s",
                batch_ingest_info_datum.trace_id,
                batch_ingest_info_datum.error,
            )
        if _should_abort(len(failed_tasks), len(proto.people)):
            logging.error(
                "Too many scraper tasks failed(%s), aborting write", len(failed_tasks)
            )
            return False

        metadata = IngestMetadata(
            region=region_code,
            jurisdiction_id=region.jurisdiction_id,
            ingest_time=session_start_time,
            facility_id=region.facility_id,
            enum_overrides=overrides,
            system_level=SystemLevel.COUNTY,
            database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS),
        )

        did_write = persistence.write(proto, metadata)
        if did_write:
            datastore_ingest_info.batch_delete_ingest_infos_for_region(region_code)

        return did_write

    logging.error("No ingest infos received from Datastore")
    return False
    def test_batch_delete_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))
        start_time = datetime.now()

        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("1"),
            task_hash=task_hash,
        )
        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("2"),
            task_hash=task_hash,
        )
        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("3"),
            task_hash=task_hash,
        )
        unrelated = datastore_ingest_info.write_ingest_info(
            region="unrelated_us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("n/a"),
            task_hash=task_hash,
        )

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")

        assert (datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time) == [])

        actual = datastore_ingest_info.batch_get_ingest_infos_for_region(
            "unrelated_us_state_county", start_time)
        assert actual == [unrelated]

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "unrelated_us_state_county")
Example #5
0
    def test_write_single_ingest_info(self):
        task_hash = hash(
            json.dumps(Task(
                task_type=constants.TaskType.SCRAPE_DATA,
                endpoint=TEST_ENDPOINT,
                response_type=constants.ResponseType.TEXT).to_serializable(),
                       sort_keys=True))

        start_time = datetime.now()
        ingest_info = datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('1'),
            task_hash=task_hash)
        results = datastore_ingest_info.batch_get_ingest_infos_for_region(
            'us_state_county', start_time)
        assert results == [ingest_info]
        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'us_state_county')
Example #6
0
    def test_batch_delete_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(Task(
                task_type=constants.TaskType.SCRAPE_DATA,
                endpoint=TEST_ENDPOINT,
                response_type=constants.ResponseType.TEXT).to_serializable(),
                       sort_keys=True))
        start_time = datetime.now()

        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('1'),
            task_hash=task_hash)
        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('2'),
            task_hash=task_hash)
        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('3'),
            task_hash=task_hash)
        unrelated = datastore_ingest_info \
            .write_ingest_info(region='unrelated_us_state_county',
                               session_start_time=start_time,
                               ingest_info=sample_ingest_info('n/a'),
                               task_hash=task_hash)

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'us_state_county')

        assert datastore_ingest_info.batch_get_ingest_infos_for_region(
            'us_state_county', start_time) == []

        actual = datastore_ingest_info.batch_get_ingest_infos_for_region(
            'unrelated_us_state_county', start_time)
        assert actual == [unrelated]

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'unrelated_us_state_county')