def test_write_errors(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() batch_ingest_info_data = datastore_ingest_info.write_error( region="us_state_county", session_start_time=start_time, error="error string", trace_id="trace", task_hash=task_hash, ) results = datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) assert results == [batch_ingest_info_data] datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county")
def test_batch_delete_over_500_ingest_infos_for_region(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() # The Datastore limit for entity writes in one call is 500. Confirm # that batch delete is properly handled when more than 500 entities # exist for the same region. for i in range(600): datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info(str(i)), task_hash=task_hash, ) datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county") assert (datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) == [])
def persist_to_database( region_code: str, session_start_time: datetime.datetime ) -> bool: """Reads all of the ingest infos from Datastore for a region and persists them to the database. """ region = regions.get_region(region_code) overrides = region.get_scraper_enum_overrides() ingest_info_data_list = _get_batch_ingest_info_list(region_code, session_start_time) logging.info("Received %s total ingest infos", len(ingest_info_data_list)) if ingest_info_data_list: proto, failed_tasks = _get_proto_from_batch_ingest_info_data_list( ingest_info_data_list ) if not proto.people: logging.error("Scrape session returned 0 people.") return False for batch_ingest_info_datum in failed_tasks.values(): logging.error( "Task with trace_id %s failed with error %s", batch_ingest_info_datum.trace_id, batch_ingest_info_datum.error, ) if _should_abort(len(failed_tasks), len(proto.people)): logging.error( "Too many scraper tasks failed(%s), aborting write", len(failed_tasks) ) return False metadata = IngestMetadata( region=region_code, jurisdiction_id=region.jurisdiction_id, ingest_time=session_start_time, facility_id=region.facility_id, enum_overrides=overrides, system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS), ) did_write = persistence.write(proto, metadata) if did_write: datastore_ingest_info.batch_delete_ingest_infos_for_region(region_code) return did_write logging.error("No ingest infos received from Datastore") return False
def test_batch_delete_ingest_infos_for_region(self): task_hash = hash( json.dumps( Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ).to_serializable(), sort_keys=True, )) start_time = datetime.now() datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("1"), task_hash=task_hash, ) datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("2"), task_hash=task_hash, ) datastore_ingest_info.write_ingest_info( region="us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("3"), task_hash=task_hash, ) unrelated = datastore_ingest_info.write_ingest_info( region="unrelated_us_state_county", session_start_time=start_time, ingest_info=sample_ingest_info("n/a"), task_hash=task_hash, ) datastore_ingest_info.batch_delete_ingest_infos_for_region( "us_state_county") assert (datastore_ingest_info.batch_get_ingest_infos_for_region( "us_state_county", start_time) == []) actual = datastore_ingest_info.batch_get_ingest_infos_for_region( "unrelated_us_state_county", start_time) assert actual == [unrelated] datastore_ingest_info.batch_delete_ingest_infos_for_region( "unrelated_us_state_county")
def test_write_single_ingest_info(self): task_hash = hash( json.dumps(Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT).to_serializable(), sort_keys=True)) start_time = datetime.now() ingest_info = datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('1'), task_hash=task_hash) results = datastore_ingest_info.batch_get_ingest_infos_for_region( 'us_state_county', start_time) assert results == [ingest_info] datastore_ingest_info.batch_delete_ingest_infos_for_region( 'us_state_county')
def test_batch_delete_ingest_infos_for_region(self): task_hash = hash( json.dumps(Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT).to_serializable(), sort_keys=True)) start_time = datetime.now() datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('1'), task_hash=task_hash) datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('2'), task_hash=task_hash) datastore_ingest_info.write_ingest_info( region='us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('3'), task_hash=task_hash) unrelated = datastore_ingest_info \ .write_ingest_info(region='unrelated_us_state_county', session_start_time=start_time, ingest_info=sample_ingest_info('n/a'), task_hash=task_hash) datastore_ingest_info.batch_delete_ingest_infos_for_region( 'us_state_county') assert datastore_ingest_info.batch_get_ingest_infos_for_region( 'us_state_county', start_time) == [] actual = datastore_ingest_info.batch_get_ingest_infos_for_region( 'unrelated_us_state_county', start_time) assert actual == [unrelated] datastore_ingest_info.batch_delete_ingest_infos_for_region( 'unrelated_us_state_county')