def testConvert_CannotConvertField_RaisesValueError(self): # Arrange metadata = IngestMetadata.new_with_defaults( system_level=SystemLevel.STATE) ingest_info = IngestInfo() ingest_info.state_people.add(birthdate='NOT_A_DATE') # Act + Assert with self.assertRaises(ValueError): self._convert_and_throw_on_errors(ingest_info, metadata)
def test_scrape_data_and_more_no_persist_second_time_persist( self, mock_get_more, mock_fetch, mock_populate, mock_write): populate_task = Task.evolve( TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) mock_get_more.return_value = [populate_task] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=False, ) start_time = datetime.datetime.now() t = Task.evolve( TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time ) scraper = FakeScraper('test') scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=populate_task, scraper_start_time=start_time, ingest_info=self.ii )] self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_get_more.assert_called_once_with(TEST_HTML, t) self.assertCountEqual(expected_tasks, scraper.tasks) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) scraper._generic_scrape(scraper.tasks[0]) self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 2) self.assertEqual(mock_write.call_count, 1) expected_metadata = IngestMetadata( scraper.region.region_code, scraper.region.jurisdiction_id, start_time, scraper.get_enum_overrides(), ) expected_proto = convert_ingest_info_to_proto(self.ii) mock_write.assert_called_once_with(expected_proto, expected_metadata)
def testConvert_MultipleOpenBookings_RaisesValueError(self): # Arrange metadata = IngestMetadata.new_with_defaults(ingest_time=_INGEST_TIME) ingest_info = IngestInfo() ingest_info.people.add(booking_ids=["BOOKING_ID1", "BOOKING_ID2"]) ingest_info.bookings.add(booking_id="BOOKING_ID1", admission_date="3/14/2020") ingest_info.bookings.add(booking_id="BOOKING_ID2", admission_date="3/16/2020") # Act + Assert with self.assertRaises(ValueError): self._convert_and_throw_on_errors(ingest_info, metadata)
def validate_and_return_populate_data(self, content, expected_ingest_info=None, expected_single_counts=None, expected_persist=True, task=None, info=None): """This function runs populate_data and runs some extra validation on the output. Args: content: the content of the page to pass into get_more_tasks expected_ingest_info: the ingest info expected to be returned from `populate_data`. If `expected_ingest_info` is `None`, then expects the return value of `populate_data` to be `None`. expected_single_counts: the list of SingleCounts expected to be returned from `populate_data`. expected_persist: the expected value of persist to be returned from `populate_data`. task: the task that is being processed, optional. info: an ingest_info to use if provided. Returns: The result from populate_data in case the user needs to do any extra validations on the output. """ info = info or ingest_info.IngestInfo() task = task or Task(task_type=constants.TaskType.SCRAPE_DATA, endpoint='') scrape_data = self.scraper.populate_data(content, task, info) print('FINAL') print(scrape_data.ingest_info) print('EXPECTED') print(expected_ingest_info) assert scrape_data.ingest_info == expected_ingest_info self.assertCountEqual(scrape_data.single_counts, expected_single_counts or []) metadata = IngestMetadata(self.scraper.region.region_code, self.scraper.region.jurisdiction_id, _FAKE_SCRAPER_START_TIME, self.scraper.get_enum_overrides()) self.validate_ingest(scrape_data.ingest_info, expected_ingest_info, metadata) assert scrape_data.persist == expected_persist return scrape_data
def write_record() -> Tuple[str, HTTPStatus]: ingest_info = None last_scraped_time = None region = None jurisdiction_id = None with monitoring.push_tags({monitoring.TagKey.REGION: region}): metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time) # type: ignore persistence.write(ingest_info, metadata) # type: ignore return "", HTTPStatus.NOT_IMPLEMENTED
def write_record(): # TODO: Something like `ingest_info = protobuf.read(request.data)` ingest_info = None last_scraped_time = None region = None jurisdiction_id = None with monitoring.push_tags({monitoring.TagKey.REGION: region}): metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time) persistence.write(ingest_info, metadata) return '', HTTPStatus.NOT_IMPLEMENTED
def testParsePerson_NoNames_FullNameIsNone(self): # Arrange metadata = IngestMetadata.new_with_defaults() ingest_person = ingest_info_pb2.Person(person_id="1234") # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults(external_id="1234") self.assertEqual(result, expected_result)
def for_state( cls, region: str, enum_overrides: Optional[EnumOverrides] = None, ) -> IngestMetadata: return IngestMetadata( region=region, jurisdiction_id="", ingest_time=datetime.datetime(2020, 4, 14, 12, 31, 00), enum_overrides=enum_overrides or EnumOverrides.empty(), system_level=SystemLevel.STATE, database_key=SQLAlchemyDatabaseKey.canonical_for_schema(SchemaType.STATE), )
def persist_to_database( region_code: str, session_start_time: datetime.datetime ) -> bool: """Reads all of the ingest infos from Datastore for a region and persists them to the database. """ region = regions.get_region(region_code) overrides = region.get_scraper_enum_overrides() ingest_info_data_list = _get_batch_ingest_info_list(region_code, session_start_time) logging.info("Received %s total ingest infos", len(ingest_info_data_list)) if ingest_info_data_list: proto, failed_tasks = _get_proto_from_batch_ingest_info_data_list( ingest_info_data_list ) if not proto.people: logging.error("Scrape session returned 0 people.") return False for batch_ingest_info_datum in failed_tasks.values(): logging.error( "Task with trace_id %s failed with error %s", batch_ingest_info_datum.trace_id, batch_ingest_info_datum.error, ) if _should_abort(len(failed_tasks), len(proto.people)): logging.error( "Too many scraper tasks failed(%s), aborting write", len(failed_tasks) ) return False metadata = IngestMetadata( region=region_code, jurisdiction_id=region.jurisdiction_id, ingest_time=session_start_time, facility_id=region.facility_id, enum_overrides=overrides, system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS), ) did_write = persistence.write(proto, metadata) if did_write: datastore_ingest_info.batch_delete_ingest_infos_for_region(region_code) return did_write logging.error("No ingest infos received from Datastore") return False
def _commit_person(person: SchemaPersonType, system_level: SystemLevel, ingest_time: datetime.datetime): act_session = SessionFactory.for_schema_base( schema_base_for_system_level(system_level)) merged_person = act_session.merge(person) metadata = IngestMetadata(region='somewhere', jurisdiction_id='12345', ingest_time=ingest_time, system_level=system_level) update_historical_snapshots(act_session, [merged_person], [], metadata) act_session.commit() act_session.close()
def testParsePerson_NoiseInPlaceOfResidence_ParsesResidencyStatus(self): # Arrange metadata = IngestMetadata.new_with_defaults(region='us_ky_allen') ingest_person = ingest_info_pb2.Person( place_of_residence='transient moves around') # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults( residency_status=ResidencyStatus.TRANSIENT, region='us_ky_allen') self.assertEqual(result, expected_result)
def test_scrape_data_and_more_yes_persist( self, mock_get_more: Mock, mock_fetch: Mock, mock_populate: Mock, mock_write: Mock, ) -> None: mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [ QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ) ] expected_metadata = IngestMetadata( region=scraper.region.region_code, jurisdiction_id=scraper.region.jurisdiction_id, ingest_time=start_time, enum_overrides=scraper.get_enum_overrides(), system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS), ) expected_proto = convert_ingest_info_to_proto(self.ii) self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 1) mock_write.assert_called_once_with(expected_proto, expected_metadata) self.assertCountEqual(expected_tasks, scraper.tasks)
def testParsePerson_InfersBirthdateFromAge(self, mock_datetime): # Arrange mock_datetime.now.return_value = _NOW metadata = IngestMetadata.new_with_defaults() ingest_person = ingest_info_pb2.Person(age='27') # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults( birthdate=datetime(year=_NOW.year - 27, month=1, day=1).date(), birthdate_inferred_from_age=True) self.assertEqual(result, expected_result)
def infer_release_on_open_bookings(region_code: str, last_ingest_time: datetime.datetime, custody_status: CustodyStatus) -> None: """ Look up all open bookings whose last_seen_time is earlier than the provided last_ingest_time in the provided region, update those bookings to have an inferred release date equal to the provided last_ingest_time. Args: region_code: the region_code last_ingest_time: The last time complete data was ingested for this region. In the normal ingest pipeline, this is the last start time of a background scrape for the region. custody_status: The custody status to be marked on the found open bookings. Defaults to INFERRED_RELEASE """ session = SessionFactory.for_schema_base(JailsBase) try: logging.info("Reading all bookings that happened before [%s]", last_ingest_time) people = county_dao.read_people_with_open_bookings_scraped_before_time( session, region_code, last_ingest_time) logging.info( "Found [%s] people with bookings that will be inferred released", len(people), ) for person in people: persistence_utils.remove_pii_for_person(person) _infer_release_date_for_bookings(person.bookings, last_ingest_time, custody_status) db_people = converter.convert_entity_people_to_schema_people(people) database.write_people( session, db_people, IngestMetadata(region=region_code, jurisdiction_id="", ingest_time=last_ingest_time), ) session.commit() except Exception: session.rollback() raise finally: session.close()
def for_county( cls, region: str, jurisdiction_id: Optional[str] = None, ingest_time: Optional[datetime.datetime] = None, enum_overrides: Optional[EnumOverrides] = None, facility_id: Optional[str] = None, ) -> IngestMetadata: return IngestMetadata( region=region, jurisdiction_id=jurisdiction_id or "jurisdiction_id", ingest_time=ingest_time or datetime.datetime(2020, 4, 14, 12, 31, 00), enum_overrides=enum_overrides or EnumOverrides.empty(), facility_id=facility_id, system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS), )
def testParsePerson_ResidenceAndStatusCombined(self): # Arrange metadata = IngestMetadata.new_with_defaults(region="us_ky_allen") ingest_person = ingest_info_pb2.Person(place_of_residence="42164 homeless") # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults( resident_of_region=True, residency_status=ResidencyStatus.HOMELESS, region="us_ky_allen", ) self.assertEqual(result, expected_result)
def test_write_noPeople(self): # Arrange most_recent_scrape_time = (SCRAPER_START_DATETIME + timedelta(days=1)) metadata = IngestMetadata.new_with_defaults( region=REGION_1, jurisdiction_id=JURISDICTION_ID, ingest_time=most_recent_scrape_time) ingest_info = IngestInfo() # Act persistence.write(ingest_info, metadata) # Assert people = county_dao.read_people( SessionFactory.for_schema_base(JailsBase)) self.assertFalse(people)
def testConvert_TotalBondWithMultipleBonds_ThrowsException(self): # Arrange metadata = IngestMetadata.new_with_defaults(ingest_time=_INGEST_TIME) ingest_info = IngestInfo() ingest_info.people.add(booking_ids=['BOOKING_ID']) ingest_info.bookings.add(booking_id='BOOKING_ID', total_bond_amount='$100', charge_ids=['CHARGE_ID', 'CHARGE_ID_2']) ingest_info.charges.add(charge_id='CHARGE_ID', bond_id='BOND_ID') ingest_info.charges.add(charge_id='CHARGE_ID_2', bond_id='BOND_ID_2') ingest_info.bonds.add(bond_id='BOND_ID') ingest_info.bonds.add(bond_id='BOND_ID_2') # Act + Assert with self.assertRaises(ValueError): self._convert_and_throw_on_errors(ingest_info, metadata)
def testParseStatePerson_TakesLastZipCodeMatch(self): # Arrange metadata = IngestMetadata.new_with_defaults(region='us_nd') # 5-digit address could be mistaken for a zip code ingest_person = ingest_info_pb2.StatePerson( current_address='12345 Main 58503') # Act state_person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.StatePerson.new_with_defaults( current_address='12345 MAIN 58503', residency_status=ResidencyStatus.PERMANENT) self.assertEqual(result, expected_result)
def testParsePerson_NotResidentOfCounty(self): # Arrange metadata = IngestMetadata.new_with_defaults(region='us_ky_allen') # 40601 is in Frankfort ingest_person = ingest_info_pb2.Person( place_of_residence='123 Main 40601') # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults( resident_of_region=False, residency_status=ResidencyStatus.PERMANENT, region='us_ky_allen') self.assertEqual(result, expected_result)
def testParseStatePerson_NoiseInPlaceOfResidence_ParsesResidency(self): # Arrange metadata = IngestMetadata.new_with_defaults(region='us_xx') ingest_person = ingest_info_pb2.StatePerson( current_address='transient moves around') # Act state_person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.StatePerson.new_with_defaults( current_address='TRANSIENT MOVES AROUND', residency_status=ResidencyStatus.TRANSIENT, state_code='US_XX') self.assertEqual(result, expected_result)
def testParseBooking_SetsDefaults(self): # Arrange metadata = IngestMetadata.new_with_defaults(ingest_time=_INGEST_TIME, ) ingest_booking = ingest_info_pb2.Booking() # Act booking.copy_fields_to_builder(self.subject, ingest_booking, metadata) result = self.subject.build() # Assert expected_result = entities.Booking.new_with_defaults( admission_date=_INGEST_TIME.date(), admission_date_inferred=True, last_seen_time=_INGEST_TIME, first_seen_time=_INGEST_TIME, custody_status=CustodyStatus.PRESENT_WITHOUT_INFO) self.assertEqual(result, expected_result)
def testParsePerson_TakesLastZipCodeMatch(self): # Arrange metadata = IngestMetadata.new_with_defaults(region="us_ky_allen") # 5-digit address could be mistaken for a zip code ingest_person = ingest_info_pb2.Person(place_of_residence="12345 Main 42164") # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults( resident_of_region=True, residency_status=ResidencyStatus.PERMANENT, region="us_ky_allen", ) self.assertEqual(result, expected_result)
def testParsePerson_NotResidentOfState(self): # Arrange metadata = IngestMetadata.new_with_defaults(region="us_ky") # 10011 is in New York ingest_person = ingest_info_pb2.Person(place_of_residence="123 Main 10011") # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults( resident_of_region=False, residency_status=ResidencyStatus.PERMANENT, region="us_ky", ) self.assertEqual(result, expected_result)
def testParsePerson_ResidentOfCounty(self): # Arrange metadata = IngestMetadata.new_with_defaults(region="us_ky_allen") # 42164 is in Allen ingest_person = ingest_info_pb2.Person(place_of_residence="123 Main 42164") # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_result = entities.Person.new_with_defaults( resident_of_region=True, residency_status=ResidencyStatus.PERMANENT, region="us_ky_allen", ) self.assertEqual(result, expected_result)
def _commit_person( person: SchemaPersonType, system_level: SystemLevel, ingest_time: datetime.datetime, ): db_key = SQLAlchemyDatabaseKey.canonical_for_schema( system_level.schema_type()) with SessionFactory.using_database(db_key) as act_session: merged_person = act_session.merge(person) metadata = IngestMetadata( region="somewhere", jurisdiction_id="12345", ingest_time=ingest_time, system_level=system_level, database_key=db_key, ) update_historical_snapshots(act_session, [merged_person], [], metadata)
def testConvert_TotalBondWithCharge_SetsTotalBondOnCharge(self): # Arrange metadata = IngestMetadata.new_with_defaults(ingest_time=_INGEST_TIME) ingest_info = IngestInfo() ingest_info.people.add(booking_ids=["BOOKING_ID"]) ingest_info.bookings.add( booking_id="BOOKING_ID", total_bond_amount="$100", charge_ids=["CHARGE_ID"] ) ingest_info.charges.add(charge_id="CHARGE_ID") # Act result = self._convert_and_throw_on_errors(ingest_info, metadata) # Assert expected_result = [ Person.new_with_defaults( bookings=[ Booking.new_with_defaults( external_id="BOOKING_ID", admission_date=_INGEST_TIME.date(), admission_date_inferred=True, first_seen_time=_INGEST_TIME, last_seen_time=_INGEST_TIME, custody_status=CustodyStatus.PRESENT_WITHOUT_INFO, charges=[ Charge.new_with_defaults( external_id="CHARGE_ID_COUNT_1", status=ChargeStatus.PRESENT_WITHOUT_INFO, bond=Bond.new_with_defaults( amount_dollars=100, status=BondStatus.PRESENT_WITHOUT_INFO, bond_type=BondType.CASH, ), ) ], ) ] ) ] self.assertEqual(result, expected_result)
def testParsePerson_WithSurnameAndGivenNames_UsesFullNameAsJson(self): # Arrange metadata = IngestMetadata.new_with_defaults() ingest_person = ingest_info_pb2.Person( surname='UNESCAPED,SURNAME"WITH-CHARS"', given_names='GIVEN_NAMES', middle_names='MIDDLE_NAMES') # Act person.copy_fields_to_builder(self.subject, ingest_person, metadata) result = self.subject.build() # Assert expected_full_name = \ '{{"given_names": "{}", "middle_names": "{}", "surname": "{}"}}'\ .format('GIVEN_NAMES', 'MIDDLE_NAMES', 'UNESCAPED,SURNAME\\"WITH-CHARS\\"') expected_result = entities.Person.new_with_defaults( full_name=expected_full_name) self.assertEqual(result, expected_result)
def testParseBooking(self): # Arrange metadata = IngestMetadata.new_with_defaults(ingest_time=_INGEST_TIME) ingest_booking = ingest_info_pb2.Booking( booking_id="BOOKING_ID", admission_date="2/3/1000", admission_reason="New Commitment", release_date="1/2/2017", projected_release_date="5/20/2020", release_reason="Transfer", custody_status="Held Elsewhere", classification="Low", ) # Act booking.copy_fields_to_builder(self.subject, ingest_booking, metadata) result = self.subject.build() # Assert expected_result = entities.Booking.new_with_defaults( external_id="BOOKING_ID", admission_date=date(year=1000, month=2, day=3), admission_reason=AdmissionReason.NEW_COMMITMENT, admission_reason_raw_text="NEW COMMITMENT", admission_date_inferred=False, release_date=date(year=2017, month=1, day=2), release_date_inferred=False, projected_release_date=date(year=2020, month=5, day=20), release_reason=ReleaseReason.TRANSFER, release_reason_raw_text="TRANSFER", custody_status=CustodyStatus.HELD_ELSEWHERE, custody_status_raw_text="HELD ELSEWHERE", classification=Classification.LOW, classification_raw_text="LOW", last_seen_time=_INGEST_TIME, first_seen_time=_INGEST_TIME, ) self.assertEqual(result, expected_result)
def testParseCharge_MapAcrossFields(self): # Arrange overrides_builder = EnumOverrides.Builder() overrides_builder.add('FELONY', ChargeClass.FELONY, ChargeDegree) overrides_builder.add('FIRST DEGREE', ChargeDegree.FIRST, ChargeClass) metadata = IngestMetadata.new_with_defaults( enum_overrides=overrides_builder.build()) ingest_charge = ingest_info_pb2.Charge(charge_class='first degree', degree='felony') # Act charge.copy_fields_to_builder(self.subject, ingest_charge, metadata) result = self.subject.build() # Assert expected_result = entities.Charge.new_with_defaults( degree=ChargeDegree.FIRST, degree_raw_text='FELONY', charge_class=ChargeClass.FELONY, class_raw_text='FIRST DEGREE', status=ChargeStatus.PRESENT_WITHOUT_INFO) self.assertEqual(result, expected_result)