def test_good_table(self): """Tests a well modelled table.""" expected_info = IngestInfo() person = expected_info.create_person() person.birthdate = "1/15/2048" info = self.extract("good_table.html", "good_table.yaml") self.assertEqual(expected_info, info)
def testParse(self): region = regions.get_region('us_ma_middlesex', is_direct_ingest=True) controller = region.get_ingestor() metadata = IngestMetadata(region.region_code, region.jurisdiction_id, _FAKE_START_TIME, controller.get_enum_overrides()) ingest_info = UsMaMiddlesexParser().parse(_ROSTER_JSON) expected_info = IngestInfo() p1 = expected_info.create_person( person_id='12345 ', birthdate='1111-01-01 00:00:00.000', gender='M', ethnicity='HISPANIC', place_of_residence='123 ST DORCHESTER MA 01234 ') b1 = p1.create_booking(booking_id='1.0', admission_date='2017-01-01 00:00:00.000', admission_reason='BAIL MITTIMUS', facility='MAIN ') b1.create_charge(charge_id='1245.0', statute='90/24/K', name='OUI-LIQUOR, 2ND OFFENSE c90 ss24', case_number='111.0', court_type='Middlesex SC (81)', charge_notes='Other') b1.create_charge(charge_id='1502.0', offense_date='2017-01-28 00:00:00', statute='90/23/J', name='OUI while license suspended for OUI', case_number='222.0', court_type='Middlesex SC (81)', charge_notes='Drug or Alcohol', status='DISMISSED').create_bond(bond_id='12345.0') b1.create_hold(hold_id='00000.0', jurisdiction_name='Middlesex SC (81)') p2 = expected_info.create_person( person_id='10472 ', birthdate='1111-02-02 00:00:00.000', gender='M', race='BLACK or AFRICAN AMERICAN', place_of_residence='456 ST MALDEN MA 98765 ') b2 = p2.create_booking(booking_id='333.0', admission_date='2018-02-02 00:00:00.000', admission_reason='SENTENCE MITTIMUS', facility='MAIN ') b2.create_arrest(agency='Cambridge PD') b2.create_charge(charge_id='12341234.0', statute='269/10/J', name='FIREARM, CARRY WITHOUT LICENSE c269 ss10', case_number='555.0', charge_notes='Other', court_type='Cambridge DC (52)') self.validate_ingest(ingest_info, expected_info, metadata)
def test_child_first(self): """Tests that in multi_key mappings (columns in a table), parent objects are created where needed.""" expected_info = IngestInfo() p = expected_info.create_person() p.create_booking(admission_date='111').create_charge(name='AAA') p.create_booking(admission_date='222').create_charge(name='BBB') info = self.extract('child_first.html', 'child_first.yaml') self.assertEqual(expected_info, info)
def test_bond_multi_key(self) -> None: expected_info = IngestInfo() booking = expected_info.create_person().create_booking() booking.create_charge().create_bond(bond_id="1", amount="10") booking.create_charge().create_bond(bond_id="2", amount="20") booking.create_charge().create_bond(bond_id="3", amount="30") info = self.extract("bonds.html", "bonds.yaml") self.assertEqual(expected_info, info)
def test_child_first(self) -> None: """Tests that in multi_key mappings (columns in a table), parent objects are created where needed.""" expected_info = IngestInfo() p = expected_info.create_person() p.create_booking(admission_date="111").create_charge(name="AAA") p.create_booking(admission_date="222").create_charge(name="BBB") info = self.extract("child_first.html", "child_first.yaml") self.assertEqual(expected_info, info)
def test_th_rows(self) -> None: """Tests a yaml file with <th> keys in rows.""" expected_info = IngestInfo() person = expected_info.create_person() person.race = "WHITE" person.gender = "M" info = self.extract("th_rows.html", "th_rows.yaml") self.assertEqual(expected_info, info)
def test_bond_multi_key(self): expected_info = IngestInfo() booking = expected_info.create_person().create_booking() booking.create_charge().create_bond(bond_id='1', amount='10') booking.create_charge().create_bond(bond_id='2', amount='20') booking.create_charge().create_bond(bond_id='3', amount='30') info = self.extract('bonds.html', 'bonds.yaml') self.assertEqual(expected_info, info)
def test_th_rows(self): """Tests a yaml file with <th> keys in rows.""" expected_info = IngestInfo() person = expected_info.create_person() person.race = 'WHITE' person.gender = 'M' info = self.extract('th_rows.html', 'th_rows.yaml') self.assertEqual(expected_info, info)
def test_readPeopleWithOpenBookings(self): admission_date = datetime.datetime(2018, 6, 20) release_date = datetime.date(2018, 7, 20) open_booking = Booking( custody_status=CustodyStatus.IN_CUSTODY.value, admission_date=admission_date, first_seen_time=admission_date, last_seen_time=admission_date, ) closed_booking = Booking( custody_status=CustodyStatus.RELEASED.value, admission_date=admission_date, release_date=release_date, first_seen_time=admission_date, last_seen_time=admission_date, ) person_no_match = Person( person_id=1, region=_REGION, jurisdiction_id=_JURISDICTION_ID, bookings=[deepcopy(open_booking)], ) person_match_full_name = Person( person_id=2, region=_REGION, jurisdiction_id=_JURISDICTION_ID, bookings=[deepcopy(open_booking)], full_name=_FULL_NAME, ) person_no_open_bookings = Person( person_id=6, region=_REGION, jurisdiction_id=_JURISDICTION_ID, full_name=_FULL_NAME, bookings=[closed_booking], ) with SessionFactory.using_database(self.database_key, autocommit=False) as session: session.add(person_no_match) session.add(person_no_open_bookings) session.add(person_match_full_name) session.commit() info = IngestInfo() info.create_person(full_name=_FULL_NAME, person_id=_EXTERNAL_ID) people = dao.read_people_with_open_bookings( session, _REGION, info.people) expected_people = [ converter.convert_schema_object_to_entity(p) for p in [person_match_full_name] ] self.assertCountEqual(people, expected_people)
def test_one_to_many(self): key_mapping_file = "../testdata/data_extractor/yaml/one_to_many.yaml" key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = HtmlDataExtractor(key_mapping_file) expected_info = IngestInfo() charge = expected_info.create_person().create_booking().create_charge() charge.create_sentence(min_length="1 day", max_length="1 day") html_contents = html.fromstring("<td>Sentence Length</td><td>1 day</td>") info = extractor.extract_and_populate_data(html_contents) self.assertEqual(expected_info, info)
def test_single_page_roster(self): """Tests that bookings are not treated as multi-key classes, i.e. we assume that a person has at most one booking if they are listed in columns.""" expected_info = IngestInfo() p1 = expected_info.create_person(full_name="PERSON ONE", birthdate="1/1/1111") p1.create_booking(booking_id="NUMBER ONE") p2 = expected_info.create_person(full_name="PERSON TWO", birthdate="2/2/2222") p2.create_booking(booking_id="NUMBER TWO") p3 = expected_info.create_person(full_name="PERSON THREE", birthdate="3/3/3333") p3.create_booking(booking_id="NUMBER THREE") info = self.extract("single_page_roster.html", "single_page_roster.yaml") self.assertEqual(expected_info, info)
def test_no_multi_key_parent(self): """Tests that parent classes are created properly when a field is scraped whose parent is a multi-key class that has not been scraped. In this example, charges are multi-key classes, but a bond is scraped from a booking with no charge information.""" expected_info = IngestInfo() charge = expected_info.create_person().create_booking().create_charge() charge.create_bond(bond_id='1111') # The extractor will warn that 'Charge Description' cannot be found. # This is necessary because we need a field under multi_key_mappings # so that charge is treated as a multi_key class. info = self.extract('no_charges.html', 'charge_multi_key.yaml') self.assertEqual(expected_info, info)
def test_persist_duplicates_to_db(self, mock_write, _mock_region, mock_session_return): """Tests that duplicate ingest_info.Person objects are merged before write.""" mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) # Arrange ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) ii_2 = IngestInfo() ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2) ii_1_dup = copy.deepcopy(ii) t1, t2, t3 = (Task(task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT + str(i), response_type=constants.ResponseType.TEXT) for i in range(3)) batch_persistence.write(ii, scrape_key, t1) batch_persistence.write(ii_2, scrape_key, t2) batch_persistence.write(ii_1_dup, scrape_key, t3) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) expected_ii = IngestInfo(people=ii.people + ii_2.people) expected_proto = ingest_utils.convert_ingest_info_to_proto(expected_ii) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto)
def test_jailtracker_person(self): key_mapping_file = 'fixtures/jailtracker_person.yaml' key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = JsonDataExtractor(key_mapping_file) expected_result = IngestInfo() expected_result.create_person(person_id='012345', birthdate='12/12/0001', age='2018', race='WHITE') result = extractor.extract_and_populate_data(_JT_PERSON) self.assertEqual(result, expected_result)
def test_jailtracker_person(self) -> None: key_mapping_file = "fixtures/jailtracker_person.yaml" key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = JsonDataExtractor(key_mapping_file) expected_result = IngestInfo() expected_result.create_person( person_id="012345", birthdate="12/12/0001", age="2018", race="WHITE" ) result = extractor.extract_and_populate_data( fixtures.as_dict("extractor", "jailtracker_person.json") ) self.assertEqual(result, expected_result)
def test_persist_to_db_different_regions(self, mock_write, _mock_region, mock_session_return): scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) ii = IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) ii2 = IngestInfo() ii2.create_person( person_id=TEST_ID, full_name=TEST_NAME2).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) mock_session_1 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii, scrape_key1, t) expected_proto = ingest_utils.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key1.region_code, mock_session_1.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # We expect the region that we persisted to have no more ingest infos. ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session_1.start) self.assertEqual(len(ingest_infos_1), 0) mock_session_2 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii2, scrape_key2, t2) ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[1], mock_session_2.start) self.assertEqual(len(ingest_infos_2), 1) expected_proto = ingest_utils.convert_ingest_info_to_proto(ii2) batch_persistence.persist_to_database(scrape_key2.region_code, mock_session_2.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) self.assertEqual(mock_write.call_count, 2)
def test_content_is_not_modified(self): """Tests that the HtmlDataExtractor does not mutate |content|.""" key_mapping_file = "../testdata/data_extractor/yaml/text_label.yaml" key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = HtmlDataExtractor(key_mapping_file) expected_info = IngestInfo() person = expected_info.create_person() person.birthdate = "1/1/1111" html_contents = html.fromstring("<html><div>DOB: 1/1/1111</div></html>") info = extractor.extract_and_populate_data(html_contents) self.assertEqual(expected_info, info) self.assertFalse(html_contents.cssselect("td"))
def testParseColFail(self): expected_info = IngestInfo(people=[ Person(person_id='100041685', gender='M', age='41', race='AMERICAN INDIAN', bookings=[ Booking( booking_id='130877687', admission_date='02/27/2020 14:51', custody_status='IN CUSTODY', facility='BERNALILLO COUNTY METRO DETENTION CENTER', arrest=Arrest(agency='/BSO', ), charges=[ Charge( offense_date='02/27/2020', name='FAIL TO COMPLY', case_number='D202CR201802134', ), Charge( offense_date='02/27/2020', name='AGGRAVATED DWI-3', case_number='D202CR201802134', ), ]) ]), ]) with pytest.raises(DirectIngestError) as e: self.run_parse_file_test(expected_info, 'MDC_VERA_20200303_02') assert str(e.value) == "Found more columns than expected in charge row"
def write_ingest_info( region: str, task_hash: int, session_start_time: datetime, ingest_info: IngestInfo ) -> BatchIngestInfoData: """Writes a new ingest info for a given region. Args: region: (string) The region the ingest info is getting added for task_hash: (int) the hash of the task associated with the ingest info session_start_time: (datetime) The start time of the scraper that got the ingest info ingest_info: (IngestInfo) The ingest info data """ logging.info( "Writing a new ingest info (with %d people) for region: [%s]", len(ingest_info.get_all_people()), region, ) new_ingest_info_entity = _DatastoreIngestInfo.new( key=ds().key(INGEST_INFO_KIND), session_start_time=session_start_time, region=region, ingest_info=ingest_info, task_hash=task_hash, ).to_entity() try: retry_grpc(NUM_GRPC_RETRIES, ds().put, new_ingest_info_entity) except Exception as e: raise DatastoreWriteIngestInfoError(ingest_info, region) from e return _DatastoreIngestInfo.get_batch_ingest_info_data(new_ingest_info_entity)
def test_partial_table(self) -> None: """Tests a page with a table as well as unstructured data.""" expected_info = IngestInfo() person = expected_info.create_person() person.age = "38" person.place_of_residence = "WICHITA FALLS" person.race = "HISPANIC" booking = person.create_booking() booking.admission_date = "08/18/2017" charge = booking.create_charge() charge.name = "FIRST CHARGE" charge.charging_entity = "WICHITA FALLS PD" bond = charge.create_bond() bond.amount = "25,000.00" info = self.extract("partial_table.html", "partial_table.yaml") self.assertEqual(expected_info, info)
def test_three_levels_multi_key(self): expected_info = IngestInfo() p = expected_info.create_person() b1 = p.create_booking(admission_date='01/01/2011', release_date='02/02/2012') b1.create_charge(name='Charge1').create_bond(amount='$1.00', bond_agent='AGENT 1') b2 = p.create_booking(admission_date='03/03/2013') b2.create_charge(name='Charge2').create_bond(amount='$2.00') b3 = p.create_booking(admission_date='03/03/2013') b3.create_charge(name='Charge3').create_bond(amount='$3.00') b4 = p.create_booking(admission_date='03/03/2013') b4.create_charge(name='Charge4').create_bond(amount='$4.00') info = self.extract('three_levels_multi_key.html', 'three_levels_multi_key.yaml') self.assertEqual(expected_info, info)
def test_single_page_roster(self): """Tests that bookings are not treated as multi-key classes, i.e. we assume that a person has at most one booking if they are listed in columns.""" expected_info = IngestInfo() p1 = expected_info.create_person(full_name='PERSON ONE', birthdate='1/1/1111') p1.create_booking(booking_id='NUMBER ONE') p2 = expected_info.create_person(full_name='PERSON TWO', birthdate='2/2/2222') p2.create_booking(booking_id='NUMBER TWO') p3 = expected_info.create_person(full_name='PERSON THREE', birthdate='3/3/3333') p3.create_booking(booking_id='NUMBER THREE') info = self.extract('single_page_roster.html', 'single_page_roster.yaml') self.assertEqual(expected_info, info)
def test_three_levels_multi_key(self) -> None: expected_info = IngestInfo() p = expected_info.create_person() b1 = p.create_booking(admission_date="01/01/2011", release_date="02/02/2012") b1.create_charge(name="Charge1").create_bond(amount="$1.00", bond_agent="AGENT 1") b2 = p.create_booking(admission_date="03/03/2013") b2.create_charge(name="Charge2").create_bond(amount="$2.00") b3 = p.create_booking(admission_date="03/03/2013") b3.create_charge(name="Charge3").create_bond(amount="$3.00") b4 = p.create_booking(admission_date="03/03/2013") b4.create_charge(name="Charge4").create_bond(amount="$4.00") info = self.extract("three_levels_multi_key.html", "three_levels_multi_key.yaml") self.assertEqual(expected_info, info)
def test_partial_table(self): """Tests a page with a table as well as unstructured data.""" expected_info = IngestInfo() person = expected_info.create_person() person.age = '38' person.place_of_residence = 'WICHITA FALLS' person.race = 'HISPANIC' booking = person.create_booking() booking.admission_date = '08/18/2017' charge = booking.create_charge() charge.name = 'FIRST CHARGE' charge.charging_entity = 'WICHITA FALLS PD' bond = charge.create_bond() bond.amount = '25,000.00' info = self.extract('partial_table.html', 'partial_table.yaml') self.assertEqual(expected_info, info)
def test_labeled_fields(self) -> None: """Tests a page with field values in <span>s labeled by <label>s.""" expected_info = IngestInfo() person = expected_info.create_person() person.person_id = "11111" person.race = "White" person.gender = "Male" booking = person.create_booking() booking.admission_date = "11/12/2018 5:04 PM" booking.facility = "Walla Walla County Corrections Department" charge = booking.create_charge() charge.name = "DUI" charge.offense_date = "9/21/2018 5:34 PM" charge.charge_class = "Gross Misdemeanor" charge.status = "Time Served" booking.charges.append(charge) info = self.extract("labeled_fields.html", "labeled_fields.yaml") self.assertEqual(expected_info, info)
def test_labeled_fields(self): """Tests a page with field values in <span>s labeled by <label>s.""" expected_info = IngestInfo() person = expected_info.create_person() person.person_id = '11111' person.race = 'White' person.gender = 'Male' booking = person.create_booking() booking.admission_date = '11/12/2018 5:04 PM' booking.facility = 'Walla Walla County Corrections Department' charge = booking.create_charge() charge.name = 'DUI' charge.offense_date = '9/21/2018 5:34 PM' charge.charge_class = 'Gross Misdemeanor' charge.status = 'Time Served' booking.charges.append(charge) info = self.extract('labeled_fields.html', 'labeled_fields.yaml') self.assertEqual(expected_info, info)
def test_jailtracker_booking(self): key_mapping_file = 'fixtures/jailtracker_booking.yaml' key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = JsonDataExtractor(key_mapping_file) expected_result = IngestInfo() expected_person = expected_result.create_person() expected_person.create_booking(booking_id='123098', admission_date='1/1/2001', release_date='1/1/2001') expected_person.create_booking(booking_id='123099', admission_date='1/1/2002', release_date='1/1/2002') result = extractor.extract_and_populate_data(_JT_BOOKING) self.assertEqual(result, expected_result)
def test_sort(self): b1 = ingest_info.Booking(admission_date='1') b2 = ingest_info.Booking(admission_date='2') ii = IngestInfo(people=[ingest_info.Person(bookings=[b1, b2])]) ii_reversed = IngestInfo( people=[ingest_info.Person(bookings=[b2, b1])]) self.assertNotEqual(ii, ii_reversed) ii.sort() ii_reversed.sort() self.assertEqual(ii, ii_reversed)
def test_person_with_charges(self): key_mapping_file = 'fixtures/person_with_charges.yaml' key_mapping_file = os.path.join(os.path.dirname(__file__), key_mapping_file) extractor = JsonDataExtractor(key_mapping_file) expected_result = IngestInfo() expected_person = expected_result.create_person(person_id='3245', full_name='AAA AAAB', race='BLACK') booking_1 = expected_person.create_booking(booking_id='324567', admission_date='1/1/1111') booking_1.create_charge(charge_id='345309', name='charge name 1') booking_1.create_charge(charge_id='894303', name='charge name 2') booking_2 = expected_person.create_booking(booking_id='3245', admission_date='2/2/2222') booking_2.create_charge(charge_id='42309', name='charge name 3') result = extractor.extract_and_populate_data(_PERSON_WITH_CHARGES) self.assertEqual(result, expected_result)
def extract_and_populate_data(self, content: Union[Dict, List], ingest_info: IngestInfo = None): """This function does all the work of taking the users yaml file and content and returning a populated data class. This function iterates through every field in the object and builds a model based on the keys that it sees. Args: content: An already parsed JSON object or array ingest_info: An IngestInfo object to use, if None we create a new one by default Returns: A populated ingest data model for a scrape. """ if ingest_info is None: ingest_info = IngestInfo() self._extract(content, ingest_info, defaultdict(set)) return ingest_info.prune()