def test_start_scrape_no_docket_item( self, mock_pubsub: Mock, mock_get_region: Mock, mock_tracker: Mock, mock_sessions: Mock, mock_task_manager: Mock, ) -> None: region = "us_nd" scrape_type = constants.ScrapeType.BACKGROUND queue_name = "us_nd_scraper" initial_task = "trash_it" mock_get_region.return_value = mock_region(region, queue_name) mock_tracker.return_value = None mock_sessions.return_value = None scraper = FakeScraper(region, initial_task) scraper.start_scrape(scrape_type) mock_get_region.assert_called_with(region) mock_tracker.assert_called_with(ScrapeKey(region, scrape_type)) mock_sessions.assert_called_with(ScrapeKey(region, scrape_type)) mock_pubsub.assert_called_with(ScrapeKey(region, scrape_type), BATCH_PUBSUB_TYPE) mock_task_manager.return_value.create_scrape_task.assert_not_called()
def test_stop_no_session( self, mock_sessions, mock_task_manager, mock_region, mock_supported ): mock_sessions.return_value = None mock_scraper = create_autospec(BaseScraper) mock_region.return_value = fake_region(ingestor=mock_scraper) mock_supported.return_value = ["us_ca", "us_ut"] request_args = { "region": "all", "scrape_type": "all", "respect_is_stoppable": "false", } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get("/stop", query_string=request_args, headers=headers) assert response.status_code == 200 mock_sessions.assert_has_calls( [ call(ScrapeKey("us_ca", constants.ScrapeType.BACKGROUND)), call(ScrapeKey("us_ca", constants.ScrapeType.SNAPSHOT)), call(ScrapeKey("us_ut", constants.ScrapeType.BACKGROUND)), call(ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT)), ], any_order=True, ) mock_scraper.stop_scrape.assert_not_called() mock_supported.assert_called_with(stripes=[], timezone=None) mock_task_manager.return_value.create_scraper_phase_task.assert_not_called()
def test_repr() -> None: scrape_key = ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT) representation = scrape_key.__repr__() assert (representation == "<ScrapeKey region_code: us_ut, " "scrape_type: ScrapeType.SNAPSHOT>")
def test_stop_no_session(self, mock_sessions, mock_enqueue, mock_region, mock_supported, client): mock_sessions.return_value = [] mock_region.return_value = fake_region() mock_supported.return_value = ['us_ca', 'us_ut'] request_args = { 'region': 'all', 'scrape_type': 'all', 'respect_is_stoppable': 'false' } headers = {'X-Appengine-Cron': "test-cron"} response = client.get('/stop', query_string=request_args, headers=headers) assert response.status_code == 200 mock_sessions.assert_has_calls([ call(ScrapeKey('us_ca', constants.ScrapeType.BACKGROUND)), call(ScrapeKey('us_ca', constants.ScrapeType.SNAPSHOT)), call(ScrapeKey('us_ut', constants.ScrapeType.BACKGROUND)), call(ScrapeKey('us_ut', constants.ScrapeType.SNAPSHOT)) ], any_order=True) assert not mock_region.return_value.get_ingestor().\ stop_scrape.called mock_supported.assert_called_with(timezone=None) assert not mock_enqueue.called
def test_stop_no_session(self, mock_sessions, mock_task_manager, mock_region, mock_supported): mock_sessions.return_value = None mock_scraper = create_autospec(BaseScraper) mock_region.return_value = fake_region(ingestor=mock_scraper) mock_supported.return_value = ['us_ca', 'us_ut'] request_args = { 'region': 'all', 'scrape_type': 'all', 'respect_is_stoppable': 'false' } headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.get('/stop', query_string=request_args, headers=headers) assert response.status_code == 200 mock_sessions.assert_has_calls([ call(ScrapeKey('us_ca', constants.ScrapeType.BACKGROUND)), call(ScrapeKey('us_ca', constants.ScrapeType.SNAPSHOT)), call(ScrapeKey('us_ut', constants.ScrapeType.BACKGROUND)), call(ScrapeKey('us_ut', constants.ScrapeType.SNAPSHOT)) ]) mock_scraper.stop_scrape.assert_not_called() mock_supported.assert_called_with(stripes=[], timezone=None) mock_task_manager.return_value.create_scraper_phase_task.\ assert_not_called()
def test_iterate_docket_item_no_matching_items(self): docket_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.add_to_query_docket(docket_key, get_payload()).result() session_key = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) self.create_session(session_key) assert not tracker.iterate_docket_item(session_key)
def test_persist_to_db_different_regions(self, mock_write, _mock_region, mock_session_return): scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) ii2 = ingest_info.IngestInfo() ii2.create_person( person_id=TEST_ID, full_name=TEST_NAME2).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) t2 = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) mock_session_1 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii, scrape_key1, t) expected_proto = serialization.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key1.region_code, mock_session_1.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # We expect the region that we persisted to have no more ingest infos. ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session_1.start) self.assertEqual(len(ingest_infos_1), 0) mock_session_2 = mock_session_return.return_value = create_mock_session( ) batch_persistence.write(ii2, scrape_key2, t2) ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[1], mock_session_2.start) self.assertEqual(len(ingest_infos_2), 1) expected_proto = serialization.convert_ingest_info_to_proto(ii2) batch_persistence.persist_to_database(scrape_key2.region_code, mock_session_2.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) self.assertEqual(mock_write.call_count, 2)
def test_stop_respects_region_is_not_stoppable( self, mock_sessions, mock_close, mock_phase, mock_task_manager, mock_region, mock_supported, ): session = sessions.ScrapeSession.new( key=None, region="us_xx", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.SCRAPE, ) mock_sessions.return_value = session mock_close.return_value = [session] mock_scraper = create_autospec(BaseScraper) mock_region.return_value = fake_region(ingestor=mock_scraper) mock_region.return_value.is_stoppable = False mock_supported.return_value = ["us_ca", "us_ut"] request_args = {"region": "all", "scrape_type": "all"} headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get("/stop", query_string=request_args, headers=headers) assert response.status_code == 200 mock_sessions.assert_has_calls( [ call(ScrapeKey("us_ca", constants.ScrapeType.BACKGROUND)), call(ScrapeKey("us_ca", constants.ScrapeType.SNAPSHOT)), call(ScrapeKey("us_ut", constants.ScrapeType.BACKGROUND)), call(ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT)), ] ) mock_phase.assert_has_calls( [call(session, scrape_phase.ScrapePhase.PERSIST)] * 4 ) assert mock_scraper.stop_scrape.mock_calls == [ call(constants.ScrapeType.BACKGROUND, None), call().__bool__(), call(constants.ScrapeType.SNAPSHOT, None), call().__bool__(), call(constants.ScrapeType.BACKGROUND, None), call().__bool__(), call(constants.ScrapeType.SNAPSHOT, None), call().__bool__(), ] mock_supported.assert_called_with(stripes=[], timezone=None) mock_task_manager.return_value.create_scraper_phase_task.assert_has_calls( [ call(region_code="us_ca", url="/read_and_persist"), call(region_code="us_ut", url="/read_and_persist"), ], any_order=True, )
def test_check_for_finished_scrapers( self, mock_region: Mock, mock_validate_regions: Mock, mock_session: Mock, mock_task_manager: Mock, ) -> None: mock_validate_regions.return_value = ["region_x", "region_y", "region_z"] mock_session.side_effect = [ # Session still in START, shouldn't be stopped sessions.ScrapeSession.new( key=None, region="region_x", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, ), # Session in SCRAPE, should be stopped sessions.ScrapeSession.new( key=None, region="region_y", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.SCRAPE, ), # No session, shouldn't be stopped None, ] fake_region_x = create_autospec(Region) fake_region_x.region_code = "region_y" fake_region_x.get_queue_name.return_value = "queue" mock_region.side_effect = [fake_region_x] mock_task_manager.return_value.list_scrape_tasks.return_value = [] request_args = {"region": "all"} headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get( "/check_finished", query_string=request_args, headers=headers ) assert response.status_code == 200 mock_validate_regions.assert_called_with(["all"]) mock_session.assert_has_calls( [ call(ScrapeKey("region_x", constants.ScrapeType.BACKGROUND)), call(ScrapeKey("region_y", constants.ScrapeType.BACKGROUND)), call(ScrapeKey("region_z", constants.ScrapeType.BACKGROUND)), ] ) mock_region.assert_called_with("region_y") mock_task_manager.return_value.list_scrape_tasks.assert_called_with( region_code="region_y", queue_name="queue" ) mock_task_manager.return_value.create_scraper_phase_task.assert_called_with( region_code="region_y", url="/stop" )
def test_get_new_docket_item_no_matching_items(self): write_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) read_key = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) pubsub_helper.create_topic_and_subscription(write_key, docket.PUBSUB_TYPE) docket.add_to_query_docket(write_key, get_payload()).result() docket_item = docket.get_new_docket_item(read_key, return_immediately=True) assert not docket_item
def test_purge_query_docket_nothing_matching(self): scrape_key_purge = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) scrape_key_add = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) pubsub_helper.create_topic_and_subscription(scrape_key_add, docket.PUBSUB_TYPE) docket.add_to_query_docket(scrape_key_add, get_payload()).result() docket.purge_query_docket(scrape_key_purge) assert not docket.get_new_docket_item(scrape_key_purge, return_immediately=True)
def test_stop_timezone( self, mock_sessions, mock_close, mock_phase, mock_task_manager, mock_region, mock_supported, ): session = sessions.ScrapeSession.new( key=None, region="us_ut", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.SCRAPE, ) mock_sessions.return_value = session mock_close.return_value = [session] mock_scraper = create_autospec(BaseScraper) mock_region.return_value = fake_region(ingestor=mock_scraper) mock_supported.side_effect = _MockSupported request_args = { "region": "all", "scrape_type": "all", "timezone": "America/New_York", "respect_is_stoppable": "false", } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get("/stop", query_string=request_args, headers=headers) assert response.status_code == 200 mock_sessions.assert_has_calls( [ call(ScrapeKey("us_ut", constants.ScrapeType.BACKGROUND)), call(ScrapeKey("us_ut", constants.ScrapeType.SNAPSHOT)), ] ) mock_phase.assert_has_calls( [call(session, scrape_phase.ScrapePhase.PERSIST)] * 2 ) mock_region.assert_has_calls([call("us_ut")]) mock_scraper.stop_scrape.assert_called_with( constants.ScrapeType.SNAPSHOT, "false" ) mock_supported.assert_called_with( stripes=[], timezone=pytz.timezone("America/New_York") ) mock_task_manager.return_value.create_scraper_phase_task.assert_called_with( region_code="us_ut", url="/read_and_persist" )
def test_stop_respects_region_is_not_stoppable(self, mock_sessions, mock_close, mock_phase, mock_task_manager, mock_region, mock_supported): session = sessions.ScrapeSession.new( key=None, region='us_xx', scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.SCRAPE) mock_sessions.return_value = session mock_close.return_value = [session] mock_scraper = create_autospec(BaseScraper) mock_region.return_value = fake_region(ingestor=mock_scraper) mock_region.return_value.is_stoppable = False mock_supported.return_value = ['us_ca', 'us_ut'] request_args = {'region': 'all', 'scrape_type': 'all'} headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.get('/stop', query_string=request_args, headers=headers) assert response.status_code == 200 mock_sessions.assert_has_calls([ call(ScrapeKey('us_ca', constants.ScrapeType.BACKGROUND)), call(ScrapeKey('us_ca', constants.ScrapeType.SNAPSHOT)), call(ScrapeKey('us_ut', constants.ScrapeType.BACKGROUND)), call(ScrapeKey('us_ut', constants.ScrapeType.SNAPSHOT)) ]) mock_phase.assert_has_calls( [call(session, scrape_phase.ScrapePhase.PERSIST)] * 4) assert mock_scraper.stop_scrape.mock_calls == [ call(constants.ScrapeType.BACKGROUND, None), call().__bool__(), call(constants.ScrapeType.SNAPSHOT, None), call().__bool__(), call(constants.ScrapeType.BACKGROUND, None), call().__bool__(), call(constants.ScrapeType.SNAPSHOT, None), call().__bool__(), ] mock_supported.assert_called_with(stripes=[], timezone=None) mock_task_manager.return_value.create_scraper_phase_task.\ assert_has_calls([ call(region_code='us_ca', url='/read_and_persist'), call(region_code='us_ut', url='/read_and_persist'), ], any_order=True)
def test_check_for_finished_scrapers_not_done(mock_region, mock_validate_regions, mock_session, mock_list_scrape_tasks, mock_enqueue, client): region_code = 'region_x' mock_session.return_value = sessions.ScrapeSession.new( key=None, region=region_code, scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.SCRAPE) mock_validate_regions.return_value = [region_code] fake_region = create_autospec(Region) fake_region.region_code = region_code fake_region.get_queue_name.return_value = 'queue' mock_region.return_value = fake_region mock_list_scrape_tasks.return_value = ['fake_task'] request_args = {'region': 'all'} headers = {'X-Appengine-Cron': "test-cron"} response = client.get('/check_finished', query_string=request_args, headers=headers) assert response.status_code == 200 mock_validate_regions.assert_called_with(['all']) mock_session.assert_called_with( ScrapeKey(region_code, constants.ScrapeType.BACKGROUND)) mock_region.assert_called_with(region_code) mock_list_scrape_tasks.assert_called_with(region_code=region_code, queue_name='queue') mock_enqueue.assert_not_called()
def test_get_current_session(self): # older self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17))) current = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18))) # closed self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21))) # different scrape type self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19))) # different region self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19))) result = sessions.get_current_session( ScrapeKey("us_ny", constants.ScrapeType.BACKGROUND)) assert result.to_entity() == current.to_entity()
def test_get_more_tasks_failure_batch( self, mock_flask: Mock, mock_get_more: Mock, mock_fetch: Mock, mock_batch_error: Mock, ) -> None: mock_fetch.return_value = ("TEST", {}) mock_get_more.side_effect = ValueError("TEST ERROR") mock_flask_get = Mock() mock_flask_get.return_value = "TRACE ID" mock_flask.headers.get = mock_flask_get start_time = datetime.datetime.now() req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ) scraper = FakeScraper("test") with self.assertRaises(ScraperGetMoreTasksError): scraper._generic_scrape(req) self.assertEqual(mock_batch_error.call_count, 1) scrape_key = ScrapeKey( region_code="test", scrape_type=constants.ScrapeType.BACKGROUND ) mock_batch_error.assert_called_once_with( error="TEST ERROR", trace_id="TRACE ID", task=TEST_TASK, scrape_key=scrape_key, )
def test_scrape_data_no_more_tasks_batch( self, mock_get_more: Mock, mock_fetch: Mock, mock_populate: Mock, mock_write: Mock, mock_batch_write: Mock, ) -> None: mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper._generic_scrape(req) scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND) self.assertEqual(mock_get_more.call_count, 0) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_batch_write.assert_called_once_with( ingest_info=self.ii, task=t, scrape_key=scrape_key, ) self.assertEqual(len(scraper.tasks), 0)
def test_write_to_datastore(self, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person(full_name=TEST_NAME).create_booking( booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) task_hash = hash(json.dumps(t.to_serializable(), sort_keys=True)) expected_batch = BatchIngestInfoData(ingest_info=ii, task_hash=task_hash) batch_persistence.write(ii, scrape_key, t) batch_ingest_info_list = batch_persistence._get_batch_ingest_info_list( scrape_key.region_code, mock_session.start) self.assertEqual(len(batch_ingest_info_list), 1) self.assertEqual(expected_batch, batch_ingest_info_list[0])
def test_load_target_list_full_names(self, mock_region: Mock) -> None: mock_region.return_value.names_file = ( "../recidiviz/tests/ingest/testdata/docket/names/last_and_first.csv" ) scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) names = [] for _ in range(8): item = docket.get_new_docket_item(scrape_key) assert item is not None name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ["Smith", "James"], ["Smith", "Michael"], ["Smith", "Robert"], ["Smith", "David"], ["Johnson", "James"], ["Johnson", "Michael"], ["Smith", "William"], ["Williams", "James"], ] assert not docket.get_new_docket_item(scrape_key)
def test_persist_to_db(self, mock_write, _mock_region, mock_session_return): mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) ii = ingest_info.IngestInfo() ii.create_person( person_id=TEST_ID, full_name=TEST_NAME).create_booking(booking_id=TEST_ID) t = Task( task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT, response_type=constants.ResponseType.TEXT, ) batch_persistence.write(ii, scrape_key, t) expected_proto = serialization.convert_ingest_info_to_proto(ii) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto) # After we persist, there should no longer be ingest infos on Datastore ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region( REGIONS[0], mock_session.start) self.assertEqual(len(ingest_infos), 0)
def test_get_recent_sessions(self): first = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 17)), ) # different scrape type self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 18)), ) third = self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), end=fix_dt(datetime(2009, 6, 21)), ) # different region, scrape type self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2009, 6, 19)), ) results = sessions.get_recent_sessions( ScrapeKey("us_ny", constants.ScrapeType.BACKGROUND) ) assert to_entities(results) == to_entities([third, first])
def test_create_session_with_existing(self, mock_datetime, mock_client, mock_query): mock_datetime.now.return_value = fixed_now existing_session = ScrapeSession.new( key=datastore.key.Key("session", "existing", project=0), start=fixed_now, scrape_type=constants.ScrapeType.BACKGROUND, region="us_ny", phase=scrape_phase.ScrapePhase.START, ) new_key = datastore.key.Key("session", "new", project=0) new_session = ScrapeSession.new( key=new_key, start=fixed_now, scrape_type=constants.ScrapeType.BACKGROUND, region="us_wy", phase=scrape_phase.ScrapePhase.START, ) client = mock_client.return_value client.key.return_value = new_key wire_sessions_to_query(mock_client, mock_query, [existing_session]) scrape_key = ScrapeKey("us_wy", constants.ScrapeType.BACKGROUND) sessions.create_session(scrape_key) existing_session.end = fixed_now client.put.assert_any_call(existing_session.to_entity()) client.put.assert_any_call(new_session.to_entity()) assert client.put.call_count == 2
def teardown_method(self, _test_method): for region in REGIONS: docket.purge_query_docket( ScrapeKey(region, constants.ScrapeType.BACKGROUND) ) sessions.ds().delete_multi(self.sessions_to_delete) self.project_id_patcher.stop()
def start_scrape(self, scrape_type): """Start new scrape session / query against corrections site Retrieves first docket item, enqueues task for initial search page scrape to start the new scraping session. Args: scrape_type: (ScrapeType) The type of scrape to start Returns: N/A """ docket_item = self.iterate_docket_item(scrape_type) scrape_key = ScrapeKey(self.get_region().region_code, scrape_type) # Ensure that the topic and subscription are created on start. pubsub_helper.create_topic_and_subscription(scrape_key, BATCH_PUBSUB_TYPE) if not docket_item: logging.error( "Found no %s docket items for %s, shutting down.", scrape_type, self.get_region().region_code, ) sessions.close_session(scrape_key) return self.add_task( self.get_initial_task_method(), QueueRequest( scrape_type=scrape_type, scraper_start_time=datetime.now(), next_task=self.get_initial_task(), ), )
def test_persist_duplicates_to_db(self, mock_write, _mock_region, mock_session_return): """Tests that duplicate ingest_info.Person objects are merged before write.""" mock_session = mock_session_return.return_value = create_mock_session() scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) # Arrange ii = IngestInfo() ii.create_person(person_id=TEST_ID, full_name=TEST_NAME) \ .create_booking(booking_id=TEST_ID) ii_2 = IngestInfo() ii.create_person(person_id=TEST_ID2, full_name=TEST_NAME2) ii_1_dup = copy.deepcopy(ii) t1, t2, t3 = (Task(task_type=constants.TaskType.SCRAPE_DATA, endpoint=TEST_ENDPOINT + str(i), response_type=constants.ResponseType.TEXT) for i in range(3)) batch_persistence.write(ii, scrape_key, t1) batch_persistence.write(ii_2, scrape_key, t2) batch_persistence.write(ii_1_dup, scrape_key, t3) batch_persistence.persist_to_database(scrape_key.region_code, mock_session.start) expected_ii = IngestInfo(people=ii.people + ii_2.people) expected_proto = ingest_utils.convert_ingest_info_to_proto(expected_ii) result_proto = mock_write.call_args[0][0] self.assertEqual(result_proto, expected_proto)
def test_load_target_list_last_names(self, mock_region): mock_region.return_value.names_file = \ '../recidiviz/tests/ingest/testdata/docket/names/last_only.csv' scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) names = [] for _ in range(12): item = docket.get_new_docket_item(scrape_key) name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ['SMITH', ''], ['JOHNSON', ''], ['WILLIAMS', ''], ['BROWN', ''], ['JONES', ''], ['MILLER', ''], ['DAVIS', ''], ['GARCIA', ''], ['RODRIGUEZ', ''], ['WILSON', ''], ['MARTINEZ', ''], ['ANDERSON', ''], ] assert not docket.get_new_docket_item(scrape_key)
def test_get_sessions_with_leased_none_for_scrape_type(self): self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="a") self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="b") self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="c") self.create_session( region_code="us_ny", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2016, 11, 20)), docket_ack_id=None) self.create_session( region_code="us_fl", scrape_type=constants.ScrapeType.BACKGROUND, phase=scrape_phase.ScrapePhase.START, start=fix_dt(datetime(2016, 11, 20)), docket_ack_id="d") results = sessions.get_sessions_with_leased_docket_items( ScrapeKey("us_fl", constants.ScrapeType.SNAPSHOT)) assert not to_entities(results)
def test_iterate_docket_item_no_matching_items(self, mock_docket): mock_docket.return_value = None payload = tracker.iterate_docket_item( ScrapeKey("us_fl", constants.ScrapeType.BACKGROUND) ) assert not payload
def test_load_target_list_last_names(self, mock_region): mock_region.return_value.names_file = ( "../recidiviz/tests/ingest/testdata/docket/names/last_only.csv") scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) names = [] for _ in range(12): item = docket.get_new_docket_item(scrape_key) name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ["SMITH", ""], ["JOHNSON", ""], ["WILLIAMS", ""], ["BROWN", ""], ["JONES", ""], ["MILLER", ""], ["DAVIS", ""], ["GARCIA", ""], ["RODRIGUEZ", ""], ["WILSON", ""], ["MARTINEZ", ""], ["ANDERSON", ""], ] assert not docket.get_new_docket_item(scrape_key)
def test_add_item_happy_path(self, mock_client, mock_query): current_session_key = datastore.key.Key("session", "current", project=0) current_session_vars = { "region": "us_va", "scrape_type": constants.ScrapeType.SNAPSHOT, "phase": scrape_phase.ScrapePhase.START, "start": fix_dt(datetime(2014, 8, 31)), } current_session = ScrapeSession.new(current_session_key, **current_session_vars) prior_session = ScrapeSession.new( datastore.key.Key("session", "prior", project=0), region="us_ny", scrape_type=constants.ScrapeType.SNAPSHOT, start=fix_dt(datetime(2014, 8, 17)), phase=scrape_phase.ScrapePhase.SCRAPE, ) wire_sessions_to_query( mock_client, mock_query, [current_session, prior_session] ) assert sessions.add_docket_item_to_current_session( "alpha", ScrapeKey("us_va", constants.ScrapeType.SNAPSHOT) ) current_session_vars.update({"docket_ack_id": "alpha"}) expected_session = ScrapeSession.new( current_session_key, **current_session_vars ) mock_client.return_value.put.assert_called_with(expected_session.to_entity())