def test_load_target_list_last_names(self, mock_region): mock_region.return_value.names_file = \ '../recidiviz/tests/ingest/testdata/docket/names/last_only.csv' scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) names = [] for _ in range(12): item = docket.get_new_docket_item(scrape_key) name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ['SMITH', ''], ['JOHNSON', ''], ['WILLIAMS', ''], ['BROWN', ''], ['JONES', ''], ['MILLER', ''], ['DAVIS', ''], ['GARCIA', ''], ['RODRIGUEZ', ''], ['WILSON', ''], ['MARTINEZ', ''], ['ANDERSON', ''], ] assert not docket.get_new_docket_item(scrape_key)
def test_load_target_list_last_names(self, mock_region): mock_region.return_value.names_file = ( "../recidiviz/tests/ingest/testdata/docket/names/last_only.csv") scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) names = [] for _ in range(12): item = docket.get_new_docket_item(scrape_key) name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ["SMITH", ""], ["JOHNSON", ""], ["WILLIAMS", ""], ["BROWN", ""], ["JONES", ""], ["MILLER", ""], ["DAVIS", ""], ["GARCIA", ""], ["RODRIGUEZ", ""], ["WILSON", ""], ["MARTINEZ", ""], ["ANDERSON", ""], ] assert not docket.get_new_docket_item(scrape_key)
def test_load_target_list_full_names(self, mock_region: Mock) -> None: mock_region.return_value.names_file = ( "../recidiviz/tests/ingest/testdata/docket/names/last_and_first.csv" ) scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) names = [] for _ in range(8): item = docket.get_new_docket_item(scrape_key) assert item is not None name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ["Smith", "James"], ["Smith", "Michael"], ["Smith", "Robert"], ["Smith", "David"], ["Johnson", "James"], ["Johnson", "Michael"], ["Smith", "William"], ["Williams", "James"], ] assert not docket.get_new_docket_item(scrape_key)
def test_load_target_list_last_names_with_bad_query(self, mock_region): mock_region.return_value.names_file = ( "../recidiviz/tests/ingest/testdata/docket/names/last_only.csv") scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key, surname="GARBAGE") item = docket.get_new_docket_item(scrape_key) assert item.message.data.decode() == json.dumps(("GARBAGE", "")) assert not docket.get_new_docket_item(scrape_key)
def test_load_target_list_background_no_names_file(self, mock_region): mock_region.return_value.names_file = None scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) item = docket.get_new_docket_item(scrape_key) assert item.message.data.decode() == json.dumps("empty")
def test_add_to_query_docket_background(self): scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) pubsub_helper.create_topic_and_subscription(scrape_key, docket.PUBSUB_TYPE) docket.add_to_query_docket(scrape_key, get_payload()[0]).result() docket.add_to_query_docket(scrape_key, get_payload()[1]).result() items = [ docket.get_new_docket_item(scrape_key), docket.get_new_docket_item(scrape_key), ] assert len(items) == 2 for i, item in enumerate(items): assert item.message.data.decode() == json.dumps(get_payload()[i])
def test_purge_query_docket(self): scrape_key_purge = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) scrape_key_read = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) pubsub_helper.create_topic_and_subscription(scrape_key_purge, docket.PUBSUB_TYPE) pubsub_helper.create_topic_and_subscription(scrape_key_read, docket.PUBSUB_TYPE) docket.add_to_query_docket(scrape_key_purge, get_payload()).result() docket.add_to_query_docket(scrape_key_read, get_payload()).result() docket.purge_query_docket(scrape_key_purge) assert not docket.get_new_docket_item(scrape_key_purge, return_immediately=True) assert docket.get_new_docket_item(scrape_key_read, return_immediately=True)
def test_load_target_list_last_names_with_query(self, mock_region): mock_region.return_value.names_file = ( "../recidiviz/tests/ingest/testdata/docket/names/last_only.csv") scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key, surname="WILSON") names = [] for _ in range(3): item = docket.get_new_docket_item(scrape_key) name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ["WILSON", ""], ["MARTINEZ", ""], ["ANDERSON", ""], ] assert not docket.get_new_docket_item(scrape_key)
def test_get_new_docket_item_no_matching_items(self): write_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) read_key = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND) pubsub_helper.create_topic_and_subscription(write_key, docket.PUBSUB_TYPE) docket.add_to_query_docket(write_key, get_payload()).result() docket_item = docket.get_new_docket_item(read_key, return_immediately=True) assert not docket_item
def test_load_target_list_full_names(self, mock_region): mock_region.return_value.names_file = \ '../recidiviz/tests/ingest/testdata/docket/names/last_and_first.csv' scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.load_target_list(scrape_key) names = [] for _ in range(8): item = docket.get_new_docket_item(scrape_key) name_serialized = item.message.data.decode() names.append(json.loads(name_serialized)) assert names == [ ['Smith', 'James'], ['Smith', 'Michael'], ['Smith', 'Robert'], ['Smith', 'David'], ['Johnson', 'James'], ['Johnson', 'Michael'], ['Smith', 'William'], ['Williams', 'James'], ] assert not docket.get_new_docket_item(scrape_key)
def iterate_docket_item(scrape_key, return_immediately=False): """Leases new docket item, updates current session, returns item contents Pulls an arbitrary new item from the docket type provided, adds it to the current session info, and returns the payload of the docket item. This payload should be an entity fit to scrape, or information suitable for retrieving an entity fit to scrape, depending on scrape type. Args: scrape_key: (ScrapeKey) The scraper to retrieve a docket item for return_immediately: (bool) Whether to return immediately or to wait for a bounded period of time for a message to enter the docket. Returns: The payload of the next docket item, if successfully retrieved and added to the current session for this region and scrape type. If not retrieved or not successfully added to the session, returns None. """ docket_item = docket.get_new_docket_item( scrape_key, return_immediately=return_immediately ) if not docket_item: logging.info("No items in docket for [%s]. Ending scrape.", scrape_key) return None item_content = json.loads(docket_item.message.data.decode()) item_added = sessions.add_docket_item_to_current_session( docket_item.ack_id, scrape_key ) if not item_added: logging.error( "Failed to update session for scraper [%s] " "with docket item [%s].", scrape_key, str(item_content), ) return None return item_content
def test_purge_query_docket_already_empty(self): scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.purge_query_docket(scrape_key) assert not docket.get_new_docket_item(scrape_key, return_immediately=True)
def test_get_new_docket_item_no_items_at_all(self): docket_item = docket.get_new_docket_item( ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND), return_immediately=True, ) assert not docket_item