def test_purge_docket_and_session(self): scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND) docket.add_to_query_docket(scrape_key, get_payload()).result() self.create_session(scrape_key) tracker.purge_docket_and_session(scrape_key) assert not tracker.iterate_docket_item(scrape_key)
def test_purge_docket_and_session(self, mock_sessions, mock_remove, mock_purge): scrape_key = ScrapeKey("us_va", constants.ScrapeType.BACKGROUND) mock_sessions.return_value = ["us_va_1", "us_va_2"] tracker.purge_docket_and_session(scrape_key) mock_purge.assert_called_with(scrape_key) mock_remove.assert_has_calls([call("us_va_1"), call("us_va_2")])
def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = next( sessions.get_sessions( region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type, ), None, ) if most_recent_session and not most_recent_session.phase.has_persisted( ): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE, ) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_scraper() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. load_docket_thread = threading.Thread( target=structured_logging.with_context(docket.load_target_list), args=(scrape_key, given_names, surname), ) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join()