Example #1
0
    def test_purge_docket_and_session(self):
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        docket.add_to_query_docket(scrape_key, get_payload()).result()
        self.create_session(scrape_key)

        tracker.purge_docket_and_session(scrape_key)

        assert not tracker.iterate_docket_item(scrape_key)
Example #2
0
    def test_purge_docket_and_session(self, mock_sessions, mock_remove, mock_purge):
        scrape_key = ScrapeKey("us_va", constants.ScrapeType.BACKGROUND)
        mock_sessions.return_value = ["us_va_1", "us_va_2"]

        tracker.purge_docket_and_session(scrape_key)

        mock_purge.assert_called_with(scrape_key)
        mock_remove.assert_has_calls([call("us_va_1"), call("us_va_2")])
Example #3
0
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = next(
            sessions.get_sessions(
                region_code=scrape_key.region_code,
                include_closed=True,
                most_recent_only=True,
                scrape_type=scrape_key.scrape_type,
            ),
            None,
        )
        if most_recent_session and not most_recent_session.phase.has_persisted(
        ):
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key,
            BATCH_PUBSUB_TYPE,
        )
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_scraper()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        load_docket_thread = threading.Thread(
            target=structured_logging.with_context(docket.load_target_list),
            args=(scrape_key, given_names, surname),
        )
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()