Example #1
0
    def test_create_session_with_existing(self, mock_datetime, mock_client, mock_query):
        mock_datetime.now.return_value = fixed_now

        existing_session = ScrapeSession.new(
            key=datastore.key.Key("session", "existing", project=0),
            start=fixed_now,
            scrape_type=constants.ScrapeType.BACKGROUND,
            region="us_ny",
            phase=scrape_phase.ScrapePhase.START,
        )
        new_key = datastore.key.Key("session", "new", project=0)
        new_session = ScrapeSession.new(
            key=new_key,
            start=fixed_now,
            scrape_type=constants.ScrapeType.BACKGROUND,
            region="us_wy",
            phase=scrape_phase.ScrapePhase.START,
        )

        client = mock_client.return_value
        client.key.return_value = new_key
        wire_sessions_to_query(mock_client, mock_query, [existing_session])

        scrape_key = ScrapeKey("us_wy", constants.ScrapeType.BACKGROUND)
        sessions.create_session(scrape_key)

        existing_session.end = fixed_now
        client.put.assert_any_call(existing_session.to_entity())
        client.put.assert_any_call(new_session.to_entity())
        assert client.put.call_count == 2
Example #2
0
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = next(
            sessions.get_sessions(
                region_code=scrape_key.region_code,
                include_closed=True,
                most_recent_only=True,
                scrape_type=scrape_key.scrape_type,
            ),
            None,
        )
        if most_recent_session and not most_recent_session.phase.has_persisted(
        ):
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key,
            BATCH_PUBSUB_TYPE,
        )
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_scraper()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        load_docket_thread = threading.Thread(
            target=structured_logging.with_context(docket.load_target_list),
            args=(scrape_key, given_names, surname),
        )
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()
    def test_create_session(self, mock_datetime, mock_client):
        mock_datetime.now.return_value = fixed_now

        # Must use a full key so that the entities are equal.
        key = datastore.key.Key('session', 'key', project=0)

        client = mock_client.return_value
        client.key.return_value = key

        scrape_key = ScrapeKey("us_ok", constants.ScrapeType.SNAPSHOT)
        sessions.create_session(scrape_key)

        session = ScrapeSession.new(
            key=datastore.key.Key('session', 'key', project=0), start=fixed_now,
            scrape_type=constants.ScrapeType.SNAPSHOT, region='us_ok',
            phase=scrape_phase.ScrapePhase.START,
        )
        client.put.assert_called_with(session.to_entity())
Example #4
0
def scraper_resume():
    """Request handler to resume one or several stopped scrapers

    Resumes scraping for each region and scrape type in request.

    Example query:
        /scraper_control/resume?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args))
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return (
            "Missing or invalid parameters, see service logs.",
            HTTPStatus.BAD_REQUEST,
        )

    for region in scrape_regions:

        for scrape_type in scrape_types:
            logging.info("Resuming [%s] scrape for [%s].", scrape_type, region)

            sessions.create_session(ScrapeKey(region, scrape_type))

            # Help avoid race condition with new session info
            # vs updating that w/first task.
            time.sleep(5)

            scraper = regions.get_region(region).get_scraper()
            scraper.resume_scrape(scrape_type)

    return ("", HTTPStatus.OK)