コード例 #1
0
def scraper_resume():
    """Request handler to resume one or several stopped scrapers

    Resumes scraping for each region and scrape type in request.

    Example query:
        /scraper_control/resume?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args))
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return (
            "Missing or invalid parameters, see service logs.",
            HTTPStatus.BAD_REQUEST,
        )

    for region in scrape_regions:

        for scrape_type in scrape_types:
            logging.info("Resuming [%s] scrape for [%s].", scrape_type, region)

            sessions.create_session(ScrapeKey(region, scrape_type))

            # Help avoid race condition with new session info
            # vs updating that w/first task.
            time.sleep(5)

            scraper = regions.get_region(region).get_scraper()
            scraper.resume_scrape(scrape_type)

    return ("", HTTPStatus.OK)
コード例 #2
0
 def test_validate_scrape_types_empty(self):
     assert ingest_utils.validate_scrape_types([]) == [
         constants.ScrapeType.BACKGROUND
     ]
コード例 #3
0
 def test_validate_scrape_types_multiple_all_invalid(self):
     assert not ingest_utils.validate_scrape_types(["all", "invalid"])
コード例 #4
0
 def test_validate_scrape_types_multiple_all(self):
     assert ingest_utils.validate_scrape_types(
         [constants.ScrapeType.BACKGROUND.value, "all"]
     ) == [constants.ScrapeType.BACKGROUND, constants.ScrapeType.SNAPSHOT]
コード例 #5
0
 def test_validate_scrape_types_multiple_invalid(self):
     assert not ingest_utils.validate_scrape_types(
         [constants.ScrapeType.BACKGROUND.value, "invalid"]
     )
コード例 #6
0
 def test_validate_scrape_types_one_invalid(self):
     assert not ingest_utils.validate_scrape_types(["When You Were Young"])
コード例 #7
0
 def test_validate_scrape_types_one_ok(self):
     assert ingest_utils.validate_scrape_types(
         [constants.ScrapeType.SNAPSHOT.value]
     ) == [constants.ScrapeType.SNAPSHOT]
コード例 #8
0
 def test_validate_scrape_types_multiple_all_invalid(self):
     assert not ingest_utils.validate_scrape_types(['all', 'invalid'])
コード例 #9
0
 def test_validate_scrape_types_one_all(self):
     assert ingest_utils.validate_scrape_types(['all']) == [
         constants.ScrapeType.BACKGROUND, constants.ScrapeType.SNAPSHOT
     ]
コード例 #10
0
def scraper_start():
    """Request handler to start one or several running scrapers

    Kicks off new scrape session for each region and scrape type in request

    Example query:
        /scraper_control/start?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'
        timezone: (string) The timezone to scrape.
        surname: (string, optional) Name to start scrape at. Required if
            given_names provided
        given_names: (string, optional) Name to start scrape at

    Args:
        N/A

    Returns:
        N/A
    """

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = \
            next(sessions.get_sessions(region_code=scrape_key.region_code,
                                       include_closed=True,
                                       most_recent_only=True,
                                       scrape_type=scrape_key.scrape_type),
                 None)
        if most_recent_session and not \
                most_recent_session.phase.has_persisted():
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key, BATCH_PUBSUB_TYPE)
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_ingestor()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        #
        # TODO(#1045): Either kill this, or ensure logs are correlated and
        # exceptions are passed up to the parent thread.
        load_docket_thread = threading.Thread(
            target=docket.load_target_list,
            args=(scrape_key, given_names, surname))
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()

    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    region_value = get_str_param_values("region", request.args)
    # If a timezone wasn't provided start all regions. If it was only start
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        region_value, timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, or no regions found, see logs.',
                HTTPStatus.BAD_REQUEST)

    given_names = get_str_param_value("given_names", request.args, "")
    surname = get_str_param_value("surname", request.args, "")

    failed_starts = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_args = \
            {executor.submit(_start_scraper, region_code, scrape_type): \
                (region_code, scrape_type)
             for scrape_type in scrape_types
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_args):
            region_code, scrape_type = future_to_args[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when starting region [%s] for '
                        '[%s]',
                        region_code, scrape_type)
                    failed_starts.append((region_code, scrape_type))
                else:
                    logging.info('Finished starting region [%s] for [%s].',
                                 region_code, scrape_type)

    if failed_starts:
        # This causes the whole request to be retried. Any regions whose session
        # was opened during this call will be immediately skipped in the next
        # call when we check for open sessions. Any regions we failed to start
        # likely still had sessions opened and thus will be skipped, but it is
        # worth retrying anyway.
        return ('Failed to start regions: {}'.format(failed_starts),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)
コード例 #11
0
def scraper_stop():
    """Request handler to stop one or several running scrapers.

    Note: Stopping any scrape type for a region involves purging the
    scraping task queue for that region, necessarily killing any other
    in-progress scrape types. Untargeted scrapes killed by this request
    handler will be noted and resumed a moment or two later.

    Unlike the other Scraper action methods, stop_scrape doesn't call
    individually for each scrape type. That could create a race condition,
    as each call noticed the other scrape type was running at the same
    time, kicked off a resume effort with a delay, and then our second
    call came to kill the other type and missed the (delayed / not yet
    in taskqueue) call - effectively not stopping the scrape.

    Instead, we send the full list of scrape_types to stop, and
    Scraper.stop_scrape is responsible for fan-out.

    Example query:
        /scraper_control/stop?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    respect_is_stoppable = get_str_param_value("respect_is_stoppable",
                                               request.args)

    # If a timezone wasn't provided stop all regions. If it was only stop
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args), timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _stop_scraper(region: str):
        closed_sessions = []
        for scrape_type in scrape_types:
            closed_sessions.extend(
                sessions.close_session(ScrapeKey(region, scrape_type)))
        for session in closed_sessions:
            sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST)
        if not closed_sessions:
            return

        was_stopped = False
        try:
            logging.info("Stopping scraper for region [%s].", region)
            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_types,
                                                     respect_is_stoppable)
        finally:
            if next_phase and was_stopped:
                logging.info("Enqueueing %s for region [%s].",
                             next_phase, region)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=next_phase_url)

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, see service logs.',
                HTTPStatus.BAD_REQUEST)

    failed_stops = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_regions = \
            {executor.submit(_stop_scraper, region_code): region_code
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_regions):
            region_code = future_to_regions[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when stopping region [%s] for '
                        '[%s]',
                        region_code, scrape_types)
                    failed_stops.append(region_code)
                else:
                    logging.info('Finished stopping region [%s] for [%s].',
                                 region_code, scrape_types)

    if failed_stops:
        # This causes the whole request to be retried. Any regions whose session
        # was closed during this call will be immediately skipped in the next
        # call as we won't find any sessions to close. Any regions we failed to
        # start likely still had their sessions closed and thus will be skipped,
        # but it is worth retrying anyway.
        return ('Failed to stop regions: {}'.format(failed_stops),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)