Ejemplo n.º 1
0
def infer_release():
    """Runs infer release for the given regions."""
    region_codes = validate_regions(
        get_str_param_values("region", request.args))
    regions = [get_region(region_code) for region_code in region_codes]

    for region in regions:
        with monitoring.push_tags(
            {monitoring.TagKey.REGION: region.region_code}):
            if region.agency_type != "jail":
                continue

            session = sessions.get_most_recent_completed_session(
                region.region_code)
            if session:
                logging.info(
                    "Got most recent completed session for [%s] with "
                    "start time [%s]",
                    region.region_code,
                    session.start,
                )
                persistence.infer_release_on_open_bookings(
                    region.region_code, session.start,
                    _get_custody_status(region))
                sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)

    return "", HTTPStatus.OK
Ejemplo n.º 2
0
 def test_validate_regions_one_all(self, _mock_modules):
     assert ingest_utils.validate_regions(['all']) == {
         'us_ny',
         'us_pa',
         'us_vt',
         'us_pa_greene',
     }
Ejemplo n.º 3
0
 def test_validate_regions_one_all(self, _mock_modules):
     assert ingest_utils.validate_regions(["all"]) == {
         "us_ny",
         "us_pa",
         "us_vt",
         "us_pa_greene",
     }
Ejemplo n.º 4
0
def run_scraper(args: argparse.Namespace) -> None:
    use_in_memory_sqlite_database(JailsBase)

    region_codes = validate_regions(args.region.split(","))
    if not region_codes:
        sys.exit(1)
    failed_regions = []
    valid_region_codes = cast(Set[str], region_codes)
    for region_code in valid_region_codes:
        logging.info("***")
        logging.info("***")
        logging.info("Starting scraper for region: [%s]", region_code)
        logging.info("***")
        logging.info("***")
        try:
            run_scraper_for_region(regions.get_region(region_code), args)
        except Exception:
            print(traceback.format_exc())
            failed_regions.append(region_code)

    if failed_regions:
        logging.info("***")
        logging.info(
            "The following regions raised errors during scraping: " "[%s]",
            failed_regions,
        )
Ejemplo n.º 5
0
def check_for_finished_scrapers():
    """Checks for any finished scrapers and kicks off next processes."""

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None
    cloud_task_manager = ScraperCloudTaskManager()

    @monitoring.with_region_tag
    def _check_finished(region_code: str):
        # If there are no sessions currently scraping, nothing to check.
        session = sessions.get_current_session(
            ScrapeKey(region_code, constants.ScrapeType.BACKGROUND)
        )
        if not session or not session.phase.is_actively_scraping():
            return

        if is_scraper_finished(region_code, cloud_task_manager):
            logging.info("Region [%s] has finished scraping.", region_code)

            if next_phase:
                logging.info(
                    "Enqueueing [%s] for region [%s].", next_phase, region_code
                )
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region_code, url=next_phase_url
                )

    region_codes = ingest_utils.validate_regions(
        get_str_param_values("region", request.args)
    )

    failed_regions = []
    with futures.ThreadPoolExecutor() as executor:
        future_to_region = {
            executor.submit(
                structured_logging.with_context(_check_finished), region_code
            ): region_code
            for region_code in region_codes
        }
        for future in futures.as_completed(future_to_region):
            region_code = future_to_region[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        "An exception occured when checking region [%s]", region_code
                    )
                    failed_regions.append(region_code)

    if failed_regions:
        return (
            "Failed to check regions: {}".format(failed_regions),
            HTTPStatus.INTERNAL_SERVER_ERROR,
        )
    return ("", HTTPStatus.OK)
Ejemplo n.º 6
0
    def test_validate_regions_multiple_all(self, mock_region, mock_env, _mock_modules):
        fake_region = Mock()
        mock_region.return_value = fake_region
        fake_region.environment = "production"
        mock_env.return_value = "production"

        assert ingest_utils.validate_regions(["us_pa", "all"]) == {
            "us_ny",
            "us_pa",
            "us_vt",
            "us_pa_greene",
        }
Ejemplo n.º 7
0
    def test_validate_regions_multiple_all(self, mock_region, mock_env,
                                           _mock_modules):
        fake_region = Mock()
        mock_region.return_value = fake_region
        fake_region.environment = 'production'
        mock_env.return_value = 'production'

        assert ingest_utils.validate_regions(['us_pa', 'all']) == {
            'us_ny',
            'us_pa',
            'us_vt',
            'us_pa_greene',
        }
Ejemplo n.º 8
0
    def test_validate_regions_environments(self, mock_region, mock_env,
                                           _mock_modules):
        region_prod, region_staging, region_none = Mock(), Mock(), Mock()
        region_prod.environment = 'production'
        region_staging.environment = 'staging'
        region_none.environment = False

        mock_region.side_effect = [
            region_prod, region_none, region_prod, region_staging
        ]
        mock_env.return_value = 'production'

        assert len(ingest_utils.validate_regions(['all'])) == 2
Ejemplo n.º 9
0
def scraper_resume():
    """Request handler to resume one or several stopped scrapers

    Resumes scraping for each region and scrape type in request.

    Example query:
        /scraper_control/resume?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args))
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return (
            "Missing or invalid parameters, see service logs.",
            HTTPStatus.BAD_REQUEST,
        )

    for region in scrape_regions:

        for scrape_type in scrape_types:
            logging.info("Resuming [%s] scrape for [%s].", scrape_type, region)

            sessions.create_session(ScrapeKey(region, scrape_type))

            # Help avoid race condition with new session info
            # vs updating that w/first task.
            time.sleep(5)

            scraper = regions.get_region(region).get_scraper()
            scraper.resume_scrape(scrape_type)

    return ("", HTTPStatus.OK)
Ejemplo n.º 10
0
def run_scraper(args):
    use_in_memory_sqlite_database(JailsBase)

    region_codes = validate_regions(args.region.split(','))
    if not region_codes:
        sys.exit(1)
    failed_regions = []
    for region_code in region_codes:
        logging.info('***')
        logging.info('***')
        logging.info("Starting scraper for region: [%s]", region_code)
        logging.info('***')
        logging.info('***')
        try:
            run_scraper_for_region(regions.get_region(region_code), args)
        except Exception:
            print(traceback.format_exc())
            failed_regions.append(region_code)

    if failed_regions:
        logging.info('***')
        logging.info("The following regions raised errors during scraping: "
                     "[%s]",
                     failed_regions)
Ejemplo n.º 11
0
 def test_validate_regions_multiple_ok(self, _mock_modules):
     assert ingest_utils.validate_regions(['us_pa',
                                           'us_ny']) == {'us_pa', 'us_ny'}
Ejemplo n.º 12
0
 def test_validate_regions_multiple_invalid(self, _mock_modules):
     assert not ingest_utils.validate_regions(["us_pa", "invalid"])
Ejemplo n.º 13
0
 def test_validate_regions_multiple_ok(self, _mock_modules):
     assert ingest_utils.validate_regions(["us_pa", "us_ny"]) == {"us_pa", "us_ny"}
Ejemplo n.º 14
0
 def test_validate_regions_one_invalid(self, _mock_modules):
     assert not ingest_utils.validate_regions(["ca_bc"])
Ejemplo n.º 15
0
 def test_validate_regions_empty(self, _mock_modules):
     assert ingest_utils.validate_regions([]) == set()
Ejemplo n.º 16
0
 def test_validate_regions_multiple_invalid(self, _mock_modules):
     assert not ingest_utils.validate_regions(['us_pa', 'invalid'])
Ejemplo n.º 17
0
def scraper_stop():
    """Request handler to stop one or several running scrapers.

    Note: Stopping any scrape type for a region involves purging the
    scraping task queue for that region, necessarily killing any other
    in-progress scrape types. Untargeted scrapes killed by this request
    handler will be noted and resumed a moment or two later.

    Unlike the other Scraper action methods, stop_scrape doesn't call
    individually for each scrape type. That could create a race condition,
    as each call noticed the other scrape type was running at the same
    time, kicked off a resume effort with a delay, and then our second
    call came to kill the other type and missed the (delayed / not yet
    in taskqueue) call - effectively not stopping the scrape.

    Instead, we send the full list of scrape_types to stop, and
    Scraper.stop_scrape is responsible for fan-out.

    Example query:
        /scraper_control/stop?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    respect_is_stoppable = get_str_param_value("respect_is_stoppable",
                                               request.args)

    # If a timezone wasn't provided stop all regions. If it was only stop
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args), timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _stop_scraper(region: str):
        closed_sessions = []
        for scrape_type in scrape_types:
            closed_sessions.extend(
                sessions.close_session(ScrapeKey(region, scrape_type)))
        for session in closed_sessions:
            sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST)
        if not closed_sessions:
            return

        was_stopped = False
        try:
            logging.info("Stopping scraper for region [%s].", region)
            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_types,
                                                     respect_is_stoppable)
        finally:
            if next_phase and was_stopped:
                logging.info("Enqueueing %s for region [%s].",
                             next_phase, region)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=next_phase_url)

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, see service logs.',
                HTTPStatus.BAD_REQUEST)

    failed_stops = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_regions = \
            {executor.submit(_stop_scraper, region_code): region_code
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_regions):
            region_code = future_to_regions[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when stopping region [%s] for '
                        '[%s]',
                        region_code, scrape_types)
                    failed_stops.append(region_code)
                else:
                    logging.info('Finished stopping region [%s] for [%s].',
                                 region_code, scrape_types)

    if failed_stops:
        # This causes the whole request to be retried. Any regions whose session
        # was closed during this call will be immediately skipped in the next
        # call as we won't find any sessions to close. Any regions we failed to
        # start likely still had their sessions closed and thus will be skipped,
        # but it is worth retrying anyway.
        return ('Failed to stop regions: {}'.format(failed_stops),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)
Ejemplo n.º 18
0
def scraper_start():
    """Request handler to start one or several running scrapers

    Kicks off new scrape session for each region and scrape type in request

    Example query:
        /scraper_control/start?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'
        timezone: (string) The timezone to scrape.
        surname: (string, optional) Name to start scrape at. Required if
            given_names provided
        given_names: (string, optional) Name to start scrape at

    Args:
        N/A

    Returns:
        N/A
    """

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = \
            next(sessions.get_sessions(region_code=scrape_key.region_code,
                                       include_closed=True,
                                       most_recent_only=True,
                                       scrape_type=scrape_key.scrape_type),
                 None)
        if most_recent_session and not \
                most_recent_session.phase.has_persisted():
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key, BATCH_PUBSUB_TYPE)
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_ingestor()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        #
        # TODO(#1045): Either kill this, or ensure logs are correlated and
        # exceptions are passed up to the parent thread.
        load_docket_thread = threading.Thread(
            target=docket.load_target_list,
            args=(scrape_key, given_names, surname))
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()

    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    region_value = get_str_param_values("region", request.args)
    # If a timezone wasn't provided start all regions. If it was only start
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        region_value, timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, or no regions found, see logs.',
                HTTPStatus.BAD_REQUEST)

    given_names = get_str_param_value("given_names", request.args, "")
    surname = get_str_param_value("surname", request.args, "")

    failed_starts = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_args = \
            {executor.submit(_start_scraper, region_code, scrape_type): \
                (region_code, scrape_type)
             for scrape_type in scrape_types
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_args):
            region_code, scrape_type = future_to_args[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when starting region [%s] for '
                        '[%s]',
                        region_code, scrape_type)
                    failed_starts.append((region_code, scrape_type))
                else:
                    logging.info('Finished starting region [%s] for [%s].',
                                 region_code, scrape_type)

    if failed_starts:
        # This causes the whole request to be retried. Any regions whose session
        # was opened during this call will be immediately skipped in the next
        # call when we check for open sessions. Any regions we failed to start
        # likely still had sessions opened and thus will be skipped, but it is
        # worth retrying anyway.
        return ('Failed to start regions: {}'.format(failed_starts),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)