Exemple #1
0
def infer_release():
    """Runs infer release for the given regions."""
    region_codes = validate_regions(
        get_str_param_values("region", request.args))
    regions = [get_region(region_code) for region_code in region_codes]

    for region in regions:
        with monitoring.push_tags(
            {monitoring.TagKey.REGION: region.region_code}):
            if region.agency_type != "jail":
                continue

            session = sessions.get_most_recent_completed_session(
                region.region_code)
            if session:
                logging.info(
                    "Got most recent completed session for [%s] with "
                    "start time [%s]",
                    region.region_code,
                    session.start,
                )
                persistence.infer_release_on_open_bookings(
                    region.region_code, session.start,
                    _get_custody_status(region))
                sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)

    return "", HTTPStatus.OK
def deliver_emails_for_batch() -> Tuple[str, HTTPStatus]:
    """Deliver a batch of generated emails.

    Validates email addresses provided in the query params.

    Query parameters:
        batch_id: (required) Identifier for this batch
        redirect_address: (optional) An email address to which all emails will be sent. This can be used for redirecting
        all of the reports to a supervisor.
        cc_address: (optional) An email address to which all emails will be CC'd. This can be used for sending
        a batch of reports to multiple recipients. Multiple cc_address params can be given.
            Example:
            ?batch_id=123&cc_address=cc-one%40test.org&cc_address=cc_two%40test.org&cc_address=cc_three%40test.org
        subject_override: (optional) Override for subject being sent.

    Returns:
        Text indicating the results of the run and an HTTP status

    Raises:
        Nothing.  Catch everything so that we can always return a response to the request
    """

    try:
        batch_id = get_only_str_param_value("batch_id", request.args)
        redirect_address = get_only_str_param_value("redirect_address",
                                                    request.args)
        cc_addresses = get_str_param_values("cc_address", request.args)
        subject_override = get_only_str_param_value("subject_override",
                                                    request.args,
                                                    preserve_case=True)

        validate_email_address(redirect_address)
        for cc_address in cc_addresses:
            validate_email_address(cc_address)
    except ValueError as error:
        logging.error(error)
        return str(error), HTTPStatus.BAD_REQUEST

    if not batch_id:
        msg = "Query parameter 'batch_id' not received"
        logging.error(msg)
        return msg, HTTPStatus.BAD_REQUEST

    success_count, failure_count = email_delivery.deliver(
        batch_id,
        redirect_address=redirect_address,
        cc_addresses=cc_addresses,
        subject_override=subject_override,
    )

    redirect_text = (f"to the redirect email address {redirect_address}"
                     if redirect_address else "")
    cc_addresses_text = (f"CC'd {','.join(email for email in cc_addresses)}."
                         if cc_addresses else "")

    return (
        f"Sent {success_count} emails {redirect_text}. {cc_addresses_text} "
        f"{failure_count} emails failed to send",
        HTTPStatus.OK,
    )
Exemple #3
0
def scraper_resume():
    """Request handler to resume one or several stopped scrapers

    Resumes scraping for each region and scrape type in request.

    Example query:
        /scraper_control/resume?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args))
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return (
            "Missing or invalid parameters, see service logs.",
            HTTPStatus.BAD_REQUEST,
        )

    for region in scrape_regions:

        for scrape_type in scrape_types:
            logging.info("Resuming [%s] scrape for [%s].", scrape_type, region)

            sessions.create_session(ScrapeKey(region, scrape_type))

            # Help avoid race condition with new session info
            # vs updating that w/first task.
            time.sleep(5)

            scraper = regions.get_region(region).get_scraper()
            scraper.resume_scrape(scrape_type)

    return ("", HTTPStatus.OK)
def check_for_finished_scrapers():
    """Checks for any finished scrapers and kicks off next processes."""

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None
    cloud_task_manager = ScraperCloudTaskManager()

    @monitoring.with_region_tag
    def _check_finished(region_code: str):
        # If there are no sessions currently scraping, nothing to check.
        session = sessions.get_current_session(
            ScrapeKey(region_code, constants.ScrapeType.BACKGROUND)
        )
        if not session or not session.phase.is_actively_scraping():
            return

        if is_scraper_finished(region_code, cloud_task_manager):
            logging.info("Region [%s] has finished scraping.", region_code)

            if next_phase:
                logging.info(
                    "Enqueueing [%s] for region [%s].", next_phase, region_code
                )
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region_code, url=next_phase_url
                )

    region_codes = ingest_utils.validate_regions(
        get_str_param_values("region", request.args)
    )

    failed_regions = []
    with futures.ThreadPoolExecutor() as executor:
        future_to_region = {
            executor.submit(
                structured_logging.with_context(_check_finished), region_code
            ): region_code
            for region_code in region_codes
        }
        for future in futures.as_completed(future_to_region):
            region_code = future_to_region[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        "An exception occured when checking region [%s]", region_code
                    )
                    failed_regions.append(region_code)

    if failed_regions:
        return (
            "Failed to check regions: {}".format(failed_regions),
            HTTPStatus.INTERNAL_SERVER_ERROR,
        )
    return ("", HTTPStatus.OK)
Exemple #5
0
 def test_get_str_param_values(self):
     self.assertEqual(
         params.get_str_param_values('region', PARAMS), ['us_mo', 'us_wa'])
Exemple #6
0
def scraper_start():
    """Request handler to start one or several running scrapers

    Kicks off new scrape session for each region and scrape type in request

    Example query:
        /scraper_control/start?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'
        timezone: (string) The timezone to scrape.
        surname: (string, optional) Name to start scrape at. Required if
            given_names provided
        given_names: (string, optional) Name to start scrape at

    Args:
        N/A

    Returns:
        N/A
    """

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = \
            next(sessions.get_sessions(region_code=scrape_key.region_code,
                                       include_closed=True,
                                       most_recent_only=True,
                                       scrape_type=scrape_key.scrape_type),
                 None)
        if most_recent_session and not \
                most_recent_session.phase.has_persisted():
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key, BATCH_PUBSUB_TYPE)
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_ingestor()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        #
        # TODO(#1045): Either kill this, or ensure logs are correlated and
        # exceptions are passed up to the parent thread.
        load_docket_thread = threading.Thread(
            target=docket.load_target_list,
            args=(scrape_key, given_names, surname))
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()

    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    region_value = get_str_param_values("region", request.args)
    # If a timezone wasn't provided start all regions. If it was only start
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        region_value, timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, or no regions found, see logs.',
                HTTPStatus.BAD_REQUEST)

    given_names = get_str_param_value("given_names", request.args, "")
    surname = get_str_param_value("surname", request.args, "")

    failed_starts = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_args = \
            {executor.submit(_start_scraper, region_code, scrape_type): \
                (region_code, scrape_type)
             for scrape_type in scrape_types
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_args):
            region_code, scrape_type = future_to_args[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when starting region [%s] for '
                        '[%s]',
                        region_code, scrape_type)
                    failed_starts.append((region_code, scrape_type))
                else:
                    logging.info('Finished starting region [%s] for [%s].',
                                 region_code, scrape_type)

    if failed_starts:
        # This causes the whole request to be retried. Any regions whose session
        # was opened during this call will be immediately skipped in the next
        # call when we check for open sessions. Any regions we failed to start
        # likely still had sessions opened and thus will be skipped, but it is
        # worth retrying anyway.
        return ('Failed to start regions: {}'.format(failed_starts),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)
Exemple #7
0
def scraper_stop():
    """Request handler to stop one or several running scrapers.

    Note: Stopping any scrape type for a region involves purging the
    scraping task queue for that region, necessarily killing any other
    in-progress scrape types. Untargeted scrapes killed by this request
    handler will be noted and resumed a moment or two later.

    Unlike the other Scraper action methods, stop_scrape doesn't call
    individually for each scrape type. That could create a race condition,
    as each call noticed the other scrape type was running at the same
    time, kicked off a resume effort with a delay, and then our second
    call came to kill the other type and missed the (delayed / not yet
    in taskqueue) call - effectively not stopping the scrape.

    Instead, we send the full list of scrape_types to stop, and
    Scraper.stop_scrape is responsible for fan-out.

    Example query:
        /scraper_control/stop?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    respect_is_stoppable = get_str_param_value("respect_is_stoppable",
                                               request.args)

    # If a timezone wasn't provided stop all regions. If it was only stop
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args), timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _stop_scraper(region: str):
        closed_sessions = []
        for scrape_type in scrape_types:
            closed_sessions.extend(
                sessions.close_session(ScrapeKey(region, scrape_type)))
        for session in closed_sessions:
            sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST)
        if not closed_sessions:
            return

        was_stopped = False
        try:
            logging.info("Stopping scraper for region [%s].", region)
            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_types,
                                                     respect_is_stoppable)
        finally:
            if next_phase and was_stopped:
                logging.info("Enqueueing %s for region [%s].",
                             next_phase, region)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=next_phase_url)

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, see service logs.',
                HTTPStatus.BAD_REQUEST)

    failed_stops = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_regions = \
            {executor.submit(_stop_scraper, region_code): region_code
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_regions):
            region_code = future_to_regions[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when stopping region [%s] for '
                        '[%s]',
                        region_code, scrape_types)
                    failed_stops.append(region_code)
                else:
                    logging.info('Finished stopping region [%s] for [%s].',
                                 region_code, scrape_types)

    if failed_stops:
        # This causes the whole request to be retried. Any regions whose session
        # was closed during this call will be immediately skipped in the next
        # call as we won't find any sessions to close. Any regions we failed to
        # start likely still had their sessions closed and thus will be skipped,
        # but it is worth retrying anyway.
        return ('Failed to stop regions: {}'.format(failed_stops),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)
Exemple #8
0
 def test_get_str_param_values(self) -> None:
     self.assertEqual(params.get_str_param_values("region", PARAMS),
                      ["us_mo", "us_wa"])