def infer_release(): """Runs infer release for the given regions.""" region_codes = validate_regions( get_str_param_values("region", request.args)) regions = [get_region(region_code) for region_code in region_codes] for region in regions: with monitoring.push_tags( {monitoring.TagKey.REGION: region.region_code}): if region.agency_type != "jail": continue session = sessions.get_most_recent_completed_session( region.region_code) if session: logging.info( "Got most recent completed session for [%s] with " "start time [%s]", region.region_code, session.start, ) persistence.infer_release_on_open_bookings( region.region_code, session.start, _get_custody_status(region)) sessions.update_phase(session, scrape_phase.ScrapePhase.DONE) return "", HTTPStatus.OK
def deliver_emails_for_batch() -> Tuple[str, HTTPStatus]: """Deliver a batch of generated emails. Validates email addresses provided in the query params. Query parameters: batch_id: (required) Identifier for this batch redirect_address: (optional) An email address to which all emails will be sent. This can be used for redirecting all of the reports to a supervisor. cc_address: (optional) An email address to which all emails will be CC'd. This can be used for sending a batch of reports to multiple recipients. Multiple cc_address params can be given. Example: ?batch_id=123&cc_address=cc-one%40test.org&cc_address=cc_two%40test.org&cc_address=cc_three%40test.org subject_override: (optional) Override for subject being sent. Returns: Text indicating the results of the run and an HTTP status Raises: Nothing. Catch everything so that we can always return a response to the request """ try: batch_id = get_only_str_param_value("batch_id", request.args) redirect_address = get_only_str_param_value("redirect_address", request.args) cc_addresses = get_str_param_values("cc_address", request.args) subject_override = get_only_str_param_value("subject_override", request.args, preserve_case=True) validate_email_address(redirect_address) for cc_address in cc_addresses: validate_email_address(cc_address) except ValueError as error: logging.error(error) return str(error), HTTPStatus.BAD_REQUEST if not batch_id: msg = "Query parameter 'batch_id' not received" logging.error(msg) return msg, HTTPStatus.BAD_REQUEST success_count, failure_count = email_delivery.deliver( batch_id, redirect_address=redirect_address, cc_addresses=cc_addresses, subject_override=subject_override, ) redirect_text = (f"to the redirect email address {redirect_address}" if redirect_address else "") cc_addresses_text = (f"CC'd {','.join(email for email in cc_addresses)}." if cc_addresses else "") return ( f"Sent {success_count} emails {redirect_text}. {cc_addresses_text} " f"{failure_count} emails failed to send", HTTPStatus.OK, )
def scraper_resume(): """Request handler to resume one or several stopped scrapers Resumes scraping for each region and scrape type in request. Example query: /scraper_control/resume?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' Args: N/A Returns: N/A """ scrape_regions = ingest_utils.validate_regions( get_str_param_values("region", request.args)) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) if not scrape_regions or not scrape_types: return ( "Missing or invalid parameters, see service logs.", HTTPStatus.BAD_REQUEST, ) for region in scrape_regions: for scrape_type in scrape_types: logging.info("Resuming [%s] scrape for [%s].", scrape_type, region) sessions.create_session(ScrapeKey(region, scrape_type)) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(5) scraper = regions.get_region(region).get_scraper() scraper.resume_scrape(scrape_type) return ("", HTTPStatus.OK)
def check_for_finished_scrapers(): """Checks for any finished scrapers and kicks off next processes.""" next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None cloud_task_manager = ScraperCloudTaskManager() @monitoring.with_region_tag def _check_finished(region_code: str): # If there are no sessions currently scraping, nothing to check. session = sessions.get_current_session( ScrapeKey(region_code, constants.ScrapeType.BACKGROUND) ) if not session or not session.phase.is_actively_scraping(): return if is_scraper_finished(region_code, cloud_task_manager): logging.info("Region [%s] has finished scraping.", region_code) if next_phase: logging.info( "Enqueueing [%s] for region [%s].", next_phase, region_code ) ScraperCloudTaskManager().create_scraper_phase_task( region_code=region_code, url=next_phase_url ) region_codes = ingest_utils.validate_regions( get_str_param_values("region", request.args) ) failed_regions = [] with futures.ThreadPoolExecutor() as executor: future_to_region = { executor.submit( structured_logging.with_context(_check_finished), region_code ): region_code for region_code in region_codes } for future in futures.as_completed(future_to_region): region_code = future_to_region[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( "An exception occured when checking region [%s]", region_code ) failed_regions.append(region_code) if failed_regions: return ( "Failed to check regions: {}".format(failed_regions), HTTPStatus.INTERNAL_SERVER_ERROR, ) return ("", HTTPStatus.OK)
def test_get_str_param_values(self): self.assertEqual( params.get_str_param_values('region', PARAMS), ['us_mo', 'us_wa'])
def scraper_start(): """Request handler to start one or several running scrapers Kicks off new scrape session for each region and scrape type in request Example query: /scraper_control/start?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' timezone: (string) The timezone to scrape. surname: (string, optional) Name to start scrape at. Required if given_names provided given_names: (string, optional) Name to start scrape at Args: N/A Returns: N/A """ @structured_logging.copy_trace_id_to_thread @monitoring.with_region_tag def _start_scraper(region, scrape_type): scrape_key = ScrapeKey(region, scrape_type) most_recent_session = \ next(sessions.get_sessions(region_code=scrape_key.region_code, include_closed=True, most_recent_only=True, scrape_type=scrape_key.scrape_type), None) if most_recent_session and not \ most_recent_session.phase.has_persisted(): raise Exception("Session already running for region [%s]. Could " "not start a new session" % region) logging.info( "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]", scrape_key, BATCH_PUBSUB_TYPE) pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE) logging.info("Starting new scraper for: [%s]", scrape_key) scraper = regions.get_region(region).get_ingestor() current_session = sessions.create_session(scrape_key) # Help avoid race condition with new session info # vs updating that w/first task. time.sleep(1) # Clear prior query docket for this scrape type and start adding new # items in a background thread. In the case that there is a large # names list, loading it can take some time. Loading it in the # background allows us to start the scraper before it is fully # loaded. tracker.purge_docket_and_session(scrape_key) # Note, the request context isn't copied when launching this thread, so # any logs from within `load_target_list` will not be associated with # the start scraper request. # # TODO(#1045): Either kill this, or ensure logs are correlated and # exceptions are passed up to the parent thread. load_docket_thread = threading.Thread( target=docket.load_target_list, args=(scrape_key, given_names, surname)) load_docket_thread.start() # Start scraper, if the docket is empty this will wait for a bounded # period of time for an item to be published (~90 seconds). logging.info("Starting [%s]/[%s] scrape...", region, scrape_type) scraper.start_scrape(scrape_type) sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE) # Wait for the docket to be loaded load_docket_thread.join() timezone = ingest_utils.lookup_timezone(request.args.get("timezone")) region_value = get_str_param_values("region", request.args) # If a timezone wasn't provided start all regions. If it was only start # regions that match the timezone. scrape_regions = ingest_utils.validate_regions( region_value, timezone=timezone) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) if not scrape_regions or not scrape_types: return ('Missing or invalid parameters, or no regions found, see logs.', HTTPStatus.BAD_REQUEST) given_names = get_str_param_value("given_names", request.args, "") surname = get_str_param_value("surname", request.args, "") failed_starts = [] with futures.ThreadPoolExecutor() as executor: # Start all of the calls. future_to_args = \ {executor.submit(_start_scraper, region_code, scrape_type): \ (region_code, scrape_type) for scrape_type in scrape_types for region_code in scrape_regions} # Wait for all the calls to finish. for future in futures.as_completed(future_to_args): region_code, scrape_type = future_to_args[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( 'An exception occured when starting region [%s] for ' '[%s]', region_code, scrape_type) failed_starts.append((region_code, scrape_type)) else: logging.info('Finished starting region [%s] for [%s].', region_code, scrape_type) if failed_starts: # This causes the whole request to be retried. Any regions whose session # was opened during this call will be immediately skipped in the next # call when we check for open sessions. Any regions we failed to start # likely still had sessions opened and thus will be skipped, but it is # worth retrying anyway. return ('Failed to start regions: {}'.format(failed_starts), HTTPStatus.INTERNAL_SERVER_ERROR) return ('', HTTPStatus.OK)
def scraper_stop(): """Request handler to stop one or several running scrapers. Note: Stopping any scrape type for a region involves purging the scraping task queue for that region, necessarily killing any other in-progress scrape types. Untargeted scrapes killed by this request handler will be noted and resumed a moment or two later. Unlike the other Scraper action methods, stop_scrape doesn't call individually for each scrape type. That could create a race condition, as each call noticed the other scrape type was running at the same time, kicked off a resume effort with a delay, and then our second call came to kill the other type and missed the (delayed / not yet in taskqueue) call - effectively not stopping the scrape. Instead, we send the full list of scrape_types to stop, and Scraper.stop_scrape is responsible for fan-out. Example query: /scraper_control/stop?region=us_ny&scrape_type=background URL parameters: region: (string) Region to take action for, or 'all' scrape_type: (string) Type of scrape to take action for, or 'all' Args: N/A Returns: N/A """ timezone = ingest_utils.lookup_timezone(request.args.get("timezone")) respect_is_stoppable = get_str_param_value("respect_is_stoppable", request.args) # If a timezone wasn't provided stop all regions. If it was only stop # regions that match the timezone. scrape_regions = ingest_utils.validate_regions( get_str_param_values("region", request.args), timezone=timezone) scrape_types = ingest_utils.validate_scrape_types( get_str_param_values("scrape_type", request.args)) next_phase = scrape_phase.next_phase(request.endpoint) next_phase_url = url_for(next_phase) if next_phase else None @structured_logging.copy_trace_id_to_thread @monitoring.with_region_tag def _stop_scraper(region: str): closed_sessions = [] for scrape_type in scrape_types: closed_sessions.extend( sessions.close_session(ScrapeKey(region, scrape_type))) for session in closed_sessions: sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST) if not closed_sessions: return was_stopped = False try: logging.info("Stopping scraper for region [%s].", region) region_scraper = regions.get_region(region).get_ingestor() was_stopped = region_scraper.stop_scrape(scrape_types, respect_is_stoppable) finally: if next_phase and was_stopped: logging.info("Enqueueing %s for region [%s].", next_phase, region) queues.enqueue_scraper_phase(region_code=region, url=next_phase_url) if not scrape_regions or not scrape_types: return ('Missing or invalid parameters, see service logs.', HTTPStatus.BAD_REQUEST) failed_stops = [] with futures.ThreadPoolExecutor() as executor: # Start all of the calls. future_to_regions = \ {executor.submit(_stop_scraper, region_code): region_code for region_code in scrape_regions} # Wait for all the calls to finish. for future in futures.as_completed(future_to_regions): region_code = future_to_regions[future] with monitoring.push_tags({monitoring.TagKey.REGION: region_code}): try: future.result() except Exception: logging.exception( 'An exception occured when stopping region [%s] for ' '[%s]', region_code, scrape_types) failed_stops.append(region_code) else: logging.info('Finished stopping region [%s] for [%s].', region_code, scrape_types) if failed_stops: # This causes the whole request to be retried. Any regions whose session # was closed during this call will be immediately skipped in the next # call as we won't find any sessions to close. Any regions we failed to # start likely still had their sessions closed and thus will be skipped, # but it is worth retrying anyway. return ('Failed to stop regions: {}'.format(failed_stops), HTTPStatus.INTERNAL_SERVER_ERROR) return ('', HTTPStatus.OK)
def test_get_str_param_values(self) -> None: self.assertEqual(params.get_str_param_values("region", PARAMS), ["us_mo", "us_wa"])