Exemple #1
0
def infer_release():
    """Runs infer release for the given regions."""
    region_codes = validate_regions(
        get_str_param_values("region", request.args))
    regions = [get_region(region_code) for region_code in region_codes]

    for region in regions:
        with monitoring.push_tags(
            {monitoring.TagKey.REGION: region.region_code}):
            if region.agency_type != "jail":
                continue

            session = sessions.get_most_recent_completed_session(
                region.region_code)
            if session:
                logging.info(
                    "Got most recent completed session for [%s] with "
                    "start time [%s]",
                    region.region_code,
                    session.start,
                )
                persistence.infer_release_on_open_bookings(
                    region.region_code, session.start,
                    _get_custody_status(region))
                sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)

    return "", HTTPStatus.OK
    def _stop_scraper(region: str):
        logging.info("Trying to stop scraper for region [%s].", region)
        for scrape_type in scrape_types:
            key = ScrapeKey(region_code=region, scrape_type=scrape_type)
            session = sessions.get_current_session(key)
            if not session:
                logging.info(
                    "No [%s] scrape to stop for region: [%s]", scrape_type,
                    region)
                continue

            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_type,
                                                     respect_is_stoppable)
            if was_stopped:
                closed_sessions = sessions.close_session(key)
                for closed_session in closed_sessions:
                    sessions.update_phase(closed_session,
                                          scrape_phase.ScrapePhase.PERSIST)
                if next_phase:
                    logging.info("Enqueueing %s for region [%s].",
                                 next_phase, region)
                    ScraperCloudTaskManager().create_scraper_phase_task(
                        region_code=region,
                        url=next_phase_url)
Exemple #3
0
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = next(
            sessions.get_sessions(
                region_code=scrape_key.region_code,
                include_closed=True,
                most_recent_only=True,
                scrape_type=scrape_key.scrape_type,
            ),
            None,
        )
        if most_recent_session and not most_recent_session.phase.has_persisted(
        ):
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key,
            BATCH_PUBSUB_TYPE,
        )
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_scraper()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        load_docket_thread = threading.Thread(
            target=structured_logging.with_context(docket.load_target_list),
            args=(scrape_key, given_names, surname),
        )
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()
    def test_scraper_success(self, mock_get_region, mock_client):
        mock_get_region.return_value = _mock_region()

        session = sessions.ScrapeSession.new(
            key=None,
            region=_REGION_CODE,
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.RELEASE)

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)

        query = SessionFactory.for_schema_base(JailsBase).query(ScraperSuccess)
        result = one(query.all())

        region = regions.get_region(_REGION_CODE)
        self.assertEqual(result.jid, region.jurisdiction_id)
        self.assertEqual(result.date, _TODAY)
Exemple #5
0
    def _stop_scraper(region: str):
        closed_sessions = []
        for scrape_type in scrape_types:
            closed_sessions.extend(
                sessions.close_session(ScrapeKey(region, scrape_type)))
        for session in closed_sessions:
            sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST)
        if not closed_sessions:
            return

        was_stopped = False
        try:
            logging.info("Stopping scraper for region [%s].", region)
            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_types,
                                                     respect_is_stoppable)
        finally:
            if next_phase and was_stopped:
                logging.info("Enqueueing %s for region [%s].",
                             next_phase, region)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=next_phase_url)
Exemple #6
0
    def test_scraper_success(self, mock_get_region: Mock,
                             _mock_client: Mock) -> None:
        mock_get_region.return_value = _mock_region()

        session = sessions.ScrapeSession.new(
            key=None,
            region=_REGION_CODE,
            scrape_type=constants.ScrapeType.BACKGROUND,
            phase=scrape_phase.ScrapePhase.RELEASE,
        )
        session.start = datetime.datetime.strptime("2000-01-01 9:01",
                                                   "%Y-%d-%m %H:%M")

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)

        with SessionFactory.using_database(self.database_key,
                                           autocommit=False) as session:
            query = session.query(ScraperSuccess)
            result = one(query.all())

        region = regions.get_region(_REGION_CODE)
        self.assertEqual(result.jid, region.jurisdiction_id)
        self.assertEqual(result.date, _TODAY)
def read_and_persist() -> Tuple[str, HTTPStatus]:
    """Reads all of the messages from Datastore for a region and persists
    them to the database.
    """

    region = request.args.get("region")

    if not isinstance(region, str):
        raise ValueError(f"Expected string region, found [{region}]")

    batch_tags = {
        monitoring.TagKey.STATUS: "COMPLETED",
        monitoring.TagKey.PERSISTED: False,
    }
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION: region}
    ), monitoring.measurements(batch_tags) as measurements:
        measurements.measure_int_put(m_batch_count, 1)

        session = sessions.get_most_recent_completed_session(
            region, ScrapeType.BACKGROUND
        )

        if not session:
            raise ValueError(
                f"Most recent session for region [{region}] is unexpectedly None"
            )

        scrape_type = session.scrape_type

        try:
            did_persist = persist_to_database(region, session.start)
            batch_tags[monitoring.TagKey.PERSISTED] = did_persist
        except Exception as e:
            logging.exception(
                "An exception occurred in read and persist: %s", type(e).__name__
            )
            batch_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(type(e).__name__)
            sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
            raise BatchPersistError(region, scrape_type) from e

        if did_persist:
            next_phase = scrape_phase.next_phase(request.endpoint)
            sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE)
            if next_phase:
                logging.info("Enqueueing %s for region %s.", next_phase, region)
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region, url=url_for(next_phase)
                )
            return "", HTTPStatus.OK

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
        return "", HTTPStatus.ACCEPTED
Exemple #8
0
def read_and_persist():
    """Reads all of the messages from Datastore for a region and persists
    them to the database.
    """

    region = request.args.get('region')
    batch_tags = {
        monitoring.TagKey.STATUS: 'COMPLETED',
        monitoring.TagKey.PERSISTED: False
    }
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags({monitoring.TagKey.REGION: region}), \
         monitoring.measurements(batch_tags) as measurements:
        measurements.measure_int_put(m_batch_count, 1)

        session = sessions.get_most_recent_completed_session(
            region, ScrapeType.BACKGROUND)
        scrape_type = session.scrape_type

        try:
            did_persist = persist_to_database(region, session.start)
            batch_tags[monitoring.TagKey.PERSISTED] = did_persist
        except Exception as e:
            logging.exception("An exception occurred in read and persist: %s",
                              type(e).__name__)
            batch_tags[monitoring.TagKey.STATUS] = 'ERROR: {}' \
                .format(type(e).__name__)
            sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
            raise BatchPersistError(region, scrape_type)

        if did_persist:
            next_phase = scrape_phase.next_phase(request.endpoint)
            sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE)
            if next_phase:
                logging.info("Enqueueing %s for region %s.", region,
                             next_phase)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=url_for(next_phase))
            return '', HTTPStatus.OK

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
        return '', HTTPStatus.ACCEPTED