Esempio n. 1
0
def test_measurements_with_push_tags(mock_mmap):
    tags = {"alice": "foo"}

    # inside of pushed tag
    with monitoring.push_tags({"eve": "baz"}):
        with monitoring.measurements(tags) as mmap:
            tags["bob"] = "bar"
            assert mmap == mock_mmap

    # outside of pushed tag
    with monitoring.measurements(tags) as mmap:
        tags["other"] = "thing"
        assert mmap == mock_mmap

    assert_recorded_tags(
        mock_mmap,
        [
            {
                "alice": "foo",
                "bob": "bar",
                "eve": "baz"
            },
            {
                "alice": "foo",
                "bob": "bar",
                "other": "thing"
            },
        ],
    )
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info('Received request to process direct ingest job: [%s]',
                 request.values)
    region_code = get_str_param_value('region', request.values)

    if not region_code:
        return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_args = _get_ingest_args(json_data)

        if not ingest_args:
            return f'Could not parse ingest args', HTTPStatus.BAD_REQUEST
        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                if not ingest_args:
                    raise DirectIngestError(
                        msg=f"process_job was called with no IngestArgs.",
                        error_type=DirectIngestErrorType.INPUT_ERROR)

                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.run_ingest_job_and_kick_scheduler_on_completion(
                ingest_args)
    return '', HTTPStatus.OK
Esempio n. 3
0
def test_measurements_with_push_tags_and_exception(mock_mmap) -> None:
    tags = {"alice": "foo"}

    # inside of pushed tag
    with pytest.raises(Exception):
        with monitoring.push_tags({"eve": "baz"}):
            with monitoring.measurements(tags) as mmap:
                tags["bob"] = "bar"
                assert mmap == mock_mmap
                raise Exception

    # outside of pushed tag
    with monitoring.measurements(tags) as mmap:
        tags["other"] = "thing"
        assert mmap == mock_mmap

    assert_recorded_tags(
        mock_mmap,
        [
            {
                "alice": "foo",
                "bob": "bar",
                "eve": "baz"
            },
            {
                "alice": "foo",
                "bob": "bar",
                "other": "thing"
            },
        ],
    )
Esempio n. 4
0
def infer_release():
    """Runs infer release for the given regions."""
    region_codes = validate_regions(
        get_str_param_values("region", request.args))
    regions = [get_region(region_code) for region_code in region_codes]

    for region in regions:
        with monitoring.push_tags(
            {monitoring.TagKey.REGION: region.region_code}):
            if region.agency_type != "jail":
                continue

            session = sessions.get_most_recent_completed_session(
                region.region_code)
            if session:
                logging.info(
                    "Got most recent completed session for [%s] with "
                    "start time [%s]",
                    region.region_code,
                    session.start,
                )
                persistence.infer_release_on_open_bookings(
                    region.region_code, session.start,
                    _get_custody_status(region))
                sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)

    return "", HTTPStatus.OK
Esempio n. 5
0
def test_measurements_with_push_tags_and_exception(mock_mmap):
    tags = {'alice': 'foo'}

    # inside of pushed tag
    with pytest.raises(Exception):
        with monitoring.push_tags({'eve': 'baz'}):
            with monitoring.measurements(tags) as mmap:
                tags['bob'] = 'bar'
                assert mmap == mock_mmap
                raise Exception

    # outside of pushed tag
    with monitoring.measurements(tags) as mmap:
        tags['other'] = 'thing'
        assert mmap == mock_mmap

    assert_recorded_tags(mock_mmap, [{
        'alice': 'foo',
        'bob': 'bar',
        'eve': 'baz'
    }, {
        'alice': 'foo',
        'bob': 'bar',
        'other': 'thing'
    }])
Esempio n. 6
0
def check_for_finished_scrapers():
    """Checks for any finished scrapers and kicks off next processes."""

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None
    cloud_task_manager = ScraperCloudTaskManager()

    @monitoring.with_region_tag
    def _check_finished(region_code: str):
        # If there are no sessions currently scraping, nothing to check.
        session = sessions.get_current_session(
            ScrapeKey(region_code, constants.ScrapeType.BACKGROUND)
        )
        if not session or not session.phase.is_actively_scraping():
            return

        if is_scraper_finished(region_code, cloud_task_manager):
            logging.info("Region [%s] has finished scraping.", region_code)

            if next_phase:
                logging.info(
                    "Enqueueing [%s] for region [%s].", next_phase, region_code
                )
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region_code, url=next_phase_url
                )

    region_codes = ingest_utils.validate_regions(
        get_str_param_values("region", request.args)
    )

    failed_regions = []
    with futures.ThreadPoolExecutor() as executor:
        future_to_region = {
            executor.submit(
                structured_logging.with_context(_check_finished), region_code
            ): region_code
            for region_code in region_codes
        }
        for future in futures.as_completed(future_to_region):
            region_code = future_to_region[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        "An exception occured when checking region [%s]", region_code
                    )
                    failed_regions.append(region_code)

    if failed_regions:
        return (
            "Failed to check regions: {}".format(failed_regions),
            HTTPStatus.INTERNAL_SERVER_ERROR,
        )
    return ("", HTTPStatus.OK)
Esempio n. 7
0
def read_and_persist() -> Tuple[str, HTTPStatus]:
    """Reads all of the messages from Datastore for a region and persists
    them to the database.
    """

    region = request.args.get("region")

    if not isinstance(region, str):
        raise ValueError(f"Expected string region, found [{region}]")

    batch_tags = {
        monitoring.TagKey.STATUS: "COMPLETED",
        monitoring.TagKey.PERSISTED: False,
    }
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION: region}
    ), monitoring.measurements(batch_tags) as measurements:
        measurements.measure_int_put(m_batch_count, 1)

        session = sessions.get_most_recent_completed_session(
            region, ScrapeType.BACKGROUND
        )

        if not session:
            raise ValueError(
                f"Most recent session for region [{region}] is unexpectedly None"
            )

        scrape_type = session.scrape_type

        try:
            did_persist = persist_to_database(region, session.start)
            batch_tags[monitoring.TagKey.PERSISTED] = did_persist
        except Exception as e:
            logging.exception(
                "An exception occurred in read and persist: %s", type(e).__name__
            )
            batch_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(type(e).__name__)
            sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
            raise BatchPersistError(region, scrape_type) from e

        if did_persist:
            next_phase = scrape_phase.next_phase(request.endpoint)
            sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE)
            if next_phase:
                logging.info("Enqueueing %s for region %s.", next_phase, region)
                ScraperCloudTaskManager().create_scraper_phase_task(
                    region_code=region, url=url_for(next_phase)
                )
            return "", HTTPStatus.OK

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
        return "", HTTPStatus.ACCEPTED
Esempio n. 8
0
def write_record() -> Tuple[str, HTTPStatus]:
    ingest_info = None
    last_scraped_time = None
    region = None
    jurisdiction_id = None

    with monitoring.push_tags({monitoring.TagKey.REGION: region}):
        metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time)  # type: ignore

        persistence.write(ingest_info, metadata)  # type: ignore

        return "", HTTPStatus.NOT_IMPLEMENTED
Esempio n. 9
0
def write_record():
    # TODO: Something like `ingest_info = protobuf.read(request.data)`
    ingest_info = None
    last_scraped_time = None
    region = None
    jurisdiction_id = None

    with monitoring.push_tags({monitoring.TagKey.REGION: region}):
        metadata = IngestMetadata(region, jurisdiction_id, last_scraped_time)

        persistence.write(ingest_info, metadata)

        return '', HTTPStatus.NOT_IMPLEMENTED
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg="raw_data_import was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )
        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            if not isinstance(controller, GcsfsDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, IngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not ingest_args:
            return "Could not parse ingest args", HTTPStatus.BAD_REQUEST
        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg="raw_data_import was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            if not isinstance(controller, GcsfsDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK
Esempio n. 13
0
def read_and_persist():
    """Reads all of the messages from Datastore for a region and persists
    them to the database.
    """

    region = request.args.get('region')
    batch_tags = {
        monitoring.TagKey.STATUS: 'COMPLETED',
        monitoring.TagKey.PERSISTED: False
    }
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags({monitoring.TagKey.REGION: region}), \
         monitoring.measurements(batch_tags) as measurements:
        measurements.measure_int_put(m_batch_count, 1)

        session = sessions.get_most_recent_completed_session(
            region, ScrapeType.BACKGROUND)
        scrape_type = session.scrape_type

        try:
            did_persist = persist_to_database(region, session.start)
            batch_tags[monitoring.TagKey.PERSISTED] = did_persist
        except Exception as e:
            logging.exception("An exception occurred in read and persist: %s",
                              type(e).__name__)
            batch_tags[monitoring.TagKey.STATUS] = 'ERROR: {}' \
                .format(type(e).__name__)
            sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
            raise BatchPersistError(region, scrape_type)

        if did_persist:
            next_phase = scrape_phase.next_phase(request.endpoint)
            sessions.update_phase(session, scrape_phase.ScrapePhase.RELEASE)
            if next_phase:
                logging.info("Enqueueing %s for region %s.", region,
                             next_phase)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=url_for(next_phase))
            return '', HTTPStatus.OK

        sessions.update_phase(session, scrape_phase.ScrapePhase.DONE)
        return '', HTTPStatus.ACCEPTED
Esempio n. 14
0
def scraper_start():
    """Request handler to start one or several running scrapers

    Kicks off new scrape session for each region and scrape type in request

    Example query:
        /scraper_control/start?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'
        timezone: (string) The timezone to scrape.
        surname: (string, optional) Name to start scrape at. Required if
            given_names provided
        given_names: (string, optional) Name to start scrape at

    Args:
        N/A

    Returns:
        N/A
    """

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _start_scraper(region, scrape_type):
        scrape_key = ScrapeKey(region, scrape_type)

        most_recent_session = \
            next(sessions.get_sessions(region_code=scrape_key.region_code,
                                       include_closed=True,
                                       most_recent_only=True,
                                       scrape_type=scrape_key.scrape_type),
                 None)
        if most_recent_session and not \
                most_recent_session.phase.has_persisted():
            raise Exception("Session already running for region [%s]. Could "
                            "not start a new session" % region)

        logging.info(
            "Purging pubsub queue for scrape_key: [%s] and pubsub_type: [%s]",
            scrape_key, BATCH_PUBSUB_TYPE)
        pubsub_helper.purge(scrape_key, BATCH_PUBSUB_TYPE)

        logging.info("Starting new scraper for: [%s]", scrape_key)
        scraper = regions.get_region(region).get_ingestor()

        current_session = sessions.create_session(scrape_key)

        # Help avoid race condition with new session info
        # vs updating that w/first task.
        time.sleep(1)

        # Clear prior query docket for this scrape type and start adding new
        # items in a background thread. In the case that there is a large
        # names list, loading it can take some time. Loading it in the
        # background allows us to start the scraper before it is fully
        # loaded.
        tracker.purge_docket_and_session(scrape_key)
        # Note, the request context isn't copied when launching this thread, so
        # any logs from within `load_target_list` will not be associated with
        # the start scraper request.
        #
        # TODO(#1045): Either kill this, or ensure logs are correlated and
        # exceptions are passed up to the parent thread.
        load_docket_thread = threading.Thread(
            target=docket.load_target_list,
            args=(scrape_key, given_names, surname))
        load_docket_thread.start()

        # Start scraper, if the docket is empty this will wait for a bounded
        # period of time for an item to be published (~90 seconds).
        logging.info("Starting [%s]/[%s] scrape...", region, scrape_type)
        scraper.start_scrape(scrape_type)

        sessions.update_phase(current_session, scrape_phase.ScrapePhase.SCRAPE)

        # Wait for the docket to be loaded
        load_docket_thread.join()

    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    region_value = get_str_param_values("region", request.args)
    # If a timezone wasn't provided start all regions. If it was only start
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        region_value, timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, or no regions found, see logs.',
                HTTPStatus.BAD_REQUEST)

    given_names = get_str_param_value("given_names", request.args, "")
    surname = get_str_param_value("surname", request.args, "")

    failed_starts = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_args = \
            {executor.submit(_start_scraper, region_code, scrape_type): \
                (region_code, scrape_type)
             for scrape_type in scrape_types
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_args):
            region_code, scrape_type = future_to_args[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when starting region [%s] for '
                        '[%s]',
                        region_code, scrape_type)
                    failed_starts.append((region_code, scrape_type))
                else:
                    logging.info('Finished starting region [%s] for [%s].',
                                 region_code, scrape_type)

    if failed_starts:
        # This causes the whole request to be retried. Any regions whose session
        # was opened during this call will be immediately skipped in the next
        # call when we check for open sessions. Any regions we failed to start
        # likely still had sessions opened and thus will be skipped, but it is
        # worth retrying anyway.
        return ('Failed to start regions: {}'.format(failed_starts),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)
Esempio n. 15
0
def scraper_stop():
    """Request handler to stop one or several running scrapers.

    Note: Stopping any scrape type for a region involves purging the
    scraping task queue for that region, necessarily killing any other
    in-progress scrape types. Untargeted scrapes killed by this request
    handler will be noted and resumed a moment or two later.

    Unlike the other Scraper action methods, stop_scrape doesn't call
    individually for each scrape type. That could create a race condition,
    as each call noticed the other scrape type was running at the same
    time, kicked off a resume effort with a delay, and then our second
    call came to kill the other type and missed the (delayed / not yet
    in taskqueue) call - effectively not stopping the scrape.

    Instead, we send the full list of scrape_types to stop, and
    Scraper.stop_scrape is responsible for fan-out.

    Example query:
        /scraper_control/stop?region=us_ny&scrape_type=background

    URL parameters:
        region: (string) Region to take action for, or 'all'
        scrape_type: (string) Type of scrape to take action for, or 'all'

    Args:
        N/A

    Returns:
        N/A
    """
    timezone = ingest_utils.lookup_timezone(request.args.get("timezone"))
    respect_is_stoppable = get_str_param_value("respect_is_stoppable",
                                               request.args)

    # If a timezone wasn't provided stop all regions. If it was only stop
    # regions that match the timezone.
    scrape_regions = ingest_utils.validate_regions(
        get_str_param_values("region", request.args), timezone=timezone)
    scrape_types = ingest_utils.validate_scrape_types(
        get_str_param_values("scrape_type", request.args))

    next_phase = scrape_phase.next_phase(request.endpoint)
    next_phase_url = url_for(next_phase) if next_phase else None

    @structured_logging.copy_trace_id_to_thread
    @monitoring.with_region_tag
    def _stop_scraper(region: str):
        closed_sessions = []
        for scrape_type in scrape_types:
            closed_sessions.extend(
                sessions.close_session(ScrapeKey(region, scrape_type)))
        for session in closed_sessions:
            sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST)
        if not closed_sessions:
            return

        was_stopped = False
        try:
            logging.info("Stopping scraper for region [%s].", region)
            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_types,
                                                     respect_is_stoppable)
        finally:
            if next_phase and was_stopped:
                logging.info("Enqueueing %s for region [%s].",
                             next_phase, region)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=next_phase_url)

    if not scrape_regions or not scrape_types:
        return ('Missing or invalid parameters, see service logs.',
                HTTPStatus.BAD_REQUEST)

    failed_stops = []
    with futures.ThreadPoolExecutor() as executor:
        # Start all of the calls.
        future_to_regions = \
            {executor.submit(_stop_scraper, region_code): region_code
             for region_code in scrape_regions}

        # Wait for all the calls to finish.
        for future in futures.as_completed(future_to_regions):
            region_code = future_to_regions[future]
            with monitoring.push_tags({monitoring.TagKey.REGION: region_code}):
                try:
                    future.result()
                except Exception:
                    logging.exception(
                        'An exception occured when stopping region [%s] for '
                        '[%s]',
                        region_code, scrape_types)
                    failed_stops.append(region_code)
                else:
                    logging.info('Finished stopping region [%s] for [%s].',
                                 region_code, scrape_types)

    if failed_stops:
        # This causes the whole request to be retried. Any regions whose session
        # was closed during this call will be immediately skipped in the next
        # call as we won't find any sessions to close. Any regions we failed to
        # start likely still had their sessions closed and thus will be skipped,
        # but it is worth retrying anyway.
        return ('Failed to stop regions: {}'.format(failed_stops),
                HTTPStatus.INTERNAL_SERVER_ERROR)
    return ('', HTTPStatus.OK)
Esempio n. 16
0
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    output_bucket_name = get_str_param_value("output_bucket",
                                             request.values,
                                             preserve_case=True)

    if not region_code or not output_bucket_name:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                GcsfsBucketPath(output_bucket_name)).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsIngestViewExportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if output_bucket_name != ingest_view_export_args.output_bucket_name:
            raise DirectIngestError(
                msg=
                f"Different buckets were passed in the url and request body\n"
                f"url: {output_bucket_name}\n"
                f"body: {ingest_view_export_args.output_bucket_name}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=GcsfsBucketPath(
                        ingest_view_export_args.output_bucket_name),
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK
Esempio n. 17
0
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no GcsfsIngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, GcsfsIngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != ingest_args.file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {ingest_args.file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_args.file_path.bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                logging.warning(str(e))
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
Esempio n. 18
0
def work(region):
    """POST request handler to route chunk of scraper work

    Very thin shim to receive a chunk of work from the task queue, and call
    the relevant part of the specified scraper to execute it.

    All scraper work that hits a third-party website goes through this handler
    as small discrete tasks, so that we leverage the taskqueue's throttling and
    retry support for network requests to the sites (and don't DOS them).

    Because scraping will vary so significantly by region, this taskqueue
    handler is very lightweight - it really just accepts the POST for the task,
    and calls the relevant regional scraper to do whatever was asked. This
    allows it to stay agnostic to regional variation.

    Never called manually, so authentication is enforced in app.yaml.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        region: (string) Region code for the scraper in question.
        task: (string) Name of the function to call in the scraper
        params: (dict) Parameter payload to give the function being called
            (optional)

    Returns:
        Response code 200 if successful

        Any other response code will make taskqueue consider the task
        failed, and it will retry the task until it expires or succeeds
        (handling backoff logic, etc.)
    """
    # Verify this was actually a task queued by our app
    if "X-AppEngine-QueueName" not in request.headers:
        logging.error("Couldn't validate task was legit, exiting.")
        return ("", HTTPStatus.INTERNAL_SERVER_ERROR)
    queue_name = request.headers.get("X-AppEngine-QueueName")

    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    task = data["task"]
    params = QueueRequest.from_serializable(data["params"])

    if region != data["region"]:
        raise ValueError(
            "Region specified in task {} does not match region from url {}.".
            format(data["region"], region))

    task_tags = {monitoring.TagKey.STATUS: "COMPLETED"}
    # Note: measurements must be second so it receives the region tag.
    with monitoring.push_tags(
        {monitoring.TagKey.REGION:
         region}), monitoring.measurements(task_tags) as measurements:
        measurements.measure_int_put(m_tasks, 1)
        if not sessions.get_current_session(
                ScrapeKey(region, params.scrape_type)):
            task_tags[monitoring.TagKey.STATUS] = "SKIPPED"
            logging.info(
                "Queue [%s], skipping task [%s] for [%s] because it "
                "is not in the current session.",
                queue_name,
                task,
                region,
            )
            return ("", HTTPStatus.OK)
        logging.info("Queue [%s], processing task [%s] for [%s].", queue_name,
                     task, region)

        scraper = regions.get_region(region).get_ingestor()
        scraper_task = getattr(scraper, task)

        try:
            scraper_task(params)
        except Exception as e:
            task_tags[monitoring.TagKey.STATUS] = "ERROR: {}".format(
                type(e).__name__)
            raise RequestProcessingError(region, task, params) from e

        # Respond to the task queue to mark this task as done
        return ("", HTTPStatus.OK)
Esempio n. 19
0
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsRawDataBQImportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != data_import_args.raw_data_file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {data_import_args.raw_data_file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=data_import_args.raw_data_file_path.
                    bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK