def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info('Received request to process direct ingest job: [%s]',
                 request.values)
    region_code = get_str_param_value('region', request.values)

    if not region_code:
        return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_args = _get_ingest_args(json_data)

        if not ingest_args:
            return f'Could not parse ingest args', HTTPStatus.BAD_REQUEST
        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                if not ingest_args:
                    raise DirectIngestError(
                        msg=f"process_job was called with no IngestArgs.",
                        error_type=DirectIngestErrorType.INPUT_ERROR)

                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.run_ingest_job_and_kick_scheduler_on_completion(
                ingest_args)
    return '', HTTPStatus.OK
def upload_from_sftp() -> Tuple[str, HTTPStatus]:
    """Connects to remote SFTP servers and uploads the files in both raw and normalized form
    to GCS buckets to start the ingest process. Should only be called from a task queue scheduler.

    Args:
        region_code (Optional[str]): required as part of the request to identify the region
        date_str (Optional[str]): ISO format date string,
            used to determine the lower bound date in which to start
            pulling items from the SFTP server. If None, uses yesterday as the default lower
            bound time, otherwises creates a datetime from the string.
        bucket_str (Optional[str]): GCS bucket name, used to override the
            destination in which the SFTP assets are downloaded to and moved for proper
            ingest (therefore used in both controllers). If None, uses the bucket determined
            by |region_code| otherwise, uses this destination.
    """
    logging.info("Received request for uploading files from SFTP: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    date_str = get_str_param_value("date", request.values)
    bucket_str = get_str_param_value("bucket", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        lower_bound_update_datetime = (
            datetime.datetime.fromisoformat(date_str) if date_str is not None
            else datetime.datetime.utcnow() - datetime.timedelta(1))
        sftp_controller = DownloadFilesFromSftpController(
            project_id=metadata.project_id(),
            region=region_code,
            lower_bound_update_datetime=lower_bound_update_datetime,
            gcs_destination_path=bucket_str,
        )
        downloaded_items, unable_to_download_items = sftp_controller.do_fetch()

        if downloaded_items:
            _, unable_to_upload_files = UploadStateFilesToIngestBucketController(
                paths_with_timestamps=downloaded_items,
                project_id=metadata.project_id(),
                region=region_code,
                gcs_destination_path=bucket_str,
            ).do_upload()

            sftp_controller.clean_up()

            if unable_to_download_items or unable_to_upload_files:
                return (
                    f"Unable to download the following files: {unable_to_download_items}, "
                    f"and upload the following files: {unable_to_upload_files}",
                    HTTPStatus.MULTI_STATUS,
                )
        elif unable_to_download_items:
            return (
                f"Unable to download the following files {unable_to_download_items}",
                HTTPStatus.MULTI_STATUS,
            )
        elif not downloaded_items and not unable_to_download_items:
            return f"No items to download for {region_code}", HTTPStatus.MULTI_STATUS
    return "", HTTPStatus.OK
Exemple #3
0
def handle_sftp_files() -> Tuple[str, HTTPStatus]:
    """Schedules the SFTP downloads into the appropriate cloud task queue."""
    logging.info("Received request for handling SFTP files: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code, ingest_instance=None):
        try:
            region = _region_for_region_code(region_code)
            direct_ingest_cloud_task_manager = DirectIngestCloudTaskManagerImpl(
            )
            direct_ingest_cloud_task_manager.create_direct_ingest_sftp_download_task(
                region)
        except FileNotFoundError as e:
            raise DirectIngestError(
                msg=f"Region [{region_code}] has no registered manifest",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            ) from e

    return "", HTTPStatus.OK
Exemple #4
0
def scheduler() -> Tuple[str, HTTPStatus]:
    """Checks the state of the ingest instance and schedules any tasks to be run."""
    logging.info("Received request for direct ingest scheduler: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    just_finished_job = get_bool_param_value("just_finished_job",
                                             request.values,
                                             default=False)

    # The bucket name for ingest instance to schedule work out of
    bucket = get_str_param_value("bucket", request.args)

    if not region_code or just_finished_job is None or not bucket:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path, allow_unlaunched=False)
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        controller.schedule_next_ingest_job(just_finished_job)
    return "", HTTPStatus.OK
Exemple #5
0
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        region = _region_for_region_code(region_code=region_code)
        if not region.is_ingest_launched_in_env():
            continue
        system_level = SystemLevel.for_region(region)
        for ingest_instance in DirectIngestInstance:
            with monitoring.push_region_tag(
                    region_code, ingest_instance=ingest_instance.value):
                try:
                    ingest_instance.check_is_valid_system_level(system_level)
                except DirectIngestInstanceError:
                    continue
                ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=system_level,
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=False,
                )

                controller.kick_scheduler(just_finished_job=False)
    def test_run_next_bq_import_export_task(self) -> None:
        """Synchronously executes the next queued BQ import/export task, but *does not
        remove it from the queue*."""
        if not self.bq_import_export_tasks:
            raise ValueError("BQ import/export job tasks should not be empty.")

        if self.num_finished_bq_import_export_tasks:
            raise ValueError("Must first pop last finished task.")

        if not self.controller:
            raise ValueError(
                "Controller is null - did you call set_controller()?")

        task = self.bq_import_export_tasks[0]
        task_id = task[0]
        args = task[1]

        with monitoring.push_region_tag(self.controller.region.region_code,
                                        self.controller.ingest_instance.value):
            if task_id.endswith("raw_data_import"):
                if not isinstance(args, GcsfsRawDataBQImportArgs):
                    raise ValueError(f"Unexpected args type {type(args)}")

                self.controller.do_raw_data_import(data_import_args=args)
            elif task_id.endswith("ingest_view_export"):
                if not isinstance(args, GcsfsIngestViewExportArgs):
                    raise ValueError(f"Unexpected args type {type(args)}")

                self.controller.do_ingest_view_export(
                    ingest_view_export_args=args)
            else:
                raise ValueError(f"Unexpected task id [{task_id}]")
        self.num_finished_bq_import_export_tasks += 1
    def test_run_next_scheduler_task(self) -> None:
        """Synchronously executes the next queued scheduler task, but *does not
        remove it from the queue*."""

        if not self.scheduler_tasks:
            raise ValueError("Scheduler job tasks should not be empty.")

        if self.num_finished_scheduler_tasks:
            raise ValueError("Must first pop last finished task.")

        if not self.controller:
            raise ValueError(
                "Controller is null - did you call set_controller()?")

        task = self.scheduler_tasks[0]
        task_id = task[0]

        with monitoring.push_region_tag(self.controller.region.region_code,
                                        self.controller.ingest_instance.value):
            ingest_bucket_path = task[1]
            if not self.controller.ingest_bucket_path == ingest_bucket_path:
                raise ValueError(
                    f"Task request [{task_id}] for ingest bucket [{ingest_bucket_path}]"
                    f"that does not match registered controller ingest bucket"
                    f"[{self.controller.ingest_bucket_path}].")
            if task_id.endswith("schedule"):
                self.controller.schedule_next_ingest_job(
                    just_finished_job=task[2])
            elif task_id.endswith("handle_new_files"):
                self.controller.handle_new_files(can_start_ingest=task[2])
            else:
                raise ValueError(f"Unexpected task id [{task_id}]")
        self.num_finished_scheduler_tasks += 1
    def test_run_next_scheduler_task(self) -> None:
        """Synchronously executes the next queued scheduler task, but *does not
        remove it from the queue*."""

        if not self.scheduler_tasks:
            raise ValueError("Scheduler job tasks should not be empty.")

        if self.num_finished_scheduler_tasks:
            raise ValueError("Must first pop last finished task.")

        if not self.controller:
            raise ValueError("Controller is null - did you call set_controller()?")

        task = self.scheduler_tasks[0]
        task_id = task[0]

        with monitoring.push_region_tag(self.controller.region.region_code):
            if task_id.endswith("schedule"):
                self.controller.schedule_next_ingest_job_or_wait_if_necessary(
                    just_finished_job=task[1]
                )
            elif task_id.endswith("handle_new_files"):
                if not isinstance(self.controller, GcsfsDirectIngestController):
                    raise ValueError(
                        f"Unexpected controller type {type(self.controller)}"
                    )
                self.controller.handle_new_files(can_start_ingest=task[1])
            else:
                raise ValueError(f"Unexpected task id [{task_id}]")
        self.num_finished_scheduler_tasks += 1
Exemple #9
0
def ensure_all_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    logging.info(
        'Received request for direct ingest ensure_all_file_paths_normalized: '
        '%s', request.values)

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        with monitoring.push_region_tag(region_code):
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=True)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR)

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.\
                create_direct_ingest_handle_new_files_task(
                    controller.region, can_start_ingest=can_start_ingest)
    return '', HTTPStatus.OK
Exemple #10
0
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]:
    """Called from a Cloud Function when a new file is added to a direct ingest
    bucket. Will trigger a job that deals with normalizing and splitting the
    file as is appropriate, then start the scheduler if allowed.
    """
    region_code = get_str_param_value('region', request.args)
    # The bucket name for the file to ingest
    bucket = get_str_param_value('bucket', request.args)
    # The relative path to the file, not including the bucket name
    relative_file_path = get_str_param_value('relative_file_path',
                                             request.args,
                                             preserve_case=True)
    start_ingest = \
        get_bool_param_value('start_ingest', request.args, default=False)

    if not region_code or not bucket \
            or not relative_file_path or start_ingest is None:
        return f'Bad parameters [{request.args}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        controller = controller_for_region_code(region_code,
                                                allow_unlaunched=True)
        if not isinstance(controller, GcsfsDirectIngestController):
            raise DirectIngestError(
                msg=f"Unexpected controller type [{type(controller)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR)

        path = GcsfsPath.from_bucket_and_blob_name(
            bucket_name=bucket, blob_name=relative_file_path)

        if isinstance(path, GcsfsFilePath):
            controller.handle_file(path, start_ingest=start_ingest)

    return '', HTTPStatus.OK
def ensure_all_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all file paths in the ingest buckets for all direct ingest states have properly normalized
    file names, to ensure that repeat uploads of files into those buckets don't fail or overwrite data."""
    logging.info(
        "Received request for direct ingest ensure_all_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        with monitoring.push_region_tag(region_code):
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=True)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region, can_start_ingest=can_start_ingest)
    return "", HTTPStatus.OK
Exemple #12
0
def handle_new_files() -> Tuple[str, HTTPStatus]:
    """Normalizes and splits files in the ingest bucket for a given region as
    is appropriate. Will schedule the next process_job task if no renaming /
    splitting work has been done that will trigger subsequent calls to this
    endpoint.
    """
    logging.info('Received request for direct ingest handle_new_files: %s',
                 request.values)
    region_code = get_str_param_value('region', request.values)
    can_start_ingest = \
        get_bool_param_value('can_start_ingest', request.values, default=False)

    if not region_code or can_start_ingest is None:
        return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        try:
            controller = controller_for_region_code(region_code,
                                                    allow_unlaunched=True)
        except DirectIngestError as e:
            if e.is_bad_request():
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        if not isinstance(controller, GcsfsDirectIngestController):
            raise DirectIngestError(
                msg=f"Unexpected controller type [{type(controller)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR)

        controller.handle_new_files(can_start_ingest=can_start_ingest)
    return '', HTTPStatus.OK
def update_raw_data_latest_views_for_state() -> Tuple[str, HTTPStatus]:
    """Updates raw data tables for a given state"""
    logging.info("Received request to do direct ingest raw data update: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        bq_client = BigQueryClientImpl(project_id=metadata.project_id())
        controller = DirectIngestRawDataTableLatestViewUpdater(
            region_code, metadata.project_id(), bq_client)
        controller.update_views_for_state()
    return "", HTTPStatus.OK
    def test_run_next_process_job_task(self) -> None:
        """Synchronously executes the next queued process job task, but *does
        not remove it from the queue*."""
        if not self.process_job_tasks:
            raise ValueError("Process job tasks should not be empty.")

        if self.num_finished_process_job_tasks:
            raise ValueError("Must first pop last finished task.")

        if not self.controller:
            raise ValueError("Controller is null - did you call set_controller()?")

        task = self.process_job_tasks[0]

        with monitoring.push_region_tag(self.controller.region.region_code):
            self.controller.run_ingest_job_and_kick_scheduler_on_completion(task[1])
        self.num_finished_process_job_tasks += 1
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg="raw_data_import was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )
        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            if not isinstance(controller, GcsfsDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK
Exemple #16
0
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]:
    """Called from a Cloud Function when a new file is added to a direct ingest
    bucket. Will trigger a job that deals with normalizing and splitting the
    file as is appropriate, then start the scheduler if allowed.
    """
    region_code = get_str_param_value("region", request.args)
    # The bucket name for the file to ingest
    bucket = get_str_param_value("bucket", request.args)
    # The relative path to the file, not including the bucket name
    relative_file_path = get_str_param_value("relative_file_path",
                                             request.args,
                                             preserve_case=True)
    start_ingest = get_bool_param_value("start_ingest",
                                        request.args,
                                        default=False)

    if not region_code or not bucket or not relative_file_path or start_ingest is None:
        response = f"Bad parameters [{request.args}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket_name=bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path,
                allow_unlaunched=True,
            )
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        path = GcsfsPath.from_bucket_and_blob_name(
            bucket_name=bucket, blob_name=relative_file_path)

        if isinstance(path, GcsfsFilePath):
            controller.handle_file(path, start_ingest=start_ingest)

    return "", HTTPStatus.OK
Exemple #17
0
def ensure_all_raw_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all files in the ingest buckets for all direct ingest states have
    properly normalized  file names, to ensure that repeat uploads of files into those
    buckets don't fail or overwrite data. This provides a layer of protection against
    cloud function failures.
    """
    logging.info(
        "Received request for direct ingest ensure_all_raw_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        # The only type of file that wouldn't be normalized is a raw file, which
        # should only ever be in the PRIMARY bucket.
        ingest_instance = DirectIngestInstance.PRIMARY
        with monitoring.push_region_tag(region_code,
                                        ingest_instance=ingest_instance.value):
            ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                region_code=region_code,
                system_level=SystemLevel.for_region(
                    _region_for_region_code(region_code)),
                ingest_instance=ingest_instance,
            )
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=True,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region,
                ingest_instance=controller.ingest_instance,
                ingest_bucket=controller.ingest_bucket_path,
                can_start_ingest=can_start_ingest,
            )
    return "", HTTPStatus.OK
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg="raw_data_import was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            if not isinstance(controller, GcsfsDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)

    if not region_code:
        return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no IngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, IngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not ingest_args:
            return "Could not parse ingest args", HTTPStatus.BAD_REQUEST
        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = controller_for_region_code(region_code)
            except DirectIngestError as e:
                if e.is_bad_request():
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
Exemple #20
0
def scheduler() -> Tuple[str, HTTPStatus]:
    logging.info('Received request for direct ingest scheduler: %s',
                 request.values)
    region_code = get_str_param_value('region', request.values)
    just_finished_job = \
        get_bool_param_value('just_finished_job', request.values, default=False)

    if not region_code or just_finished_job is None:
        return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code):
        try:
            controller = controller_for_region_code(region_code)
        except DirectIngestError as e:
            if e.is_bad_request():
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        controller.schedule_next_ingest_job_or_wait_if_necessary(
            just_finished_job)
    return '', HTTPStatus.OK
def create_raw_data_latest_view_update_tasks() -> Tuple[str, HTTPStatus]:
    """Creates tasks for every direct ingest region with SQL preprocessing
    enabled to update the raw data table latest views.
    """
    raw_update_ctm = DirectIngestRawUpdateCloudTaskManager(
        metadata.project_id())

    for region_code in get_existing_region_dir_names():
        with monitoring.push_region_tag(region_code):
            region = get_region(region_code, is_direct_ingest=True)
            if region.are_raw_data_bq_imports_enabled_in_env():
                logging.info(
                    'Creating raw data latest view update task for region [%s]',
                    region_code)
                raw_update_ctm.create_raw_data_latest_view_update_task(
                    region_code)
            else:
                logging.info(
                    'Skipping raw data latest view update for region [%s] - raw data imports not enabled.',
                    region_code)
    return '', HTTPStatus.OK
Exemple #22
0
def create_raw_data_latest_view_update_tasks() -> Tuple[str, HTTPStatus]:
    """Creates tasks for every direct ingest region with SQL preprocessing
    enabled to update the raw data table latest views.
    """
    raw_update_ctm = DirectIngestRawUpdateCloudTaskManager()

    for region_code in get_existing_region_dir_names():
        with monitoring.push_region_tag(region_code, ingest_instance=None):
            region = _region_for_region_code(region_code)
            if region.is_ingest_launched_in_env():
                logging.info(
                    "Creating raw data latest view update task for region [%s]",
                    region_code,
                )
                raw_update_ctm.create_raw_data_latest_view_update_task(
                    region_code)
            else:
                logging.info(
                    "Skipping raw data latest view update for region [%s] - ingest not enabled.",
                    region_code,
                )
    return "", HTTPStatus.OK
Exemple #23
0
def handle_new_files() -> Tuple[str, HTTPStatus]:
    """Normalizes and splits files in the ingest bucket for a given region as
    is appropriate. Will schedule the next process_job task if no renaming /
    splitting work has been done that will trigger subsequent calls to this
    endpoint.
    """
    logging.info("Received request for direct ingest handle_new_files: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    can_start_ingest = get_bool_param_value("can_start_ingest",
                                            request.values,
                                            default=False)
    bucket = get_str_param_value("bucket", request.values)

    if not region_code or can_start_ingest is None or not bucket:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket_name=bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path,
                allow_unlaunched=True,
            )
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        controller.handle_new_files(can_start_ingest=can_start_ingest)
    return "", HTTPStatus.OK
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        with monitoring.push_region_tag(region_code):
            region = region_for_region_code(region_code=region_code)
            if not region.is_ingest_launched_in_env():
                continue
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=False)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            controller.kick_scheduler(just_finished_job=False)
 def wrapped_fn(*args: Any, **kwargs: Any) -> None:
     with monitoring.push_region_tag(region_code):
         fn(*args, **kwargs)
 def wrapped_fn(*args, **kwargs):
     with monitoring.push_region_tag(region_code):
         fn(*args, **kwargs)
Exemple #27
0
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no GcsfsIngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, GcsfsIngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != ingest_args.file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {ingest_args.file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_args.file_path.bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                logging.warning(str(e))
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
Exemple #28
0
def upload_from_sftp() -> Tuple[str, HTTPStatus]:
    """Connects to remote SFTP servers and uploads the files in both raw and normalized form
    to GCS buckets to start the ingest process. Should only be called from a task queue scheduler.

    Args:
        region_code (Optional[str]): required as part of the request to identify the region
        date_str (Optional[str]): ISO format date string,
            used to determine the lower bound date in which to start
            pulling items from the SFTP server. If None, uses yesterday as the default lower
            bound time, otherwises creates a datetime from the string.
        bucket_str (Optional[str]): GCS bucket name, used to override the
            destination in which the SFTP assets are downloaded to and moved for proper
            ingest (therefore used in both controllers). If None, uses the bucket determined
            by |region_code| otherwise, uses this destination.
    """
    logging.info("Received request for uploading files from SFTP: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    date_str = get_str_param_value("date", request.values)
    bucket_str = get_str_param_value("bucket", request.values)
    gcs_destination_path = GcsfsBucketPath(bucket_str) if bucket_str else None

    if not region_code:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(region_code, ingest_instance=None):
        lower_bound_update_datetime = (
            datetime.datetime.fromisoformat(date_str) if date_str is not None
            else datetime.datetime.now(tz=pytz.UTC) - datetime.timedelta(1))
        sftp_controller = DownloadFilesFromSftpController(
            project_id=metadata.project_id(),
            region=region_code,
            lower_bound_update_datetime=lower_bound_update_datetime,
            gcs_destination_path=gcs_destination_path,
        )
        download_result = sftp_controller.do_fetch()

        with monitoring.measurements({TagKey.SFTP_TASK_TYPE:
                                      "download"}) as download_measurements:
            download_measurements.measure_int_put(
                m_sftp_attempts,
                len(download_result.successes) + len(download_result.failures),
            )
            download_measurements.measure_int_put(
                m_sftp_errors, len(download_result.failures))

        unable_to_download_text = (
            f"Unable to download the following files: {download_result.failures}"
            if download_result.failures else "")
        skipped_download_text = (
            f"Skipped downloading the following files: {download_result.skipped}"
            if download_result.skipped else "")

        if not download_result.successes and download_result.failures:
            return (
                f"All files failed to download. {unable_to_download_text}",
                HTTPStatus.BAD_REQUEST,
            )

        if not download_result.successes and not download_result.skipped:
            return f"No items to download for {region_code}", HTTPStatus.BAD_REQUEST

        if not download_result.successes and download_result.skipped:
            return f"All files skipped. {skipped_download_text}", HTTPStatus.OK

        if not download_result.successes:
            raise ValueError("Expected non-empty successes here.")

        upload_controller = UploadStateFilesToIngestBucketController(
            paths_with_timestamps=download_result.successes,
            project_id=metadata.project_id(),
            region=region_code,
            gcs_destination_path=gcs_destination_path,
        )
        upload_result = upload_controller.do_upload()

        with monitoring.measurements({TagKey.SFTP_TASK_TYPE:
                                      "upload"}) as upload_measurements:
            upload_measurements.measure_int_put(
                m_sftp_attempts,
                len(upload_result.successes) + len(upload_result.failures),
            )
            upload_measurements.measure_int_put(m_sftp_errors,
                                                len(upload_result.failures))

        sftp_controller.clean_up()

        unable_to_upload_text = (
            f"Unable to upload the following files: {upload_result.failures}"
            if upload_result.failures else "")
        skipped_text = (
            f"Skipped uploading the following files: {upload_controller.skipped_files}"
            if upload_result.skipped else "")
        if not upload_result.successes and not upload_result.skipped:
            return (
                f"{unable_to_download_text}"
                f" All files failed to upload. {unable_to_upload_text}"
                f"{skipped_text}",
                HTTPStatus.BAD_REQUEST,
            )

        if download_result.failures or upload_result.failures:
            return (
                f"{unable_to_download_text}"
                f" {unable_to_upload_text}"
                f"{skipped_text}",
                HTTPStatus.MULTI_STATUS,
            )

        if not upload_result.successes and upload_result.skipped:
            return f"All files skipped. {skipped_text}", HTTPStatus.OK

        if upload_result.skipped:
            return (
                f"{unable_to_download_text}"
                f" {unable_to_upload_text}"
                f"{skipped_text}",
                HTTPStatus.MULTI_STATUS,
            )

        # Trigger ingest to handle copied files (in case queue has emptied already while
        # ingest was paused).
        direct_ingest_cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        direct_ingest_cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=_region_for_region_code(region_code),
            ingest_instance=DirectIngestInstance.PRIMARY,
            ingest_bucket=upload_controller.destination_ingest_bucket,
            can_start_ingest=True,
        )

        return "", HTTPStatus.OK
Exemple #29
0
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    output_bucket_name = get_str_param_value("output_bucket",
                                             request.values,
                                             preserve_case=True)

    if not region_code or not output_bucket_name:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                GcsfsBucketPath(output_bucket_name)).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsIngestViewExportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if output_bucket_name != ingest_view_export_args.output_bucket_name:
            raise DirectIngestError(
                msg=
                f"Different buckets were passed in the url and request body\n"
                f"url: {output_bucket_name}\n"
                f"body: {ingest_view_export_args.output_bucket_name}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=GcsfsBucketPath(
                        ingest_view_export_args.output_bucket_name),
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK
 def wrapped_fn(*args: Any, **kwargs: Any) -> None:
     with monitoring.push_region_tag(region_code, ingest_instance.value):
         fn(*args, **kwargs)