コード例 #1
0
    def test_collect_and_build_ingest_view_builders(
            self, _name: str, project_id: str,
            environment: GCPEnvironment) -> None:
        with patch("recidiviz.utils.environment.get_gcp_environment",
                   return_value=environment):
            with patch("recidiviz.utils.metadata.project_id",
                       return_value=project_id):
                for region_code in self.region_dir_names:
                    region = get_region(
                        region_code,
                        is_direct_ingest=True,
                        region_module_override=self.region_module_override,
                    )

                    with patch(
                            "recidiviz.utils.metadata.project_id",
                            return_value="recidiviz-456",
                    ):
                        controller = DirectIngestControllerFactory.build(
                            ingest_bucket_path=self.
                            primary_ingest_bucket_for_region(region),
                            allow_unlaunched=True,
                        )

                    builders = DirectIngestPreProcessedIngestViewCollector(
                        region, controller.get_file_tag_rank_list()
                    ).collect_view_builders()
                    for builder in builders:
                        builder.build()
コード例 #2
0
 def test_build_throws_in_prod_region_only_launched_in_staging(
     self, ) -> None:
     mock_region = fake_region(
         region_code="us_xx",
         environment="staging",
         is_direct_ingest=True,
         region_module=templates,
     )
     with patch(
             "recidiviz.utils.regions.get_region",
             Mock(return_value=mock_region),
     ):
         ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
             region_code=mock_region.region_code,
             system_level=SystemLevel.for_region(mock_region),
             ingest_instance=DirectIngestInstance.PRIMARY,
         )
         with self.assertRaises(DirectIngestError) as e:
             _ = DirectIngestControllerFactory.build(
                 ingest_bucket_path=ingest_bucket_path,
                 allow_unlaunched=False)
         self.assertEqual(
             str(e.exception),
             "Bad environment [production] for region [us_xx].",
         )
コード例 #3
0
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        region = _region_for_region_code(region_code=region_code)
        if not region.is_ingest_launched_in_env():
            continue
        system_level = SystemLevel.for_region(region)
        for ingest_instance in DirectIngestInstance:
            with monitoring.push_region_tag(
                    region_code, ingest_instance=ingest_instance.value):
                try:
                    ingest_instance.check_is_valid_system_level(system_level)
                except DirectIngestInstanceError:
                    continue
                ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=system_level,
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=False,
                )

                controller.kick_scheduler(just_finished_job=False)
コード例 #4
0
def scheduler() -> Tuple[str, HTTPStatus]:
    """Checks the state of the ingest instance and schedules any tasks to be run."""
    logging.info("Received request for direct ingest scheduler: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    just_finished_job = get_bool_param_value("just_finished_job",
                                             request.values,
                                             default=False)

    # The bucket name for ingest instance to schedule work out of
    bucket = get_str_param_value("bucket", request.args)

    if not region_code or just_finished_job is None or not bucket:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path, allow_unlaunched=False)
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        controller.schedule_next_ingest_job(just_finished_job)
    return "", HTTPStatus.OK
コード例 #5
0
 def test_build_for_unsupported_region_throws(self) -> None:
     ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
         region_code="us_xx",
         system_level=SystemLevel.STATE,
         ingest_instance=DirectIngestInstance.PRIMARY,
     )
     with self.assertRaises(DirectIngestError) as e:
         _ = DirectIngestControllerFactory.build(
             ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False)
     self.assertEqual(
         str(e.exception),
         "Unsupported direct ingest region [us_xx] in project [recidiviz-456]",
     )
コード例 #6
0
    def test_build_gcsfs_ingest_controller(self) -> None:
        mock_package = Mock()
        mock_controller = create_autospec(spec=UsNdController)
        mock_package.UsNdController.return_value = mock_controller

        with patch.dict(
                "sys.modules",
            {
                "recidiviz.ingest.direct.regions.us_nd.us_nd_controller":
                mock_package
            },
        ):
            controller = DirectIngestControllerFactory.build_gcsfs_ingest_controller(
                region_code="us_nd", fs=Mock())
            assert controller is mock_controller
コード例 #7
0
 def test_region_controller_builds(self, ) -> None:
     for region_code in self.region_dir_names:
         region = get_region(
             region_code,
             is_direct_ingest=True,
             region_module_override=self.region_module_override,
         )
         with patch("recidiviz.utils.metadata.project_id",
                    return_value="recidiviz-456"):
             controller = DirectIngestControllerFactory.build(
                 ingest_bucket_path=self.primary_ingest_bucket_for_region(
                     region),
                 allow_unlaunched=True,
             )
             self.test.assertIsNotNone(controller)
             self.test.assertIsInstance(controller,
                                        BaseDirectIngestController)
コード例 #8
0
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]:
    """Called from a Cloud Function when a new file is added to a direct ingest
    bucket. Will trigger a job that deals with normalizing and splitting the
    file as is appropriate, then start the scheduler if allowed.
    """
    region_code = get_str_param_value("region", request.args)
    # The bucket name for the file to ingest
    bucket = get_str_param_value("bucket", request.args)
    # The relative path to the file, not including the bucket name
    relative_file_path = get_str_param_value("relative_file_path",
                                             request.args,
                                             preserve_case=True)
    start_ingest = get_bool_param_value("start_ingest",
                                        request.args,
                                        default=False)

    if not region_code or not bucket or not relative_file_path or start_ingest is None:
        response = f"Bad parameters [{request.args}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket_name=bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path,
                allow_unlaunched=True,
            )
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        path = GcsfsPath.from_bucket_and_blob_name(
            bucket_name=bucket, blob_name=relative_file_path)

        if isinstance(path, GcsfsFilePath):
            controller.handle_file(path, start_ingest=start_ingest)

    return "", HTTPStatus.OK
コード例 #9
0
    def test_build_gcsfs_ingest_controller_all_regions(self) -> None:
        for region_code in get_existing_region_dir_names():
            region = get_region(region_code, is_direct_ingest=True)
            for ingest_instance in DirectIngestInstance:
                ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=SystemLevel.for_region(region),
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket_path,
                    allow_unlaunched=False)

                self.assertIsNotNone(controller)
                self.assertIsInstance(controller, BaseDirectIngestController)
                self.assertEqual(ingest_bucket_path,
                                 controller.ingest_bucket_path)
コード例 #10
0
def ensure_all_raw_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all files in the ingest buckets for all direct ingest states have
    properly normalized  file names, to ensure that repeat uploads of files into those
    buckets don't fail or overwrite data. This provides a layer of protection against
    cloud function failures.
    """
    logging.info(
        "Received request for direct ingest ensure_all_raw_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        # The only type of file that wouldn't be normalized is a raw file, which
        # should only ever be in the PRIMARY bucket.
        ingest_instance = DirectIngestInstance.PRIMARY
        with monitoring.push_region_tag(region_code,
                                        ingest_instance=ingest_instance.value):
            ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                region_code=region_code,
                system_level=SystemLevel.for_region(
                    _region_for_region_code(region_code)),
                ingest_instance=ingest_instance,
            )
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=True,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region,
                ingest_instance=controller.ingest_instance,
                ingest_bucket=controller.ingest_bucket_path,
                can_start_ingest=can_start_ingest,
            )
    return "", HTTPStatus.OK
コード例 #11
0
    def test_build_gcsfs_ingest_controller_all_regions_do_not_allow_launched(
        self, ) -> None:
        for region_code in get_existing_region_dir_names():
            region = get_region(region_code, is_direct_ingest=True)
            for ingest_instance in DirectIngestInstance:
                ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=SystemLevel.for_region(region),
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket_path,
                    allow_unlaunched=True)

                # Should still succeed for all controllers in the test environment
                self.assertIsNotNone(controller)
                self.assertIsInstance(controller, BaseDirectIngestController)
                self.assertEqual(ingest_bucket_path,
                                 controller.ingest_bucket_path)
コード例 #12
0
 def test_build_succeeds_in_staging_region_launched_in_prod(self) -> None:
     mock_region = fake_region(
         region_code="us_xx",
         environment="production",
         is_direct_ingest=True,
         region_module=templates,
     )
     with patch(
             "recidiviz.utils.regions.get_region",
             Mock(return_value=mock_region),
     ):
         ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
             region_code=mock_region.region_code,
             system_level=SystemLevel.for_region(mock_region),
             ingest_instance=DirectIngestInstance.PRIMARY,
         )
         controller = DirectIngestControllerFactory.build(
             ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False)
         self.assertIsNotNone(controller)
         self.assertIsInstance(controller, BaseDirectIngestController)
         self.assertEqual(ingest_bucket_path, controller.ingest_bucket_path)
コード例 #13
0
    def test_region_controller_exists_and_builds(self) -> None:
        for dir_path in self.region_dir_paths:
            region_code = os.path.basename(dir_path)
            controller_path = os.path.join(dir_path,
                                           f"{region_code}_controller.py")
            self.test.assertTrue(
                os.path.exists(controller_path),
                f"Path [{controller_path}] does not exist.",
            )

            region = get_region(
                region_code,
                is_direct_ingest=True,
                region_module_override=self.region_module_override,
            )
            with patch("recidiviz.utils.metadata.project_id",
                       return_value="recidiviz-456"):
                controller_class = DirectIngestControllerFactory.get_controller_class(
                    region)
                self.test.assertIsNotNone(controller_class)
                self.test.assertEqual(region_code,
                                      controller_class.region_code())
コード例 #14
0
def handle_new_files() -> Tuple[str, HTTPStatus]:
    """Normalizes and splits files in the ingest bucket for a given region as
    is appropriate. Will schedule the next process_job task if no renaming /
    splitting work has been done that will trigger subsequent calls to this
    endpoint.
    """
    logging.info("Received request for direct ingest handle_new_files: %s",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    can_start_ingest = get_bool_param_value("can_start_ingest",
                                            request.values,
                                            default=False)
    bucket = get_str_param_value("bucket", request.values)

    if not region_code or can_start_ingest is None or not bucket:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    bucket_path = GcsfsBucketPath(bucket_name=bucket)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                bucket_path).value,
    ):
        try:
            controller = DirectIngestControllerFactory.build(
                ingest_bucket_path=bucket_path,
                allow_unlaunched=True,
            )
        except DirectIngestError as e:
            if e.is_bad_request():
                logging.error(str(e))
                return str(e), HTTPStatus.BAD_REQUEST
            raise e

        controller.handle_new_files(can_start_ingest=can_start_ingest)
    return "", HTTPStatus.OK
コード例 #15
0
    def test_raw_files_yaml_parses_all_regions(self) -> None:
        for region_code in self.region_dir_names:
            region = get_region(
                region_code,
                is_direct_ingest=True,
                region_module_override=self.region_module_override,
            )

            with patch("recidiviz.utils.metadata.project_id",
                       return_value="recidiviz-456"):
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=self.primary_ingest_bucket_for_region(
                        region),
                    allow_unlaunched=True,
                )

            builders = DirectIngestPreProcessedIngestViewCollector(
                region,
                controller.get_file_tag_rank_list()).collect_view_builders()

            raw_file_manager = DirectIngestRegionRawFileConfig(
                region_code=region.region_code,
                region_module=self.region_module_override,
            )

            if builders or raw_file_manager.raw_file_configs:
                if region.is_ingest_launched_in_env() is not None:
                    self.test.assertTrue(raw_file_manager.raw_file_configs)
                config_file_tags = set()
                for config in raw_file_manager.raw_file_configs.values():
                    self.test.assertTrue(
                        config.file_tag not in config_file_tags,
                        f"Multiple raw file configs defined with the same "
                        f"file_tag [{config.file_tag}]",
                    )
                    config_file_tags.add(config.file_tag)
コード例 #16
0
def raw_data_import() -> Tuple[str, HTTPStatus]:
    """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data
    table in BQ.
    """
    logging.info("Received request to do direct ingest raw data import: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        data_import_args = _parse_cloud_task_args(json_data)

        if not data_import_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsRawDataBQImportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(data_import_args, GcsfsRawDataBQImportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(data_import_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != data_import_args.raw_data_file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {data_import_args.raw_data_file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=data_import_args.raw_data_file_path.
                    bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_raw_data_import(data_import_args)
    return "", HTTPStatus.OK
コード例 #17
0
def process_job() -> Tuple[str, HTTPStatus]:
    """Processes a single direct ingest file, specified in the provided ingest
    arguments.
    """
    logging.info("Received request to process direct ingest job: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    file_path = get_str_param_value("file_path",
                                    request.values,
                                    preserve_case=True)

    if not region_code or not file_path:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    gcsfs_path = GcsfsFilePath.from_absolute_path(file_path)

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                gcsfs_path.bucket_path).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_args = _parse_cloud_task_args(json_data)

        if not ingest_args:
            raise DirectIngestError(
                msg="process_job was called with no GcsfsIngestArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_args, GcsfsIngestArgs):
            raise DirectIngestError(
                msg=
                f"process_job was called with incorrect args type [{type(ingest_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if gcsfs_path != ingest_args.file_path:
            raise DirectIngestError(
                msg=f"Different paths were passed in the url and request body\n"
                f"url: {gcsfs_path.uri()}\n"
                f"body: {ingest_args.file_path.uri()}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags(
            {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_args.file_path.bucket_path,
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            try:
                controller.run_ingest_job_and_kick_scheduler_on_completion(
                    ingest_args)
            except GCSPseudoLockAlreadyExists as e:
                logging.warning(str(e))
                return str(e), HTTPStatus.CONFLICT
    return "", HTTPStatus.OK
コード例 #18
0
def ingest_view_export() -> Tuple[str, HTTPStatus]:
    """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be
    processed and ingested into our Recidiviz DB.
    """
    logging.info("Received request to do direct ingest view export: [%s]",
                 request.values)
    region_code = get_str_param_value("region", request.values)
    output_bucket_name = get_str_param_value("output_bucket",
                                             request.values,
                                             preserve_case=True)

    if not region_code or not output_bucket_name:
        response = f"Bad parameters [{request.values}]"
        logging.error(response)
        return response, HTTPStatus.BAD_REQUEST

    with monitoring.push_region_tag(
            region_code,
            ingest_instance=DirectIngestInstance.for_ingest_bucket(
                GcsfsBucketPath(output_bucket_name)).value,
    ):
        json_data = request.get_data(as_text=True)
        ingest_view_export_args = _parse_cloud_task_args(json_data)

        if not ingest_view_export_args:
            raise DirectIngestError(
                msg=
                "raw_data_import was called with no GcsfsIngestViewExportArgs.",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs):
            raise DirectIngestError(
                msg=
                f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        if output_bucket_name != ingest_view_export_args.output_bucket_name:
            raise DirectIngestError(
                msg=
                f"Different buckets were passed in the url and request body\n"
                f"url: {output_bucket_name}\n"
                f"body: {ingest_view_export_args.output_bucket_name}",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        with monitoring.push_tags({
                TagKey.INGEST_VIEW_EXPORT_TAG:
                ingest_view_export_args.task_id_tag()
        }):
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=GcsfsBucketPath(
                        ingest_view_export_args.output_bucket_name),
                    allow_unlaunched=False,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            controller.do_ingest_view_export(ingest_view_export_args)
    return "", HTTPStatus.OK