Esempio n. 1
0
def controller_for_region_code(
        region_code: str,
        allow_unlaunched: bool = False) -> BaseDirectIngestController:
    """Returns an instance of the region's controller, if one exists."""
    if region_code not in get_supported_direct_ingest_region_codes():
        raise DirectIngestError(
            msg=
            f"Unsupported direct ingest region [{region_code}] in project [{metadata.project_id()}]",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    try:
        region = regions.get_region(region_code, is_direct_ingest=True)
    except FileNotFoundError:
        raise DirectIngestError(
            msg=f"Region [{region_code}] has no registered manifest",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    if not allow_unlaunched and not region.is_ingest_launched_in_env():
        check_is_region_launched_in_env(region)

    controller = region.get_ingestor()

    if not isinstance(controller, BaseDirectIngestController):
        raise DirectIngestError(
            msg=
            f"Controller for direct ingest region [{region_code}] has unexpected type [{type(controller)}]",
            error_type=DirectIngestErrorType.INPUT_ERROR,
        )

    return controller
Esempio n. 2
0
    def build(
        cls, *, ingest_bucket_path: GcsfsBucketPath, allow_unlaunched: bool
    ) -> BaseDirectIngestController:
        """Retrieve a direct ingest GcsfsDirectIngestController associated with a
        particular ingest bucket.

        Returns:
            An instance of the region's direct ingest controller class (e.g.,
             UsNdController) that can run ingest operations for the ingest instance
             associated with the input bucket.
        """
        region_code = get_region_code_from_direct_ingest_bucket(
            ingest_bucket_path.bucket_name
        )

        if (
            region_code is None
            or region_code not in get_supported_direct_ingest_region_codes()
        ):
            raise DirectIngestError(
                msg=f"Unsupported direct ingest region [{region_code}] in "
                f"project [{metadata.project_id()}]",
                error_type=DirectIngestErrorType.INPUT_ERROR,
            )

        region = cls._region_for_bucket(ingest_bucket_path)
        if not allow_unlaunched and not region.is_ingest_launched_in_env():
            check_is_region_launched_in_env(region)

        controller_class = cls.get_controller_class(region)
        controller = controller_class(ingest_bucket_path=ingest_bucket_path)
        if not isinstance(controller, BaseDirectIngestController):
            raise ValueError(f"Unexpected controller class type [{type(controller)}]")

        return controller
Esempio n. 3
0
def ensure_all_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    logging.info(
        'Received request for direct ingest ensure_all_file_paths_normalized: '
        '%s', request.values)

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        with monitoring.push_region_tag(region_code):
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=True)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR)

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.\
                create_direct_ingest_handle_new_files_task(
                    controller.region, can_start_ingest=can_start_ingest)
    return '', HTTPStatus.OK
def ensure_all_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all file paths in the ingest buckets for all direct ingest states have properly normalized
    file names, to ensure that repeat uploads of files into those buckets don't fail or overwrite data."""
    logging.info(
        "Received request for direct ingest ensure_all_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        with monitoring.push_region_tag(region_code):
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=True)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region, can_start_ingest=can_start_ingest)
    return "", HTTPStatus.OK
Esempio n. 5
0
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        region = _region_for_region_code(region_code=region_code)
        if not region.is_ingest_launched_in_env():
            continue
        system_level = SystemLevel.for_region(region)
        for ingest_instance in DirectIngestInstance:
            with monitoring.push_region_tag(
                    region_code, ingest_instance=ingest_instance.value):
                try:
                    ingest_instance.check_is_valid_system_level(system_level)
                except DirectIngestInstanceError:
                    continue
                ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=system_level,
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=False,
                )

                controller.kick_scheduler(just_finished_job=False)
Esempio n. 6
0
def ensure_all_raw_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all files in the ingest buckets for all direct ingest states have
    properly normalized  file names, to ensure that repeat uploads of files into those
    buckets don't fail or overwrite data. This provides a layer of protection against
    cloud function failures.
    """
    logging.info(
        "Received request for direct ingest ensure_all_raw_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        # The only type of file that wouldn't be normalized is a raw file, which
        # should only ever be in the PRIMARY bucket.
        ingest_instance = DirectIngestInstance.PRIMARY
        with monitoring.push_region_tag(region_code,
                                        ingest_instance=ingest_instance.value):
            ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                region_code=region_code,
                system_level=SystemLevel.for_region(
                    _region_for_region_code(region_code)),
                ingest_instance=ingest_instance,
            )
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=True,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region,
                ingest_instance=controller.ingest_instance,
                ingest_bucket=controller.ingest_bucket_path,
                can_start_ingest=can_start_ingest,
            )
    return "", HTTPStatus.OK
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        with monitoring.push_region_tag(region_code):
            region = region_for_region_code(region_code=region_code)
            if not region.is_ingest_launched_in_env():
                continue
            try:
                controller = controller_for_region_code(region_code,
                                                        allow_unlaunched=False)
            except DirectIngestError as e:
                raise e
            if not isinstance(controller, BaseDirectIngestController):
                raise DirectIngestError(
                    msg=f"Unexpected controller type [{type(controller)}].",
                    error_type=DirectIngestErrorType.INPUT_ERROR,
                )

            if not isinstance(controller, GcsfsDirectIngestController):
                continue

            controller.kick_scheduler(just_finished_job=False)
Esempio n. 8
0
                 os.path.relpath(
                     state_versions.__file__[: -len("__init__.py")]
                 ),  # versions
             }
         )
     )
 ],
 # ingest docs
 INGEST_DOCS_KEY: [
     RequiredModificationSets(
         if_modified_files=frozenset(
             {f"recidiviz/ingest/direct/regions/{region_code}/"}
         ),
         then_modified_files=frozenset({f"docs/ingest/{region_code}/"}),
     )
     for region_code in get_supported_direct_ingest_region_codes()
 ],
 # case triage dummy data
 CASE_TRIAGE_FIXTURES_KEY: [
     RequiredModificationSets(
         if_modified_files=frozenset(
             {"recidiviz/tools/case_triage/fixtures/etl_clients.csv"}
         ),
         then_modified_files=frozenset(
             {"recidiviz/case_triage/fixtures/dummy_clients.json"}
         ),
     )
 ],
 NORMALIZED_SQL_PREPROCESSING_ROWS_KEY: [
     RequiredModificationSets.for_symmetric_check(
         frozenset(