def primary_ingest_bucket_for_region(self,
                                      region: Region) -> GcsfsBucketPath:
     return gcsfs_direct_ingest_bucket_for_region(
         region_code=region.region_code,
         system_level=SystemLevel.for_region(region),
         ingest_instance=DirectIngestInstance.PRIMARY,
     )
Example #2
0
    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        delegate: UploadStateFilesToIngestBucketDelegate,
        destination_bucket_override: Optional[GcsfsBucketPath] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()
        self.delegate = delegate

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        # Raw data uploads always default to primary ingest bucket
        self.destination_ingest_bucket = (
            destination_bucket_override
            or gcsfs_direct_ingest_bucket_for_region(
                region_code=region,
                system_level=SystemLevel.STATE,
                ingest_instance=DirectIngestInstance.PRIMARY,
                project_id=self.project_id,
            ))

        self.uploaded_files: List[str] = []
        self.skipped_files: List[str] = []
        self.unable_to_upload_files: List[str] = []
Example #3
0
def kick_all_schedulers() -> None:
    """Kicks all ingest schedulers to restart ingest"""
    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        region = _region_for_region_code(region_code=region_code)
        if not region.is_ingest_launched_in_env():
            continue
        system_level = SystemLevel.for_region(region)
        for ingest_instance in DirectIngestInstance:
            with monitoring.push_region_tag(
                    region_code, ingest_instance=ingest_instance.value):
                try:
                    ingest_instance.check_is_valid_system_level(system_level)
                except DirectIngestInstanceError:
                    continue
                ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=system_level,
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=False,
                )

                controller.kick_scheduler(just_finished_job=False)
 def test_build_throws_in_prod_region_only_launched_in_staging(
     self, ) -> None:
     mock_region = fake_region(
         region_code="us_xx",
         environment="staging",
         is_direct_ingest=True,
         region_module=templates,
     )
     with patch(
             "recidiviz.utils.regions.get_region",
             Mock(return_value=mock_region),
     ):
         ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
             region_code=mock_region.region_code,
             system_level=SystemLevel.for_region(mock_region),
             ingest_instance=DirectIngestInstance.PRIMARY,
         )
         with self.assertRaises(DirectIngestError) as e:
             _ = DirectIngestControllerFactory.build(
                 ingest_bucket_path=ingest_bucket_path,
                 allow_unlaunched=False)
         self.assertEqual(
             str(e.exception),
             "Bad environment [production] for region [us_xx].",
         )
Example #5
0
 def test_get_county_ingest_bucket_path_for_region(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_bucket_for_region(
             region_code="us_tx_brazos",
             system_level=SystemLevel.COUNTY,
             ingest_instance=DirectIngestInstance.PRIMARY,
         ).abs_path(),
         "recidiviz-123-direct-ingest-county-us-tx-brazos",
     )
Example #6
0
 def test_get_state_ingest_bucket_path_for_region_secondary(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_bucket_for_region(
             region_code="us_nd",
             system_level=SystemLevel.STATE,
             ingest_instance=DirectIngestInstance.SECONDARY,
         ).abs_path(),
         "recidiviz-staging-direct-ingest-state-us-nd-secondary",
     )
    def test_from_county_ingest_bucket(self) -> None:
        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code="us_xx_yyyyy",
            system_level=SystemLevel.COUNTY,
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id="recidiviz-456",
        )

        self.assertEqual(
            DirectIngestInstance.PRIMARY,
            DirectIngestInstance.for_ingest_bucket(ingest_bucket_path),
        )
 def setUp(self) -> None:
     bucket = gcsfs_direct_ingest_bucket_for_region(
         project_id="recidiviz-456",
         region_code=_REGION.region_code,
         system_level=SystemLevel.STATE,
         ingest_instance=DirectIngestInstance.PRIMARY,
     )
     self.ingest_view_file_path = GcsfsFilePath.from_directory_and_file_name(
         bucket,
         to_normalized_processed_file_name(
             "file_path.csv", GcsfsDirectIngestFileType.INGEST_VIEW),
     )
 def test_build_for_unsupported_region_throws(self) -> None:
     ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
         region_code="us_xx",
         system_level=SystemLevel.STATE,
         ingest_instance=DirectIngestInstance.PRIMARY,
     )
     with self.assertRaises(DirectIngestError) as e:
         _ = DirectIngestControllerFactory.build(
             ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False)
     self.assertEqual(
         str(e.exception),
         "Unsupported direct ingest region [us_xx] in project [recidiviz-456]",
     )
    def test_from_state_ingest_bucket(self) -> None:
        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code="us_xx",
            system_level=SystemLevel.STATE,
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id="recidiviz-456",
        )

        self.assertEqual(
            DirectIngestInstance.PRIMARY,
            DirectIngestInstance.for_ingest_bucket(ingest_bucket_path),
        )

        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code="us_xx",
            system_level=SystemLevel.STATE,
            ingest_instance=DirectIngestInstance.SECONDARY,
            project_id="recidiviz-456",
        )

        self.assertEqual(
            DirectIngestInstance.SECONDARY,
            DirectIngestInstance.for_ingest_bucket(ingest_bucket_path),
        )
    def test_build_gcsfs_ingest_controller_all_regions(self) -> None:
        for region_code in get_existing_region_dir_names():
            region = get_region(region_code, is_direct_ingest=True)
            for ingest_instance in DirectIngestInstance:
                ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=SystemLevel.for_region(region),
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket_path,
                    allow_unlaunched=False)

                self.assertIsNotNone(controller)
                self.assertIsInstance(controller, BaseDirectIngestController)
                self.assertEqual(ingest_bucket_path,
                                 controller.ingest_bucket_path)
Example #12
0
def ensure_all_raw_file_paths_normalized() -> Tuple[str, HTTPStatus]:
    """Ensures that all files in the ingest buckets for all direct ingest states have
    properly normalized  file names, to ensure that repeat uploads of files into those
    buckets don't fail or overwrite data. This provides a layer of protection against
    cloud function failures.
    """
    logging.info(
        "Received request for direct ingest ensure_all_raw_file_paths_normalized: "
        "%s",
        request.values,
    )

    supported_regions = get_supported_direct_ingest_region_codes()
    for region_code in supported_regions:
        logging.info("Ensuring paths normalized for region [%s]", region_code)
        # The only type of file that wouldn't be normalized is a raw file, which
        # should only ever be in the PRIMARY bucket.
        ingest_instance = DirectIngestInstance.PRIMARY
        with monitoring.push_region_tag(region_code,
                                        ingest_instance=ingest_instance.value):
            ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
                region_code=region_code,
                system_level=SystemLevel.for_region(
                    _region_for_region_code(region_code)),
                ingest_instance=ingest_instance,
            )
            try:
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket,
                    allow_unlaunched=True,
                )
            except DirectIngestError as e:
                if e.is_bad_request():
                    logging.error(str(e))
                    return str(e), HTTPStatus.BAD_REQUEST
                raise e

            can_start_ingest = controller.region.is_ingest_launched_in_env()
            controller.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                controller.region,
                ingest_instance=controller.ingest_instance,
                ingest_bucket=controller.ingest_bucket_path,
                can_start_ingest=can_start_ingest,
            )
    return "", HTTPStatus.OK
    def test_build_gcsfs_ingest_controller_all_regions_do_not_allow_launched(
        self, ) -> None:
        for region_code in get_existing_region_dir_names():
            region = get_region(region_code, is_direct_ingest=True)
            for ingest_instance in DirectIngestInstance:
                ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                    region_code=region_code,
                    system_level=SystemLevel.for_region(region),
                    ingest_instance=ingest_instance,
                )
                controller = DirectIngestControllerFactory.build(
                    ingest_bucket_path=ingest_bucket_path,
                    allow_unlaunched=True)

                # Should still succeed for all controllers in the test environment
                self.assertIsNotNone(controller)
                self.assertIsInstance(controller, BaseDirectIngestController)
                self.assertEqual(ingest_bucket_path,
                                 controller.ingest_bucket_path)
Example #14
0
    def __init__(
        self,
        project_id: str,
        region: str,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        file_filter: Optional[str],
    ):

        self.project_id = project_id
        self.region = region
        self.state_code = StateCode(region.upper())
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region(
            region_code=region,
            system_level=SystemLevel.STATE,
            # Raw files are only ever stored in the PRIMARY storage bucket
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id=self.project_id,
        )
        self.ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
            region_code=region,
            system_level=SystemLevel.STATE,
            # Raw files are only ever processed in the PRIMARY ingest bucket
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id=self.project_id,
        )

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
 def test_build_succeeds_in_staging_region_launched_in_prod(self) -> None:
     mock_region = fake_region(
         region_code="us_xx",
         environment="production",
         is_direct_ingest=True,
         region_module=templates,
     )
     with patch(
             "recidiviz.utils.regions.get_region",
             Mock(return_value=mock_region),
     ):
         ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
             region_code=mock_region.region_code,
             system_level=SystemLevel.for_region(mock_region),
             ingest_instance=DirectIngestInstance.PRIMARY,
         )
         controller = DirectIngestControllerFactory.build(
             ingest_bucket_path=ingest_bucket_path, allow_unlaunched=False)
         self.assertIsNotNone(controller)
         self.assertIsInstance(controller, BaseDirectIngestController)
         self.assertEqual(ingest_bucket_path, controller.ingest_bucket_path)
    def start_ingest_run(self, state_code: StateCode,
                         instance_str: str) -> None:
        """This function is called through the Ingest Operations UI in the admin panel.
        It calls to start a direct ingest run for the given region_code in the given instance
        Requires:
        - state_code: (required) State code to start ingest for (i.e. "US_ID")
        - instance: (required) Which instance to start ingest for (either PRIMARY or SECONDARY)
        """
        try:
            instance = DirectIngestInstance[instance_str]
        except KeyError as e:
            logging.error("Received an invalid instance: %s.", instance_str)
            raise ValueError(
                f"Invalid instance [{instance_str}] received", ) from e

        can_start_ingest = state_code in self.state_codes_launched_in_env

        formatted_state_code = state_code.value.lower()
        region = get_region(formatted_state_code, is_direct_ingest=True)

        # Get the ingest bucket for this region and instance
        ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
            region_code=formatted_state_code,
            system_level=SystemLevel.for_region(region),
            ingest_instance=instance,
            project_id=self.project_id,
        )

        logging.info(
            "Creating cloud task to schedule next job and kick ingest for %s instance in %s.",
            instance,
            formatted_state_code,
        )
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=region,
            ingest_instance=instance,
            ingest_bucket=ingest_bucket_path,
            can_start_ingest=can_start_ingest,
        )
def build_gcsfs_controller_for_tests(
    controller_cls: Type[CsvGcsfsDirectIngestController],
    ingest_instance: DirectIngestInstance,
    run_async: bool,
    can_start_ingest: bool = True,
    regions_module: ModuleType = fake_regions_module,
) -> BaseDirectIngestController:
    """Builds an instance of |controller_cls| for use in tests with several internal classes mocked properly. """
    fake_fs = FakeGCSFileSystem()

    def mock_build_fs() -> FakeGCSFileSystem:
        return fake_fs

    if "TestGcsfsDirectIngestController" in controller_cls.__name__:
        view_collector_cls: Type[
            BigQueryViewCollector] = FakeDirectIngestPreProcessedIngestViewCollector
    else:
        view_collector_cls = DirectIngestPreProcessedIngestViewCollector

    with patch(
            f"{BaseDirectIngestController.__module__}.DirectIngestCloudTaskManagerImpl"
    ) as mock_task_factory_cls:
        with patch(
                f"{BaseDirectIngestController.__module__}.BigQueryClientImpl"
        ) as mock_big_query_client_cls:
            with patch(
                    f"{BaseDirectIngestController.__module__}.DirectIngestRawFileImportManager",
                    FakeDirectIngestRawFileImportManager,
            ):
                with patch(
                        f"{BaseDirectIngestController.__module__}.DirectIngestPreProcessedIngestViewCollector",
                        view_collector_cls,
                ):
                    task_manager = (
                        FakeAsyncDirectIngestCloudTaskManager() if run_async
                        else FakeSynchronousDirectIngestCloudTaskManager())
                    mock_task_factory_cls.return_value = task_manager
                    mock_big_query_client_cls.return_value = (
                        FakeDirectIngestBigQueryClient(
                            project_id=metadata.project_id(),
                            fs=fake_fs,
                            region_code=controller_cls.region_code(),
                        ))
                    with patch.object(GcsfsFactory, "build",
                                      new=mock_build_fs):
                        with patch.object(
                                direct_ingest_raw_table_migration_collector,
                                "regions",
                                new=regions_module,
                        ):
                            controller = controller_cls(
                                ingest_bucket_path=
                                gcsfs_direct_ingest_bucket_for_region(
                                    region_code=controller_cls.region_code(),
                                    system_level=SystemLevel.for_region_code(
                                        controller_cls.region_code(),
                                        is_direct_ingest=True,
                                    ),
                                    ingest_instance=ingest_instance,
                                    project_id="recidiviz-xxx",
                                ))
                            controller.csv_reader = GcsfsCsvReader(fake_fs)
                            controller.raw_file_import_manager.csv_reader = (
                                controller.csv_reader)

                            task_manager.set_controller(controller)
                            fake_fs.test_set_delegate(
                                DirectIngestFakeGCSFileSystemDelegate(
                                    controller,
                                    can_start_ingest=can_start_ingest))
                            return controller
    def get_ingest_instance_summaries(
            self, state_code: StateCode) -> List[Dict[str, Any]]:
        """Returns a list of dictionaries containing the following info for a given instance:
        i.e. {
            instance: the direct ingest instance,
            dbName: database name for this instance,
            storage: storage bucket absolute path,
            ingest: {
                name: bucket_name,
                unprocessedFilesRaw: how many unprocessed raw data files in the bucket,
                processedFilesRaw: how many processed raw data files are in the bucket (should be zero),
                unprocessedFilesIngestView: how many unprocessed ingest view files in the bucket,
                processedFilesIngestView: how many processed ingest view files are in the bucket (should be zero),
            },
            operations: {
                unprocessedFilesRaw: number of unprocessed raw files in the operations database
                unprocessedFilesIngestView: number of unprocessed ingest view files in the operations database
                dateOfEarliestUnprocessedIngestView: date of earliest unprocessed ingest file, if it exists
            }
        }
        """
        formatted_state_code = state_code.value.lower()

        ingest_instance_summaries: List[Dict[str, Any]] = []
        for instance in DirectIngestInstance:
            # Get the ingest bucket path
            ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                region_code=formatted_state_code,
                system_level=SystemLevel.STATE,
                ingest_instance=instance,
                project_id=self.project_id,
            )
            # Get an object containing information about the ingest bucket
            ingest_bucket_metadata = self._get_bucket_metadata(
                ingest_bucket_path)

            # Get the storage bucket for this instance
            storage_bucket_path = gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=formatted_state_code,
                system_level=SystemLevel.STATE,
                ingest_instance=instance,
                project_id=self.project_id,
            )

            # Get the database name corresponding to this instance
            ingest_db_name = self._get_database_name_for_state(
                state_code, instance)

            # Get the operations metadata for this ingest instance
            operations_db_metadata = self._get_operations_db_metadata(
                state_code, ingest_db_name)

            ingest_instance_summary: Dict[str, Any] = {
                "instance": instance.value,
                "storage": storage_bucket_path.abs_path(),
                "ingest": ingest_bucket_metadata,
                "dbName": ingest_db_name,
                "operations": operations_db_metadata,
            }

            ingest_instance_summaries.append(ingest_instance_summary)

        return ingest_instance_summaries