Ejemplo n.º 1
0
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[str] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id
                )
            )
            if gcs_destination_path is None
            else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)
        )
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY
        )
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[GcsfsDirectoryPath] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []
        self.skipped_files: List[str] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (gcsfs_sftp_download_bucket_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id) if
                       gcs_destination_path is None else gcs_destination_path)
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY)

        self.postgres_direct_ingest_file_metadata_manager = (
            PostgresDirectIngestRawFileMetadataManager(
                region,
                DirectIngestInstance.PRIMARY.database_version(
                    SystemLevel.STATE,
                    state_code=StateCode(self.region.upper())).name,
            ))
Ejemplo n.º 3
0
    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        delegate: UploadStateFilesToIngestBucketDelegate,
        destination_bucket_override: Optional[GcsfsBucketPath] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()
        self.delegate = delegate

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        # Raw data uploads always default to primary ingest bucket
        self.destination_ingest_bucket = (
            destination_bucket_override
            or gcsfs_direct_ingest_bucket_for_region(
                region_code=region,
                system_level=SystemLevel.STATE,
                ingest_instance=DirectIngestInstance.PRIMARY,
                project_id=self.project_id,
            ))

        self.uploaded_files: List[str] = []
        self.skipped_files: List[str] = []
        self.unable_to_upload_files: List[str] = []
Ejemplo n.º 4
0
def normalize_raw_file_path() -> Tuple[str, HTTPStatus]:
    """Called from a Cloud Function when a new file is added to a bucket that is configured to rename files but not
    ingest them. For example, a bucket that is being used for automatic data transfer testing.
    """
    # The bucket name for the file to normalize
    bucket = get_str_param_value("bucket", request.args)
    # The relative path to the file, not including the bucket name
    relative_file_path = get_str_param_value("relative_file_path",
                                             request.args,
                                             preserve_case=True)

    if not bucket or not relative_file_path:
        return f"Bad parameters [{request.args}]", HTTPStatus.BAD_REQUEST

    path = GcsfsPath.from_bucket_and_blob_name(bucket_name=bucket,
                                               blob_name=relative_file_path)

    if not isinstance(path, GcsfsFilePath):
        raise ValueError(
            f"Incorrect type [{type(path)}] for path: {path.uri()}")

    fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
    fs.mv_path_to_normalized_path(path,
                                  file_type=GcsfsDirectIngestFileType.RAW_DATA)

    return "", HTTPStatus.OK
 def setUp(self) -> None:
     self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
     self.prioritizer = GcsfsDirectIngestJobPrioritizer(
         self.fs,
         self._INGEST_BUCKET_PATH,
         ["tagA", "tagB"],
         file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
     )
Ejemplo n.º 6
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
Ejemplo n.º 7
0
    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)
    def setUp(self) -> None:
        self.project_id = "recidiviz-456"
        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.test_region = fake_region(
            region_code="us_xx", are_raw_data_bq_imports_enabled_in_env=True)

        self.region_module_patcher = patch.object(
            direct_ingest_raw_table_migration_collector,
            "regions",
            new=controller_fixtures,
        )
        self.region_module_patcher.start()

        self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name="direct/controllers/fixtures")
        self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket")

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = (
            self.mock_import_raw_file_to_big_query)

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client,
        )
        self.import_manager.csv_reader = _TestSafeGcsCsvReader(
            self.fs.gcs_file_system)

        self.time_patcher = patch(
            "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time"
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
Ejemplo n.º 9
0
    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        gcs_destination_path: Optional[str] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.gcs_destination_path = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id))
            if gcs_destination_path is None else
            GcsfsDirectoryPath.from_absolute_path(gcs_destination_path))
        self.uploaded_files: List[str] = []
        self.unable_to_upload_files: List[str] = []
Ejemplo n.º 10
0
    def __init__(
        self,
        *,
        state_code: StateCode,
        sandbox_dataset_prefix: str,
        test_ingest_bucket: GcsfsBucketPath,
    ):

        check_is_valid_sandbox_bucket(test_ingest_bucket)

        super().__init__(
            region=get_region(state_code.value.lower(), is_direct_ingest=True),
            fs=DirectIngestGCSFileSystem(GcsfsFactory.build()),
            ingest_bucket_path=test_ingest_bucket,
            temp_output_directory_path=GcsfsDirectoryPath.from_dir_and_subdir(
                test_ingest_bucket, "temp_raw_data"
            ),
            big_query_client=BigQueryClientImpl(),
        )
        self.sandbox_dataset = (
            f"{sandbox_dataset_prefix}_{super()._raw_tables_dataset()}"
        )
 def setUp(self) -> None:
     self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
 def setUp(self) -> None:
     self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
     self.prioritizer = GcsfsDirectIngestJobPrioritizer(
         self.fs,
         self._INGEST_BUCKET_PATH, ['tagA', 'tagB'],
         file_type_filter=None)
Ejemplo n.º 13
0
 def __init__(self, override_project_id: Optional[str] = None) -> None:
     self.project_id = (metadata.project_id() if override_project_id is None
                        else override_project_id)
     self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
     self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
     self.cloud_tasks_client = tasks_v2.CloudTasksClient()