Exemple #1
0
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[str] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id
                )
            )
            if gcs_destination_path is None
            else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)
        )
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY
        )
 def __init__(
     self,
     region_code: str,
     dry_run: bool,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.dry_run = dry_run
     self.project_id = 'recidiviz-123'
     self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_'
         f'{datetime.datetime.now().isoformat()}.txt')
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self._get_file_tag_rank_list())

        self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()

        if ingest_directory_path:
            self.ingest_directory_path = ingest_directory_path
        else:
            self.ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        if storage_directory_path:
            self.storage_directory_path = storage_directory_path
        else:
            self.storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self._get_file_tag_rank_list())
    def __init__(self, project_id: str, region: str,
                 start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool,
                 file_filter: Optional[str]):

        self.project_id = project_id
        self.region = region
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id)
        self.ingest_bucket = gcsfs_direct_ingest_directory_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id)

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
Exemple #6
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
Exemple #7
0
    def __init__(
        self,
        project_id: str,
        region: str,
        file_type_to_move: GcsfsDirectIngestFileType,
        destination_file_type: GcsfsDirectIngestFileType,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        file_filter: Optional[str],
    ):

        self.project_id = project_id
        self.region = region
        self.file_type_to_move = file_type_to_move
        self.destination_file_type = destination_file_type

        if (
            self.file_type_to_move != self.destination_file_type
            and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED
        ):
            raise ValueError(
                "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED"
            )

        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )
        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
Exemple #8
0
    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        gcs_destination_path: Optional[str] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.gcs_destination_path = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id))
            if gcs_destination_path is None else
            GcsfsDirectoryPath.from_absolute_path(gcs_destination_path))
        self.uploaded_files: List[str] = []
        self.unable_to_upload_files: List[str] = []
Exemple #9
0
    def __init__(self, paths: str, project_id: str, region: str, date: str,
                 dry_run: bool):

        self.paths = paths
        self.project_id = project_id
        self.region = region.lower()
        self.datetime = datetime.datetime.fromisoformat(date)
        self.dry_run = dry_run

        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))

        self.mutex = threading.Lock()
        self.move_progress: Optional[Bar] = None
        self.copies_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"upload_to_ingest_result_{region}_{self.project_id}_date_{self.datetime.date().isoformat()}"
            f"_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
Exemple #10
0
 def test_get_state_ingest_directory_path_for_region(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_directory_path_for_region(
             "us_nd", SystemLevel.STATE),
         "recidiviz-staging-direct-ingest-state-us-nd",
     )
Exemple #11
0
 def test_get_county_ingest_directory_path_for_region(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_directory_path_for_region(
             "us_tx_brazos", SystemLevel.COUNTY),
         "recidiviz-123-direct-ingest-county/us_tx_brazos",
     )