def __init__( self, project_id: str, region: str, lower_bound_update_datetime: Optional[datetime.datetime], gcs_destination_path: Optional[str] = None, ): self.project_id = project_id self.region = region.lower() self.auth = SftpAuth.for_region(region) self.delegate = SftpDownloadDelegateFactory.build(region_code=region) self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.unable_to_download_items: List[str] = [] self.downloaded_items: List[Tuple[str, datetime.datetime]] = [] self.lower_bound_update_datetime = lower_bound_update_datetime self.bucket = ( GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) if gcs_destination_path is None else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path) ) self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir( dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY )
def __init__( self, region_code: str, dry_run: bool, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.dry_run = dry_run self.project_id = 'recidiviz-123' self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_' f'{datetime.datetime.now().isoformat()}.txt') self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self._get_file_tag_rank_list()) self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() if ingest_directory_path: self.ingest_directory_path = ingest_directory_path else: self.ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) if storage_directory_path: self.storage_directory_path = storage_directory_path else: self.storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self._get_file_tag_rank_list())
def __init__(self, project_id: str, region: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str]): self.project_id = project_id self.region = region self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id) self.ingest_bucket = gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt' )
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.temp_output_directory_path = \ GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = \ GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl()) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()))
def __init__( self, project_id: str, region: str, file_type_to_move: GcsfsDirectIngestFileType, destination_file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str], ): self.project_id = project_id self.region = region self.file_type_to_move = file_type_to_move self.destination_file_type = destination_file_type if ( self.file_type_to_move != self.destination_file_type and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED ): raise ValueError( "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED" ) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt", )
def __init__( self, paths_with_timestamps: List[Tuple[str, datetime.datetime]], project_id: str, region: str, gcs_destination_path: Optional[str] = None, ): self.paths_with_timestamps = paths_with_timestamps self.project_id = project_id self.region = region.lower() self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.gcs_destination_path = ( GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) if gcs_destination_path is None else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)) self.uploaded_files: List[str] = [] self.unable_to_upload_files: List[str] = []
def __init__(self, paths: str, project_id: str, region: str, date: str, dry_run: bool): self.paths = paths self.project_id = project_id self.region = region.lower() self.datetime = datetime.datetime.fromisoformat(date) self.dry_run = dry_run self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.mutex = threading.Lock() self.move_progress: Optional[Bar] = None self.copies_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f"upload_to_ingest_result_{region}_{self.project_id}_date_{self.datetime.date().isoformat()}" f"_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt", )
def test_get_state_ingest_directory_path_for_region(self) -> None: self.assertEqual( gcsfs_direct_ingest_directory_path_for_region( "us_nd", SystemLevel.STATE), "recidiviz-staging-direct-ingest-state-us-nd", )
def test_get_county_ingest_directory_path_for_region(self) -> None: self.assertEqual( gcsfs_direct_ingest_directory_path_for_region( "us_tx_brazos", SystemLevel.COUNTY), "recidiviz-123-direct-ingest-county/us_tx_brazos", )