def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self._get_file_tag_rank_list()) self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
def __init__( self, region_code: str, dry_run: bool, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.dry_run = dry_run self.project_id = 'recidiviz-123' self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_' f'{datetime.datetime.now().isoformat()}.txt') self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.temp_output_directory_path = \ GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = \ GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl()) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()))
def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, file_filter: Optional[str]): self.file_type = file_type self.region_code = region_code self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.project_id = project_id self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, self.file_type, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}' f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def create_export_manager(self, region): metadata_manager = PostgresDirectIngestFileMetadataManager(region.region_code) return DirectIngestIngestViewExportManager( region=region, fs=FakeDirectIngestGCSFileSystem(), ingest_directory_path=GcsfsDirectoryPath.from_absolute_path('ingest_bucket'), big_query_client=self.mock_client, file_metadata_manager=metadata_manager, view_collector=_ViewCollector(region, controller_file_tags=['ingest_view']))
def _copy_files_for_date(self, subdir_path_str: str): dir_path = GcsfsDirectoryPath.from_absolute_path(subdir_path_str.rstrip('/')) from_path = f'gs://{self.prod_region_storage_dir_path.bucket_name}/{dir_path.relative_path}*' to_path = f'gs://{self.staging_region_storage_dir_path.bucket_name}/{dir_path.relative_path}' if not self.dry_run: gsutil_cp(from_path=from_path, to_path=to_path) with self.mutex: self.copy_list.append((from_path, to_path)) if self.copy_progress: self.copy_progress.next()
def __init__(self, project_id: str, region: str, file_type_to_move: GcsfsDirectIngestFileType, destination_file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str]): self.project_id = project_id self.region = region self.file_type_to_move = file_type_to_move self.destination_file_type = destination_file_type if self.file_type_to_move != self.destination_file_type and \ self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED: raise ValueError( 'Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED' ) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt' )
def __init__(self, region_code: str, file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool): self.file_type = file_type self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id='recidiviz-123')) self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id='recidiviz-staging')) self.dry_run = dry_run self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.log_output_path = os.path.join( os.path.dirname(__file__), f'copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.copy_list: List[Tuple[str, str]] = [] self.copy_progress: Optional[Bar] = None
def __init__(self, paths: str, project_id: str, region: str, date: str, dry_run: bool): self.paths = paths self.project_id = project_id self.region = region.lower() self.datetime = datetime.datetime.fromisoformat(date) self.dry_run = dry_run self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.mutex = threading.Lock() self.move_progress: Optional[Bar] = None self.copies_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f'upload_to_ingest_result_{region}_{self.project_id}_date_{self.datetime.date().isoformat()}' f'_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt' )
def _move_files_for_date(self, subdir_path_str: str): """Function that loops through each subdirectory and moves files in each subdirectory using the from path and to path specified.""" from_dir_path = GcsfsDirectoryPath.from_absolute_path( subdir_path_str.rstrip('/')) previous_date_format = from_dir_path.relative_path.rstrip('/').split( '/')[-1] new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") from_paths = gsutil_ls(f'{subdir_path_str}*.csv') for from_path in from_paths: file_name = GcsfsFilePath( bucket_name=self.region_storage_dir_path.bucket_name, blob_name=from_path).file_name to_file_path = os.path.join( 'gs://', self.region_storage_dir_path.bucket_name, self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value, new_date_format, file_name) normalized_to_file_path = to_normalized_processed_file_path_from_normalized_path( to_file_path, file_type_override=GcsfsDirectIngestFileType.RAW_DATA) to_path = normalized_to_file_path if not self.dry_run: gsutil_mv(from_path=from_path, to_path=to_path) with self.mutex: self.move_list.append((from_path, to_path)) if self.move_progress: self.move_progress.next()
class TestGcsfsDirectIngestJobPrioritizer(unittest.TestCase): """Tests for the GcsfsDirectIngestJobPrioritizer.""" _DAY_1_TIME_1 = datetime.datetime(year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_2 = datetime.datetime(year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=7789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_3 = datetime.datetime(year=2019, month=1, day=2, hour=10, minute=4, second=5, microsecond=678, tzinfo=datetime.timezone.utc) _DAY_2_TIME_1 = datetime.datetime(year=2019, month=1, day=3, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1 = _DAY_1_TIME_1.date() _DAY_2 = _DAY_2_TIME_1.date() _INGEST_BUCKET_PATH = \ GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures') def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() self.prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB']) FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures' def _normalized_path_for_filename(self, filename: str, dt: datetime.datetime) -> GcsfsFilePath: normalized_path = \ to_normalized_unprocessed_file_path( os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename), dt) return GcsfsFilePath.from_absolute_path(normalized_path) def _process_jobs_for_paths_with_no_gaps_in_expected_order( self, paths: List[GcsfsFilePath]): for path in paths: date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) if next_job_args is None: # Make mypy happy self.fail() self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) def test_empty_fs(self): self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1_TIME_1.date().isoformat())) self.assertIsNone(self.prioritizer.get_next_job_args()) def test_single_expected_file(self): path = self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1) self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order([path]) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagB self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2) ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_unexpected_file(self): # Only file is out of order path = self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1) self.fs.test_add_path(path) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertFalse( self.prioritizer.are_next_args_expected(next_job_args)) # ... job runs eventually even though unexpected... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagA self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_files_on_multiple_days(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_files_on_multiple_days_with_gap(self): """Runs a test where there are files on multiple days and there is a gap in the expected files for the first day. """ paths = [ self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) are_args_expected = \ self.prioritizer.are_next_args_expected(next_job_args) if i == 0: self.assertFalse(are_args_expected) else: self.assertTrue(are_args_expected) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_multiple_files_same_tag(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files_times_out_of_order(self): """Runs a test where there are no gaps but the files have been added (i.e. have creation times) out of order. """ paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) are_more_jobs_expected = \ self.prioritizer.are_more_jobs_expected_for_day(date_str) if i == 2: self.assertFalse(are_more_jobs_expected) else: self.assertTrue(are_more_jobs_expected) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_run_multiple_copies_of_same_tag(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA_2.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))