class TestDirectIngestGcsFileSystem(TestCase): """Tests for the FakeGCSFileSystem.""" STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket', relative_path='region_subdir') INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket') def setUp(self) -> None: self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem()) def fully_process_file(self, dt: datetime.datetime, path: GcsfsFilePath, file_type_differentiation_on: bool = False) -> None: """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) start_num_total_files = len(self.fs.gcs_file_system.all_paths) # pylint: disable=protected-access start_ingest_paths = self.fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '', None) start_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', None) if file_type_differentiation_on: start_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) start_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: start_raw_storage_paths = [] start_ingest_view_storage_paths = [] # File is renamed to normalized path file_type = GcsfsDirectIngestFileType.RAW_DATA \ if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED self.fs.mv_path_to_normalized_path(path, file_type, dt) if file_type_differentiation_on: raw_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(len(raw_unprocessed), 1) self.assertTrue( self.fs.is_seen_unprocessed_file(raw_unprocessed[0])) # ... raw file imported to BQ processed_path = self.fs.mv_path_to_processed_path( raw_unprocessed[0]) processed = self.fs.get_processed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.fs.copy( processed_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( processed_path.abs_path(), file_type_override=GcsfsDirectIngestFileType. INGEST_VIEW))) self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH) ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None ingest_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter) self.assertEqual(len(ingest_unprocessed), 1) self.assertTrue(self.fs.is_seen_unprocessed_file( ingest_unprocessed[0])) # ... file is ingested # File is moved to processed path self.fs.mv_path_to_processed_path(ingest_unprocessed[0]) processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.assertTrue(self.fs.is_processed_file(processed[0])) unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(unprocessed), 0) # File is moved to storage ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \ if file_type_differentiation_on else None self.fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, date_str_bound=dt.date().isoformat(), include_bound=True, file_type_filter=ingest_move_type_filter) end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH, '', file_type_filter=None) end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) if file_type_differentiation_on: end_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) end_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: end_raw_storage_paths = [] end_ingest_view_storage_paths = [] # Each file gets re-exported as ingest view splitting_factor = 2 if file_type_differentiation_on else 1 expected_final_total_files = start_num_total_files + splitting_factor - 1 self.assertEqual(len(self.fs.gcs_file_system.all_paths), expected_final_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1 * splitting_factor) if file_type_differentiation_on: self.assertEqual( len(end_raw_storage_paths) + len(end_ingest_view_storage_paths), len(end_storage_paths)) self.assertEqual(len(end_raw_storage_paths), len(start_raw_storage_paths) + 1) self.assertEqual(len(end_ingest_view_storage_paths), len(start_ingest_view_storage_paths) + 1) for sp in end_storage_paths: parts = filename_parts_from_path(sp) if sp.abs_path() not in { p.abs_path() for p in start_storage_paths }: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) dir_path, storage_file_name = os.path.split(sp.abs_path()) if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED: self.assertTrue(parts.file_type.value in dir_path) name, _ = path.file_name.split('.') self.assertTrue(name in storage_file_name) def test_direct_ingest_file_moves(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) def test_direct_ingest_multiple_file_moves(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv')) def test_move_to_storage_with_conflict(self) -> None: dt = datetime.datetime.now() self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: self.assertTrue(filename_parts_from_path(path)) if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file) def test_direct_ingest_file_moves_with_file_types(self) -> None: self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) def test_direct_ingest_multiple_file_moves_with_file_types(self) -> None: self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'), file_type_differentiation_on=True) def test_move_to_storage_with_conflict_with_file_types(self) -> None: dt = datetime.datetime.now() self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 4) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
class GcsfsDirectIngestController( BaseDirectIngestController[GcsfsIngestArgs, GcsfsFileContentsHandle]): """Controller for parsing and persisting a file in the GCS filesystem.""" _MAX_STORAGE_FILE_RENAME_TRIES = 10 _DEFAULT_MAX_PROCESS_JOB_WAIT_TIME_SEC = 300 _INGEST_FILE_SPLIT_LINE_LIMIT = 2500 def __init__( self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None, ): super().__init__(region_name, system_level) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = gcsfs_direct_ingest_directory_path_for_region( region_name, system_level) self.ingest_directory_path = GcsfsDirectoryPath.from_absolute_path( ingest_directory_path) if not storage_directory_path: storage_directory_path = ( gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level)) self.storage_directory_path = GcsfsDirectoryPath.from_absolute_path( storage_directory_path) self.temp_output_directory_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = ( GcsfsDirectIngestFileType.INGEST_VIEW if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None) self.file_prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter, ) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl(), ) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()), launched_file_tags=self.get_file_tag_rank_list(), ) # ================= # # NEW FILE HANDLING # # ================= # def handle_file(self, path: GcsfsFilePath, start_ingest: bool) -> None: """Called when a single new file is added to an ingest bucket (may also be called as a result of a rename). May be called from any worker/queue. """ if self.fs.is_processed_file(path): logging.info("File [%s] is already processed, returning.", path.abs_path()) return if self.fs.is_normalized_file_path(path): parts = filename_parts_from_path(path) if (parts.is_file_split and parts.file_split_size and parts.file_split_size <= self.ingest_file_split_line_limit): self.kick_scheduler(just_finished_job=False) logging.info( "File [%s] is already normalized and split split " "with correct size, kicking scheduler.", path.abs_path(), ) return logging.info("Creating cloud task to schedule next job.") self.cloud_task_manager.create_direct_ingest_handle_new_files_task( region=self.region, can_start_ingest=start_ingest) def _register_all_new_paths_in_metadata( self, paths: List[GcsfsFilePath]) -> None: for path in paths: if not self.file_metadata_manager.has_file_been_discovered(path): self.file_metadata_manager.mark_file_as_discovered(path) @trace.span def handle_new_files(self, can_start_ingest: bool) -> None: """Searches the ingest directory for new/unprocessed files. Normalizes file names and splits files as necessary, schedules the next ingest job if allowed. Should only be called from the scheduler queue. """ if not can_start_ingest and self.region.is_ingest_launched_in_env(): raise ValueError( "The can_start_ingest flag should only be used for regions where ingest is not yet launched in a " "particular environment. If we want to be able to selectively pause ingest processing for a state, we " "will first have to build a config that is respected by both the /ensure_all_file_paths_normalized " "endpoint and any cloud functions that trigger ingest.") unnormalized_paths = self.fs.get_unnormalized_file_paths( self.ingest_directory_path) unnormalized_path_file_type = ( GcsfsDirectIngestFileType.RAW_DATA if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED) for path in unnormalized_paths: logging.info("File [%s] is not yet seen, normalizing.", path.abs_path()) self.fs.mv_path_to_normalized_path( path, file_type=unnormalized_path_file_type) if unnormalized_paths: logging.info( "Normalized at least one path - returning, will handle " "normalized files separately.") # Normalizing file paths will cause the cloud function that calls # this function to be re-triggered. return if not can_start_ingest: logging.warning( "Ingest not configured to start post-file normalization - returning." ) return check_is_region_launched_in_env(self.region) unprocessed_raw_paths = [] ingest_file_type_filter = ( GcsfsDirectIngestFileType.INGEST_VIEW if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None) unprocessed_ingest_view_paths = self.fs.get_unprocessed_file_paths( self.ingest_directory_path, file_type_filter=ingest_file_type_filter) if self.region.is_raw_vs_ingest_file_name_detection_enabled(): unprocessed_raw_paths = self.fs.get_unprocessed_file_paths( self.ingest_directory_path, file_type_filter=GcsfsDirectIngestFileType.RAW_DATA, ) self._register_all_new_paths_in_metadata(unprocessed_raw_paths) if self.region.are_ingest_view_exports_enabled_in_env(): self._register_all_new_paths_in_metadata( unprocessed_ingest_view_paths) unprocessed_paths = unprocessed_raw_paths + unprocessed_ingest_view_paths did_split = False for path in unprocessed_ingest_view_paths: if self._split_file_if_necessary(path): did_split = True if did_split: if self.region.are_ingest_view_exports_enabled_in_env(): post_split_unprocessed_ingest_view_paths = ( self.fs.get_unprocessed_file_paths( self.ingest_directory_path, file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW, )) self._register_all_new_paths_in_metadata( post_split_unprocessed_ingest_view_paths) logging.info( "Split at least one path - returning, will handle split " "files separately.") # Writing new split files to storage will cause the cloud function # that calls this function to be re-triggered. return if unprocessed_paths: self.schedule_next_ingest_job_or_wait_if_necessary( just_finished_job=False) def do_raw_data_import(self, data_import_args: GcsfsRawDataBQImportArgs) -> None: """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to storage on completion. """ check_is_region_launched_in_env(self.region) if not self.region.are_raw_data_bq_imports_enabled_in_env(): raise ValueError( f"Raw data imports not enabled for region [{self.region.region_code}]" ) if not self.fs.exists(data_import_args.raw_data_file_path): logging.warning( "File path [%s] no longer exists - might have already been " "processed or deleted", data_import_args.raw_data_file_path, ) self.kick_scheduler(just_finished_job=True) return file_metadata = self.file_metadata_manager.get_file_metadata( data_import_args.raw_data_file_path) if file_metadata.processed_time: logging.warning( "File [%s] is already marked as processed. Skipping file processing.", data_import_args.raw_data_file_path.file_name, ) self.kick_scheduler(just_finished_job=True) return self.raw_file_import_manager.import_raw_file_to_big_query( data_import_args.raw_data_file_path, file_metadata) if not self.region.are_ingest_view_exports_enabled_in_env(): # TODO(#3162) This is a stopgap measure for regions that have only partially launched. Delete once SQL # pre-processing is enabled for all direct ingest regions. parts = filename_parts_from_path( data_import_args.raw_data_file_path) ingest_file_tags = self.get_file_tag_rank_list() if parts.file_tag in ingest_file_tags: self.fs.copy( data_import_args.raw_data_file_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( data_import_args.raw_data_file_path.abs_path(), file_type_override=GcsfsDirectIngestFileType. INGEST_VIEW, )), ) processed_path = self.fs.mv_path_to_processed_path( data_import_args.raw_data_file_path) self.file_metadata_manager.mark_file_as_processed( path=data_import_args.raw_data_file_path) self.fs.mv_path_to_storage(processed_path, self.storage_directory_path) self.kick_scheduler(just_finished_job=True) def do_ingest_view_export( self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> None: check_is_region_launched_in_env(self.region) if not self.region.are_ingest_view_exports_enabled_in_env(): raise ValueError( f"Ingest view exports not enabled for region [{self.region.region_code}]. Passed args: " f"{ingest_view_export_args}") did_export = self.ingest_view_export_manager.export_view_for_args( ingest_view_export_args) if (not did_export or not self.file_metadata_manager. get_ingest_view_metadata_pending_export()): logging.info("Creating cloud task to schedule next job.") self.cloud_task_manager.create_direct_ingest_handle_new_files_task( region=self.region, can_start_ingest=True) # ============== # # JOB SCHEDULING # # ============== # def _schedule_any_pre_ingest_tasks(self) -> bool: """Schedules any tasks related to SQL preprocessing of new files in preparation for ingest of those files into our Postgres database. Returns True if any jobs were scheduled or if there were already any pre-ingest jobs scheduled. Returns False if there are no remaining ingest jobs to schedule and it is safe to proceed with ingest. """ if self._schedule_raw_data_import_tasks(): logging.info("Found pre-ingest raw data import tasks to schedule.") return True # TODO(#3020): We have logic to ensure that we wait 10 min for all files to upload properly before moving on to # ingest. We probably actually need this to happen between raw data import and ingest view export steps - if we # haven't seen all files yet and most recent raw data file came in sometime in the last 10 min, we should wait # to do view exports. if self._schedule_ingest_view_export_tasks(): logging.info("Found pre-ingest view export tasks to schedule.") return True return False def _schedule_raw_data_import_tasks(self) -> bool: if not self.region.are_raw_data_bq_imports_enabled_in_env(): return False queue_info = self.cloud_task_manager.get_bq_import_export_queue_info( self.region) did_schedule = False tasks_to_schedule = [ GcsfsRawDataBQImportArgs(path) for path in self.raw_file_import_manager.get_unprocessed_raw_files_to_import() ] for task_args in tasks_to_schedule: # If the file path has not actually been discovered by the metadata manager yet, it likely was just added # and a subsequent call to handle_files will register it and trigger another call to this function so we can # schedule the appropriate job. discovered = self.file_metadata_manager.has_file_been_discovered( task_args.raw_data_file_path) if discovered and not queue_info.has_task_already_scheduled( task_args): self.cloud_task_manager.create_direct_ingest_raw_data_import_task( self.region, task_args) did_schedule = True return queue_info.has_raw_data_import_jobs_queued() or did_schedule def _schedule_ingest_view_export_tasks(self) -> bool: """Schedules all pending ingest view export tasks for launched ingest view tags, if they have not been scheduled. If tasks are scheduled or are still running, returns True. Otherwise, if it's safe to proceed with next steps of ingest, returns False.""" if not self.region.are_ingest_view_exports_enabled_in_env(): return False queue_info = self.cloud_task_manager.get_bq_import_export_queue_info( self.region) if queue_info.has_ingest_view_export_jobs_queued(): # Since we schedule all export jobs at once, after all raw files have been processed, we wait for all of the # export jobs to be done before checking if we need to schedule more. return True did_schedule = False tasks_to_schedule = ( self.ingest_view_export_manager.get_ingest_view_export_task_args()) rank_list = self.get_file_tag_rank_list() ingest_view_name_rank = { ingest_view_name: i for i, ingest_view_name in enumerate(rank_list) } # Filter out views that aren't in ingest view tags. filtered_tasks_to_schedule = [] for args in tasks_to_schedule: if args.ingest_view_name not in ingest_view_name_rank: logging.warning( "Skipping ingest view task export for [%s] - not in controller ingest tags.", args.ingest_view_name, ) continue filtered_tasks_to_schedule.append(args) tasks_to_schedule = filtered_tasks_to_schedule # Sort by tag order and export datetime tasks_to_schedule.sort(key=lambda args: ( ingest_view_name_rank[args.ingest_view_name], args.upper_bound_datetime_to_export, )) for task_args in tasks_to_schedule: if not queue_info.has_task_already_scheduled(task_args): self.cloud_task_manager.create_direct_ingest_ingest_view_export_task( self.region, task_args) did_schedule = True return did_schedule @classmethod @abc.abstractmethod def get_file_tag_rank_list(cls) -> List[str]: pass def _get_next_job_args(self) -> Optional[GcsfsIngestArgs]: args = self.file_prioritizer.get_next_job_args() if not self.region.are_ingest_view_exports_enabled_in_env(): return args if not args: return None discovered = self.file_metadata_manager.has_file_been_discovered( args.file_path) if not discovered: # If the file path has not actually been discovered by the controller yet, it likely was just added and a # subsequent call to handle_files will register it and trigger another call to this function so we can # schedule the appropriate job. logging.info( "Found args [%s] for a file that has not been discovered by the metadata manager yet - not scheduling.", args, ) return None return args def _wait_time_sec_for_next_args(self, args: GcsfsIngestArgs) -> int: if self.file_prioritizer.are_next_args_expected(args): # Run job immediately return 0 now = datetime.datetime.utcnow() file_upload_time: datetime.datetime = filename_parts_from_path( args.file_path).utc_upload_datetime max_delay_sec = (self.max_delay_sec_between_files if self.max_delay_sec_between_files is not None else self._DEFAULT_MAX_PROCESS_JOB_WAIT_TIME_SEC) max_wait_from_file_upload_time = file_upload_time + datetime.timedelta( seconds=max_delay_sec) if max_wait_from_file_upload_time <= now: wait_time = 0 else: wait_time = (max_wait_from_file_upload_time - now).seconds logging.info("Waiting [%s] sec for [%s]", wait_time, self._job_tag(args)) return wait_time def _on_job_scheduled(self, ingest_args: GcsfsIngestArgs) -> None: pass # =================== # # SINGLE JOB RUN CODE # # =================== # def _job_tag(self, args: GcsfsIngestArgs) -> str: return (f"{self.region.region_code}/{args.file_path.file_name}:" f"{args.ingest_time}") def _get_contents_handle( self, args: GcsfsIngestArgs) -> Optional[GcsfsFileContentsHandle]: return self._get_contents_handle_from_path(args.file_path) def _get_contents_handle_from_path( self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]: return self.fs.download_to_temp_file(path) @abc.abstractmethod def _are_contents_empty(self, args: GcsfsIngestArgs, contents_handle: GcsfsFileContentsHandle) -> bool: pass def _can_proceed_with_ingest_for_contents( self, args: GcsfsIngestArgs, contents_handle: GcsfsFileContentsHandle) -> bool: parts = filename_parts_from_path(args.file_path) return self._are_contents_empty( args, contents_handle) or not self._must_split_contents( parts.file_type, args.file_path) def _must_split_contents(self, file_type: GcsfsDirectIngestFileType, path: GcsfsFilePath) -> bool: if (self.region.is_raw_vs_ingest_file_name_detection_enabled() and file_type == GcsfsDirectIngestFileType.RAW_DATA): return False return not self._file_meets_file_line_limit( self.ingest_file_split_line_limit, path) @abc.abstractmethod def _file_meets_file_line_limit(self, line_limit: int, path: GcsfsFilePath) -> bool: """Subclasses should implement to determine whether the file meets the expected line limit""" @abc.abstractmethod def _parse(self, args: GcsfsIngestArgs, contents_handle: GcsfsFileContentsHandle) -> IngestInfo: pass def _should_split_file(self, path: GcsfsFilePath) -> bool: """Returns a handle to the contents of this path if this file should be split, None otherwise.""" parts = filename_parts_from_path(path) if (self.region.is_raw_vs_ingest_file_name_detection_enabled() and parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW): raise ValueError( f"Should not be attempting to split files other than ingest view files, found path with " f"file type: {parts.file_type}") if parts.file_tag not in self.get_file_tag_rank_list(): logging.info( "File tag [%s] for path [%s] not in rank list - not splitting.", parts.file_tag, path.abs_path(), ) return False if (parts.is_file_split and parts.file_split_size and parts.file_split_size <= self.ingest_file_split_line_limit): logging.info( "File [%s] already split with size [%s].", path.abs_path(), parts.file_split_size, ) return False return self._must_split_contents(parts.file_type, path) @trace.span def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool: """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. Returns True if the file was split, False if splitting was not necessary. """ should_split = self._should_split_file(path) if not should_split: logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata( path) output_dir = GcsfsDirectoryPath.from_file_path(path) split_contents_paths = self._split_file(path) upload_paths = [] for i, split_contents_path in enumerate(split_contents_paths): upload_path = self._create_split_file_path(path, output_dir, split_num=i) logging.info( "Copying split [%s] to direct ingest directory at path [%s].", i, upload_path.abs_path(), ) upload_paths.append(upload_path) try: self.fs.mv(split_contents_path, upload_path) except Exception as e: logging.error( "Threw error while copying split files from temp bucket - attempting to clean up before rethrowing." " [%s]", e, ) for p in upload_paths: self.fs.delete(p) raise e # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving # the metadata manager in an inconsistent state. if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError( "Attempting to split a non-ingest view type file") logging.info( "Registering [%s] split files with the metadata manager.", len(upload_paths), ) for upload_path in upload_paths: ingest_file_metadata = ( self.file_metadata_manager.register_ingest_file_split( original_metadata, upload_path)) self.file_metadata_manager.mark_ingest_view_exported( ingest_file_metadata) self.file_metadata_manager.mark_file_as_processed(path) logging.info( "Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_paths), ) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True def _create_split_file_path( self, original_file_path: GcsfsFilePath, output_dir: GcsfsDirectoryPath, split_num: int, ) -> GcsfsFilePath: parts = filename_parts_from_path(original_file_path) rank_str = str(split_num + 1).zfill(5) updated_file_name = ( f"{parts.stripped_file_name}_{rank_str}" f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}" f".{parts.extension}") file_type = ( GcsfsDirectIngestFileType.INGEST_VIEW if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED) return GcsfsFilePath.from_directory_and_file_name( output_dir, to_normalized_unprocessed_file_path(updated_file_name, file_type=file_type, dt=parts.utc_upload_datetime), ) @abc.abstractmethod def _split_file(self, path: GcsfsFilePath) -> List[GcsfsFilePath]: """Should be implemented by subclasses to split a file accessible via the provided path into multiple files and upload those files to GCS. Returns the list of upload paths.""" def _do_cleanup(self, args: GcsfsIngestArgs) -> None: self.fs.mv_path_to_processed_path(args.file_path) if self.region.are_ingest_view_exports_enabled_in_env(): self.file_metadata_manager.mark_file_as_processed(args.file_path) parts = filename_parts_from_path(args.file_path) self._move_processed_files_to_storage_as_necessary( last_processed_date_str=parts.date_str) def _is_last_job_for_day(self, args: GcsfsIngestArgs) -> bool: """Returns True if the file handled in |args| is the last file for that upload date.""" parts = filename_parts_from_path(args.file_path) upload_date, date_str = parts.utc_upload_datetime, parts.date_str more_jobs_expected = self.file_prioritizer.are_more_jobs_expected_for_day( date_str) if more_jobs_expected: return False next_job_args = self.file_prioritizer.get_next_job_args(date_str) if next_job_args: next_job_date = filename_parts_from_path( next_job_args.file_path).utc_upload_datetime return next_job_date > upload_date return True def _move_processed_files_to_storage_as_necessary( self, last_processed_date_str: str) -> None: """Moves files that have already been ingested/processed, up to and including the given date, into storage, if there is nothing more left to ingest/process, i.e. we are not expecting more files.""" next_args = self.file_prioritizer.get_next_job_args() should_move_last_processed_date = False if not next_args: are_more_jobs_expected = ( self.file_prioritizer.are_more_jobs_expected_for_day( last_processed_date_str)) if not are_more_jobs_expected: should_move_last_processed_date = True else: next_date_str = filename_parts_from_path( next_args.file_path).date_str if next_date_str < last_processed_date_str: logging.info("Found a file [%s] from a date previous to our " "last processed date - not moving anything to " "storage.") return # If there are still more to process on this day, do not move files # from this day. should_move_last_processed_date = next_date_str != last_processed_date_str # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ. file_type_to_move = ( GcsfsDirectIngestFileType.INGEST_VIEW if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None) self.fs.mv_processed_paths_before_date_to_storage( self.ingest_directory_path, self.storage_directory_path, file_type_filter=file_type_to_move, date_str_bound=last_processed_date_str, include_bound=should_move_last_processed_date, ) @staticmethod def file_tag(file_path: GcsfsFilePath) -> str: return filename_parts_from_path(file_path).file_tag