def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool: """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. Returns True if the file was split, False if splitting was not necessary. """ should_split = self._should_split_file(path) if not should_split: logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata( path) output_dir = GcsfsDirectoryPath.from_file_path(path) split_contents_paths = self._split_file(path) upload_paths = [] for i, split_contents_path in enumerate(split_contents_paths): upload_path = self._create_split_file_path(path, output_dir, split_num=i) logging.info( "Copying split [%s] to direct ingest directory at path [%s].", i, upload_path.abs_path()) upload_paths.append(upload_path) try: self.fs.mv(split_contents_path, upload_path) except Exception as e: logging.error( 'Threw error while copying split files from temp bucket - attempting to clean up before rethrowing.' ' [%s]', e) for p in upload_paths: self.fs.delete(p) raise e # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving # the metadata manager in an inconsistent state. if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError( 'Attempting to split a non-ingest view type file') logging.info( 'Registering [%s] split files with the metadata manager.', len(upload_paths)) for upload_path in upload_paths: ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split( original_metadata, upload_path) self.file_metadata_manager.mark_ingest_view_exported( ingest_file_metadata) self.file_metadata_manager.mark_file_as_processed(path) logging.info( "Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_paths)) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True
def on_file_added(self, path: GcsfsFilePath) -> None: if path.abs_path().startswith( self.controller.ingest_directory_path.abs_path()): self.controller.handle_file(path, start_ingest=self.can_start_ingest)
def _test_get_local_file(file_path: GcsfsFilePath) -> str: local_path = os.path.join( os.path.realpath(os.path.dirname(os.path.realpath(__file__))), "auth_fixtures" ) return Path(os.path.join(local_path, file_path.abs_path())).read_text()
def delete(self, path: GcsfsFilePath) -> None: with self.mutex: self.files.pop(path.abs_path())
def post_process_downloads(self, downloaded_path: GcsfsFilePath, _: GCSFileSystem) -> str: """The US_ID server doesn't require any post-processing.""" return downloaded_path.abs_path()
def set_content_type(self, path: GcsfsFilePath, content_type: str) -> None: with self.mutex: entry = self.files[path.abs_path()] self.files[path.abs_path()] = FakeGCSFileSystemEntry( path, entry.local_path, content_type)
def clear_metadata(self, path: GcsfsFilePath) -> None: path_key = path.abs_path() self.metadata_store[path_key] = {}
def get_metadata(self, path: GcsfsFilePath) -> Optional[Dict[str, str]]: path_key = path.abs_path() return self.metadata_store[path_key]