Ejemplo n.º 1
0
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        upload_paths = []
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i, upload_path.abs_path())

            upload_paths.append(upload_path)
            try:
                self.fs.mv(split_contents_path, upload_path)
            except Exception as e:
                logging.error(
                    'Threw error while copying split files from temp bucket - attempting to clean up before rethrowing.'
                    ' [%s]', e)
                for p in upload_paths:
                    self.fs.delete(p)
                raise e

        # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving
        # the metadata manager in an inconsistent state.
        if self.region.are_ingest_view_exports_enabled_in_env():
            if not isinstance(original_metadata,
                              DirectIngestIngestFileMetadata):
                raise ValueError(
                    'Attempting to split a non-ingest view type file')

            logging.info(
                'Registering [%s] split files with the metadata manager.',
                len(upload_paths))

            for upload_path in upload_paths:
                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path)
                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(), len(split_contents_paths))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True
Ejemplo n.º 2
0
 def on_file_added(self, path: GcsfsFilePath) -> None:
     if path.abs_path().startswith(
             self.controller.ingest_directory_path.abs_path()):
         self.controller.handle_file(path,
                                     start_ingest=self.can_start_ingest)
Ejemplo n.º 3
0
def _test_get_local_file(file_path: GcsfsFilePath) -> str:
    local_path = os.path.join(
        os.path.realpath(os.path.dirname(os.path.realpath(__file__))), "auth_fixtures"
    )
    return Path(os.path.join(local_path, file_path.abs_path())).read_text()
Ejemplo n.º 4
0
 def delete(self, path: GcsfsFilePath) -> None:
     with self.mutex:
         self.files.pop(path.abs_path())
Ejemplo n.º 5
0
 def post_process_downloads(self, downloaded_path: GcsfsFilePath,
                            _: GCSFileSystem) -> str:
     """The US_ID server doesn't require any post-processing."""
     return downloaded_path.abs_path()
Ejemplo n.º 6
0
 def set_content_type(self, path: GcsfsFilePath, content_type: str) -> None:
     with self.mutex:
         entry = self.files[path.abs_path()]
         self.files[path.abs_path()] = FakeGCSFileSystemEntry(
             path, entry.local_path, content_type)
Ejemplo n.º 7
0
 def clear_metadata(self, path: GcsfsFilePath) -> None:
     path_key = path.abs_path()
     self.metadata_store[path_key] = {}
Ejemplo n.º 8
0
 def get_metadata(self, path: GcsfsFilePath) -> Optional[Dict[str, str]]:
     path_key = path.abs_path()
     return self.metadata_store[path_key]