def test_single_file(self): """Parse a URL referring to a single file on Blob.""" linked_bundle_path = parse_linked_bundle_url( "azfs://storageclwsdev0/bundles/uuid/contents.gz") self.assertEqual(linked_bundle_path.storage_type, StorageType.AZURE_BLOB_STORAGE.value) self.assertEqual(linked_bundle_path.uses_beam, True) self.assertEqual(linked_bundle_path.bundle_path, "azfs://storageclwsdev0/bundles/uuid/contents.gz") self.assertEqual(linked_bundle_path.is_archive, True) self.assertEqual(linked_bundle_path.is_archive_dir, False) self.assertEqual(linked_bundle_path.index_path, "azfs://storageclwsdev0/bundles/uuid/index.sqlite") self.assertEqual(linked_bundle_path.archive_subpath, None) self.assertEqual(linked_bundle_path.bundle_uuid, "uuid") linked_bundle_path = parse_linked_bundle_url( "gs://codalabbucket1/uuid/contents.gz") self.assertEqual(linked_bundle_path.storage_type, StorageType.GCS_STORAGE.value) self.assertEqual(linked_bundle_path.uses_beam, True) self.assertEqual(linked_bundle_path.bundle_path, "gs://codalabbucket1/uuid/contents.gz") self.assertEqual(linked_bundle_path.is_archive, True) self.assertEqual(linked_bundle_path.is_archive_dir, False) self.assertEqual(linked_bundle_path.index_path, "gs://codalabbucket1/uuid/index.sqlite") self.assertEqual(linked_bundle_path.archive_subpath, None) self.assertEqual(linked_bundle_path.bundle_uuid, "uuid")
def test_container(self): """Parse a URL referring to a container or bucket.""" linked_bundle_path = parse_linked_bundle_url("gs://codalab-test") self.assertEqual(linked_bundle_path.storage_type, StorageType.GCS_STORAGE.value) linked_bundle_path = parse_linked_bundle_url( "azfs://devstoreaccount1/bundles") self.assertEqual(linked_bundle_path.storage_type, StorageType.AZURE_BLOB_STORAGE.value)
def hash_directory(path, dirs_and_files=None): """ Return the hash of the contents of the folder at the given path. This hash is independent of the path itself - if you were to move the directory and call get_hash again, you would get the same result. """ if parse_linked_bundle_url(path).uses_beam: # On Azure Blob Storage, we just use the directory size for the hashed contents. return get_size(path) (directories, files) = dirs_and_files or recursive_ls(path) # Sort and then hash all directories and then compute a hash of the hashes. # This two-level hash is necessary so that the overall hash is unambiguous - # if we updated directory_hash with the directory names themselves, then # we'd be hashing the concatenation of these names, which could be generated # in multiple ways. directory_hash = hashlib.sha1() for directory in sorted(directories): relative_path = get_relative_path(path, directory) directory_hash.update( hashlib.sha1(relative_path.encode()).hexdigest().encode()) # Use a similar two-level hashing scheme for all files, but incorporate a # hash of both the file name and contents. file_hash = hashlib.sha1() for file_name in sorted(files): relative_path = get_relative_path(path, file_name) file_hash.update( hashlib.sha1(relative_path.encode()).hexdigest().encode()) file_hash.update(hash_file_contents(file_name).encode()) # Return a hash of the two hashes. overall_hash = hashlib.sha1(directory_hash.hexdigest().encode()) overall_hash.update(file_hash.hexdigest().encode()) return overall_hash.hexdigest()
def __init__(self, path: str): """Initialize TarSubdirStream. Args: path (str): Specified path of the subdirectory on Blob Storage. Must refer to a subdirectory path within a .tar.gz file. """ from codalab.worker.file_util import OpenIndexedArchiveFile from codalab.worker.download_util import compute_target_info_blob_descendants_flat self.linked_bundle_path = parse_linked_bundle_url(path) # We add OpenIndexedArchiveFile to self._stack so that the context manager remains open and is exited # only in the method self.close(). with ExitStack() as stack: self.tf = stack.enter_context( OpenIndexedArchiveFile(self.linked_bundle_path.bundle_path)) self._stack = stack.pop_all() # Keep track of descendants of the specified subdirectory and the current descendant self.descendants = compute_target_info_blob_descendants_flat(path) self.current_desc = CurrentDescendant(desc=None, pos=0, finfo=EmptyFileInfo, tinfo=tarfile.TarInfo()) # Buffer that stores the underlying bytes of the output tar archive self._buffer = BytesBuffer() # Output tar archive self.output = tarfile.open(fileobj=self._buffer, mode="w:")
def create_file(self, contents=b"hello world"): """Creates a file on Blob (stored as a .gz with an index.sqlite index file) and returns its path.""" bundle_uuid = str(random.random()) bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/contents.gz" compressed_file = BytesIO(gzip.compress(contents)) # TODO: Unify this code with code in BlobStorageUploader.write_fileobj(). with FileSystems.create( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as f: shutil.copyfileobj(compressed_file, f) compressed_file.seek(0) with tempfile.NamedTemporaryFile(suffix=".sqlite") as tmp_index_file: SQLiteIndexedTar( fileObject=compressed_file, tarFileName= "contents", # If saving a single file as a .gz archive, this file can be accessed by the "/contents" entry in the index. writeIndex=True, clearIndexCache=True, indexFileName=tmp_index_file.name, ) with FileSystems.create( parse_linked_bundle_url(bundle_path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ) as out_index_file, open(tmp_index_file.name, "rb") as tif: shutil.copyfileobj(tif, out_index_file) return bundle_uuid, bundle_path
def get_file_size(file_path): """ Gets the size of the file, in bytes. If file is not found, raises a FileNotFoundError. """ linked_bundle_path = parse_linked_bundle_url(file_path) if linked_bundle_path.uses_beam and linked_bundle_path.is_archive: # If no archive subpath is specified for a .tar.gz or .gz file, get the uncompressed size of the entire file, # or the compressed size of the entire directory. if not linked_bundle_path.archive_subpath: if linked_bundle_path.is_archive_dir: filesystem = FileSystems.get_filesystem( linked_bundle_path.bundle_path) return filesystem.size(linked_bundle_path.bundle_path) else: with OpenFile(linked_bundle_path.bundle_path, 'rb') as fileobj: fileobj.seek(0, os.SEEK_END) return fileobj.tell() # If the archive file is a .tar.gz file on Azure, open the specified archive subpath within the archive. # If it is a .gz file on Azure, open the "/contents" entry, which represents the actual gzipped file. with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf: assert linked_bundle_path.is_archive_dir fpath = "/" + linked_bundle_path.archive_subpath finfo = tf.getFileInfo(fpath) if finfo is None: raise FileNotFoundError(fpath) return finfo.size if not get_path_exists(file_path): raise FileNotFoundError(file_path) # Local path return os.stat(file_path).st_size
def write_fileobj(self, source_ext: str, source_fileobj: IO[bytes], bundle_path: str, unpack_archive: bool): if unpack_archive: output_fileobj = zip_util.unpack_to_archive( source_ext, source_fileobj) else: output_fileobj = GzipStream(source_fileobj) # Write archive file. with FileSystems.create( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as out: shutil.copyfileobj(output_fileobj, out) # Write index file to a temporary file, then write that file to Blob Storage. with FileSystems.open(bundle_path, compression_type=CompressionTypes.UNCOMPRESSED ) as ttf, tempfile.NamedTemporaryFile( suffix=".sqlite") as tmp_index_file: SQLiteIndexedTar( fileObject=ttf, tarFileName= "contents", # If saving a single file as a .gz archive, this file can be accessed by the "/contents" entry in the index. writeIndex=True, clearIndexCache=True, indexFileName=tmp_index_file.name, ) with FileSystems.create( parse_linked_bundle_url(bundle_path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ) as out_index_file, open(tmp_index_file.name, "rb") as tif: shutil.copyfileobj(tif, out_index_file)
def get_target_bypass_url(self, target, **kwargs): """ Get SAS url with read permission. Used for bypass server downloading from Azure blob storage. """ return parse_linked_bundle_url( self._get_target_path(target)).bundle_path_bypass_url( permission='r', **kwargs)
def get_index_sas_token(self, path, **kwargs): """ Get SAS token of the index file with read and write permission. Used for uploading. """ return (parse_linked_bundle_url(path).index_path_bypass_url( permission='rw', **kwargs).split('?')[-1] # Get SAS token from SAS url. )
def get_bundle_sas_token(self, path, **kwargs): """ Get SAS token with write permission. Used for bypass server uploading. """ return (parse_linked_bundle_url(path).bundle_path_bypass_url( permission='rw', **kwargs).split('?')[-1] # Get SAS token from SAS url. )
def test_non_azure_file(self): """Should parse a non-Azure URL properly.""" linked_bundle_path = parse_linked_bundle_url( "/tmp/storageclwsdev0/bundles/uuid/contents.txt") self.assertEqual(linked_bundle_path.storage_type, StorageType.DISK_STORAGE.value) self.assertEqual(linked_bundle_path.uses_beam, False) self.assertEqual(linked_bundle_path.bundle_path, "/tmp/storageclwsdev0/bundles/uuid/contents.txt") self.assertEqual(linked_bundle_path.is_archive, False)
def get_size(path, dirs_and_files=None): """ Get the size (in bytes) of the file or directory at or under the given path. Does not include symlinked files and directories. """ if parse_linked_bundle_url(path).uses_beam: return get_path_size(path) if os.path.islink(path) or not os.path.isdir(path): return os.lstat(path).st_size dirs_and_files = dirs_and_files or recursive_ls(path) return sum( os.lstat(path).st_size for path in itertools.chain(*dirs_and_files))
def __init__(self, path: str): self.f = FileSystems.open( path, compression_type=CompressionTypes.UNCOMPRESSED) self.path = path with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as index_fileobj: self.index_file_name = index_fileobj.name shutil.copyfileobj( FileSystems.open( parse_linked_bundle_url(self.path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ), index_fileobj, )
def create_directory(self): """Creates a directory (stored as a .tar.gz with an index.sqlite index file) and returns its path.""" bundle_uuid = str(random.random()) bundle_path = f"azfs://storageclwsdev0/bundles/{bundle_uuid}/contents.tar.gz" def writestr(tf, name, contents): tinfo = tarfile.TarInfo(name) tinfo.size = len(contents) tf.addfile(tinfo, BytesIO(contents.encode())) def writedir(tf, name): tinfo = tarfile.TarInfo(name) tinfo.type = tarfile.DIRTYPE tf.addfile(tinfo, BytesIO()) # TODO: Unify this code with code in UploadManager.upload_to_bundle_store(). with FileSystems.create( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED ) as out, tempfile.NamedTemporaryFile( suffix=".tar.gz" ) as tmp_tar_file, tempfile.NamedTemporaryFile( suffix=".sqlite" ) as tmp_index_file: with tarfile.open(name=tmp_tar_file.name, mode="w:gz") as tf: # We need to create separate entries for each directory, as a regular # .tar.gz file would have. writestr(tf, "./README.md", "hello world") writedir(tf, "./src") writestr(tf, "./src/test.sh", "echo hi") writedir(tf, "./dist") writedir(tf, "./dist/a") writedir(tf, "./dist/a/b") writestr(tf, "./dist/a/b/test2.sh", "echo two") shutil.copyfileobj(tmp_tar_file, out) with open(tmp_tar_file.name, "rb") as ttf: SQLiteIndexedTar( fileObject=ttf, tarFileName="contents", writeIndex=True, clearIndexCache=True, indexFilePath=tmp_index_file.name, ) with FileSystems.create( parse_linked_bundle_url(bundle_path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ) as out_index_file, open(tmp_index_file.name, "rb") as tif: shutil.copyfileobj(tif, out_index_file) return bundle_uuid, bundle_path
def _is_available_locally(self, target): """Returns whether the target is accessible from the current machine. Returns True if the target is on an accessible disk or if the target is on Azure Blob Storage. """ file_path = self._get_target_path(target) if parse_linked_bundle_url(file_path).uses_beam: # Return True if the URL is in Azure Blob Storage. return True if self._bundle_model.get_bundle_state(target.bundle_uuid) in [ State.RUNNING, State.PREPARING, ]: return self._bundle_model.get_bundle_worker( target.bundle_uuid)['shared_file_system'] return True
def test_directory_with_subpath(self): """Parse a URL referring to a subpath within an archived directory.""" linked_bundle_path = parse_linked_bundle_url( "azfs://storageclwsdev0/bundles/uuid/contents.tar.gz/a/b.txt") self.assertEqual(linked_bundle_path.storage_type, StorageType.AZURE_BLOB_STORAGE.value) self.assertEqual( linked_bundle_path.bundle_path, "azfs://storageclwsdev0/bundles/uuid/contents.tar.gz") self.assertEqual(linked_bundle_path.is_archive, True) self.assertEqual(linked_bundle_path.is_archive_dir, True) self.assertEqual(linked_bundle_path.index_path, "azfs://storageclwsdev0/bundles/uuid/index.sqlite") self.assertEqual(linked_bundle_path.archive_subpath, "a/b.txt") self.assertEqual(linked_bundle_path.bundle_uuid, "uuid")
def _get_normalized_target_path(bundle_path: str, target: BundleTarget) -> str: if parse_linked_bundle_url(bundle_path).uses_beam: # On Azure, don't call os.path functions on the paths (which are azfs:// URLs). # We can just concatenate them together. return f"{bundle_path}/{target.subpath}" if target.subpath else bundle_path else: real_bundle_path = os.path.realpath(bundle_path) normalized_target_path = os.path.normpath( _get_target_path(real_bundle_path, target.subpath)) error_path = _get_target_path(target.bundle_uuid, target.subpath) if not normalized_target_path.startswith(real_bundle_path): raise PathException('%s is not inside the bundle.' % error_path) return normalized_target_path
def get_target_info(bundle_path: str, target: BundleTarget, depth: int) -> TargetInfo: """ Generates an index of the contents of the given path. The index contains the fields: name: Name of the entry. type: Type of the entry, one of 'file', 'directory' or 'link'. size: Size of the entry. perm: Permissions of the entry. link: If type is 'link', where the symbolic link points to. contents: If type is 'directory', a list of entries for the contents. For the top level entry, also contains resolved_target, a BundleTarget: Any entries more than depth levels deep are filtered out. Depth 0, for example, means only the top-level entry is included, and no contents. Depth 1 means the contents of the top-level are included, but nothing deeper. If the given path does not exist, raises PathException. If reading the given path is not secure, raises a PathException. """ final_path = _get_normalized_target_path(bundle_path, target) if parse_linked_bundle_url(final_path).uses_beam: # If the target is on Blob Storage, use a Blob-specific method # to get the target info. try: info = _compute_target_info_blob(final_path, depth) except Exception: logging.error( "Path '{}' in bundle {} not found: {}".format( target.subpath, target.bundle_uuid, traceback.format_exc() ) ) raise PathException( "Path '{}' in bundle {} not found".format(target.subpath, target.bundle_uuid) ) else: if not os.path.islink(final_path) and not os.path.exists(final_path): raise PathException( "Path '{}' in bundle {} not found".format(target.subpath, target.bundle_uuid) ) info = _compute_target_info_local(final_path, depth) info['resolved_target'] = target return info
def __enter__(self) -> IO[bytes]: linked_bundle_path = parse_linked_bundle_url(self.path) if linked_bundle_path.uses_beam and linked_bundle_path.is_archive: # Stream an entire, single .gz file from Blob Storage. This is gzipped by default, # so if the user requested a gzipped version of the entire file, just read and return it. if not linked_bundle_path.is_archive_dir and self.gzipped: return FileSystems.open( self.path, compression_type=CompressionTypes.UNCOMPRESSED) # Stream an entire, single .tar.gz file from Blob Storage. This is gzipped by default, # and directories are always gzipped, so just read and return it. if linked_bundle_path.is_archive_dir and not linked_bundle_path.archive_subpath: if not self.gzipped: raise IOError("Directories must be gzipped.") return FileSystems.open( self.path, compression_type=CompressionTypes.UNCOMPRESSED) # If a file path is specified within an archive file on Blob Storage, open the specified path within the archive. with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf: isdir = lambda finfo: stat.S_ISDIR(finfo.mode) # If the archive file is a .tar.gz file, open the specified archive subpath within the archive. # If it is a .gz file, open the "/contents" entry, which represents the actual gzipped file. fpath = ("/" + linked_bundle_path.archive_subpath if linked_bundle_path.is_archive_dir else "/contents") finfo = cast(FileInfo, tf.getFileInfo(fpath)) if finfo is None: raise FileNotFoundError(fpath) if isdir(finfo): # Stream a directory from within the archive if not self.gzipped: raise IOError("Directories must be gzipped.") return GzipStream(TarSubdirStream(self.path)) else: # Stream a single file from within the archive fs = TarFileStream(tf, finfo) return GzipStream(fs) if self.gzipped else fs else: # Stream a directory or file from disk storage. if os.path.isdir(self.path): if not self.gzipped: raise IOError("Directories must be gzipped.") return tar_gzip_directory(self.path) if self.gzipped: raise IOError( "Gzipping local files from disk from OpenFile is not yet supported. Please use file_util.gzip_file instead." ) return open(self.path, self.mode)
def gzip_file(file_path: str) -> IO[bytes]: """ Returns a file-like object containing the gzipped version of the given file. Note: For right now, it's important for gzip to run in a separate process, otherwise things on CodaLab grind to a halt! """ if parse_linked_bundle_url(file_path).uses_beam: try: with OpenFile(file_path, gzipped=True) as file_path_obj: return file_path_obj except Exception as e: raise IOError(e) args = ['gzip', '-c', '-n', file_path] try: proc = subprocess.Popen(args, stdout=subprocess.PIPE) if proc.stdout is None: raise IOError("Stdout is empty") return proc.stdout except subprocess.CalledProcessError as e: raise IOError(e.output)
def get_path_size(path, exclude_names=[], ignore_nonexistent_path=False): """ Returns the size of the contents of the given path, in bytes. If path is a directory, any directory entries in exclude_names will be ignored. If ignore_nonexistent_path is True and the input path is nonexistent, the value 0 is returned. Else, an exception is raised (FileNotFoundError). """ if parse_linked_bundle_url(path).uses_beam: # On Azure, use Apache Beam methods, not native os methods, # to get the path size. # Get the size of the specified path (file / directory). # This will only get the right size of files, not of directories (but we don't # store any bundles as directories on Azure). return get_file_size(path) try: result = os.lstat(path).st_size except FileNotFoundError: if ignore_nonexistent_path: # If we are to ignore nonexistent paths, return the size of this path as 0 return 0 # Raise the FileNotFoundError raise if not os.path.islink(path) and os.path.isdir(path): for child in os.listdir(path): if child not in exclude_names: try: full_child_path = os.path.join(path, child) except UnicodeDecodeError: full_child_path = os.path.join(path.decode('utf-8'), child.decode('utf-8')) result += get_path_size(full_child_path, ignore_nonexistent_path=True) return result
def remove(path): """ Remove the given path, whether it is a directory, file, or link. """ if parse_linked_bundle_url(path).uses_beam: from apache_beam.io.filesystems import FileSystems if not FileSystems.exists(path): FileSystems.delete([path]) return check_isvalid(path, 'remove') set_write_permissions(path) # Allow permissions if os.path.islink(path): os.unlink(path) elif os.path.isdir(path): try: shutil.rmtree(path) except shutil.Error: pass else: os.remove(path) if os.path.exists(path): print('Failed to remove %s' % path)
def _compute_target_info_blob( path: str, depth: Union[int, float], return_generators=False ) -> TargetInfo: """Computes target info for a file that is externalized on Blob Storage, meaning that it's contained within an indexed archive file. Args: path (str): The path that refers to the specified target. depth (Union[int, float]): Depth until which directory contents are resolved. return_generators (bool, optional): If set to True, the 'contents' key of directories is equal to a generator instead of a list. Defaults to False. Raises: PathException: Path not found or invalid. Returns: TargetInfo: Target info of specified path. """ linked_bundle_path = parse_linked_bundle_url(path) if not FileSystems.exists(linked_bundle_path.bundle_path): raise PathException(linked_bundle_path.bundle_path) if not linked_bundle_path.is_archive: # Single file raise PathException( "Single files on Blob Storage are not supported; only a path within an archive file is supported." ) # process_contents is used to process the value of the 'contents' key (which is a generator) before it is returned. # If return_generators is False, it resolves the given generator into a list; otherwise, it just returns # the generator unchanged. process_contents = list if return_generators is False else lambda x: x with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf: islink = lambda finfo: stat.S_ISLNK(finfo.mode) readlink = lambda finfo: finfo.linkname isfile = lambda finfo: not stat.S_ISDIR(finfo.mode) isdir = lambda finfo: stat.S_ISDIR(finfo.mode) listdir = lambda path: cast(Dict[str, FileInfo], tf.listDir(path) or {}) def _get_info(path: str, depth: Union[int, float]) -> TargetInfo: """This function is called to get the target info of the specified path. If the specified path is a directory and additional depth is requested, this function is recursively called to retrieve the target info of files within the directory, much like _compute_target_info_local. """ if not path.startswith("/"): path = "/" + path finfo = cast(FileInfo, tf.getFileInfo(path)) if finfo is None: # Not found raise PathException("File not found.") result: TargetInfo = { 'name': os.path.basename(path), # get last part of path 'size': finfo.size, 'perm': finfo.mode & 0o777, 'type': '', } if islink(finfo): result['type'] = 'link' result['link'] = readlink(finfo) elif isfile(finfo): result['type'] = 'file' elif isdir(finfo): result['type'] = 'directory' if depth > 0: result['contents'] = process_contents( _get_info(path + "/" + file_name, depth - 1) for file_name in listdir(path) if file_name != "." ) return result if not linked_bundle_path.is_archive_dir: # Return the contents of the single .gz file. # The entry returned by ratarmount for a single .gz file is not technically part of a tar archive # and has a name hardcoded as "contents," so we modify the type, name, and permissions of # the output accordingly. return cast( TargetInfo, dict( _get_info("/contents", depth), type="file", name=linked_bundle_path.bundle_uuid, perm=0o755, ), ) if linked_bundle_path.archive_subpath: # Return the contents of a subpath within a directory. return _get_info(linked_bundle_path.archive_subpath, depth) else: # No subpath, return the entire directory with the bundle # contents in it. The permissions of this directory # cannot be set by the user (the user can only set permissions # of files *within* this directory that are part of the bundle # itself), so we just return a placeholder value of 0o755 # for this directory's permissions. file = FileSystems.match([path])[0].metadata_list[0] result: TargetInfo = { 'name': linked_bundle_path.bundle_uuid, 'type': 'directory', 'size': file.size_in_bytes, 'perm': 0o755, } if depth > 0: result['contents'] = process_contents( _get_info(file_name, depth - 1) for file_name in listdir("/") if file_name != "." ) return result
def get_bundle_index_url(self, path, **kwargs): return parse_linked_bundle_url(path).index_path_bypass_url(**kwargs)
def get_bundle_signed_url(self, path, **kwargs): """ Get signed url for the bundle path """ return parse_linked_bundle_url(path).bundle_path_bypass_url(**kwargs)
def write_fileobj( self, source_ext: str, source_fileobj: IO[bytes], bundle_path: str, unpack_archive: bool, bundle_conn_str=None, index_conn_str=None, progress_callback=None, ): if unpack_archive: output_fileobj = zip_util.unpack_to_archive( source_ext, source_fileobj) else: output_fileobj = GzipStream(source_fileobj) # Write archive file. if bundle_conn_str is not None: conn_str = os.environ.get('AZURE_STORAGE_CONNECTION_STRING', '') os.environ['AZURE_STORAGE_CONNECTION_STRING'] = bundle_conn_str try: bytes_uploaded = 0 CHUNK_SIZE = 16 * 1024 with FileSystems.create( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED) as out: while True: to_send = output_fileobj.read(CHUNK_SIZE) if not to_send: break out.write(to_send) bytes_uploaded += len(to_send) if progress_callback is not None: should_resume = progress_callback(bytes_uploaded) if not should_resume: raise Exception('Upload aborted by client') with FileSystems.open( bundle_path, compression_type=CompressionTypes.UNCOMPRESSED ) as ttf, tempfile.NamedTemporaryFile( suffix=".sqlite") as tmp_index_file: SQLiteIndexedTar( fileObject=ttf, tarFileName= "contents", # If saving a single file as a .gz archive, this file can be accessed by the "/contents" entry in the index. writeIndex=True, clearIndexCache=True, indexFilePath=tmp_index_file.name, ) if bundle_conn_str is not None: os.environ[ 'AZURE_STORAGE_CONNECTION_STRING'] = index_conn_str with FileSystems.create( parse_linked_bundle_url(bundle_path).index_path, compression_type=CompressionTypes.UNCOMPRESSED, ) as out_index_file, open(tmp_index_file.name, "rb") as tif: while True: to_send = tif.read(CHUNK_SIZE) if not to_send: break out_index_file.write(to_send) bytes_uploaded += len(to_send) if progress_callback is not None: should_resume = progress_callback(bytes_uploaded) if not should_resume: raise Exception('Upload aborted by client') except Exception as err: raise err finally: # restore the origin connection string if bundle_conn_str is not None: os.environ[ 'AZURE_STORAGE_CONNECTION_STRING'] = conn_str if conn_str != '' else None # type: ignore