Example #1
0
class GzipStream(BytesIO):
    """A stream that gzips a file in chunks.
    """
    def __init__(self, fileobj: IO[bytes]):
        self.__input = fileobj
        self.__buffer = BytesBuffer()
        self.__gzip = gzip.GzipFile(None, mode='wb', fileobj=self.__buffer)

    def read(self, num_bytes=None) -> bytes:
        while num_bytes is None or len(self.__buffer) < num_bytes:
            s = self.__input.read(num_bytes)
            if not s:
                self.__gzip.close()
                break
            self.__gzip.write(s)
        return self.__buffer.read(num_bytes)

    def close(self):
        self.__input.close()
class TarFileStream(BytesIO):
    """Streams a file from a tar archive stored on Blob Storage.

    The general idea is that whenever .read() is called on this class,
    it will read the specified number of bytes through ratarmount's tf.open()
    API on the associated file and return those bytes.

    TODO (Ashwin): If we can add tf.open() support upstream to the ratarmount API
    (right now it only supports tf.read()), we may not have a need for this class anymore.
    """
    def __init__(self, tf: SQLiteIndexedTar, finfo: FileInfo):
        """Initialize TarFileStream.

        Args:
            tf (SQLiteIndexedTar): Tar archive indexed by ratarmount.
            finfo (FileInfo): FileInfo object describing the file that is to be read from the aforementioned tar archive.
        """
        self.tf = tf
        self.finfo = finfo
        self._buffer = BytesBuffer()
        self.pos = 0

    def _read_from_tar(self, num_bytes):
        """Read the contents of the specified file from within
        the tar archive.
        """
        contents = self.tf.read(
            fileInfo=self.finfo,
            size=self.finfo.size if num_bytes is None else min(
                self.finfo.size - self.pos, num_bytes),
            offset=self.pos,
        )
        self._buffer.write(contents)
        self.pos += len(contents)

    def read(self, num_bytes=None):
        """Read the specified number of bytes from the associated file.
        """
        while (self.pos < self.finfo.size) and (num_bytes is None or
                                                len(self._buffer) < num_bytes):
            self._read_from_tar(num_bytes)
        if num_bytes is None:
            num_bytes = len(self._buffer)
        return self._buffer.read(num_bytes)

    def seek(self, pos, whence=SEEK_SET):
        if whence == SEEK_SET:
            self.pos = pos
        elif whence == SEEK_CUR:
            self.pos += pos
        elif whence == SEEK_END:
            self.pos = self.finfo.size - pos

    def tell(self):
        return self.pos

    def __getattr__(self, name):
        """
        Proxy any methods/attributes besides read() and close() to the
        fileobj (for example, if we're wrapping an HTTP response object.)
        Behavior is undefined if other file methods such as tell() are
        attempted through this proxy.
        """
        return getattr(self._buffer, name)
class TarSubdirStream(BytesIO):
    """Streams a subdirectory from an indexed archive file stored on Blob Storage, as its own .tar.gz archive.

    The general idea is that on initialization, this class will construct a list
    "descendants" that contains all files within the specified subdirectory in the tar file.
    Whenever .read() is called on this class, it will partially construct a tar file
    with the headers and contents of each descendant, up to the specified number of bytes,
    and return those bytes.

    Inspired by https://gist.github.com/chipx86/9598b1e4a9a1a7831054.
    """

    current_desc: CurrentDescendant

    def __init__(self, path: str):
        """Initialize TarSubdirStream.

        Args:
            path (str): Specified path of the subdirectory on Blob Storage. Must refer to a subdirectory path within a .tar.gz file.
        """
        from codalab.worker.file_util import OpenIndexedArchiveFile
        from codalab.worker.download_util import compute_target_info_blob_descendants_flat

        self.linked_bundle_path = parse_linked_bundle_url(path)

        # We add OpenIndexedArchiveFile to self._stack so that the context manager remains open and is exited
        # only in the method self.close().
        with ExitStack() as stack:
            self.tf = stack.enter_context(
                OpenIndexedArchiveFile(self.linked_bundle_path.bundle_path))
            self._stack = stack.pop_all()

        # Keep track of descendants of the specified subdirectory and the current descendant
        self.descendants = compute_target_info_blob_descendants_flat(path)
        self.current_desc = CurrentDescendant(desc=None,
                                              pos=0,
                                              finfo=EmptyFileInfo,
                                              tinfo=tarfile.TarInfo())

        # Buffer that stores the underlying bytes of the output tar archive
        self._buffer = BytesBuffer()

        # Output tar archive
        self.output = tarfile.open(fileobj=self._buffer, mode="w:")

    def _read_from_tar(self, num_bytes=None) -> None:
        """Read the specified number of bytes from the tar file
        associated with the given subdirectory.

        Based on where we currently are within the subdirectory's descendants,
        either read the next descendant's header or its contents.
        """
        if self.current_desc.desc is None:
            # Advance to the next descendant and read its header.
            member = next(self.descendants)

            # TODO (Ashwin): Make sure this works with symlinks, too (it should work, but add a test to ensure it).
            full_name = f"{self.linked_bundle_path.archive_subpath}/{member['name']}"
            member_finfo = cast(FileInfo, self.tf.getFileInfo("/" + full_name))
            member_tarinfo = tarfile.TarInfo(
                name="./" + member['name'] if member['name'] else '.')
            for attr in ("size", "mtime", "mode", "linkname", "uid", "gid"):
                setattr(member_tarinfo, attr, getattr(member_finfo, attr))
            # ratarmount's FileInfo does not have a type attribute, so we have
            # to manually construct it from the mode.
            if stat.S_ISDIR(member_finfo.mode):
                member_tarinfo.type = tarfile.DIRTYPE
            elif stat.S_ISLNK(member_finfo.mode):
                member_tarinfo.type = tarfile.SYMTYPE
            else:
                member_tarinfo.type = tarfile.REGTYPE

            # finfo is a ratarmount-specific data structure, while tinfo is a tarfile-specific data structure.
            # We need to store the former in order to read from the file with ratarmount and the latter in order to
            # construct the output tar archive.
            self.current_desc.desc = member
            self.current_desc.finfo = member_finfo
            self.current_desc.tinfo = member_tarinfo
            self.output.addfile(member_tarinfo)
        elif self.current_desc.pos < self.current_desc.finfo.size:
            # Read the contents of the current descendant.
            chunk = self.tf.read(
                fileInfo=self.current_desc.finfo,
                size=self.current_desc.finfo.size if num_bytes is None else
                min(self.current_desc.finfo.size -
                    self.current_desc.pos, num_bytes),
                offset=self.current_desc.pos,
            )
            assert self.output.fileobj is not None
            self.output.fileobj.write(chunk)
            self.current_desc.pos += len(chunk)
            # We're ignoring types here because the TarFile.offset type is missing.
            # TODO: Remove "# type: ignore" annotations once this PR is merged: https://github.com/python/typeshed/pull/5210
            self.output.offset += len(chunk)  # type: ignore
        else:
            # We've finished reading the entire current descendant.
            # Write the remainder of the block, if needed, and then reset the descendant so it is empty.
            if self.current_desc.pos > 0:
                # This code for writing the remainder of the block is taken from
                # https://github.com/python/cpython/blob/9d2c2a8e3b8fe18ee1568bfa4a419847b3e78575/Lib/tarfile.py#L2008-L2012.
                blocks, remainder = divmod(self.current_desc.tinfo.size,
                                           tarfile.BLOCKSIZE)
                if remainder > 0:
                    assert self.output.fileobj is not None
                    self.output.fileobj.write(tarfile.NUL *
                                              (tarfile.BLOCKSIZE - remainder))
                    blocks += 1
                self.output.offset += blocks * tarfile.BLOCKSIZE  # type: ignore
            self.current_desc = CurrentDescendant(
                desc=None,
                pos=0,
                finfo=EmptyFileInfo,
                tinfo=tarfile.TarInfo(),
            )

    def read(self, num_bytes=None):
        """Read the specified number of bytes from the tar version of the associated subdirectory.
        """
        while num_bytes is None or len(self._buffer) < num_bytes:
            try:
                self._read_from_tar(num_bytes)
            except StopIteration:
                # The next(self.descendants) function has failed, so we've gone through all
                # descendants and have finished going through the file.
                self.close()
                break
        if num_bytes is None:
            num_bytes = len(self._buffer)
        return self._buffer.read(num_bytes)

    def close(self):
        # Close the OpenIndexedArchiveFile context manager that was initialized in __init__.
        self._stack.__exit__(self, None, None)

    def __getattr__(self, name):
        """
        Proxy any methods/attributes besides read() and close() to the
        fileobj (for example, if we're wrapping an HTTP response object.)
        Behavior is undefined if other file methods such as tell() are
        attempted through this proxy.
        """
        return getattr(self._buffer, name)
Example #4
0
 def __init__(self, fileobj: IO[bytes]):
     self.__input = fileobj
     self.__buffer = BytesBuffer()
     self.__gzip = gzip.GzipFile(None, mode='wb', fileobj=self.__buffer)