Ejemplo n.º 1
0
def gzopen_without_timestamps(name, mode="r", fileobj=None, **kwargs):
    """ !! Method overrided by laso to pass mtime=0 (!=None) to avoid time.time() was
        setted in Gzip file causing md5 to change. Not possible using the
        previous tarfile open because arguments are not passed to GzipFile constructor
    """
    compresslevel = int(os.getenv("CONAN_COMPRESSION_LEVEL", 9))

    if mode not in ("r", "w"):
        raise ValueError("mode must be 'r' or 'w'")

    try:
        fileobj = gzip.GzipFile(name, mode, compresslevel, fileobj, mtime=0)
    except OSError:
        if fileobj is not None and mode == 'r':
            raise tarfile.ReadError("not a gzip file")
        raise

    try:
        # Format is forced because in Python3.8, it changed and it generates different tarfiles
        # with different checksums, which break hashes of tgzs
        t = tarfile.TarFile.taropen(name, mode, fileobj, format=tarfile.GNU_FORMAT, **kwargs)
    except IOError:
        fileobj.close()
        if mode == 'r':
            raise tarfile.ReadError("not a gzip file")
        raise
    except Exception:
        fileobj.close()
        raise
    t._extfileobj = False
    return t
Ejemplo n.º 2
0
 def check_tarfile(self):
     '''
 Check if the tar_archive exists and is a valid tar file
 '''
     try:
         if not tarfile.is_tarfile(self.tar_archive):
             # file exists but is not a valid tar file
             raise tarfile.ReadError(self.tar_archive +
                                     ' is not a tar archive')
     except FileNotFoundError:
         # file does not exist
         raise tarfile.ReadError(self.tar_archive + ' is not found')
Ejemplo n.º 3
0
        def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
            """Open lzma compressed tar archive name for reading or writing.

            Attention:
                Appending is not allowed.

            Note:
               Backported from `Python 3.6
               <https://github.com/python/cpython/blob/3.6/Lib/tarfile.py>`_
            """
            if mode not in ("r", "w", "x"):
                raise ValueError("mode must be 'r', 'w' or 'x'")

            fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)

            try:
                t = cls.taropen(name, mode, fileobj, **kwargs)
            except (lzma.LZMAError, EOFError):
                fileobj.close()
                if mode == 'r':
                    raise tarfile.ReadError("not an lzma file")
                raise
            except:
                fileobj.close()
                raise
            t._extfileobj = False
            return t
Ejemplo n.º 4
0
    def extract(self, path: pathlib.Path, data_dir: pathlib.Path, file_list_file: pathlib.Path) -> None:
        """Attempt to extract the tar archive. Save metadata about the list of files in the downloaded dataset in
        ``file_list_file``.

        :param path: Path to the tar archive.
        :param data_dir: Path to the data dir to extract data files to.
        :file_list_file: Path to the file that stores the list of files in the downloaded dataset.
        :raises tarfile.ReadError: The tar archive was unable to be read.
        """
        try:
            mytar = tarfile.open(path)
        except tarfile.ReadError as e:
            raise tarfile.ReadError(f'Failed to unarchive tar file "{path}"\ncaused by:\n{e}')
        with mytar:
            FileListFileContents = Dict[str, Dict[str, int]]
            contents: FileListFileContents = {}
            metadata: Dict[str, Union[str, FileListFileContents]] = {}

            metadata['type'] = 'application/x-tar'
            for member in mytar.getmembers():
                contents[member.name] = {'type': int(member.type)}
                if member.isreg():  # For regular files, we also save its size
                    contents[member.name]['size'] = member.size
            metadata['contents'] = contents

            with open(file_list_file, mode='w') as f:
                # We do not specify 'utf-8' here to match the default encoding used by the OS, which also likely
                # uses this encoding for accessing the filesystem.
                json.dump(metadata, f, indent=2)
            mytar.extractall(path=data_dir)
Ejemplo n.º 5
0
    def lzmaopen(cls,
                 name=None,
                 mode="r",
                 fileobj=None,
                 compressformat=lzma.FORMAT_XZ,
                 compresslevel=9,
                 **kwargs):
        """Open lzma/xz compressed tar archive name for reading or writing.
           Appending is not allowed.
        """

        if len(mode) > 1 or mode not in "rw":
            raise ValueError("mode must be 'r' or 'w'.")

        try:
            import lzma
        except ImportError:
            raise tarfile.CompressionError("lzma module is not available")

        if fileobj is not None:
            fileobj = _LZMAProxy(fileobj, mode)
        else:
            options = {"format": compressformat, "level": compresslevel}
            fileobj = lzma.LZMAFile(name, mode, format=1)

        try:
            t = cls.taropen(name, mode, fileobj, **kwargs)
        except IOError:
            raise tarfile.ReadError("not a lzma file")
        t._extfileobj = False
        return t
Ejemplo n.º 6
0
    def decompress(self):
        try:
            # Set pointer to current directory
            os.chdir(self.__path)

            # Unzip the file (GeoLite2-City.tar.gz) to get the database
            with tarfile.open(self.__file, 'r') as tar_file:
                tar_file.extractall(self.__path)

                # Get root file to get database (GeoLite2-City_aaaammdd)
                folder = tar_file.getnames()[0]

                # Changing to the root folder (GeoLite2-City_aaaammdd)
                os.chdir(os.path.join(self.__path, folder))

                # copying the database to the assets directory
                shutil.copy(self.__db, self.__path)
                os.chdir(self.__path)

                # Removing the folder (GeoLite2-City_aaaammdd)
                shutil.rmtree(folder)
            tar_file.close()

            os.remove(self.__file)
        except FileNotFoundError:
            raise FileNotFoundError(
                "El sistema no puede encontrar el archivo especificado")
        except tarfile.ReadError:
            raise tarfile.ReadError(
                "El archivo no se pudo abrir correctamente")
Ejemplo n.º 7
0
    def __init__(self, name=None, mode='r', mtime=None):
        if not mode.startswith('r') or tarfile.is_tarfile(name):
            self.__tar = tarfile.TarFile(name=name, mode=mode)

        else:
            # convert for tar

            if not zipfile.is_zipfile(name):
                raise tarfile.ReadError()

            try:
                tmp_dir = tempfile.mkdtemp()
                tmp_fd, tmp_name = tempfile.mkstemp()
                tmp_fo = os.fdopen(tmp_fd, 'w')

                zip = zipfile.ZipFile(name)
                zip.extractall(tmp_dir)

                tar = tarfile.TarFile(fileobj=tmp_fo, mode='w')
                tar.add(tmp_dir, arcname='')
                tar.close()

                self.__tar = tarfile.TarFile(name=tmp_name, mode=mode)

            finally:
                tmp_fo.close()
                os.unlink(tmp_name)
                shutil.rmtree(tmp_dir)

        if mtime:
            self.mtime = mtime

        else:
            self.mtime = time.time()
Ejemplo n.º 8
0
    def zstdopen(cls,
                 name,
                 mode="r",
                 fileobj=None,
                 cctx=None,
                 dctx=None,
                 **kwargs):  # type: ignore
        """Open zstd compressed tar archive name for reading or writing.
           Appending is not allowed.
        """
        if mode not in ("r"):
            raise ValueError("mode must be 'r'")

        try:
            zobj = zstandard.open(fileobj or name,
                                  mode + "b",
                                  cctx=cctx,
                                  dctx=dctx)
            with zobj:
                data = zobj.read()
        except (zstandard.ZstdError, EOFError) as e:
            raise tarfile.ReadError("not a zstd file") from e

        fileobj = io.BytesIO(data)
        t = cls.taropen(name, mode, fileobj, **kwargs)
        t._extfileobj = False
        return t
Ejemplo n.º 9
0
    def zstdopen(cls, name, mode="r", fileobj=None, level=9, **kwargs):
        """
        Open zstd compressed tar archive name for reading or writing.
           Appending is not allowed.
        """
        if mode not in ("r", "w", "x"):
            raise ValueError("mode must be 'r', 'w' or 'x'")
        fileobj = ZstdFile(
            fileobj or name,
            mode,
            level=level
        )

        try:
            t = cls.taropen(name, mode, fileobj, **kwargs)
        except (OSError, EOFError):
            fileobj.close()
            if mode == 'r':
                raise tarfile.ReadError("not a zstd file")
            raise
        except:
            fileobj.close()
            raise
        t._extfileobj = False
        return t
Ejemplo n.º 10
0
    def download(self, check: bool = True) -> None:
        """Downloads, extracts, and removes dataset archive. It adds a directory write lock during execution.

        :param check: Check to make sure the data files are not already present in :attr:`._data_dir` (passed in via
            ``data_dir`` in the constructor :class:`Dataset`) by running :meth:`.is_downloaded`. If set to ``True``,
            raise an error if they are present and prevent a subsequent download. Set to ``False`` to remove this
            safeguard, and subsequent calls to :meth:`.download` will then overwrite data files if they were previously
            downloaded to :attr:`._data_dir`.
        :raises RuntimeError: The dataset was previously downloaded as indicated by :meth:`.is_downloaded`
            returning ``True``.
        :raises NotADirectoryError: :attr:`Dataset._data_dir` (passed in via ``data_dir`` in the constructor
            :class:`Dataset`) points to an existing file that is not a directory.
        :raises OSError: The SHA512 checksum of a downloaded dataset doesn't match the expected checksum.
        :raises tarfile.ReadError: The tar archive was unable to be read.
        :raises exceptions.DirectoryLockAcquisitionError: Failed to acquire the directory lock.
        """

        if check and self.is_downloaded():
            raise RuntimeError(
                f'{self.__class__.__name__}.download() was previously called. To overwrite existing '
                f'data files, rerun {self.__class__.__name__}.download() with ``check`` set to '
                f'``False``.')

        download_url = self._schema['download_url']
        download_file_name = pathlib.Path(os.path.basename(download_url))

        with self._lock.locking_with_exception(write=True):
            archive_fp = self._pydax_dir / download_file_name
            response = requests.get(download_url, stream=True)
            archive_fp.write_bytes(response.content)

            computed_hash = hashlib.sha512(archive_fp.read_bytes()).hexdigest()
            actual_hash = self._schema['sha512sum']
            if not actual_hash == computed_hash:
                raise OSError(
                    f'{archive_fp} has a SHA512 checksum of: ({computed_hash}) '
                    f'which is different from the expected SHA512 checksum of: ({actual_hash}) '
                    f'the file may by corrupted.')

            # Supports tar archives only for now
            try:
                tar = tarfile.open(archive_fp)
            except tarfile.ReadError as e:
                raise tarfile.ReadError(
                    f'Failed to unarchive "{archive_fp}"\ncaused by:\n{e}')
            with tar:
                members = {}
                for member in tar.getmembers():
                    members[member.name] = {'type': int(member.type)}
                    if member.isreg(
                    ):  # For regular files, we also save its size
                        members[member.name]['size'] = member.size
                with open(self._file_list_file, mode='w') as f:
                    # We do not specify 'utf-8' here to match the default encoding used by the OS, which also likely
                    # uses this encoding for accessing the filesystem.
                    json.dump(members, f, indent=2)
                tar.extractall(path=self._data_dir)

            os.remove(archive_fp)
Ejemplo n.º 11
0
    def test_tape_empty_when_tarfile_ReadError_with_msg_then_drive_is_empty(self, mock_rewind_tape, mock_tarfile_open):
        exception = tarfile.ReadError('empty file')
        mock_tarfile_open.side_effect = exception

        self.assertTrue(tape_empty("some_drive"))

        mock_tarfile_open.assert_called_once()
        mock_rewind_tape.assert_called_once_with("some_drive")
Ejemplo n.º 12
0
    def test_tape_empty_when_tarfile_ReadError_and_unknown_message_then_raise_exception(self, mock_tarfile_open):
        exception = tarfile.ReadError()
        exception.message = 'some other message'
        mock_tarfile_open.side_effect = exception

        with self.assertRaises(tarfile.ReadError):
            tape_empty("some_drive")

        mock_tarfile_open.assert_called_once()
Ejemplo n.º 13
0
    def open(cls, name=None, mode="r", fileobj=None, **kwargs):
        fileobj = lzma.LZMAFile(name, mode, fileobj=fileobj)

        try:
            t = cls.taropen(name, mode, fileobj, **kwargs)
        except lzma.LZMAError:
            fileobj.close()
            raise tarfile.ReadError("not an lzma file")

        t._extfileobj = False
        return t
Ejemplo n.º 14
0
    def test_upload_archive_failed(self):
        faulty_tar = MagicMock()
        faulty_tar.extract.side_effect = tarfile.ReadError()
        member = MagicMock()
        faulty_tar.__iter__.return_value = [member]

        tmp_dump_dir = tempfile.mkdtemp()
        self.assertRaises(DumpInvalidException, self.uploader.upload_archive, tmp_dump_dir,
                          faulty_tar, '/test', schema.artist_relation_schema, self.uploader.process_json)

        status = utils.path_exists('/test')
        self.assertFalse(status)
Ejemplo n.º 15
0
    def _load(self, tarfilepath):
        if not tarfile.is_tarfile(tarfilepath):
            raise tarfile.ReadError("%r is not a readable tar archive file" %
                                    tarfilepath)

        root_dir = self._get_root_dir(tarfilepath)
        with tarfile.open(tarfilepath, mode='r') as tar:
            metadata_reader = ArtifactDataReader(tar, root_dir)
            type_, uuid_, provenance = self._load_metadata(metadata_reader)

            data_reader = ArtifactDataReader(
                tar, os.path.join(root_dir, self._data_dir))
            data = type_().load(data_reader)
            return data, type_, provenance, uuid_
Ejemplo n.º 16
0
    def xzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
        """Open lzma compressed tar archive name for reading or writing.
           Appending is not allowed.
        """
        if mode not in ("r", "w"):
            raise ValueError("mode must be 'r' or 'w'")

        try:
            from backports import lzma
            lzma.LZMAFile
        except (ImportError, AttributeError):
            raise tarfile.CompressionError("lzma module is not available")

        try:
            fileobj = lzma.LZMAFile(fileobj or name, mode)
        except (OSError, IOError):
            if mode == 'r':
                raise tarfile.ReadError("not an lzma file")
            raise

        try:
            fileobj.peek()
        except (lzma.LZMAError, EOFError):
            raise tarfile.ReadError("not an lzma file")

        try:
            t = cls.taropen(name, mode, fileobj, **kwargs)
        except IOError:
            fileobj.close()
            if mode == 'r':
                raise tarfile.ReadError("not an lzma file")
            raise
        except:
            fileobj.close()
            raise
        t._extfileobj = False
        return t
Ejemplo n.º 17
0
    def extract_all(self):
        """Extract All"""

        self.logger.debug("Reading header from %s/%s ...", self.path,
                          self.tarfilename)

        members_to_extract = []
        extract = False

        attempts = 0
        # Our little retry loop. Implemented due to speed-related writing errors.
        # TODO Replace / update with "tenacity" module.
        while attempts < 3:
            try:
                tfile = tarfile.open(os.path.join(self.path, self.tarfilename),
                                     'r')
                break
            except tarfile.ReadError as error:
                self.logger.warning(
                    'ReadError encountered when opening tar file.')
                self.logger.warning('Sleeping for 2 seconds and trying again.')
                self.logger.warning(error)
                attempts += 1
                time.sleep(2)
        if attempts == 3:
            raise tarfile.ReadError('Tar file could not be read after 3 attempts: %s + "/" + %s' \
                                    % (self.path, self.tarfilename))

        for member in tfile.getmembers():
            lower_name = member.name.lower()
            if 'gff' in lower_name:
                self.logger.info('Skipping GFF file extraction for %s',
                                 member.name)
                continue
            if os.path.exists(os.path.join(self.path, member.name)):
                self.logger.info('%s/%s already exists, not extracting.',
                                 self.path, member.name)
            else:
                self.logger.info("Extracting (%s->%s/%s)", member.name,
                                 self.path, member.name)
                members_to_extract.append(member)
                extract = True

        if extract is True:
            tfile.extractall(self.path, members_to_extract)
Ejemplo n.º 18
0
    def download(self) -> None:
        """Downloads, extracts, and removes dataset archive. It adds a directory write lock during execution.

        :raises NotADirectory: :attr:`Dataset._data_dir` (passed in via :meth:`.__init__()`) points to an
                               existing file that is not a directory.
        :raises OSError: The SHA512 checksum of a downloaded dataset doesn't match the expected checksum.
        :raises tarfile.ReadError: The tar archive was unable to be read.
        :raises exceptions.DirectoryLockAcquisitionError: Failed to acquire the directory lock.
        """
        download_url = self._schema['download_url']
        download_file_name = pathlib.Path(os.path.basename(download_url))

        with self._lock.locking_with_exception(write=True):
            archive_fp = self._pydax_dir / download_file_name
            response = requests.get(download_url, stream=True)
            archive_fp.write_bytes(response.content)

            computed_hash = hashlib.sha512(archive_fp.read_bytes()).hexdigest()
            actual_hash = self._schema['sha512sum']
            if not actual_hash == computed_hash:
                raise OSError(
                    f'{archive_fp} has a SHA512 checksum of: ({computed_hash}) \
                                which is different from the expected SHA512 checksum of: ({actual_hash}) \
                                the file may by corrupted.')

            # Supports tar archives only for now
            try:
                tar = tarfile.open(archive_fp)
            except tarfile.ReadError as e:
                raise tarfile.ReadError(
                    f'Failed to unarchive "{archive_fp}"\ncaused by:\n{e}')
            with tar:
                members = {}
                for member in tar.getmembers():
                    members[member.name] = {'type': int(member.type)}
                    if member.isreg(
                    ):  # For regular files, we also save its size
                        members[member.name]['size'] = member.size
                with open(self._file_list_file, mode='w') as f:
                    # We do not specify 'utf-8' here to match the default encoding used by the OS, which also likely
                    # uses this encoding for accessing the filesystem.
                    json.dump(members, f, indent=2)
                tar.extractall(path=self._data_dir)

            os.remove(archive_fp)
Ejemplo n.º 19
0
    def lzmaopen(cls,
                 name=None,
                 mode="r",
                 fileobj=None,
                 compresslevel=None,
                 **kwargs):
        """Open lzma/xz compressed tar archive name for reading or writing.
           Appending is not allowed.
        """

        try:
            import lzma
        except ImportError:
            try:
                from backports import lzma
            except ImportError:
                raise tarfile.CompressionError("Lzma module is not available")

        if not compresslevel:
            compresslevel = ctx.config.values.build.compressionlevel

        if len(mode) > 1 or mode not in "rw":
            raise ValueError("mode must be 'r' or 'w'.")

        if 'w' in mode:
            if fileobj is not None:
                fileobj = _LZMAProxy(fileobj, mode)
            else:
                fileobj = lzma.LZMAFile(name, mode, preset=compresslevel)

        else:
            if fileobj is not None:
                fileobj = _LZMAProxy(fileobj, mode)
            else:
                fileobj = lzma.LZMAFile(name, mode)

        try:
            t = cls.taropen(name, mode, fileobj, **kwargs)
        except IOError:
            raise tarfile.ReadError(
                _(" \"{}\" is not a lzma file.").format(name))
        t._extfileobj = False
        return t
Ejemplo n.º 20
0
    def _open(self):
        """ Function to open the tarfile for writing

            Raises:
                ReadError: Is raised when a tar archive is opened, that either cannot
                    be handled by the tarfile module or is somehow invalid.
                ValueError: Is raised when the given compression type is not supported
        """
        if re.match('^\.?tgz$', self.ext, re.IGNORECASE):
            try:
                self.arcfile = tarfile.open(self.arcname + '.tar.gz', 'w:gz')
            except Exception as e:
                logger.exception('[' + self.arcname + \
                                 '] Unable to create output file for writing ' + str(e))
                raise tarfile.ReadError(
                    'Unable to create output file for writing')
        else:
            logger.error('[' + self.arcname + '] Invalid compression type ' +
                         self.ext)
            raise ValueError('Invalid compression type ' + self.ext)
Ejemplo n.º 21
0
 def test_download_results_corrupted_compression_readerror(
         self, get_mock, tarfile_mock):
     requests_result_mock = Mock()
     status_code_mock = PropertyMock(return_value=200)
     type(requests_result_mock).status_code = status_code_mock
     content_mock = PropertyMock(return_value=self.tarfile.read())
     type(requests_result_mock).content = content_mock
     url_mock = PropertyMock(return_value="http://foo.bar.com/file.tar.xz")
     type(requests_result_mock).url = url_mock
     headers_mock = PropertyMock(
         return_value={"Content-Type": "application/x-tar"})
     type(requests_result_mock).headers = headers_mock
     get_mock.return_value = requests_result_mock
     tarfile_mock.side_effect = tarfile.ReadError()
     results = self.plugin._download_results(RESULT_DICT)
     status_code_mock.assert_called_once_with()
     self.assertEqual(content_mock.call_count, 2)
     self.assertEqual(self.plugin.tradefed_results_url, RESULT_URL)
     self.assertIsNone(results.test_results)
     self.assertIsNone(results.tradefed_stdout)
     self.assertIsNone(results.tradefed_logcat)
Ejemplo n.º 22
0
    def callback(self, ch, method, properties, body):
        file_path = body.decode('utf-8')
        rsync_statement = "rsync " + file_path + " ../src-logs/"
        print(rsync_statement)
        try:
            code = subprocess.call(rsync_statement, shell=True)
            if code == 0:
                file_name = file_path.split('/')[-1]
                new_file_path = "../src-logs/" + file_name
                try:
                    if tarfile.is_tarfile(new_file_path):
                        tar_file = tarfile.open(new_file_path, "r:gz")
                        for tarinfo in tar_file:
                            print(tarinfo.name, " is ", tarinfo.size, " bytes")
                            tar_file.extract(tarinfo, path='../src-logs/')
                            self.split_file("../src-logs/" + tarinfo.name)

                        ch.basic_ack(delivery_tag=method.delivery_tag)
                    os.remove(new_file_path)
                except tarfile.TarError:
                    print("tarfile error")
                    raise (tarfile.TarError('tarfile tar error'))
                except tarfile.ReadError:
                    print("read tarfile error")
                    raise (tarfile.ReadError("read tarfile error"))
            else:
                print("cann't async to file_path")
                ch.basic_reject(delivery_tag=method.delivery_tag)
                #ch.basic_ack(delivery_tag=method.delivery_tag)
                return
        except (OSError, pika.exceptions.ConnectionClosed, zlib.error) as err:
            #ch.basic_reject(delivery_tag=method.delivery_tag)
            os.remove(new_file_path)
            f = open("../src-logs/" + tarinfo.name, 'w')
            f.close()
            print(err)
            self.split_file("../src-logs/" + tarinfo.name)
            ch.basic_ack(delivery_tag=method.delivery_tag)
            return
Ejemplo n.º 23
0
    def __next__(self):
        """A copy and modification of the next() method in tarfile module.

    The copy is from tarfile.py of CPython @102457:95df96aa2f5a

    # Copyright (C) 2002 Lars Gustäbel <*****@*****.**>
    # All rights reserved.
    #
    # Permission  is  hereby granted,  free  of charge,  to  any person
    # obtaining a  copy of  this software  and associated documentation
    # files  (the  "Software"),  to   deal  in  the  Software   without
    # restriction,  including  without limitation  the  rights to  use,
    # copy, modify, merge, publish, distribute, sublicense, and/or sell
    # copies  of  the  Software,  and to  permit  persons  to  whom the
    # Software  is  furnished  to  do  so,  subject  to  the  following
    # conditions:
    #
    # The above copyright  notice and this  permission notice shall  be
    # included in all copies or substantial portions of the Software.
    """
        self._check("ra")
        if self.firstmember is not None:
            m = self.firstmember
            self.firstmember = None
            return m

        # Advance the file pointer.
        if self.offset != self.fileobj.tell():
            self.fileobj.seek(self.offset - 1)
            if not self.fileobj.read(1):
                raise tarfile.ReadError("unexpected end of data")

        # Read the next block.
        tarinfo = None
        while True:
            try:
                tarinfo = self.tarinfo.fromtarfile(self)
            except tarfile.EOFHeaderError as e:
                if self.ignore_zeros:
                    self._dbg(2, "0x%X: %s" % (self.offset, e))
                    self.offset += tarfile.BLOCKSIZE
                    continue
            except tarfile.InvalidHeaderError as e:
                if self.ignore_zeros:
                    self._dbg(2, "0x%X: %s" % (self.offset, e))
                    self.offset += tarfile.BLOCKSIZE
                    continue
                # Modify here, to raise exceptions if errorlevel is bigger than 0.
                elif self.errorlevel > 0:
                    raise tarfile.ReadError(str(e))
            except tarfile.EmptyHeaderError:
                if self.offset == 0:
                    raise tarfile.ReadError("empty file")
            except tarfile.TruncatedHeaderError as e:
                if self.offset == 0:
                    raise tarfile.ReadError(str(e))
            except tarfile.SubsequentHeaderError as e:
                raise tarfile.ReadError(str(e))
            break

        if tarinfo is not None:
            self.members.append(tarinfo)
        else:
            self._loaded = True

        return tarinfo
Ejemplo n.º 24
0
def tariterator(fileobj,
                check_sorted=False,
                keys=base_plus_ext,
                decode=True,
                source=None,
                lcase=True,
                filename=None):
    """Iterate over samples from a tar archive, either locally or given by URL.

    Tar archives are assumed to be sorted by file name. For each basename,
    reads all the files with different extensions and returns a dictionary
    with the extension as key and the file contents as value.

    :param str archive: tar archive with sorted file names (file name or URL)
    :param bool check_sorted: verify that file names are sorted
    :returns: iterator over samples

    """
    if filename is None:
        filename = getattr(fileobj, "pipe_cmd", getattr(fileobj, "name", "?"))
    if decode is True:
        decode = utils.autodecode
    elif decode is False:
        decode = trivial_decode
    elif isinstance(decode, str):
        decode = utils.autodecoder(decode)
    current_count = 0
    current_prefix = None
    current_sample = None
    try:
        stream = tarfile.open(fileobj=fileobj, mode="r|*")
    except tarfile.ReadError:
        info = getattr(fileobj, "pipe_cmd", getattr(fileobj, "name", "?"))
        raise tarfile.ReadError("{}: empty file".format(info))
    for tarinfo in stream:
        if not tarinfo.isreg():
            continue
        fname = tarinfo.name
        if fname is None:
            warnings.warn("tarinfo.name is None")
            continue
        prefix, suffix = keys(fname)
        if prefix is None:
            warnings.warn("prefix is None for: %s" % (tarinfo.name, ))
            continue
        if prefix != current_prefix:
            if check_sorted and prefix <= current_prefix:
                raise ValueError("[%s] -> [%s]: tar file does not contain sorted keys (%s)" % \
                                 (current_prefix, prefix, filename))
            if valid_sample(current_sample):
                decoded = maybe_decode(current_sample,
                                       decode,
                                       current_count,
                                       info="file {}".format(filename))
                if decoded is not None:
                    yield decoded
            current_prefix = prefix
            current_sample = dict(__key__=prefix, __source__=source)
        try:
            data = stream.extractfile(tarinfo).read()
        except tarfile.ReadError as e:
            print("tarfile.ReadError at", current_count)
            print("file:", tarinfo.name)
            print("source:", filename)
            print(e)
            current_sample["__bad__"] = True
        else:
            if lcase:
                suffix = suffix.lower()
            current_sample[suffix] = data
            current_count += 1
    if valid_sample(current_sample):
        decoded = maybe_decode(current_sample, decode, current_count)
        if decoded is not None:
            yield decoded
    try:
        del stream
    except:
        pass
Ejemplo n.º 25
0
    def test_check_resource_invalid_archive(self, tarfile_open):
        self.resource_get.return_value = 'targzfile'
        tarfile_open.side_effect = tarfile.ReadError()

        self.assertFalse(kerberos_keytab_utils.check_resource())