Beispiel #1
0
    def _resolve_archive(self, filename, subpath=None):
        ext = os.path.splitext(filename)[1]
        if subpath and subpath[0] == "/":
            subpath = subpath[1:]

        if ext == ".zip":
            import zipfile
            zf = zipfile.ZipFile(filename)
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [name for name in zf.namelist()
                   if not(name.startswith("__MACOSX/") or name.endswith("/"))]
            if subpath:
                if subpath in zff:
                    zff = [subpath]
                else:
                    raise TValueError("File `%s` does not exist in archive "
                                      "`%s`" % (subpath, filename))
            if len(zff) > 1:
                self.logger.warning("Zip file %s contains multiple compressed "
                                    "files: %r. Only the first of them will be "
                                    "used." % (filename, zff))
            if len(zff) == 0:
                raise TValueError("Zip file %s is empty" % filename)
            self._tempdir = tempfile.mkdtemp()
            if self._verbose:
                self.logger.debug("Extracting %s to temporary directory %s"
                                  % (filename, self._tempdir))
            self._tempfiles.append(zf.extract(zff[0], path=self._tempdir))
            self._file = self._tempfiles[-1]

        elif ext == ".gz":
            import gzip
            zf = gzip.GzipFile(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self.logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".bz2":
            import bz2
            zf = bz2.open(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self.logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".xz":
            import lzma
            zf = lzma.open(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self.logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".xlsx" or ext == ".xls":
            self._result = read_xls_workbook(filename, subpath)

        else:
            self._file = filename
Beispiel #2
0
def _resolve_archive(filename, subpath, tempfiles):
    logger = tempfiles._logger
    ext = os.path.splitext(filename)[1]
    if subpath and subpath[0] in ["/", "\\"]:
        subpath = subpath[1:]

    out_file = None
    out_text = None
    out_result = None
    # TODO: file extarction should be lazy
    if ext == ".zip":
        import zipfile
        with zipfile.ZipFile(filename) as zf:
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [name for name in zf.namelist()
                   if not(name.startswith("__MACOSX/") or name.endswith("/"))]
            if subpath:
                if subpath in zff:
                    filename = os.path.join(filename, subpath)
                    zff = [subpath]
                else:
                    raise IOError("File `%s` does not exist in archive `%s`"
                                   % (subpath, filename))
            extracted_files = []
            for zf_file in zff:
                if logger:
                    logger.debug("Extracting %s/%s to temporary directory %s"
                                 % (filename, zf_file, tempfiles.tempdir))
                newfile = zf.extract(zf_file, path=tempfiles.tempdir)
                srcname = os.path.join(filename, zf_file)
                tempfiles.add(newfile)
                extracted_files.append(((srcname, newfile, None, None), None))

            if len(extracted_files) == 1:
                out_file = extracted_files[0][0][1]
            else:
                return (None, None, None, None), extracted_files

    elif filename.endswith(".tar.gz") or filename.endswith(".tgz"):
        import tarfile
        zf = tarfile.open(filename, mode="r:gz")
        zff = [entry.name for entry in zf.getmembers() if entry.isfile()]
        if subpath:
            if subpath in zff:
                filename = os.path.join(filename, subpath)
                zff = [subpath]
            else:
                raise IOError("File `%s` does not exist in archive `%s`"
                              % (subpath, filename))
        extracted_files = []
        for entryname in zff:
            if logger:
                logger.debug("Extracting %s/%s to temporary directory %s"
                             % (filename, entryname, tempfiles.tempdir))
            newfile = tempfiles.create_temp_file()
            with zf.extractfile(entryname) as inp, open(newfile, "wb") as out:
                out.write(inp.read())
            srcname = os.path.join(filename, entryname)
            extracted_files.append(((srcname, newfile, None, None), None))
        if len(extracted_files) == 1:
            out_file = extracted_files[0][0][1]
        else:
            return (None, None, None, None), extracted_files

    elif ext == ".gz":
        import gzip
        zf = gzip.GzipFile(filename, mode="rb")
        if logger:
            logger.debug("Extracting %s into memory" % filename)
        out_text = zf.read()
        if logger:
            logger.debug("Extracted: size = %d" % len(out_text))

    elif ext == ".bz2":
        import bz2
        with bz2.open(filename, mode="rb") as zf:
            if logger:
                logger.debug("Extracting %s into memory" % filename)
            out_text = zf.read()
            if logger:
                logger.debug("Extracted: size = %d" % len(out_text))

    elif ext == ".xz":
        import lzma
        with lzma.open(filename, mode="rb") as zf:
            if logger:
                logger.debug("Extracting %s into memory" % filename)
            out_text = zf.read()
            if logger:
                logger.debug("Extracted: size = %d" % len(out_text))

    elif ext == ".xlsx" or ext == ".xls":
        out_result = read_xls_workbook(filename, subpath)
        if subpath:
            filename = os.path.join(filename, subpath)

    elif ext == ".jay":
        out_result = core.open_jay(filename)

    else:
        out_file = filename
    # src, file, fileno, text, result
    return (filename, out_file, None, out_text), out_result