def _resolve_archive(self, filename, subpath=None): ext = os.path.splitext(filename)[1] if subpath and subpath[0] == "/": subpath = subpath[1:] if ext == ".zip": import zipfile zf = zipfile.ZipFile(filename) # MacOS is found guilty of adding extra files into the Zip archives # it creates. The files are hidden, and in the directory __MACOSX/. # We remove those files from the list, since they are not real user # files, and have an unknown binary format. zff = [name for name in zf.namelist() if not(name.startswith("__MACOSX/") or name.endswith("/"))] if subpath: if subpath in zff: zff = [subpath] else: raise TValueError("File `%s` does not exist in archive " "`%s`" % (subpath, filename)) if len(zff) > 1: self.logger.warning("Zip file %s contains multiple compressed " "files: %r. Only the first of them will be " "used." % (filename, zff)) if len(zff) == 0: raise TValueError("Zip file %s is empty" % filename) self._tempdir = tempfile.mkdtemp() if self._verbose: self.logger.debug("Extracting %s to temporary directory %s" % (filename, self._tempdir)) self._tempfiles.append(zf.extract(zff[0], path=self._tempdir)) self._file = self._tempfiles[-1] elif ext == ".gz": import gzip zf = gzip.GzipFile(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self.logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".bz2": import bz2 zf = bz2.open(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self.logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xz": import lzma zf = lzma.open(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self.logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xlsx" or ext == ".xls": self._result = read_xls_workbook(filename, subpath) else: self._file = filename
def _resolve_archive(filename, subpath, tempfiles): logger = tempfiles._logger ext = os.path.splitext(filename)[1] if subpath and subpath[0] in ["/", "\\"]: subpath = subpath[1:] out_file = None out_text = None out_result = None # TODO: file extarction should be lazy if ext == ".zip": import zipfile with zipfile.ZipFile(filename) as zf: # MacOS is found guilty of adding extra files into the Zip archives # it creates. The files are hidden, and in the directory __MACOSX/. # We remove those files from the list, since they are not real user # files, and have an unknown binary format. zff = [name for name in zf.namelist() if not(name.startswith("__MACOSX/") or name.endswith("/"))] if subpath: if subpath in zff: filename = os.path.join(filename, subpath) zff = [subpath] else: raise IOError("File `%s` does not exist in archive `%s`" % (subpath, filename)) extracted_files = [] for zf_file in zff: if logger: logger.debug("Extracting %s/%s to temporary directory %s" % (filename, zf_file, tempfiles.tempdir)) newfile = zf.extract(zf_file, path=tempfiles.tempdir) srcname = os.path.join(filename, zf_file) tempfiles.add(newfile) extracted_files.append(((srcname, newfile, None, None), None)) if len(extracted_files) == 1: out_file = extracted_files[0][0][1] else: return (None, None, None, None), extracted_files elif filename.endswith(".tar.gz") or filename.endswith(".tgz"): import tarfile zf = tarfile.open(filename, mode="r:gz") zff = [entry.name for entry in zf.getmembers() if entry.isfile()] if subpath: if subpath in zff: filename = os.path.join(filename, subpath) zff = [subpath] else: raise IOError("File `%s` does not exist in archive `%s`" % (subpath, filename)) extracted_files = [] for entryname in zff: if logger: logger.debug("Extracting %s/%s to temporary directory %s" % (filename, entryname, tempfiles.tempdir)) newfile = tempfiles.create_temp_file() with zf.extractfile(entryname) as inp, open(newfile, "wb") as out: out.write(inp.read()) srcname = os.path.join(filename, entryname) extracted_files.append(((srcname, newfile, None, None), None)) if len(extracted_files) == 1: out_file = extracted_files[0][0][1] else: return (None, None, None, None), extracted_files elif ext == ".gz": import gzip zf = gzip.GzipFile(filename, mode="rb") if logger: logger.debug("Extracting %s into memory" % filename) out_text = zf.read() if logger: logger.debug("Extracted: size = %d" % len(out_text)) elif ext == ".bz2": import bz2 with bz2.open(filename, mode="rb") as zf: if logger: logger.debug("Extracting %s into memory" % filename) out_text = zf.read() if logger: logger.debug("Extracted: size = %d" % len(out_text)) elif ext == ".xz": import lzma with lzma.open(filename, mode="rb") as zf: if logger: logger.debug("Extracting %s into memory" % filename) out_text = zf.read() if logger: logger.debug("Extracted: size = %d" % len(out_text)) elif ext == ".xlsx" or ext == ".xls": out_result = read_xls_workbook(filename, subpath) if subpath: filename = os.path.join(filename, subpath) elif ext == ".jay": out_result = core.open_jay(filename) else: out_file = filename # src, file, fileno, text, result return (filename, out_file, None, out_text), out_result