def test_data(tmpdir, monkeypatch): monkeypatch.chdir(str(tmpdir)) shutil.rmtree("base", ignore_errors=True) setup_testdata(tmpdir, testdata) manifest = Manifest(paths=[Path("base")]) manifest.add_metadata(Path("base", ".manifest.yaml")) with open("manifest.yaml", "wb") as f: manifest.write(f) return tmpdir
def test_verify_missing_metadata_item(test_data, testname): name = archive_name(tags=[testname]) manifest = Manifest(paths=[Path("base")]) manifest.add_metadata(Path("base", ".manifest.yaml")) manifest.add_metadata(Path("base", ".msg.txt")) with tarfile.open(name, "w") as tarf: with tempfile.TemporaryFile(dir=str(test_data)) as tmpf: manifest.write(tmpf) tmpf.seek(0) ti = tarf.gettarinfo(arcname="base/.manifest.yaml", fileobj=tmpf) ti.mode = stat.S_IFREG | stat.S_IMODE(0o444) tarf.addfile(ti, tmpf) tarf.add("base") with Archive().open(Path(name)) as archive: with pytest.raises(ArchiveIntegrityError) as err: archive.verify() assert "'base/.msg.txt' not found" in str(err.value)
class Archive: def __init__(self): self.path = None self.basedir = None self.manifest = None self._file = None self._metadata = [] def create(self, path, compression, paths, basedir=None, workdir=None, excludes=None, dedup=DedupMode.LINK, tags=None): if sys.version_info < (3, 5): # The 'x' (exclusive creation) mode was added to tarfile # in Python 3.5. mode = 'w:' + compression else: mode = 'x:' + compression if workdir: with tmp_chdir(workdir): self._create(workdir / path, mode, paths, basedir, excludes, dedup, tags) else: self._create(path, mode, paths, basedir, excludes, dedup, tags) return self def _create(self, path, mode, paths, basedir, excludes, dedup, tags): self.path = path self._check_paths(paths, basedir, excludes) self.manifest = Manifest(paths=paths, excludes=excludes, tags=tags) self.manifest.add_metadata(self.basedir / ".manifest.yaml") for md in self._metadata: md.set_path(self.basedir) self.manifest.add_metadata(md.path) with tarfile.open(str(self.path), mode) as tarf: with tempfile.TemporaryFile() as tmpf: self.manifest.write(tmpf) tmpf.seek(0) self.add_metadata(".manifest.yaml", tmpf) md_names = self._add_metadata_files(tarf) dupindex = {} for fi in self.manifest: p = fi.path name = self._arcname(p) if name in md_names: raise ArchiveCreateError("cannot add %s: " "this filename is reserved" % p) if fi.is_file(): ti = tarf.gettarinfo(str(p), arcname=name) dup = self._check_duplicate(fi, name, dedup, dupindex) if dup: ti.type = tarfile.LNKTYPE ti.linkname = dup tarf.addfile(ti) else: ti.size = fi.size ti.type = tarfile.REGTYPE ti.linkname = '' with p.open("rb") as f: tarf.addfile(ti, fileobj=f) else: tarf.add(str(p), arcname=name, recursive=False) def _check_paths(self, paths, basedir, excludes): """Check the paths to be added to an archive for several error conditions. Accept a list of either strings or path-like objects. Convert them to a list of Path objects. Also sets self.basedir. """ if not paths: raise ArchiveCreateError("refusing to create an empty archive") if not basedir: p = paths[0] if p.is_absolute(): self.basedir = Path(self.path.name.split('.')[0]) else: self.basedir = Path(p.parts[0]) else: self.basedir = basedir if self.basedir.is_absolute(): raise ArchiveCreateError("basedir must be relative") # We allow two different cases: either # - all paths are absolute, or # - all paths are relative and start with basedir. # The same rules for paths also apply to excludes, if # provided. So we may just iterate over the chain of both # lists. abspath = None for p in itertools.chain(paths, excludes or ()): if not _is_normalized(p): raise ArchiveCreateError( "invalid path %s: must be normalized" % p) if abspath is None: abspath = p.is_absolute() else: if abspath != p.is_absolute(): raise ArchiveCreateError("mixing of absolute and relative " "paths is not allowed") if not p.is_absolute(): try: # This will raise ValueError if p does not start # with basedir: p.relative_to(self.basedir) except ValueError as e: raise ArchiveCreateError(str(e)) if not abspath: if self.basedir.is_symlink() or not self.basedir.is_dir(): raise ArchiveCreateError("basedir must be a directory") def _add_metadata_files(self, tarf): """Add the metadata files to the tar file. """ md_names = set() for md in self._metadata: name = str(md.path) if name in md_names: raise ArchiveCreateError("duplicate metadata %s" % name) md_names.add(name) ti = tarf.gettarinfo(arcname=name, fileobj=md.fileobj) ti.mode = stat.S_IFREG | stat.S_IMODE(md.mode) tarf.addfile(ti, md.fileobj) return md_names def _check_duplicate(self, fileinfo, name, dedup, dupindex): """Check if the archive item fileinfo should be linked to another item already added to the archive. """ assert fileinfo.is_file() if dedup == DedupMode.LINK: st = fileinfo.path.stat() if st.st_nlink == 1: return None idxkey = (st.st_dev, st.st_ino) elif dedup == DedupMode.CONTENT: try: hashalg = fileinfo.Checksums[0] except IndexError: return None idxkey = fileinfo.checksum[hashalg] else: return None if idxkey in dupindex: return dupindex[idxkey] else: dupindex[idxkey] = name return None def add_metadata(self, name, fileobj, mode=0o444): path = self.basedir / name if self.basedir else None md = MetadataItem(name=name, path=path, fileobj=fileobj, mode=mode) self._metadata.insert(0, md) def open(self, path): self.path = path try: self._file = tarfile.open(str(self.path), 'r') except OSError as e: raise ArchiveReadError(str(e)) md = self.get_metadata(".manifest.yaml") self.basedir = md.path.parent self.manifest = Manifest(fileobj=md.fileobj) if not self.manifest.metadata: # Legacy: Manifest version 1.0 did not have metadata. self.manifest.add_metadata(self.basedir / ".manifest.yaml") return self def get_metadata(self, name): ti = self._file.next() path = Path(ti.path) if path.name != name: raise ArchiveIntegrityError("%s not found" % name) fileobj = self._file.extractfile(ti) md = MetadataItem(path=path, tarinfo=ti, fileobj=fileobj) self._metadata.append(md) return md def close(self): if self._file: self._file.close() self._file = None def __enter__(self): return self def __exit__(self, type, value, tb): self.close() def __del__(self): self.close() def _arcname(self, p): if p.is_absolute(): return str(self.basedir / p.relative_to(p.root)) else: return str(p) def verify(self): if not self._file: raise ValueError("archive is closed.") # Verify that all metadata items are present in the proper # order at the beginning of the tar file. Start iterating for # TarInfo objects in the tarfile from the beginning, # regardless of what has already been read: tarf_it = iter(self._file) for md in self.manifest.metadata: ti = next(tarf_it) if ti.name != md: raise ArchiveIntegrityError("Expected metadata item '%s' " "not found" % (md)) # Check the content of the archive. for fileinfo in self.manifest: self._verify_item(fileinfo) def _verify_item(self, fileinfo): def _check_condition(cond, item, message): if not cond: raise ArchiveIntegrityError("%s: %s" % (item, message)) itemname = "%s:%s" % (self.path, fileinfo.path) try: tarinfo = self._file.getmember(self._arcname(fileinfo.path)) except KeyError: raise ArchiveIntegrityError("%s: missing" % itemname) _check_condition(tarinfo.mode == fileinfo.mode, itemname, "wrong mode") _check_condition( int(tarinfo.mtime) == int(fileinfo.mtime), itemname, "wrong modification time") if fileinfo.is_dir(): _check_condition(tarinfo.isdir(), itemname, "wrong type, expected directory") elif fileinfo.is_file(): _check_condition(tarinfo.isfile() or tarinfo.islnk(), itemname, "wrong type, expected regular file") if tarinfo.isfile(): _check_condition(tarinfo.size == fileinfo.size, itemname, "wrong size") with self._file.extractfile(tarinfo) as f: cs = checksum(f, fileinfo.checksum.keys()) _check_condition(cs == fileinfo.checksum, itemname, "checksum does not match") elif fileinfo.is_symlink(): _check_condition(tarinfo.issym(), itemname, "wrong type, expected symbolic link") _check_condition(tarinfo.linkname == str(fileinfo.target), itemname, "wrong link target") else: raise ArchiveIntegrityError("%s: invalid type" % (itemname)) def extract(self, targetdir, inclmeta=False): # We extract the directories last in reverse order. This way, # the directory attributes, in particular the file modification # time, is set correctly after the file content is written into # the directory. dirstack = [] if inclmeta: for mi in self.manifest.metadata: self._file.extract(mi, path=str(targetdir)) for fi in self.manifest: if fi.is_dir(): dirstack.append(fi.path) else: self._file.extract(self._arcname(fi.path), path=str(targetdir)) while True: try: p = dirstack.pop() except IndexError: break self._file.extract(self._arcname(p), path=str(targetdir))