def dump_payloaddata(repo_paths, outfile="payloaddata.json.gz"): uids = idmap(root=0) gids = idmap(root=0) rpms = dict() for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=rpm_basename): envra, payload = payloadinfo(rpmfn) rpms[envra] = [] for inode_ents in payload: f, links = combine_hardlinks(inode_ents) # find or allocate uid/gid uid = uids.add(f.stat.user) gid = gids.add(f.stat.group) # fix up the stat f = f._replace(stat=f.stat._replace(user=uid, group=gid)) # add it to the list rpms[envra].append((f, links)) print("dumping {}...".format(outfile)) with gzip.open(outfile, 'wt') as outf: o = OrderedDict() u = uids.strdict() g = gids.strdict() o['counts'] = {'uid': len(u), 'gid': len(g), 'rpms': len(rpms)} o['uid'] = u o['gid'] = g o['rpms'] = [{ 'envra': envra, 'count': len(files), 'files': files } for envra, files in rpms.items()] json.dump(o, outf) return uids, gids, rpms
def rpmlister(dirs): '''list RPMs under topdir, grouped by .src.rpm name, sorted newest-oldest''' print("Finding RPMs, one moment..") rpmps = [p for d in dirs for p in Path(d).glob('**/*.rpm')] # read RPM headers and get source RPM tuple for each package srctup = dict() for p in progress(rpmps, prefix='Reading RPM headers ', itemfmt=lambda p: p.name): # TODO: we should also gather header/payload sizes and warn if we're # probably going to blow up 32-bit offsets. (Or, like.. auto-split # files at that point...) srctup[p] = rpm(p).srctup() src = rpm_src_groupsort(srctup.items()) return {name:[p for pkgs in src[name].values() for p in pkgs] for name in src}
def dump_sizedata(repo_paths, outfile="sizedata.json.gz"): sizedata = dict() valcount = defaultdict(Counter) for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=rpm_basename): r = rpmhdr(rpmfn) sizedata[r.envra] = [[r.sig.size, r.hdr.size, r.payloadsize], [(te.tag, te.offset, te.size, te.realsize) for te in r.hdr.tagent.values()]] for t in r.hdr.tagval: if t >= 1000 and t not in BIN_TAGS: v = r.hdr.jsonval(t) valcount[t].update(v if type(v) == tuple else [v]) print("\ndumping to {}...".format(outfile)) outdata = [sizedata, [(t, vc.most_common()) for t, vc in valcount.items()]] json.dump(outdata, gzip.open(outfile, 'wt')) print("done!") return outdata
def dump_deps(repo_paths, outfile="depdata.json.gz"): deps = dict() rpmcount = 0 depcount = 0 for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=basename): r = rpm(rpmfn) # Skip duplicate ENVRAs if r.envra in deps: continue deps[r.envra] = r.alldeps() rpmcount += 1 depcount += len(deps[r.envra]) print("dumping {}...".format(outfile)) with gzip.open(outfile, 'wt') as outf: o = OrderedDict() o['type'] = 'deps' o['version'] = 1 o['counts'] = {'rpms': rpmcount, 'deps': depcount} o['deps'] = [{'envra': t, 'deps': d} for t, d in deps.items()] json.dump(o, outf) return deps
def merge_rpms(rpmiter, outfile, **dino_kwargs): # Start with a new header object d = DINORPMArchive(**dino_kwargs) count, rpmsize, rpmtotal = 0, 0, 0 # separate contexts for compressing headers vs. files. # TODO: it might be helpful if we made dictionaries for each? fzst = d.dino.get_compressor(level=d.compresslevel) hzst = d.dino.get_compressor(level=d.compresslevel) # Okay let's start adding some RPMs! if not verbose: rpmiter = progress(rpmiter, prefix=Path(outfile).name+': ', itemfmt=lambda p: p.name) for rpmfn in rpmiter: vprint(f'{rpmfn}:') r = rpm(rpmfn) # update stats count += 1 rpmsize = r.payloadsize + r.headersize rpmtotal += rpmsize # We handle the files before the RPM header because while _nearly_ # everything in the RPM payload can be reconstructed from the RPM # header itself, there are a couple tiny things that could be # different, like the ordering of the files in the archive. # NOTE: I'm almost sure we can reproduce the original _uncompressed_ # payload, but I'm really not certain that we can get the exact # compression context (or timestamps or whatever else) are needed. # Grab the filenames and digests from the rpmhdr fnames = ["."+f for f in r.iterfiles()] rpmalgo = r.getval(Tag.FILEDIGESTALGO) digests = r.getval(Tag.FILEDIGESTS) # show RPM header/file sizes vprint(f' RPM: hdr={r.headersize-0x60:<6} files={len(fnames):<3} filesize={r.payloadsize}' f' compr={r.payloadsize/unc_payloadsize(r):<6.2%}') # Keep track of the order of the files in the payload payload_in_order = True payload_order = [] hdridx = {f:n for n,f in enumerate(fnames)} # Start running through the RPM payload filecount, filesize, unc_filesize = 0, 0, 0 for n,item in enumerate(r.payload_iter()): # Does the payload name match the corresponding header name? # If not, find the header index for the payload filename. if item.name == fnames[n]: idx = n else: payload_in_order = False idx = hdridx[item.name] payload_order.append(idx) # We only store regular files with actual data if not (item.isreg and item.size): continue # Set up hashers hashers = {algo:gethasher(algo) for algo in (rpmalgo, d.idxalgo)} # Uncompress file, hash it, and write it to a temporary file. # If the calculated file key isn't in the index, compress the # temporary file contents into the filedata section. with SpooledTemporaryFile() as tmpf: # Uncompress and hash the file contents for block in item.get_blocks(): # TODO: parallelize? parallelize! tmpf.write(block) for h in hashers.values(): h.update(block) # Check digest to make sure the file is OK h = hashers[rpmalgo] if h.hexdigest() != digests[idx]: act = h.hexdigest() exp = digests[idx] raise VerifyError(f"{fnames[idx]}: expected {exp}, got {act}") # Add this if it's not already in the fileidx filekey = hashers[d.idxalgo].digest() if filekey not in d.fileidx: # Write file data into its own compressed frame. tmpf.seek(0) offset = d.filedata.fobj.tell() usize, size = fzst.copy_stream(tmpf, d.filedata.fobj, size=item.size) vprint(f"wrote {size} bytes to filedata sec at offset {offset}") d.fileidx.add(filekey, offset, size, usize) assert d.filedata.fobj.tell() == offset + size filecount += 1 filesize += size unc_filesize += item.size # Okay, files are added, now we can add the rpm header. # FIXME: we shouldn't have to do this manually.. hdr = None with open(r.name, 'rb') as fobj: fobj.seek(0x60) # don't bother with the lead hdr = fobj.read(r.headersize-0x60) # Check signature header digest (if present) sigkey = r.sig.getval(SigTag.SHA256, '') if sigkey: h = gethasher(HashAlgo.SHA256) h.update(hdr[-r.hdr.size:]) if sigkey != h.hexdigest(): raise VerifyError(f"SHA256 mismatch in {r.name}: expected {sigkey} got {h.hexdigest()}") # Add the payload ordering if not payload_in_order: hdr += b''.join(struct.pack('I',i) for i in payload_order) # Add it to the rpmhdr section offset = d.rpmhdr.fobj.tell() usize, size = hzst.copy_stream(BytesIO(hdr), d.rpmhdr.fobj, size=len(hdr)) assert d.rpmhdr.fobj.tell() == offset + size sizediff = (size+filesize)-rpmsize vprint(f' DINO: hdr={size:<6} files={filecount:<3} filesize={filesize}' f' {f"compr={filesize/unc_filesize:<6.2%}" if filesize else ""}' f' diff={sizediff:+} ({sizediff/rpmsize:+.1%})' f' {"(!)" if sizediff/rpmsize > 0.02 else ""}') # Generate pkgkey (TODO: maybe copy_into should do this..) # TODO: y'know, it might be more useful to use the sha256 of the # package envra - which, in theory, should also be unique, but also # gives us fast package lookups by name... #pkgid = hashlib.sha256(bytes(r.envra, 'utf8')).hexdigest() hasher = gethasher(d.idxalgo) hasher.update(hdr) pkgkey = hasher.digest() # Add package key to the index d.rpmidx.add(pkgkey, offset, size, usize) # We did it! Write the data to the output file! with open(outfile, 'wb') as outf: wrote = d.dino.write_to(outf) sizediff = wrote-rpmtotal print(f'packed {count} packages ({rpmtotal} bytes) into {outfile} ' f'({sizediff/rpmtotal:<+.1%} -> {wrote} bytes)' f'{" (!)" if wrote > rpmtotal else ""}') return rpmtotal, wrote