Exemple #1
0
def dump_payloaddata(repo_paths, outfile="payloaddata.json.gz"):
    uids = idmap(root=0)
    gids = idmap(root=0)
    rpms = dict()
    for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=rpm_basename):
        envra, payload = payloadinfo(rpmfn)
        rpms[envra] = []
        for inode_ents in payload:
            f, links = combine_hardlinks(inode_ents)
            # find or allocate uid/gid
            uid = uids.add(f.stat.user)
            gid = gids.add(f.stat.group)
            # fix up the stat
            f = f._replace(stat=f.stat._replace(user=uid, group=gid))
            # add it to the list
            rpms[envra].append((f, links))
    print("dumping {}...".format(outfile))
    with gzip.open(outfile, 'wt') as outf:
        o = OrderedDict()
        u = uids.strdict()
        g = gids.strdict()
        o['counts'] = {'uid': len(u), 'gid': len(g), 'rpms': len(rpms)}
        o['uid'] = u
        o['gid'] = g
        o['rpms'] = [{
            'envra': envra,
            'count': len(files),
            'files': files
        } for envra, files in rpms.items()]
        json.dump(o, outf)
    return uids, gids, rpms
Exemple #2
0
def rpmlister(dirs):
    '''list RPMs under topdir, grouped by .src.rpm name, sorted newest-oldest'''
    print("Finding RPMs, one moment..")
    rpmps = [p for d in dirs for p in Path(d).glob('**/*.rpm')]
    # read RPM headers and get source RPM tuple for each package
    srctup = dict()
    for p in progress(rpmps, prefix='Reading RPM headers ', itemfmt=lambda p: p.name):
        # TODO: we should also gather header/payload sizes and warn if we're
        # probably going to blow up 32-bit offsets. (Or, like.. auto-split
        # files at that point...)
        srctup[p] = rpm(p).srctup()
    src = rpm_src_groupsort(srctup.items())
    return {name:[p for pkgs in src[name].values() for p in pkgs] for name in src}
Exemple #3
0
def dump_sizedata(repo_paths, outfile="sizedata.json.gz"):
    sizedata = dict()
    valcount = defaultdict(Counter)
    for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=rpm_basename):
        r = rpmhdr(rpmfn)
        sizedata[r.envra] = [[r.sig.size, r.hdr.size, r.payloadsize],
                             [(te.tag, te.offset, te.size, te.realsize)
                              for te in r.hdr.tagent.values()]]
        for t in r.hdr.tagval:
            if t >= 1000 and t not in BIN_TAGS:
                v = r.hdr.jsonval(t)
                valcount[t].update(v if type(v) == tuple else [v])
    print("\ndumping to {}...".format(outfile))
    outdata = [sizedata, [(t, vc.most_common()) for t, vc in valcount.items()]]
    json.dump(outdata, gzip.open(outfile, 'wt'))
    print("done!")
    return outdata
Exemple #4
0
def dump_deps(repo_paths, outfile="depdata.json.gz"):
    deps = dict()
    rpmcount = 0
    depcount = 0
    for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=basename):
        r = rpm(rpmfn)
        # Skip duplicate ENVRAs
        if r.envra in deps:
            continue
        deps[r.envra] = r.alldeps()
        rpmcount += 1
        depcount += len(deps[r.envra])
    print("dumping {}...".format(outfile))
    with gzip.open(outfile, 'wt') as outf:
        o = OrderedDict()
        o['type'] = 'deps'
        o['version'] = 1
        o['counts'] = {'rpms': rpmcount, 'deps': depcount}
        o['deps'] = [{'envra': t, 'deps': d} for t, d in deps.items()]
        json.dump(o, outf)
    return deps
Exemple #5
0
def merge_rpms(rpmiter, outfile, **dino_kwargs):
    # Start with a new header object
    d = DINORPMArchive(**dino_kwargs)
    count, rpmsize, rpmtotal = 0, 0, 0

    # separate contexts for compressing headers vs. files.
    # TODO: it might be helpful if we made dictionaries for each?
    fzst = d.dino.get_compressor(level=d.compresslevel)
    hzst = d.dino.get_compressor(level=d.compresslevel)

    # Okay let's start adding some RPMs!
    if not verbose:
        rpmiter = progress(rpmiter, prefix=Path(outfile).name+': ', itemfmt=lambda p: p.name)
    for rpmfn in rpmiter:
        vprint(f'{rpmfn}:')
        r = rpm(rpmfn)

        # update stats
        count += 1
        rpmsize = r.payloadsize + r.headersize
        rpmtotal += rpmsize

        # We handle the files before the RPM header because while _nearly_
        # everything in the RPM payload can be reconstructed from the RPM
        # header itself, there are a couple tiny things that could be
        # different, like the ordering of the files in the archive.
        # NOTE: I'm almost sure we can reproduce the original _uncompressed_
        # payload, but I'm really not certain that we can get the exact
        # compression context (or timestamps or whatever else) are needed.

        # Grab the filenames and digests from the rpmhdr
        fnames = ["."+f for f in r.iterfiles()]
        rpmalgo = r.getval(Tag.FILEDIGESTALGO)
        digests = r.getval(Tag.FILEDIGESTS)

        # show RPM header/file sizes
        vprint(f'  RPM: hdr={r.headersize-0x60:<6} files={len(fnames):<3} filesize={r.payloadsize}'
               f' compr={r.payloadsize/unc_payloadsize(r):<6.2%}')


        # Keep track of the order of the files in the payload
        payload_in_order = True
        payload_order = []
        hdridx = {f:n for n,f in enumerate(fnames)}

        # Start running through the RPM payload
        filecount, filesize, unc_filesize = 0, 0, 0
        for n,item in enumerate(r.payload_iter()):
            # Does the payload name match the corresponding header name?
            # If not, find the header index for the payload filename.
            if item.name == fnames[n]:
                idx = n
            else:
                payload_in_order = False
                idx = hdridx[item.name]
            payload_order.append(idx)

            # We only store regular files with actual data
            if not (item.isreg and item.size):
                continue

            # Set up hashers
            hashers = {algo:gethasher(algo) for algo in (rpmalgo, d.idxalgo)}

            # Uncompress file, hash it, and write it to a temporary file.
            # If the calculated file key isn't in the index, compress the
            # temporary file contents into the filedata section.
            with SpooledTemporaryFile() as tmpf:
                # Uncompress and hash the file contents
                for block in item.get_blocks():
                    # TODO: parallelize? parallelize!
                    tmpf.write(block)
                    for h in hashers.values():
                        h.update(block)
                # Check digest to make sure the file is OK
                h = hashers[rpmalgo]
                if h.hexdigest() != digests[idx]:
                    act = h.hexdigest()
                    exp = digests[idx]
                    raise VerifyError(f"{fnames[idx]}: expected {exp}, got {act}")
                # Add this if it's not already in the fileidx
                filekey = hashers[d.idxalgo].digest()
                if filekey not in d.fileidx:
                    # Write file data into its own compressed frame.
                    tmpf.seek(0)
                    offset = d.filedata.fobj.tell()
                    usize, size = fzst.copy_stream(tmpf, d.filedata.fobj, size=item.size)
                    vprint(f"wrote {size} bytes to filedata sec at offset {offset}")
                    d.fileidx.add(filekey, offset, size, usize)
                    assert d.filedata.fobj.tell() == offset + size
                    filecount += 1
                    filesize += size
                    unc_filesize += item.size

        # Okay, files are added, now we can add the rpm header.
        # FIXME: we shouldn't have to do this manually..
        hdr = None
        with open(r.name, 'rb') as fobj:
            fobj.seek(0x60) # don't bother with the lead
            hdr = fobj.read(r.headersize-0x60)

        # Check signature header digest (if present)
        sigkey = r.sig.getval(SigTag.SHA256, '')
        if sigkey:
            h = gethasher(HashAlgo.SHA256)
            h.update(hdr[-r.hdr.size:])
            if sigkey != h.hexdigest():
                raise VerifyError(f"SHA256 mismatch in {r.name}: expected {sigkey} got {h.hexdigest()}")

        # Add the payload ordering
        if not payload_in_order:
            hdr += b''.join(struct.pack('I',i) for i in payload_order)

        # Add it to the rpmhdr section
        offset = d.rpmhdr.fobj.tell()
        usize, size = hzst.copy_stream(BytesIO(hdr), d.rpmhdr.fobj, size=len(hdr))
        assert d.rpmhdr.fobj.tell() == offset + size
        sizediff = (size+filesize)-rpmsize
        vprint(f' DINO: hdr={size:<6} files={filecount:<3} filesize={filesize}'
               f' {f"compr={filesize/unc_filesize:<6.2%}" if filesize else ""}'
               f' diff={sizediff:+} ({sizediff/rpmsize:+.1%})'
               f' {"(!)" if sizediff/rpmsize > 0.02 else ""}')

        # Generate pkgkey (TODO: maybe copy_into should do this..)
        # TODO: y'know, it might be more useful to use the sha256 of the
        # package envra - which, in theory, should also be unique, but also
        # gives us fast package lookups by name...
        #pkgid = hashlib.sha256(bytes(r.envra, 'utf8')).hexdigest()
        hasher = gethasher(d.idxalgo)
        hasher.update(hdr)
        pkgkey = hasher.digest()
        # Add package key to the index
        d.rpmidx.add(pkgkey, offset, size, usize)

    # We did it! Write the data to the output file!
    with open(outfile, 'wb') as outf:
        wrote = d.dino.write_to(outf)
    sizediff = wrote-rpmtotal
    print(f'packed {count} packages ({rpmtotal} bytes) into {outfile} '
          f'({sizediff/rpmtotal:<+.1%} -> {wrote} bytes)'
          f'{" (!)" if wrote > rpmtotal else ""}')
    return rpmtotal, wrote