def test_hashsplit_files(tmpdir): fn = os.path.join(tmpdir, b'f1') f = open(fn, 'wb') sz = 0 for idx in range(10): f.write(b'\x00' * 8192 * 4) sz += 4 * 8192 f.close() def o(): return open(fn, 'rb') res = [(len(b), lvl) for b, lvl in HashSplitter([o(), o(), o()], bits=BUP_BLOBBITS)] WVPASSEQ(res, [(32 * 1024, 0)] * 10 * 3) def bio(n): return BytesIO(split_test_objs[n]) def ex(n): return [(len(split_test_objs[n]), (n - 14) // 4)] res = [(len(b), lvl) for b, lvl in HashSplitter([o(), bio(14), o()], bits=BUP_BLOBBITS)] WVPASSEQ(res, 10 * [(32 * 1024, 0)] + ex(14) + 10 * [(32 * 1024, 0)]) res = [(len(b), lvl) for b, lvl in HashSplitter([bio(14), bio(15)], bits=BUP_BLOBBITS)] WVPASSEQ(res, ex(14) + ex(15)) res = [(len(b), lvl) for b, lvl in HashSplitter([bio(14), bio(27)], bits=BUP_BLOBBITS)] WVPASSEQ(res, ex(14) + ex(27))
def hslevels(data): global hashbits global fanout return [(len(b), l) for b, l in HashSplitter([BytesIO(data)], bits=hashbits, fanbits=int(math.log(fanout, 2)))]
def _splitbuf(data): data = data[:] hs = HashSplitter([BytesIO(data)], bits=BUP_BLOBBITS, fanbits=1) sz = 0 for blob, lvl in hs: # this isn't necessarily _quite_ right, but try to # reconstruct from a max blob to not having split if len(blob) == 4 << 13 and lvl == 0: sz += len(blob) continue yield sz + len(blob), 13 + lvl sz = 0
def test_hashsplit_boundaries(): with no_lingering_errors(): # check with/without boundaries and not finding any split points def bio(s): return BytesIO(s) hs = HashSplitter([ bio(b'\x00' * 8192), bio(b'\x00' * 8192), bio(b'\x00' * 8192), bio(b'\x00' * 8192) ], bits=BUP_BLOBBITS, keep_boundaries=False) res = [(len(b), lvl) for b, lvl in hs] WVPASSEQ(res, [(4 * 8192, 0)]) hs = HashSplitter([ bio(b'\x00' * 8192), bio(b'\x00' * 8192), bio(b'\x00' * 8192), bio(b'\x00' * 8192) ], bits=BUP_BLOBBITS) res = [(len(b), lvl) for b, lvl in hs] WVPASSEQ(res, 4 * [(8192, 0)]) # check with/without boundaries with split points def sbio(n): return BytesIO(split_test_objs[n]) def ex(n): p = n if p > 13: p -= 1 return (len(split_test_objs[n]), (p - 13) // 4) exp = [ex(13), ex(14), ex(15)] inputs = [sbio(13), sbio(14), sbio(15)] hs = HashSplitter(inputs, bits=BUP_BLOBBITS) res = [(len(b), lvl) for b, lvl in hs] WVPASSEQ(res, exp) inputs = [sbio(13), sbio(14), sbio(15)] hs = HashSplitter(inputs, bits=BUP_BLOBBITS, keep_boundaries=False) res = [(len(b), lvl) for b, lvl in hs] WVPASSEQ(res, exp) # check with/without boundaries with found across boundary data = split_test_objs[27] d1, d2 = data[:len(data) // 2], data[len(data) // 2:] hs = HashSplitter([BytesIO(d1), BytesIO(d2)], bits=BUP_BLOBBITS) res = [(len(b), lvl) for b, lvl in hs] WVPASSEQ(res, [(len(d1), 0), (len(d2), 0)]) hs = HashSplitter([BytesIO(d1), BytesIO(d2)], bits=BUP_BLOBBITS, keep_boundaries=False, fanbits=1) res = [(len(b), lvl) for b, lvl in hs] WVPASSEQ(res, [(len(data), 27 - 13 - 1)])
def test_samples(): for k in split_test_objs: if k <= 21: # First check that they have the right number of bits. rsum = _helpers.rollsum(split_test_objs[k]) mask = (1 << (k + 1)) - 1 ones = (1 << k) - 1 WVPASSEQ(rsum & mask, ones) # then also check that again, with the default (bits=13) expected = k - 13 # algorithm ignores 1 bit after the split bits if expected > 0: expected -= 1 hs = HashSplitter([BytesIO(split_test_objs[k])], bits=BUP_BLOBBITS, fanbits=1) blob, level = next(hs) res = (k, len(blob), level) WVPASSEQ(res, (k, len(split_test_objs[k]), expected))