def test_bloom(): hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name = 'dummy.idx' ix.shatable = ''.join(hashes) for k in (4, 5): b = bloom.create('pybuptest.bloom', expected=100, k=k) b.add_idx(ix) WVPASSLT(b.pfalse_positive(), .1) b.close() b = bloom.ShaBloom('pybuptest.bloom') all_present = True for h in hashes: all_present &= b.exists(h) WVPASS(all_present) false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 WVPASSLT(false_positives, 5) os.unlink('pybuptest.bloom') tf = tempfile.TemporaryFile() b = bloom.create('bup.bloom', f=tf, expected=100) WVPASSEQ(b.rwfile, tf) WVPASSEQ(b.k, 5) tf = tempfile.TemporaryFile() b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False) WVPASSEQ(b.k, 4)
def ruin_bloom(bloomfilename): rbloomfilename = git.repo_rel(bloomfilename) if not os.path.exists(bloomfilename): log(path_msg(bloomfilename) + '\n') add_error('bloom: %s not found to ruin\n' % path_msg(rbloomfilename)) return b = bloom.ShaBloom(bloomfilename, readwrite=True, expected=1) b.map[16:16 + 2**b.bits] = b'\0' * 2**b.bits
def ruin_bloom(bloomfilename): rbloomfilename = git.repo_rel(bloomfilename) if not os.path.exists(bloomfilename): log("%s\n" % bloomfilename) add_error("bloom: %s not found to ruin\n" % rbloomfilename) return b = bloom.ShaBloom(bloomfilename, readwrite=True, expected=1) b.map[16:16+2**b.bits] = '\0' * 2**b.bits
def test_bloom(): with no_lingering_errors(): with test_tempdir('bup-tbloom-') as tmpdir: hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name = 'dummy.idx' ix.shatable = ''.join(hashes) for k in (4, 5): b = bloom.create(tmpdir + '/pybuptest.bloom', expected=100, k=k) b.add_idx(ix) WVPASSLT(b.pfalse_positive(), .1) b.close() b = bloom.ShaBloom(tmpdir + '/pybuptest.bloom') all_present = True for h in hashes: all_present &= b.exists(h) WVPASS(all_present) false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 WVPASSLT(false_positives, 5) os.unlink(tmpdir + '/pybuptest.bloom') tf = tempfile.TemporaryFile(dir=tmpdir) b = bloom.create('bup.bloom', f=tf, expected=100) WVPASSEQ(b.rwfile, tf) WVPASSEQ(b.k, 5) # Test large (~1GiB) filter. This may fail on s390 (31-bit # architecture), and anywhere else where the address space is # sufficiently limited. tf = tempfile.TemporaryFile(dir=tmpdir) skip_test = False try: b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False) except EnvironmentError as ex: (ptr_width, linkage) = platform.architecture() if ptr_width == '32bit' and ex.errno == errno.ENOMEM: WVMSG( 'skipping large bloom filter test (mmap probably failed) ' + str(ex)) skip_test = True else: raise if not skip_test: WVPASSEQ(b.k, 4)
def test_bloom(tmpdir): hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name = b'dummy.idx' ix.shatable = b''.join(hashes) for k in (4, 5): with bloom.create(tmpdir + b'/pybuptest.bloom', expected=100, k=k) as b: b.add_idx(ix) assert b.pfalse_positive() < .1 with bloom.ShaBloom(tmpdir + b'/pybuptest.bloom') as b: all_present = True for h in hashes: all_present &= (b.exists(h) or False) assert all_present false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 assert false_positives < 5 os.unlink(tmpdir + b'/pybuptest.bloom') tf = tempfile.TemporaryFile(dir=tmpdir) with bloom.create(b'bup.bloom', f=tf, expected=100) as b: assert b.file == tf assert b.k == 5 # Test large (~1GiB) filter. This may fail on s390 (31-bit # architecture), and anywhere else where the address space is # sufficiently limited. tf = tempfile.TemporaryFile(dir=tmpdir) skip_test = False try: with bloom.create(b'bup.bloom', f=tf, expected=2**28, delaywrite=False) as b: assert b.k == 4 except EnvironmentError as ex: (ptr_width, linkage) = platform.architecture() if ptr_width == '32bit' and ex.errno == errno.ENOMEM: logging.getLogger().info( 'skipping large bloom filter test (mmap probably failed) ' + str(ex)) else: raise
def check_bloom(path, bloomfilename, idx): rbloomfilename = git.repo_rel(bloomfilename) ridx = git.repo_rel(idx) if not os.path.exists(bloomfilename): log('bloom: %s: does not exist.\n' % path_msg(rbloomfilename)) return b = bloom.ShaBloom(bloomfilename) if not b.valid(): add_error('bloom: %r is invalid.\n' % path_msg(rbloomfilename)) return base = os.path.basename(idx) if base not in b.idxnames: log('bloom: %s does not contain the idx.\n' % path_msg(rbloomfilename)) return if base == idx: idx = os.path.join(path, idx) log('bloom: bloom file: %s\n' % path_msg(rbloomfilename)) log('bloom: checking %s\n' % path_msg(ridx)) for objsha in git.open_idx(idx): if not b.exists(objsha): add_error('bloom: ERROR: object %s missing' % hexstr(objsha))
def refresh(self, skip_midx=False): """Refresh the index list. This method verifies if .midx files were superseded (e.g. all of its contents are in another, bigger .midx file) and removes the superseded files. If skip_midx is True, all work on .midx files will be skipped and .midx files will be removed from the list. The instance variable 'ignore_midx' can force this function to always act as if skip_midx was True. """ if self.bloom is not None: self.bloom.close() self.bloom = None # Always reopen the bloom as it may have been relaced self.do_bloom = False skip_midx = skip_midx or self.ignore_midx d = dict((p.name, p) for p in self.packs if not skip_midx or not isinstance(p, midx.PackMidx)) if os.path.exists(self.dir): if not skip_midx: midxl = [] midxes = set(glob.glob(os.path.join(self.dir, b'*.midx'))) # remove any *.midx files from our list that no longer exist for ix in list(d.values()): if not isinstance(ix, midx.PackMidx): continue if ix.name in midxes: continue # remove the midx del d[ix.name] ix.close() self.packs.remove(ix) for ix in self.packs: if isinstance(ix, midx.PackMidx): for name in ix.idxnames: d[os.path.join(self.dir, name)] = ix for full in midxes: if not d.get(full): mx = midx.PackMidx(full) (mxd, mxf) = os.path.split(mx.name) broken = False for n in mx.idxnames: if not os.path.exists(os.path.join(mxd, n)): log(('warning: index %s missing\n' ' used by %s\n') % (path_msg(n), path_msg(mxf))) broken = True if broken: mx.close() del mx unlink(full) else: midxl.append(mx) midxl.sort( key=lambda ix: (-len(ix), -xstat.stat(ix.name).st_mtime)) for ix in midxl: any_needed = False for sub in ix.idxnames: found = d.get(os.path.join(self.dir, sub)) if not found or isinstance(found, PackIdx): # doesn't exist, or exists but not in a midx any_needed = True break if any_needed: d[ix.name] = ix for name in ix.idxnames: d[os.path.join(self.dir, name)] = ix elif not ix.force_keep: debug1('midx: removing redundant: %s\n' % path_msg(os.path.basename(ix.name))) ix.close() unlink(ix.name) for full in glob.glob(os.path.join(self.dir, b'*.idx')): if not d.get(full): try: ix = open_idx(full) except GitError as e: add_error(e) continue d[full] = ix bfull = os.path.join(self.dir, b'bup.bloom') if self.bloom is None and os.path.exists(bfull): self.bloom = bloom.ShaBloom(bfull) self.packs = list(set(d.values())) self.packs.sort(reverse=True, key=lambda x: len(x)) if self.bloom and self.bloom.valid() and len( self.bloom) >= len(self): self.do_bloom = True else: self.bloom = None debug1('PackIdxList: using %d index%s.\n' % (len(self.packs), len(self.packs) != 1 and 'es' or ''))
def do_bloom(path, outfilename, k, force): global _first assert k in (None, 4, 5) b = None if os.path.exists(outfilename) and not force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i, name in enumerate(glob.glob(b'%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif k is not None and k != b.k: debug1("bloom: new k %d != existing k %d, regenerating\n" % (k, b.k)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS[b.k] and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path) + b': ' or b'' progress('bloom: %s%s %d file%s (%d object%s).\r' % (path_msg(dirprefix), msg, len(add), len(add) != 1 and 's' or '', add_count, add_count != 1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, b'bup.tmp.bloom') b = bloom.create(tfname, expected=add_count, k=k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount * 100.0 / add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) # Currently, there's an open file object for tfname inside b. # Make sure it's closed before rename. b.close() if tfname: os.rename(tfname, outfilename)
def do_bloom(path, outfilename): global _first b = None if os.path.exists(outfilename) and not opt.force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i, name in enumerate(glob.glob('%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) total = add_count + rest_count if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path) + ': ' or '' progress('bloom: %s%s %d file%s (%d object%s).\n' % (dirprefix, msg, len(add), len(add) != 1 and 's' or '', add_count, add_count != 1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, 'bup.tmp.bloom') tf = open(tfname, 'w+') b = bloom.create(tfname, f=tf, expected=add_count, k=opt.k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount * 100.0 / add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) if tfname: os.rename(tfname, outfilename)