def rotateFilters(self): assert self.R * self.counters[self.first] >= self.capacity_per_filter last = (self.first + 2) % 3 # clear the last filter and make it an empty first filter self.cfs[last] = pydablooms.Dablooms(capacity=self.capacity_per_filter, error_rate=FPP_RATE, filepath="%d_%s" % (last, self.filepath)) self.counters[last] = 0 self.first = last self.rotations += 1
def __init__(self, gt, lt): self.gt = int(gt) self.lt = int(lt) if 'pydablooms' in sys.modules.keys(): self.bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY, error_rate=DABLOOMS_ERROR_RATE, filepath=DABLOOMS_FILEPATH) else: self.bloom = None host = settings.get('MONGOD_HOST', MONGOD_HOST) port = settings.get('MONGOD_PORT', MONGOD_PORT) self.db = _default_mongo(host, port, usedb='master_timeline')
def __init__(self, R=1, capacity=0, filepath="ghostlist.bf"): self.filepath = filepath self.capacity_per_filter = int(ceil(0.5 * capacity)) for i in xrange(3): cf = pydablooms.Dablooms(capacity=self.capacity_per_filter, error_rate=FPP_RATE, filepath="%d_%s" % (i, self.filepath)) self.cfs.append(cf) self.counters.append(0) #print self.cfs self.total_count = 0 self.first = 0 self.capacity = capacity #print "capacity_per_filter=%d, but capacity=%d" % (self.capacity_per_filter, self.capacity) self.ghosthits = 0 self.ghostmisses = 0 self.rotations = 0 self.R = R print "R=%d" % R self.Rignored = 0 self.Rpassed = 0 self.ghostplus = [0.0 for c in xrange(capacity)] self.pdf = [0.0 for j in xrange(int(ceil(1.5 * capacity)))]
#! /usr/bin/env python import pydablooms CAPACITY = 5000 ERROR_RATE = float(1) / CAPACITY bloom = pydablooms.Dablooms(capacity=CAPACITY, error_rate=ERROR_RATE, filepath='bloom.bin') f = open('classes.txt', 'r') g = open('classnames.txt', 'w') for line in f: fields = line.split("'") if fields[0] == 'class ': g.write(fields[1] + '\n') f.close() g.close() h = open('classnames.txt', 'rb') i = 0 for line in h: bloom.add(line.rstrip(), i) i += 1 bloom.flush()
#!/usr/bin/python import pydablooms import sys import md5 sys.path.append('..') import known_libs from androguard.core.bytecode import * from androguard.core.bytecodes.apk import * from androguard.core.analysis.analysis import * bloom = pydablooms.Dablooms(capacity=10000000, error_rate=.05, filepath='libs.bbf') def isLibraryClass(classname, libs=None): package_method = False if libs == None: for package in known_libs.known_libs: package_name = "L" + package + "/" package_name = package_name.replace(".", "/") if package_name in classname: package_method = True break else: for package in libs: if package in classname: package_method = True break return package_method
def build_filter(bloom_filename, linear_refs, circular_refs, kmer, mismatches, inserts, deletions, error_rate=0.01, rc=True): #Using 5e-06 is close to a set for my example, both in run time #(a fraction more) and the number of reads kept (9528 vs 8058 #with sets). simple = set() del_hashes = set() count = 0 t0 = time.time() if linear_refs: for fasta in linear_refs: sys.stderr.write("Hashing linear references in %s\n" % fasta) handle = open(fasta) for upper_seq, raw_read in fasta_iterator(handle): #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \ # % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT")) #Note we do the disambiguate call on the fragments rather than #the whole reference to avoid too many levels of recursion. for i in range(0, len(upper_seq) - kmer): for fragment in disambiguate(upper_seq[i:i + kmer]): assert set(fragment).issubset("ACGT"), fragment simple.add(fragment) #bloom.add(fragment, kmer) count += 1 #TODO - Can do this in one go from len(upper_seq) if deletions: for i in range(0, len(upper_seq) - kmer + 1): for fragment in make_deletions(upper_seq[i:i + kmer + 1]): del_hashes.add(fragment) handle.close() if circular_refs: for fasta in circular_refs: sys.stderr.write("Hashing circular references in %s\n" % fasta) handle = open(fasta) for upper_seq, raw_read in fasta_iterator(handle): #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \ # % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT")) #Want to consider wrapping round the origin, add k-mer length: upper_seq += upper_seq[:kmer] for i in range(0, len(upper_seq) - kmer): for fragment in disambiguate(upper_seq[i:i + kmer]): assert set(fragment).issubset("ACGT"), fragment simple.add(fragment) #bloom.add(fragment, kmer) count += 1 #TODO - Can do this in one go from len(upper_seq) if deletions: for i in range(0, len(upper_seq) - kmer + 1): for fragment in make_deletions(upper_seq[i:i + kmer + 1]): del_hashes.add(fragment) handle.close() if rc: #Would popping be slow? Should mean less memory at once temp = simple.copy() for fragment in temp: simple.add(reverse_complement(fragment)) del temp if mismatches or inserts or deletions: sys.stderr.write("Have %i unique k-mers before consider fuzzy matches\n" \ % (len(simple))) if deletions: #Do this first to avoid 3 large sets in memory! new = del_hashes del del_hashes new.update(simple) sys.stderr.write("Adding deletions brings this to %i unique k-mers\n" \ % len(new)) else: new = simple.copy() if mismatches: for fragment in simple: for var in make_variants(fragment, mismatches): new.add(var) sys.stderr.write("Adding %i mis-matches per k-mer, have %i unique k-mers\n" \ % (mismatches, len(new))) if inserts: for fragment in simple: for var in make_inserts(fragment): new.add(var) sys.stderr.write("Adding inserts brings this to %i unique k-mers\n" \ % len(new)) simple = new capacity = len(simple) bloom = pydablooms.Dablooms(capacity, error_rate, bloom_filename) for fragment in simple: bloom.add(fragment) bloom.flush() sys.stderr.write( "Set and bloom filter of %i-mers created (%i k-mers considered, %i unique)\n" % (kmer, count, len(simple))) sys.stderr.write( "Using Bloom filter with capacity %i and error rate %r\n" % (capacity, error_rate)) sys.stderr.write("Building filters took %0.1fs\n" % (time.time() - t0)) return simple, bloom
# -*- coding: utf-8 -*- # 将已有master_timeline的微博加入dablooms的集合 import pydablooms import time from utils4scrapy.tk_maintain import _default_mongo MONGOD_HOST = 'localhost' MONGOD_PORT = 27017 DABLOOMS_CAPACITY = 2000000000 DABLOOMS_ERROR_RATE = .001 DABLOOMS_FILEPATH = '/opt/scrapy_weibo/scrapy_weibo/bloom.bin' #DABLOOMS_FILEPATH = '/tmp/bloom.bin' bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY, error_rate=DABLOOMS_ERROR_RATE, filepath=DABLOOMS_FILEPATH) db = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb='master_timeline') for status in db.master_timeline_weibo.find(): bloom.add(status['mid'], int(time.time() * 1000))
import pydablooms capacity = 100000 error_rate = 0.05 print("pydablooms version: %s" % pydablooms.__version__) if len(sys.argv) != 3: sys.stderr.write("Usage: %s <bloom_file> <words_file>\n" % sys.argv[0]) sys.exit(1) bloom_fname = sys.argv[1] words_fname = sys.argv[2] bloom = pydablooms.Dablooms(capacity=capacity, error_rate=error_rate, filepath=bloom_fname) words_file = open(words_fname, 'rb') i = 0 for line in words_file: bloom.add(line.rstrip(), i) i += 1 words_file.seek(0) i = 0 for line in words_file: if i % 5 == 0: bloom.delete(line.rstrip(), i) i += 1