def test_derandomize_ensures_comparable_filters(self): """Derandomize ensures that filters are comparable.""" with bloomfilter.util.derandomize(): bloom_filter_1 = bloomfilter.BloomFilter(10, 0.1) with bloomfilter.util.derandomize(): bloom_filter_2 = bloomfilter.BloomFilter(10, 0.1) self.assertEqual(bloom_filter_1.raw_data(), bloom_filter_2.raw_data())
def test_derandomize_ensures_serialization_is_consistent(self): """Derandomize ensures that serialization_is_consistent.""" with bloomfilter.util.derandomize(234): bloom_filter_1 = bloomfilter.BloomFilter(10, 0.1) with bloomfilter.util.derandomize(234): bloom_filter_2 = bloomfilter.BloomFilter(10, 0.1) self.assertEqual(bloom_filter_1.serialize(), bloom_filter_2.serialize())
def test_membership(self): for perm in itertools.permutations(strings, len(strings)): bf = bloomfilter.BloomFilter(10, MyString) for s in perm: bf[s] = True self.assertTrue(bf[s]) self.assertTrue(s in bf)
def hash_value_raises_hash_func_exception(self, return_val, maxsize=10): class BadHash: @bloom_hash def bad_hash(self, foo, bar): return return_val bf = bloomfilter.BloomFilter(maxsize, BadHash) obj = BadHash() with self.assertRaises(bloomfilter.BloomFilterHashFunctionException): obj.bad_hash()
def test1(): bf = bloomfilter.BloomFilter() # start_cha = 0 end_cha = 10000000 # start_not = 10000001 end_not = 20000000 #insert 1 ~ 100w for i in xrange(start_cha, end_cha): bf.Add(str(i)) print "Dump before." bf.Dump(FILE) print "Dump done."
def __init__(self): self.raw_datas = [] #数据集:原始数据 self.datasWithoutClear = [] self.dupli_count = 0 #计数:重复的数据 self.now = datetime.date.today() #字段:插入记录的日期 self.key_infos = bloomfilter.BloomFilter(0.001,1000000) #学习使用bloomfilter try: # self.conn=pymysql.connect(host = "192.168.1.207",user = "******",passwd = "root",db = "property_info",charset = "utf8") self.conn = ToolsBox.get_database() # self.conn=pymysql.connect(host = "office.xmcdhpg.cn",user = "******",passwd = "root",db = "property_info",charset = "utf8",port = 6153) except: print( "初始化时Connect failed") self.cur = self.conn.cursor(cursor=pymysql.cursors.DictCursor) # 用字典
def test2(): bf = bloomfilter.BloomFilter(FILE) # start_cha = 0 end_cha = 10000000 # start_not = 10000001 end_not = 20000000 #check for FN cnt = 0 for i in xrange(start_cha, end_cha): if not bf.Test(str(i)): cnt += 1 print "FN", cnt #check for FP cnt = 0 for i in xrange(start_not, end_not): if bf.Test(str(i)): cnt += 1 print "FP", cnt
def test_bad_bits_per_table(self): with self.assertRaises(bloomfilter.BloomFilterException): bloomfilter.BloomFilter(0, MyString) with self.assertRaises(bloomfilter.BloomFilterException): bloomfilter.BloomFilter(-1234, MyString)
def test_empty(self): bf = bloomfilter.BloomFilter(10, MyString) self.assertFalse(any(bf[s] for s in strings))
def test_class_wo_bloom_hashes(self): class Foo: pass with self.assertRaises(bloomfilter.BloomFilterException): bloomfilter.BloomFilter(10, Foo)
count += 1 q.put(url) else: break crawled.set(page) varLock.release() q.task_done() start = time.time() seed = "http://www.baidu.com" q = Queue.Queue() q.put(seed) count = 1 max_page = 20 crawled = bloomfilter.BloomFilter(20*max_page, 5) THREAD_NUM = 10 threads = [] varLock = threading.Lock() for i in range(THREAD_NUM): t = threading.Thread(target = working) t.setDaemon(True) threads.append(t) t.start() q.join() print "It uses", time.time()-start,"sec to parse", max_page,"pages with", THREAD_NUM, "threads."
def make_bloomFilter(L): """Create a new empty bloom filter, and add elements in L""" bloom = bf.BloomFilter(100, 10) for x in L: bloom.add(x) return bloom
def setUp(self): self.bf = bloomfilter.BloomFilter(50)
import urls_create import bloomfilter urls = urls_create.Get_urls() # print(urls) bf = bloomfilter.BloomFilter() aa = [] for i in urls: if bf.isContains(i, "testurl"): print(i) aa.append(i) else: bf.insert(i, "testurl") print(len(aa))