class TestBloomFilter(unittest.TestCase): def setUp(self): self.size = 500000 self.hash_count = 7 self.bf = BloomFilter(self.size, self.hash_count) lst = ['abc', 'xyz', 'foo', 'bar'] for item in lst: self.bf.add(item) def _initialize(self): pass def _cleanup(self): if self.bf: del(self.bf) self.bf = None def test_lookup_yes(self): self.assertEqual(self.bf.lookup('foo'), True) def test_lookup_no(self): self.assertEqual(self.bf.lookup('hello'), False) def tearDown(self): self._cleanup()
class TestBloomFilter(unittest.TestCase): def setUp(self): self.bf = BloomFilter(256) self.existing_strings = [ 'tiny', 'bloom', 'rate', 'back', 'apple', 'google', 'dijkstra', 'limiter', 'url', 'travel', 'man', '2', ] for each in self.existing_strings: self.bf.insert(each) self.non_existing_strings = [ 'multi', 'short', 'path', 'components', 'connect', 'unit', 'test', ] def test_of_bloomfilter(self): for each in self.existing_strings: self.assertTrue(self.bf.lookup(each)) for each in self.non_existing_strings: # with small false positive, this will fail :) self.assertFalse(self.bf.lookup(each))
class DuplicatesPipeline(object): def __init__(self): # self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.bf = BloomFilter(10000, 0.0001, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() self.count_num = 0 def process_item(self, item, spider): # print '************%d pages visited!*****************' %len(self.bf) temp='?' str1=item['url'] str2=str1[:str1.find(temp)] # if self.bf.add(item['url']):#True if item in the BF # if self.bf.lookup(item['url']): if self.bf.lookup(str2): raise DropItem("Duplicate item found: %s" % item) else: # print '%d pages visited!'% len(self.url_seen) self.count_num+=1 # self.bf.add(item['url']) # self.save_to_file(item['url'],item['title']) self.bf.add(str2) self.save_to_file(item['url'],item['title']) self.si.AddIndex(item) print self.count_num return item def save_to_file(self,url,utitle): self.f_write.write(url) self.f_write.write('\t') self.f_write.write(utitle.encode('utf-8')) self.f_write.write('\n') def __del__(self): """docstring for __del__""" self.f_write.close() self.si.IndexDone()