Esempio n. 1
0
class TestBloomFilter(unittest.TestCase):
    def setUp(self):
        self.size = 500000
        self.hash_count = 7

        self.bf = BloomFilter(self.size, self.hash_count)
        lst = ['abc', 'xyz', 'foo', 'bar']
        for item in lst:
            self.bf.add(item)

    def _initialize(self):
        pass

    def _cleanup(self):
        if self.bf:
            del(self.bf)
            self.bf = None

    def test_lookup_yes(self):
        self.assertEqual(self.bf.lookup('foo'), True)

    def test_lookup_no(self):
        self.assertEqual(self.bf.lookup('hello'), False)

    def tearDown(self):
        self._cleanup()
class TestBloomFilter(unittest.TestCase):

    def setUp(self):
        self.bf = BloomFilter(256)
        self.existing_strings = [
            'tiny', 'bloom', 'rate', 'back', 'apple', 'google',
            'dijkstra', 'limiter', 'url', 'travel', 'man',
            '2',
        ]
        for each in self.existing_strings:
            self.bf.insert(each)
        self.non_existing_strings = [
            'multi', 'short', 'path', 'components', 'connect',
            'unit', 'test',
        ]

    def test_of_bloomfilter(self):
        for each in self.existing_strings:
            self.assertTrue(self.bf.lookup(each))
        for each in self.non_existing_strings:
            # with small false positive, this will fail :)
            self.assertFalse(self.bf.lookup(each))
Esempio n. 3
0
class DuplicatesPipeline(object):
    def __init__(self):
#         self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        
        self.bf = BloomFilter(10000, 0.0001, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
        

    def process_item(self, item, spider):
#         print '************%d pages visited!*****************' %len(self.bf)
        temp='?'
        str1=item['url']
        str2=str1[:str1.find(temp)]
#         if self.bf.add(item['url']):#True if item in the BF
#         if self.bf.lookup(item['url']):
        if self.bf.lookup(str2):   
            raise DropItem("Duplicate item found: %s" % item)
        else:
#             print '%d pages visited!'% len(self.url_seen)
            self.count_num+=1
#             self.bf.add(item['url'])
#             self.save_to_file(item['url'],item['title'])
            self.bf.add(str2)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            print self.count_num
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()