Beispiel #1
0
    def test_derandomize_ensures_comparable_filters(self):
        """Derandomize ensures that filters are comparable."""
        with bloomfilter.util.derandomize():
            bloom_filter_1 = bloomfilter.BloomFilter(10, 0.1)
        with bloomfilter.util.derandomize():
            bloom_filter_2 = bloomfilter.BloomFilter(10, 0.1)

        self.assertEqual(bloom_filter_1.raw_data(), bloom_filter_2.raw_data())
Beispiel #2
0
    def test_derandomize_ensures_serialization_is_consistent(self):
        """Derandomize ensures that serialization_is_consistent."""
        with bloomfilter.util.derandomize(234):
            bloom_filter_1 = bloomfilter.BloomFilter(10, 0.1)
        with bloomfilter.util.derandomize(234):
            bloom_filter_2 = bloomfilter.BloomFilter(10, 0.1)

        self.assertEqual(bloom_filter_1.serialize(),
                         bloom_filter_2.serialize())
Beispiel #3
0
 def test_membership(self):
     for perm in itertools.permutations(strings, len(strings)):
         bf = bloomfilter.BloomFilter(10, MyString)
         for s in perm:
             bf[s] = True
             self.assertTrue(bf[s])
             self.assertTrue(s in bf)
Beispiel #4
0
    def hash_value_raises_hash_func_exception(self, return_val, maxsize=10):
        class BadHash:
            @bloom_hash
            def bad_hash(self, foo, bar):
                return return_val

        bf = bloomfilter.BloomFilter(maxsize, BadHash)
        obj = BadHash()
        with self.assertRaises(bloomfilter.BloomFilterHashFunctionException):
            obj.bad_hash()
def test1():
    bf = bloomfilter.BloomFilter()
    #
    start_cha = 0
    end_cha = 10000000
    #
    start_not = 10000001
    end_not = 20000000

    #insert 1 ~ 100w
    for i in xrange(start_cha, end_cha):
        bf.Add(str(i))
    print "Dump before."
    bf.Dump(FILE)
    print "Dump done."
Beispiel #6
0
    def __init__(self):
        self.raw_datas = []                     #数据集:原始数据
        self.datasWithoutClear = []
        self.dupli_count = 0                    #计数:重复的数据
        self.now = datetime.date.today()        #字段:插入记录的日期

        self.key_infos = bloomfilter.BloomFilter(0.001,1000000)     #学习使用bloomfilter

        try:
            # self.conn=pymysql.connect(host = "192.168.1.207",user = "******",passwd = "root",db = "property_info",charset = "utf8")
            self.conn = ToolsBox.get_database()
            # self.conn=pymysql.connect(host = "office.xmcdhpg.cn",user = "******",passwd = "root",db = "property_info",charset = "utf8",port = 6153)
        except:
            print( "初始化时Connect failed")
        self.cur = self.conn.cursor(cursor=pymysql.cursors.DictCursor)            # 用字典
def test2():
    bf = bloomfilter.BloomFilter(FILE)
    #
    start_cha = 0
    end_cha = 10000000
    #
    start_not = 10000001
    end_not = 20000000

    #check for FN
    cnt = 0
    for i in xrange(start_cha, end_cha):
        if not bf.Test(str(i)):
            cnt += 1
    print "FN", cnt

    #check for FP
    cnt = 0
    for i in xrange(start_not, end_not):
        if bf.Test(str(i)):
            cnt += 1
    print "FP", cnt
Beispiel #8
0
 def test_bad_bits_per_table(self):
     with self.assertRaises(bloomfilter.BloomFilterException):
         bloomfilter.BloomFilter(0, MyString)
     with self.assertRaises(bloomfilter.BloomFilterException):
         bloomfilter.BloomFilter(-1234, MyString)
Beispiel #9
0
 def test_empty(self):
     bf = bloomfilter.BloomFilter(10, MyString)
     self.assertFalse(any(bf[s] for s in strings))
Beispiel #10
0
    def test_class_wo_bloom_hashes(self):
        class Foo:
            pass

        with self.assertRaises(bloomfilter.BloomFilterException):
            bloomfilter.BloomFilter(10, Foo)
                        count += 1
                        q.put(url)
                    else:
                        break
                crawled.set(page)
                varLock.release()
        q.task_done()


start = time.time()
seed = "http://www.baidu.com"
q = Queue.Queue()
q.put(seed)
count = 1
max_page = 20
crawled = bloomfilter.BloomFilter(20*max_page, 5)

THREAD_NUM = 10
threads = []
varLock = threading.Lock()
for i in range(THREAD_NUM):
    t = threading.Thread(target = working)
    t.setDaemon(True)
    threads.append(t)
    t.start()


q.join()

print "It uses", time.time()-start,"sec to parse", max_page,"pages with", THREAD_NUM, "threads."
Beispiel #12
0
def make_bloomFilter(L):
    """Create a new empty bloom filter, and add elements in L"""
    bloom = bf.BloomFilter(100, 10)
    for x in L:
        bloom.add(x)
    return bloom
Beispiel #13
0
 def setUp(self):
     self.bf = bloomfilter.BloomFilter(50)
import urls_create
import bloomfilter

urls = urls_create.Get_urls()
# print(urls)
bf = bloomfilter.BloomFilter()
aa = []
for i in urls:
    if bf.isContains(i, "testurl"):
        print(i)
        aa.append(i)

    else:
        bf.insert(i, "testurl")

print(len(aa))