예제 #1
0
class BloomFilter:
    def __init__(self, n):
        #对于基本参数进行设置
        self.k = 10
        self.m = n * 20 + 1
        #这一步很关键,处以一个素数会让冲突减小一万倍
        self.bitarray = Bitarray(self.m)

    def hash_str(self, keyword, i):
        seed = eval("1313" + i * "13")  # 31 131 1313 13131 131313 etc..
        hash = 0
        for i in range(len(keyword)):
            hash = (hash * seed) + ord(keyword[i])
        index = hash % self.m
        return index

    def add_keyword(self, keyword):
        for i in range(self.k):
            index = self.hash_str(keyword=keyword, i=i)
            self.bitarray.set(index)

    def check_keyword(self, keyword):
        for i in range(self.k):
            index = self.hash_str(keyword=keyword, i=i)
            if not self.bitarray.get(index):
                return False
        return True
예제 #2
0
class BloomFilter(set):
    def __init__(self, size, hash_num):
        super(BloomFilter, self).__init__()
        self.bitarray = Bitarray(size)
        self.size = size
        self.hash_num = hash_num

    def __len__(self):
        return self.size

    def __iter__(self):
        return iter(self.bitarray)

    def add(self, item):
        for i in range(self.hash_num):
            index = mmh3.hash(item, i) % self.size
            self.bitarray.set(index)

        return self

    def __contains__(self, item):
        flag = True
        for i in range(self.hash_num):
            index = mmh3.hash(item, i) % self.size
            if self.bitarray.get(index) == 0:
                flag = False

        return flag
예제 #3
0
def main():
    funcs = ['BKDRHash', 'RSHash', 'JSHash', 'SDBMHash', 'DEKHash']
    bit_obj = Bitarray(800000)

    words = []
    train_num = 110000
    while train_num > 0:
        ran_str = create_random_string(1, 10)
        if ran_str not in words:
            words.append(ran_str)
            train_num -= 1

    for word in words:
        add_keyword(word, bit_obj, funcs)

    count = 0
    test_num = count_test = 100000

    while count_test > 0:
        ran_str = create_random_string(1, 10)
        if ran_str not in words:
            count_test -= 1
            if check(ran_str, bit_obj, funcs):
                count += 1

    print float(count) / test_num
예제 #4
0
def initialize_filter(size=mod,
                      num_hash_func=k,
                      word_cnt=0,
                      ideal=IDEAL_CHOICE_OF_HASH_FUNCS):
    global k, mod, bit_array_obj, IDEAL_CHOICE_OF_HASH_FUNCS
    if ideal:
        k = int(math.log(2, math.e) * mod / float(word_cnt))
    k = num_hash_func
    mod = size
    bit_array_obj = Bitarray(mod)
예제 #5
0
 def __init__(self,
              size=1,
              num_hash_func=13,
              word_cnt=0,
              ideal=False,
              debug_mode=False):
     self.size = size
     self.num_hash_func = num_hash_func
     self.word_cnt = word_cnt
     self.IDEAL = ideal
     self.DEBUG_MODE = debug_mode
     if self.IDEAL:
         if word_cnt:
             self.num_hash_func = int(
                 math.log(2, math.e) * size / float(word_cnt))
     self.bit_array_obj = Bitarray(size)
     # BKRD Hash seeds(for not the most ideal cases) len=26
     self.BKRD_hash_seeds = [
         13, 1331, 13131, 3313, 3113, 3111, 2483, 12536, 92174, 213, 654,
         65, 367, 1252, 12356, 75634, 345, 6544, 3456, 1827, 8125, 26154,
         2345, 213, 755, 12467
     ]
예제 #6
0
 def __init__(self, n):
     #对于基本参数进行设置
     self.k = 10
     self.m = n * 20 + 1
     #这一步很关键,处以一个素数会让冲突减小一万倍
     self.bitarray = Bitarray(self.m)
예제 #7
0
from GeneralHashFunctions import *
import math
import Ex3_1_Bloom_Filter_ast_BKDRHash_seeds  # 99990 numbers from 0 to 10000000

IDEAL_CHOICE_OF_HASH_FUNCS = 1

# Number of Hash Functions k>=10
k = 13
# Bit Array Size
mod = -1
if IDEAL_CHOICE_OF_HASH_FUNCS:
    mod = 5555555
else:
    mod = 55555
# Bit Array
bit_array_obj = Bitarray(mod)
# BKRD Hash seeds(for not the most ideal cases) len=26
BKRD_hash_seeds = [
    13, 1331, 13131, 3313, 3113, 3111, 2483, 12536, 92174, 213, 654, 65, 367,
    1252, 12356, 75634, 345, 6544, 3456, 1827, 8125, 26154, 2345, 213, 755,
    12467
]


def initialize_filter(size=mod,
                      num_hash_func=k,
                      word_cnt=0,
                      ideal=IDEAL_CHOICE_OF_HASH_FUNCS):
    global k, mod, bit_array_obj, IDEAL_CHOICE_OF_HASH_FUNCS
    if ideal:
        k = int(math.log(2, math.e) * mod / float(word_cnt))
예제 #8
0
 def __init__(self,size,hash_num):
     super(BloomFilter,self).__init__()
     self.bitarray = Bitarray(size)
     self.size = size
     self.hash_num = hash_num
예제 #9
0
class BloomFilter:
    def __init__(self,
                 size=1,
                 num_hash_func=13,
                 word_cnt=0,
                 ideal=False,
                 debug_mode=False):
        self.size = size
        self.num_hash_func = num_hash_func
        self.word_cnt = word_cnt
        self.IDEAL = ideal
        self.DEBUG_MODE = debug_mode
        if self.IDEAL:
            if word_cnt:
                self.num_hash_func = int(
                    math.log(2, math.e) * size / float(word_cnt))
        self.bit_array_obj = Bitarray(size)
        # BKRD Hash seeds(for not the most ideal cases) len=26
        self.BKRD_hash_seeds = [
            13, 1331, 13131, 3313, 3113, 3111, 2483, 12536, 92174, 213, 654,
            65, 367, 1252, 12356, 75634, 345, 6544, 3456, 1827, 8125, 26154,
            2345, 213, 755, 12467
        ]

    # form the bit map hit values
    # [return] <list>
    def hit(self, key):
        hit = []
        hit.append(RSHash(key) % self.size)
        hit.append(JSHash(key) % self.size)
        hit.append(PJWHash(key) % self.size)
        hit.append(ELFHash(key) % self.size)
        hit.append(SDBMHash(key) % self.size)
        hit.append(DJBHash(key) % self.size)
        hit.append(DEKHash(key) % self.size)
        hit.append(BPHash(key) % self.size)
        hit.append(FNVHash(key) % self.size)
        hit.append(APHash(key) % self.size)
        for i in range(self.num_hash_func - 10):
            if self.IDEAL:
                seed = Ex3_1_Bloom_Filter_ast_BKDRHash_seeds.get_BKRD_hash_seed(
                    i)
            else:
                seed = self.BKRD_hash_seeds[i]
            hit.append(BKDRHash(key, seed) % self.size)

        return hit

    # add the keyword to the bit map
    def add(self, key):
        # call the function to generate the hit indices in the bit array
        hit = self.hit(key)

        # update the bit map values
        for idx in hit:
            self.bit_array_obj.set(idx)

    # search the bit map to determine whether a keyword is in(True) the bit map or not(False)
    def search(self, key):
        # call the function to generate the hit indices in the bit array
        to_hit = self.hit(key)
        for idx in to_hit:
            # the corresponding idx in the bit array is 0, i.e., keyword not in
            if not self.bit_array_obj.get(idx):
                return False
        return True

    # test the bloom filter
    # @ para [size]         the size of the bit array
    # @ para [file_train]   an English text file to train the filter
    # @ para [file_test]    an English text file to test the filter
    def test(self, file_train, file_test):
        # read the file to train
        words = []
        f = open(file_train, 'r')
        for line in f.xreadlines():  # read the text line by line
            for word in line.strip().split(
                    ' '):  # split the contents with whitespace to be words
                words.append(word)  # add the word to <list> words
        f.close()
        len_words = len(words)
        if self.DEBUG_MODE:
            print "# All keywords added"

        # read the file to test
        test_words = []
        f = open(file_test, 'r')
        for line in f.xreadlines():  # read the text line by line
            for word in line.strip().split(
                    ' '):  # split the contents with whitespace to be words
                test_words.append(word)  # add the word to <list> words
        f.close()
        if self.DEBUG_MODE:
            print "# All test words read"

        # add the keywords to the bit map
        for word in words:
            self.add(word)

        # test the filter
        total = len(test_words)
        wrong_cnt = 0
        for word in test_words:
            if (word in words) != (self.search(word)):
                wrong_cnt += 1
        false_positive_rate = float(wrong_cnt) / total

        print "Words added:", len_words
        print "Bit Array Size:", self.size
        print "Number of hash functions:", self.num_hash_func
        print "Wrong mapping count:", wrong_cnt
        print "Total word count:", total
        print "False positive rate:", false_positive_rate

        return false_positive_rate
예제 #10
0
                crawled.add_str(page)
                add_page_to_folder(page, content)
                outlinks = get_all_links(content, page)
                if varLock.acquire():
                    union(tocrawl, outlinks)
                    graph[page] = outlinks
                    COUNT += 1
                    varLock.release()
        tocrawl.task_done()
    tocrawl.clear()


if __name__ == '__main__':

    seed = 'http://www.baidu.com'
    max_page = 10

    tocrawl = Queue.Queue()  # tocrawl is a global working queue
    tocrawl.put(seed)
    crawled = Bitarray(20 * max_page)
    graph = {}
    COUNT = 0
    varLock = threading.Lock()
    NUM = 5

    for i in range(NUM):
        t = threading.Thread(target=page_working)
        t.setDaemon(True)
        t.start()

    tocrawl.join()
예제 #11
0
            if f == 0 and add_page_to_folder(page, content):
                max_page -= 1

            varLock.release()

        else:
            varLock.release()  #这里释放查找不匹配情况下,查找上的锁
        q.task_done()


start = time.time()
NUM = 128  #线程数
# seed = sys.argv[1]
# max_page = int(sys.argv[2])
seed = 'https://www.guancha.cn/'
max_page = 7000
crawled = Bitarray(max_page * 20)
constant = max_page * 20  #Hash值需要%的常数
varLock = threading.Lock()
q = queue.Queue()
q.put(seed)
for i in range(NUM):
    t = threading.Thread(target=working)
    t.setDaemon(True)
    t.start()
q.join()
end = time.time()
print(end - start)
print(max_page)