Esempio n. 1
0
 def __init__(self, redisName, filterName):
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
     }
     self.start_queue = RdsQueue(redisName)
     self.filter = BloomFilter(filterName)
     self.lock_commit = Lock()
     self.logger = logger
     self.conn = conn
     self.cursor = self.conn.cursor()
Esempio n. 2
0
 def __init__(self, transactions, items, numReduce):
     self.__numReduce = numReduce
     self.__num_transaction = len(transactions)
     self.__items = items
     self.__transaction_index_map = {}
     self.__listHashFunc = BloomFilter(
         int(self.__num_transaction / self.__numReduce), 0.05).listHashFunc
     transactionsHashed = list()
     for i in range(self.__num_transaction):
         transactionsHashed.append(
             list(map(lambda y: y(str(i)), self.__listHashFunc)))
     for hashed, transaction in zip(transactionsHashed, transactions):
         self.add_transaction(transaction, hashed)
Esempio n. 3
0
def run(seed):
    bf = BloomFilter(0.00001,1000000)   #初始化布隆过滤器
    queue = Queue.Queue(maxsize = 0)        #初始化URL队列
    urlCount = 0                        #初始化已得到URL变量
    urlList = []                        #初始化下载列表
    queue.put(seed)
    
    while(queue.empty() == False):
        currentURL = queue.get()
        urlList.append(currentURL)
        print 'currentURL',to_bytestring(currentURL)
        
        try:    #timeout处理
            html = urllib.urlopen(currentURL)
        except:
            continue

        bs_obj = BeautifulSoup(html,'html.parser')
        a_list = bs_obj.findAll('a') + bs_obj.findAll('img')

        for aa in a_list:
            if aa.attrs.get('href'):
                hrefStr = aa.attrs.get('href')
            else:
                hrefStr = aa.attrs.get('src')

            if hrefStr:
                hrefStr = is_relativeURL(hrefStr,currentURL)
                if hrefStr == -1:     #判断相对/绝对路径
                    continue
                if is_needURL(hrefStr) == True:         #判断是否需要抓取
                    if bf.is_element_exist(hrefStr) == False:   #布隆过滤
                        bf.insert_element(hrefStr)
                        print to_bytestring(hrefStr)
                        if is_resourceFile(hrefStr) == False:  #判断是否是资源文件
                            queue.put(hrefStr)
                    
                        urlList.append(hrefStr)
                        try:
                            downloadHtml(hrefStr)
                        except:
                                pass
                    urlCount = urlCount + 1
        print '所有--当前',urlCount,len(urlList)
Esempio n. 4
0
def sampleData(file1, file2, column):

    filter = BloomFilter(13419082, 23)

    firstUsersIds1 = userIds(file1, column)

    for user in firstUsersIds1:
        filter.add(str(user))

    firstUsersIds2 = userIds(file2, 'fc20')

    same = 0
    diff = 0
    for user in firstUsersIds2:
        if filter.contains(str(user)):
            same += 1
        else:
            diff += 1

    return same, diff
Esempio n. 5
0
 def add_transaction(self, transaction, hashed):
     for item in transaction:
         if item not in self.__transaction_index_map:
             self.__transaction_index_map[item] = BloomFilter(
                 int(self.__num_transaction / self.__numReduce), 0.05)
         self.__transaction_index_map[item].addHashed(hashed)
Esempio n. 6
0
def main():
    input_size = 10000
    fp_rate = 0.01

    count_size = 4

    bloom_filter = BloomFilter(input_size, fp_rate)
    start_time = time.time()
    for i in range(0, input_size):
        bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in bloom_filter:
            fp_count += 1
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size

    print("Expected false positive rate for all calculations is :" +
          str(fp_rate))
    print()
    print("For Standard Bloom Filter : \nFalse positive count:" +
          str(fp_count) + "  in " + str(input_size) + " try. " +
          str((fp_count / input_size)) + " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(bloom_filter) +
        bloom_filter.get_bitarray_size()))

    shifting_bloom_filter = ShiftingBloomFilterM(input_size, fp_rate)
    start_time = time.time()
    for i in range(0, input_size):
        shifting_bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in shifting_bloom_filter:
            fp_count += 1
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size

    print()
    print("For Shifting Bloom Filter : \nFalse positive count:" +
          str(fp_count) + "  in " + str(input_size) + " try. " +
          str((fp_count / input_size)) + " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(shifting_bloom_filter) +
        shifting_bloom_filter.get_bitarray_size()))

    counting_bloom_filter = CountingBloomFilter(input_size,
                                                fp_rate,
                                                count_size=count_size)

    start_time = time.time()
    for i in range(0, input_size):
        counting_bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in counting_bloom_filter:
            fp_count += 1

    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    for i in range(0, input_size):
        if not str(i) in counting_bloom_filter:
            print(str(i))
    print()
    print("For counting filter :\nFalse positive count:" + str(fp_count) +
          "  in " + str(input_size) + " try. " + str((fp_count / input_size)) +
          " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(counting_bloom_filter) +
        counting_bloom_filter.get_bitarray_size()))

    scalable_bloom_filter = ScalableBloomFilter(
        fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH)

    start_time = time.time()
    for i in range(0, input_size):
        scalable_bloom_filter.add(str(i))

    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in scalable_bloom_filter:
            fp_count += 1

    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For scalable filter :\nFalse positive count:" + str(fp_count) +
          "  in " + str(input_size) + " try. " + str((fp_count / input_size)) +
          " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))

    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(scalable_bloom_filter) +
        scalable_bloom_filter.get_bitarray_size()))

    c_scalable_bloom_filter = ScalableBloomFilter(
        fp_prob=fp_rate,
        growth=ScalableBloomFilter.SMALL_GROWTH,
        countable=True,
        count_size=count_size)

    start_time = time.time()
    for i in range(0, input_size):
        c_scalable_bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in c_scalable_bloom_filter:
            fp_count += 1

    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For counting scalable filter :\nFalse positive count:" +
          str(fp_count) + "  in " + str(input_size) + " try. " +
          str((fp_count / input_size)) + " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))

    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(c_scalable_bloom_filter) +
        c_scalable_bloom_filter.get_bitarray_size()))

    size_sum = 0
    filled_bit_count = 0
    max_count = 0
    for a in c_scalable_bloom_filter.bloom_filters:
        for i in range(0, len(a)):
            if counting_bloom_filter.get_bit_value(i) > 0:
                size_sum += counting_bloom_filter.get_bit_value(i)
                filled_bit_count += 1
                if max_count < counting_bloom_filter.get_bit_value(i):
                    max_count = counting_bloom_filter.get_bit_value(i)

    avg_size = size_sum / filled_bit_count
    print("For counting filter -------- avg count:" + str(avg_size))
    print("For counting filter-------- max count:" + str(max_count))

    hasmap = {}
    start_time = time.time()
    for i in range(0, input_size):
        hasmap[str(i)] = i
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    for i in range(input_size, input_size * 2):
        if str(i) in hasmap:
            pass
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For Hashmap ")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(memory_usage.get_obj_size(hasmap)))

    py_list = []
    start_time = time.time()
    for i in range(0, input_size):
        py_list.append(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    for i in range(input_size, input_size * 2):
        if str(i) in py_list:
            pass
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For List ")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(memory_usage.get_obj_size(py_list)))

    temp = 0
    for i in c_scalable_bloom_filter.bloom_filters:
        temp += memory_usage.get_obj_size(i)
    print(
        "aaa" +
        str(memory_usage.get_obj_size(c_scalable_bloom_filter.bloom_filters)))
    print("xxx" + str(temp))
Esempio n. 7
0
 def __init__(self, filterSize, hashCount, clickedUsers):
     self.allData = []
     self.clickedCounter = len(clickedUsers)
     self.noClickedCounter = 0
     self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount)
     self.__addUsers(clickedUsers)
import socket
import random
import os
from sample_data import USERS
from server_config import NODES
from pickle_hash import serialize_GET, serialize_PUT, serialize_DELETE
from node_ring import NodeRing
from lru_cache import Lru_Node, Lru_Cache
from bloomFilter import BloomFilter

BUFFER_SIZE = 1024
hash_codes = set()
has_cache = False
lru_cache_obj = Lru_Cache(0)
lru_cache_initialized = False
bf = BloomFilter(10, 0.05)


class UDPClient():
    def __init__(self, host, port):
        self.host = host
        self.port = int(port)

    def send(self, request):
        print('Connecting to server at {}:{}:{}'.format(
            self.host, self.port, os.getpid()))
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            s.sendto(request, (self.host, self.port))
            response, ip = s.recvfrom(BUFFER_SIZE)
            return response