def __init__(self, ui): self.ui = ui self.precompile_filter = BloomFilter() self.keyword_filter = BloomFilter(7, 10000) self.plainFmt = None # 解决format错误的问题 self.flag_format_changed = False # 解决无限递归问题 self.init_filter() self.color = self.FontColor() self.plainFmt = self.ui.textEdit.currentCharFormat()
def __init__(self): self.words_bloom = BloomFilter(max_elements=64_000, error_rate=0.000001) self.parts_bloom = BloomFilter(max_elements=700_000, error_rate=0.000001) try: self.words_bloom, self.parts_bloom = pickle.load(_VOCABULARY_PATH.open('rb')) except Exception: logger.warning('Vocabulary unpickling error: \n' + traceback.format_exc()) self._build_from_file() pickle.dump((self.words_bloom, self.parts_bloom), _VOCABULARY_PATH.open('wb'))
def __init__(self): self.bf_urls = BloomFilter(max_elements=10000000, error_rate=0.001, filename="Filter_files/urls_AET_1.bf") self.bf_content = BloomFilter(max_elements=10000000, error_rate=0.001, filename="Filter_files/title_AET_1.bf") self.bf_urls1 = BloomFilter(max_elements=10000000, error_rate=0.001, filename="Filter_files/urls_AET.bf") self.bf_content1 = BloomFilter(max_elements=10000000, error_rate=0.001, filename="Filter_files/title_AET2.bf")
def filter_anchors(query): non_singles = BloomFilter(max_elements=1000, error_rate=0.1) singles = BloomFilter(max_elements=5800, error_rate=0.1) priority0_x, priority1_x, priority2_x = [], [], [] priority0_y, priority1_y, priority2_y = [], [], [] for i in range(len(query.x)): d = str(((query.x[i] - query.y[i]) >> 4) << 4) if d in non_singles: priority2_x.append(query.x[i]) priority2_y.append(query.y[i]) else: non_singles.add(d) priority0_x.append(query.x[i]) priority0_y.append(query.y[i]) #if d in non_singles: # priority2_x.append(query.x[i]) # priority2_y.append(query.y[i]) #elif d in singles: # non_singles.add(d) # priority1_x.append(query.x[i]) # priority1_y.append(query.y[i]) #else: # singles.add(d) # priority0_x.append(query.x[i]) # priority0_y.append(query.y[i]) #Some stats: filter_mem = (non_singles.num_bits_m) / 8192 original = (128 * len(query.x)) / 8192 filtered = (128 * len(priority2_x)) / 8192 print("Original memory use: {}".format(original)) print("New memory use, data only: {}".format(filtered)) print("New memory use, with filter: {}".format(filtered + filter_mem)) print("Savings factor: {}".format(original / (filtered + filter_mem))) #Priority 0 #plt.scatter(priority0_x, priority0_y, s=10) #Priority 1 #plt.scatter(priority1_x, priority1_y, s=10, color='deepskyblue') #Priority 2 #plt.scatter(priority2_x, priority2_y, s=10, color='r') #plt.xlabel("Reference Position") #plt.ylabel("Query Position") #plt.title("Filtered anchors. Filter size: {}K. Num Hashes: {} False positive Rate: {}".format(filter_mem, 4, 0.1)) #plt.show() return original / (filtered + filter_mem)
def __init__(self, primes, num_prime_funcs, num_prime_bits, num_fp_funcs, num_fp_bits): self.primes_bloom_filter = BloomFilter(num_prime_funcs, num_prime_bits) self.fps_bloom_filter = BloomFilter(num_fp_funcs, num_fp_bits) print 'Adding primes' for p in primes[:-1]: # why ignore the last prime? self.primes_bloom_filter.add(p) print 'Adding false positives..' for i in range(primes[0], primes[-1]): true_prime = i in primes bf_prime = self.primes_bloom_filter.contains(i) if true_prime and not bf_prime: assert False, 'False negatives NEVER happen' elif not true_prime and bf_prime: self.fps_bloom_filter.add(i)
class EventValidator: last = -1 current = -1 bloom = BloomFilter(4980000, 0.01) orderErrors = [] uniqueErrors = [] def __init__(self): self def checkOrder(self, value): if self.last == -1: self.last = value else: self.current = value if self.last > self.current: self.orderErrors.append(self.current) else: self.last = self.current def checkUnique(self, value): inside = value in self.bloom if inside: self.uniqueErrors.append(value) else: self.bloom.add(value)
def crawl_city_page(url): # BLOOMF??? bloomf = BloomFilter(10000000, 0.01) # print(bloomf) #全国城市url city_page_content = crawl_dp_page(url, 'citylist') # print(city_page_content,'>>>>>') html = etree.HTML(city_page_content) #这是全国城市链接 city_urls = html.xpath('//div[@class="findHeight"]/a/@href') print(city_urls) for city_detail_url in city_urls: page = 1 while True: #商铺列表页面 city_detail_page_content = crawl_dp_page( "http:" + city_detail_url + "/ch20/g187p%s" % page, 'citydetailpage') #商铺详情链接url shop_urls = re.findall(r'href="(.+/shop/[0-9]+)"', city_detail_page_content) tot_shop = 0 for shop_url in shop_urls: if shop_url in bloomf: print("ignore duplicate " + shop_url) else: #商铺详情页 shop_content = crawl_dp_page(shop_url, 'shopdetail') bloomf.add(shop_url) #None parse_shop(shop_content) #None tot_shop += 1 if tot_shop < 1: # break if there is no shop on that page break page += 1
async def filter(self, existed_vid_list=None): bloom = BloomFilter(max_elements=config.MAX_ESTIMATE_RECORD_NUMBER ) ## construct a bloom filter for ele in existed_vid_list: bloom.add(ele) ## add origin_id into the filter latest_results = [] ## final result to output ## The one who is responsible for the paging for xinpianchang do not have any kids buffer = config.check_latest_buffer latest = await self.fetch() for ele in latest: if bloom.__contains__( ele['vid']): ## determine whether if the ele is reocrded ## if the ele is recorded ## meaning that the upcoming ele are repeated ## so we just return the current latest_results ## but due to the unreasonable paging issue ## we need a buffer to make sure that we make it to the end if buffer == 0: del bloom ## release memory return jmespath.search( '[]', latest_results) if latest_results else [] else: buffer -= 1 continue else: bloom.add(ele['vid']) ## add origin_id into the filter latest_results.append(ele) else: return jmespath.search('[]', latest_results) if latest_results else []
class Test: #这三个都是多线程共享的变量 f = BloomFilter(0.0001, 10000000) urls = [ 'http://m.sodu.com', ] count = 0 @classmethod def get_url(cls): url = Test.urls.pop(0) while Test.f.is_element_exist(url) == True: url = Test.urls.pop(0) Test.f.insert_element(url) # 可选,进行抓取的url写入一个文件中,但会增加I/O操作 # with open('urls.txt', 'a') as file_obj: # file_obj.write(url + '\n') return url @classmethod def get_urls(cls): while len(Test.urls) > 0: url = Test.get_url() try: Test.count += 1 print(Test.count, url) analysis = PageParser(url) test = analysis.get_urls() Test.urls += test except: pass
def add(self, v): now = int(time.time()) if now - self.last_reset_time > self.RESET_TIME: logging.info("bloom filter reset") self.bf = BloomFilter() self.last_reset_time = now self.bf.add(v)
def __init__(self, expected_inserts, error_rate, mode=SMALL_GROWTH): self.sbfilters = [] sbfilter = BloomFilter(expected_inserts, error_rate) self.sbfilters.append(sbfilter) self.error_prob_ratio = 0.9 self.space_scale = mode
def add_time(capacity, error_rate): bfilter = BloomFilter(capacity, error_rate) start_time = time.time() for i in range(capacity): bfilter.add(i) end_time = time.time() return end_time - start_time
def test_bloom_filter(): bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY) word_present = [ 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial' ] word_absent = ['facebook', 'twitter'] for item in word_present: bloomfilter.add(item) test_words = word_present[:10] + word_absent shuffle(test_words) for word in test_words: if bloomfilter.is_member(word): if word in word_absent: print(f"'{word}' is a false positive!") logging.info(f"'{word}' is a false positive!") else: print(f"'{word}' is probably present!") logging.info(f"'{word}' is probably present!") else: print(f"'{word}' is definitely not present!") logging.info(f"'{word}' is definitely not present!")
def __init__(self, n, m, dimension, is_real_size=False, point_build=[], discretisator=None): """ :param n: number of element that will be stored. :param m: size of the bloom filter in bit. :param point_build: list of object points to insert in the bloom filter """ self.dimension = dimension if is_real_size: n = n * 2**dimension m = m * 2**dimension error_rate = float( math.exp((math.log(2)**2) * float(m) / float(-1 * n))) self.bloom_filter = BloomFilter(n, error_rate) # TODO version with the choose of the Hash : the third parameter is the hash self.point_build = point_build self.discretisator = discretisator if self.discretisator: for point in point_build: for d_pt in self.discretisator.discretise_point_to_insert( point): self.bloom_filter.add(d_pt.to_string()) else: for pt in point_build: self.bloom_filter.add(pt.to_string())
def test_false_positives(): """ see how well it does at false positives""" bloom = BloomFilter(initial_size=10, vector_type=ByteBitVector) stats = Counter() counter = 0 logging.basicConfig(level=logging.DEBUG) with open('./wordlist.txt', 'r', encoding='iso-8859-1') as fp: for line in fp: term = line.strip() contains = term in bloom if contains is True: counter += 1 logging.debug(counter, "\tTerm already set:\t", term) stats.update([str(contains)]) bloom.add(term) assert term in bloom # no false negatives false_positives = float(stats[True] / sum(stats.values())) print('Count of false positives:', stats, "{:0.04f}%".format(100 * false_positives)) print('sys.getsizeof filter:', getsizeof(bloom)) print('sys.getsizeof bitvector needed:', getsizeof(bloom.bitvector.bits)) print('num bits in vector:', len(bloom.bitvector)) print('num bitvector set:', bloom.bitvector.num_set()) # A bit arbitrary - descriptions of bloom filters said 2-3% false positives is pretty good assert false_positives <= .03
def __init__(self, name, size=None, max_elements=1e4, error_rate=1e-8): self.name = name self.size = size self.max_elements = max_elements self.error_rate = error_rate self._set = BloomFilter(max_elements=max_elements, error_rate=error_rate)
def __init__(self, partition_id, src_dir, dest_dir, dry_run, flatten): self.partition_id = partition_id self.src_dir = src_dir self.dest_dir = dest_dir self.dest_bloom = BloomFilter(max_elements=EST_MAX_FILES_PER_YEAR) self.dry_run = dry_run self.flatten = flatten
def build_db(self): if isfile(self.tcem_index): os.remove(self.tcem_index) logger.info( f'<chunk {self.chunk_index}> Building TCEM database from {self.fasta}' ) bloom = BloomFilter(max_elements=self.bloom_size, error_rate=self.bloom_error) seq_counter = 0 with Pool(self.cores) as pool, gzip.open( self.fasta, 'rt') as f, gzip.open(self.tcem_index, 'wt') as o: try: seqs = (seq for i, seq in enumerate(SeqIO.parse(f, 'fasta')) if (i % self.total_chunks) == self.chunk_index) for taxa_kmer_set in pool.imap_unordered(process_one_seq, seqs, chunksize=1000): seq_counter += 1 if seq_counter % (10 * 1000) == 0: logger.info( f'<chunk {self.chunk_index}> Processing seq: {seq_counter}' ) for pair in taxa_kmer_set: pair_str = f'{pair[0]},{pair[1]}' if pair_str in bloom: continue bloom.add(pair_str) print(pair_str, file=o) except KeyboardInterrupt: pool.terminate() raise open(self.tcem_index + '.flag', 'w').close()
def main(): """ - Access to the Database - Access to a Bloom filter - Consumer sending request for data """ UNIQUE_SET_SIZE = 5 data_key = 34 data_value = "This is a value" db_repository = BasicDBRepository() hashing_function_1 = BasicHashingFunction() hashing_function_2 = Basic2HashingFunction() print("Hello world!!") bloom_filter = BloomFilter(db_repository,hashing_function_1, hashing_function_2, UNIQUE_SET_SIZE) print(f'Initialized bloom filter: {bloom_filter.bit_vector}') bloom_filter.insert_new_data(data_key, data_value) print("Reached!") pprint(f"BloomFilter state: {bloom_filter.bit_vector}") pprint(db_repository.show_data(), width = 1)
def __init__(self, *args, **kwargs): super(MyFilterSwitch, self).__init__(*args, **kwargs) self.swList = {} #daftar switch self.hostDB = {} #berisi pairing antara host.ID - port number self.bloom = BloomFilter(max_elements=10000, error_rate=0.1) self.randomFilter(12345678, 1000) self.monitor_thread = hub.spawn(self._monitor)
def __init__(self, peer_pool: PeerPool) -> None: super().__init__() self._peer_pool = peer_pool # 1m should give us 9000 blocks before that filter becomes less reliable # It should take up about 1mb of memory self._bloom = BloomFilter(max_elements=1000000) self._bloom_salt = str(uuid.uuid4())
def test_bloom_filter(): with pytest.assertRaises(ValueError): blf = BloomFilter(n=-1, p=0.1) with pytest.assertRaises(ValueError): blf = BloomFilter(n=1, p=-0.1) with pytest.assertRaises(ValueError): blf = BloomFilter(n=1.0, p=0.1) # aiming to cause collision blf = BloomFilter(n=3, p=0.1) blf.insert("Gondor") assert blf.is_present("Gondor 1") == True assert blf.is_present("Isenguard") == False
def __init__(self): self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} self.cookies = {'cookie': 'tryThing00=1098; tryThing01=5838; tryThing02=0552; optimizelyEndUserId=oeu1556099053513r0.5843061130621663; s_fid=4A92F5E10FC9F2AB-176246D5FD4ACB26; gig_hasGmid=ver2; s_vi=[CS]v1|2E60197B8507E3C9-40000113A004AC13[CE]; __gads=ID=5243df9823110dbb:T=1556099830:S=ALNI_MbcB64SpCOjLKHMNlWpA0cU3jln6A; bfp_sn_rf_8b2087b102c9e3e5ffed1c1478ed8b78=Direct; bfp_sn_rt_8b2087b102c9e3e5ffed1c1478ed8b78=1556099832385; _fbp=fb.1.1556099850902.815265183; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22abac316b-5aa7-403d-9230-f90f4f625c5f%22; _cb_ls=1; ug=5cc02ff00e7dc50a3f9cca0013c765a0; __qca=P0-689587383-1556590653464; _cb=BmyPaDKxXHCDf5t7; ugs=1; s_cc=true; s_ppv=100; countryCode=US; bounceClientVisit340v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgMYB2pApihAIakD2YAdGaS-QLZEgA0IATjBAgAvkA; _cb_svref=null; dmxRegion=false; s_sq=%5B%5BB%5D%5D; GED_PLAYLIST_ACTIVITY=W3sidSI6InA1UU4iLCJ0c2wiOjE1NTc3MTMxMzQsIm52IjoxLCJ1cHQiOjE1NTc3MTMwNzUsImx0IjoxNTU3NzEzMTMyfV0.; OptanonConsent=landingPath=NotLandingPage&datestamp=Mon+May+13+2019+10%3A05%3A38+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=4.4.0&EU=false&groups=1%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2C0_37248%3A1%2C0_37215%3A1%2C0_37244%3A1%2C0_37211%3A1%2C0_37240%3A1%2C0_37207%3A1%2C0_37236%3A1%2C0_37203%3A1%2C0_37198%3A1%2C0_37231%3A1%2C0_37227%3A1%2C0_37223%3A1%2C0_37219%3A1%2C0_37216%3A1%2C0_37249%3A1%2C0_37212%3A1%2C0_37245%3A1%2C0_37208%3A1%2C0_37241%3A1%2C0_37204%3A1%2C0_37237%3A1%2C0_37232%3A1%2C0_37199%3A1%2C0_37228%3A1%2C0_37224%3A1%2C0_37220%3A1%2C0_37217%3A1%2C0_37246%3A1%2C0_37213%3A1%2C0_37242%3A1%2C0_37209%3A1%2C0_37238%3A1%2C0_37205%3A1%2C0_37234%3A1%2C0_37200%3A1%2C0_37233%3A1%2C0_37196%3A1%2C0_37229%3A1%2C0_37225%3A1%2C0_37221%3A1%2C0_37250%3A1%2C0_37214%3A1%2C0_37210%3A1%2C0_37243%3A1%2C0_37206%3A1%2C0_37239%3A1%2C0_37202%3A1%2C0_37235%3A1%2C0_37201%3A1%2C0_37230%3A1%2C0_37197%3A1%2C0_37226%3A1%2C0_37222%3A1%2C0_37218%3A1%2C8%3A1%2C101%3A1%2C102%3A1%2C103%3A1%2C104%3A1%2C105%3A1%2C106%3A1%2C107%3A1%2C108%3A1%2C109%3A1%2C110%3A1%2C111%3A1%2C112%3A1%2C113%3A1%2C114%3A1%2C115%3A1%2C116%3A1%2C117%3A1%2C118%3A1%2C119%3A1%2C120%3A1%2C121%3A1%2C122%3A1%2C123%3A1%2C124%3A1%2C125%3A1%2C126%3A1%2C127%3A1%2C128%3A1%2C129%3A1%2C130%3A1%2C131%3A1%2C133%3A1%2C134%3A1%2C135%3A1%2C136%3A1%2C137%3A1&AwaitingReconsent=false; _chartbeat2=.1557484090990.1557713169225.1001.S1zafCOkPc4t2pQ-D9by5sD3_rnb.3'} self.post_url = 'http://127.0.0.1:30008/crawler/article/transfer' self.filter_url = 'http://console.cc.clipclaps.tv/crawler/log' self.have_met = BloomFilter(max_elements=100000, error_rate=0.1) self.downloadPath = '/data/crawler' self.picPath = '/cnn_news/picture/'
def process_two(pipe21, pipe23): pid = 1 counter = BloomFilter(n, p, 0) counter = recv_message(pipe21, pid, counter, 'g') counter = send_message(pipe21, pid, counter, 'h') counter = send_message(pipe23, pid, counter, 'i') counter = recv_message(pipe23, pid, counter, 'j') print_history(counter, pid)
def add(self, key): bfilter = self.sbfilters[-1] if bfilter.can_accomodate() == False: new_expected_inserts = bfilter.expected_inserts * self.space_scale new_error_rate = bfilter.error_rate * self.error_prob_ratio new_bfilter = BloomFilter(new_expected_inserts, new_error_rate) self.sbfilters.append(new_bfilter) bfilter = new_bfilter bfilter.add(key)
def process_one(pipe12): pid = 0 counter = BloomFilter(n, p, 0) counter = event(pid, counter, 'b') counter = send_message(pipe12, pid, counter, 'c') counter = event(pid, counter, 'd') counter = recv_message(pipe12, pid, counter, 'e') counter = event(pid, counter, 'f') print_history(counter, pid)
def Create_and_fillin_BF(inputList,maxElement, Error): myBF = BloomFilter(max_elements=maxElement, error_rate=Error) for element in inputList: if element not in myBF: myBF.add(element) else: print("Element {} is already in the BF".format(element)) print('*********** Bloom Filter is created and filled-in ***************') return myBF
def general_test(self): # Set the size of the bit vector. bit_vector_size = 20 # Words to be added. words_saved = [ "this", "nonsense", "senior", "story", "jokes", "a", "young", "a", "one", "to", "impress", "dev", "with", "trying", "a", "is", "of" ] # Words that will not be added. words_not_saved = [ "These", "words", "do", "not", "exist", "in", "the", "filter" ] bloom = BloomFilter(bit_vector_size) # Add words to the filter. for word in range(len(words_saved)): bloom.add(words_saved[word]) number_of_true_positive = 0 number_of_true_negative = 0 number_of_false_positive = 0 number_of_false_negative = 0 # Check all the added words. for i in range(len(words_saved)): # If the word is found the we have true positive result. if bloom.search(words_saved[i]): number_of_true_positive += 1 # If the word is not found then there is a bug in the implementation. else: number_of_false_negative += 1 # Check all the non added words for i in range(len(words_not_saved)): # If the word is found the we have false positive result. if bloom.search(words_not_saved[i]): number_of_false_positive += 1 # If the word is not found the we have true negative result. else: number_of_true_negative += 1 print("Number of true positive results: ", number_of_true_positive) print("Number of true negative results: ", number_of_true_negative) print("Number of false positive results: ", number_of_false_positive) print("Number of false negative results: ", number_of_false_negative) # A bloom filter should never return false negative, if it does raise an error. if number_of_false_negative != 0: logging.error( "general_test: FAIL. Bloom filter return false negative.") return logging.info("general_test: PASS")
def __init__(self, env, feature_transformer): self.env = env self.models = {} self.feature_transformer = feature_transformer for a in env.actions_available: self.models[a] = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, shuffle=False) self.bloom_states = BloomFilter(max_elements=256**2) self.nonseen_states = 0
def __init__(self, *args, **kwargs): # // 要爬取网站的跟 self.base_url = 'http://ggzyxx.deyang.gov.cn/' super(DeyangSpider, self).__init__(*args, **kwargs) self.bloom_filter = BloomFilter(max_elements=1000000, error_rate=0.1, filename='bf.data') self.num = 0 self.scrawl_mode = ScrawlMode.HISTORY self._stop_parse = False