def add(self, key): """Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> b.add("hello") False >>> b.add("hello") True """ if key in self: return True if not self.filters: filter = BloomFilter(capacity=self.initial_capacity, error_rate=self.error_rate * (1.0 - self.ratio)) self.filters.append(filter) else: filter = self.filters[-1] if filter.count >= filter.capacity: filter = BloomFilter(capacity=filter.capacity * self.scale, error_rate=filter.error_rate * self.ratio) self.filters.append(filter) filter.add(key, skip_check=True) return False
def __init__(self, url, charset=None, headers=None, response_handle=None, timeout=3, retry_times=30, load_wait=None, execute_js=None, execute_js_wait=None, retry_delta=3, http_proxy_url=None, force=False): ''' url 目标url charset 编码 data post的数据,字符串 headers 自定义请求头,dict response_handle 采集结果处理函数 timeout 超时时间,int, 比如:3 retry_times 重试次数,int,比如3 load_wait 加载页面后等待时间,秒. execute_js 加载页面完成后执行的js execute_js_waite 执行js之后等待的时间 retry_delta 如果出错,重试间隔,秒,int http_proxy_url 代理ip, "http://192.168.1.1:80" force 强制爬取,而不管有没有爬取过. ''' if not PhantomjsSpider._url_buff: PhantomjsSpider._url_buff = [BloomFilter(1000000)] global _queue _hash = md5(url) self.url = url self.timeout = timeout self.retry_times = retry_times self.retry_delta = retry_delta self.response_handle = response_handle self.charset = charset self.headers = headers self.execute_js = execute_js self.execute_js_wait = execute_js_wait self.load_wait = load_wait self.proxy = http_proxy_url if not force: try: for bloomfilter in PhantomjsSpider._url_buff: assert _hash not in bloomfilter except: pass else: try: PhantomjsSpider._url_buff[-1].add(_hash) except: PhantomjsSpider._url_buff.append( BloomFilter(PhantomjsSpider._url_buff[-1].capacity + 1000000)) PhantomjsSpider._url_buff[-1].add(_hash) _queue.put(self._go) else: _queue.put(self._go)
def __init__(self, capacity=(1 << 30), error_rate=0.0001): self.bloomFilters = [] self.leaderIndex = 0 self.followerIndex = 1 self.bloomFilters.append(BloomFilter(capacity, error_rate)) self.bloomFilters.append(BloomFilter(capacity, error_rate)) self.status = 0
class CrawlBSF: request_headers = { 'host': "www.mafengwo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" } cur_level = 0 max_level = 5 dir_name = 'iterate/' iter_width = 50 downloaded_urls = [] du_md5_file_name = dir_name + 'download.txt' du_url_file_name = dir_name + 'urls.txt' bloom_downloaded_urls = BloomFilter(1024 * 1024 * 16, 0.01) bloom_url_queue = BloomFilter(1024 * 1024 * 16, 0.01) cur_queue = deque() child_queue = deque() def __init__(self, url): self.root_url = url self.cur_queue.append(url) self.du_file = open(self.du_url_file_name, 'a+') try: self.dumd5_file = open(self.du_md5_file_name, 'r') self.downloaded_urls = self.dumd5_file.readlines() self.dumd5_file.close() for urlmd5 in self.downloaded_urls: self.bloom_downloaded_urls.add(urlmd5[:-2]) except IOError: print "File not found" finally: self.dumd5_file = open(self.du_md5_file_name, 'a+') def enqueueUrl(self, url): if url not in self.bloom_url_queue and hashlib.md5( url).hexdigest() not in crawler.bloom_downloaded_urls: self.child_queue.append(url) self.bloom_url_queue.add(url) def dequeuUrl(self): try: url = self.cur_queue.popleft() return url except IndexError: return None def close(self): self.dumd5_file.close() self.du_file.close()
def test_intersection_k_fail(): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.01) def _run(): new_bloom = bloom_one.intersection(bloom_two) assertRaises(ValueError, _run)
def test_intersection_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.intersection(bloom_two) self.assertRaises(ValueError, _run)
def test_union_k_fail(self): bloom_one = BloomFilter(100, 0.01) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) self.assertRaises(ValueError, _run)
def test_union_capacity_fail(): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): new_bloom = bloom_one.union(bloom_two) assertRaises(ValueError, _run)
def __init__(self): self.monosyllableMorphemeBf = BloomFilter(capacity=20000, error_rate=0.0001) self.disyllableMorphemeBf = BloomFilter(capacity=50000, error_rate=0.0001) self.multisyllableMorphemeBf = BloomFilter(capacity=500000, error_rate=0.0001) # 加载单音节,双音节,多音节语素表 self.load_morphemes()
def test_union(): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars[len(chars) / 2:]: bloom_one.add(char) for char in chars[:len(chars) / 2]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: assert_(char in new_bloom)
def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range_fn(97, 123)] for char in chars[int(len(chars) / 2):]: bloom_one.add(char) for char in chars[:int(len(chars) / 2)]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assertTrue(char in new_bloom)
def __init__(self, url, charset=None, data=None, headers=None, timeout=3, retry_times=30, retry_delta=3, http_proxy=None, force=False): ''' url 目标url charset 编码 data post的数据,字符串 headers 自定义请求头,dict timeout 超时时间,int, 比如:3 retry_times 重试次数,int,比如3 retry_delta 重试间隔,int http_proxy 代理ip, 比如 192.168.1.1:3128 ,也可以是一个函数 lambda :"192.168.1.1:3128" force 强制爬取,而不管有没有爬取过. ''' self.url = url self.data = data self.timeout = timeout self.retry_times = retry_times self.retry_delta = retry_delta self.charset = charset self.headers = headers self.http_proxy = http_proxy if not Spider._url_buff: Spider._url_buff = [BloomFilter(1000000)] global _queue if data: _hash = md5(url) + md5(data) else: _hash = md5(url) if not force: try: for bloomfilter in Spider._url_buff: assert _hash not in bloomfilter except: pass else: try: Spider._url_buff[-1].add(_hash) except: Spider._url_buff.append( BloomFilter(Spider._url_buff[-1].capacity + 1000000)) Spider._url_buff[-1].add(_hash) _queue.put_priority(self.__dict__, 0) else: _queue.put_priority(self.__dict__, 0)
def test_intersection(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars: bloom_one.add(char) for char in chars[:len(chars)/2]: bloom_two.add(char) new_bloom = bloom_one.intersection(bloom_two) for char in chars[:len(chars)/2]: self.assert_(char in new_bloom) for char in chars[len(chars)/2:]: self.assert_(char not in new_bloom)
def __init__(self, cachefile, capacity=1000000, error_rate=0.001): self.cachefile = cachefile if os.name == 'nt' or not cachefile: from pybloom import BloomFilter if self.cache(): with open(cachefile, 'r') as fp: self.filter = BloomFilter.fromfile(fp) else: self.filter = BloomFilter(capacity=capacity, error_rate=error_rate) elif os.name == 'posix': from pybloomfilter import BloomFilter if self.cache(): self.filter = BloomFilter.open(self.cachefile) else: self.filter = BloomFilter(capacity, error_rate, cachefile)
def start(): res = request_get(biqukan_url) index = BeautifulSoup(res, features=features) if os.path.exists(bf_file): LOG.info('bs from file') bf = BloomFilter.fromfile(open(bf_file, 'r')) else: LOG.info('init bs') bf = BloomFilter(500000) try: pool = Pool(size=pool_size) book_urls = find_wanben() book_urls += find_new_storage_block(index) book_urls += find_recommend_block(index, u'强力推荐') book_urls += find_type_block(index, u'玄幻小说') book_urls += find_type_block(index, u'修真小说') book_urls += find_type_block(index, u'都市小说') book_urls += find_type_block(index, u'穿越小说') book_urls += find_type_block(index, u'网游小说') book_urls += find_type_block(index, u'科幻小说') book_urls += find_new_update_block(index) book_num = len(book_urls) for i, url in enumerate(book_urls): pool.spawn(download_book, url, bf) # download_book(url, bf) LOG.info(u'开始下载%s本,剩余%s本', i + 1, book_num - i - 1) pool.join() LOG.info(u'下载完成') except Exception as e: LOG.exception(e) finally: bf.tofile(open(bf_file, 'w'))
def determine_lookup_speed_threshold(self): from time import time #do each one 5 times bf = BloomFilter(capacity=self.bloom_size, error_rate=self.bloom_error) count = 1 repetitions = 5 self_bf_holder = self.bf self.bf = bf while True: bf.add('andrew_' + str(count)) bin_faster_count = 0 for j in xrange(repetitions): #Linear scan t1 = time() self.linear_scan_count('andrew') t2 = time() linear_time = t2-t1 t1 = time() self.binsearch_count('andrew') t2 = time() bin_time = t2-t1 bin_faster_count += int(bin_time < linear_time) if 1.*bin_faster_count / repetitions >= 0.75: del bf self.bf = self_bf_holder return count count += 1
def ingest_payloads(filename): stats = { "days_per_user": {}, "users_per_versions": {}, "num_data_days": 0, "ignored_users": set(), "num_payloads": 0, "ignored_submissions": 0, "duplicate_submissions": 0, "duplicate_submission_interest": 0, } bloom_filter = BloomFilter(capacity=10000000, error_rate=0.001) with open(filename, "r") as infile: db = SQLBackend.instance() session = db.get_session() session.begin(subtransactions=True) for line in infile: payload = json.loads(line) try: ingest_payload(payload, session, stats, bloom_filter) except: session.rollback() session.commit() sys.stdout.write("===== Payload ingestion =====") sys.stdout.flush() print "\n" return stats
def __init__(self, domain, first_url=None, first_url_callback=None, first_url_follow=True, url_amount=100000, requests_session=None, tls=False, max_depth=100): self.domain = domain self.max_depth = max_depth self.bf = BloomFilter(capacity=url_amount, error_rate=1 / url_amount) self.url_queue = queue.Queue() if first_url is None: if tls: first_url = 'https://' + domain else: first_url = 'http://' + domain self.url_queue.put( (first_url, first_url_callback, first_url_follow, 0)) if requests_session is None: self.session = requests.Session() else: self.session = requests_session
def __init__(self): #mail # self.mailer = MailSender.from_settings(settings) #mongo self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"] #bloom file filename = settings['MONGODB_COLLECTION'] + ".blm" #pybloom num = (int(settings['CrawlCar_Num']) + self.collection.count()) * 1.1 self.df = BloomFilter(capacity=num, error_rate=0.01) #read isexists = os.path.exists(filename) self.fa = open(filename, "a") if isexists: fr = open(filename, "r") lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) fr.close() else: for i in self.collection.find(): if "status" in i.keys(): item = i["status"] item = md5(item).hexdigest() self.df.add(item) self.fa.writelines(item + '\n') #count self.counts = 0
def estimate_overlap(source_files, target_files, gran='word', n=8, capacity=10000, error_rate=1e-5, header=0, interval=100000): """ Estimate overlapping of target_files with source_files using n-grams gran: granularity of the token. It can be 'word' or 'char' header: number of lines of each file to skip. It's because in our format, the first line is the url """ if not gran in set(['word', 'char']): raise ValueError("gran has to be 'word' or 'char'") if isinstance(source_files, str): source_files = [source_files] if isinstance(target_files, str): target_files = [target_files] bf = BloomFilter(capacity=capacity, error_rate=error_rate) for source_file in source_files: bf = build_ngram(file=source_file, bf=bf, gran=gran, n=n, uncase=True, alphanumeric=True, interval=interval) results = [] for file in target_files: print(file) results.append( estimate_overlap_bf(bf, file, gran=gran, n=8, header=header)) return results
def open_spider(self, spider): brandName = 'mybloom' isexists = os.path.exists(brandName + '.blm') if isexists == True: self.bf = BloomFilter.fromfile(open(brandName + '.blm', 'rb')) else: self.bf = BloomFilter(100000, 0.001)
def build_bloom_filter_and_iblt(m, include_value_in_iblt=False): c = 8 * math.pow(math.log(2), 2) tau = 16.5 n = len(selected_txs) alpha = n / (c * tau) # print(alpha * tau) if m <= n: fpr = 0.1 else: fpr = alpha / m - n print("Mempool difference", abs(m - n)) n_cells = int((4 / 3) * abs(m - n)) + 30 print('n_cells', n_cells) logging.info("Calculated FPR: %f" % fpr) fpr = 0.1 b = BloomFilter(capacity=n, error_rate=fpr) i = IBLT(m=n_cells, k=3, key_size=32, value_size=0) for tx in selected_txs: b.add(tx['hash']) v = '' if include_value_in_iblt: v = tx_to_bytes(tx) i.insert(tx['hash'], v) return b, i
def returnItemsWithMinSupportV3(itemSet, lenItem, transactionList, minSupport, freqSet): _itemSet = set() localSet = defaultdict(int) if len(itemSet): filterCdd = BloomFilter(capacity=len(itemSet), error_rate=0.0001) else: print("As I say, ValueError: Capacity must be > 0") return set([]) print("Store cdds in BF ... - %s" % getTime()) for val in itemSet: pass # 待引入counting BF,如达到minSup*len(transactionList),则不插入;or 不用counting BF,判断,已在BF的则不再插入。 filterCdd.add(val) print("Mapping cddFromTrans on BF ... - %s" % getTime()) for trans in transactionList: for cdd in combinations(trans, lenItem): cdd = frozenset(cdd) if cdd in filterCdd: freqSet[cdd] += 1 #zi 全局存一个 localSet[cdd] += 1 #zi 局部存一个,(item, count),然后过滤小于minSupport的。 print("Filter cdds that less than minSup. - %s" % getTime()) for item, count in localSet.items(): support = float(count) / len(transactionList) if support > minSupport: _itemSet.add(item) return _itemSet
def user_init(): import re users = BloomFilter(10000000, 0.001) f = open(u"D:/工作/数据美化/data/简书用户id1.txt") for line in f: users.add(line.strip()) return users
def spider_update_Init(dbname, website, carnum): # Mongo setting # spider_original_Init(dbname, website, carnum) # Mongo con connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) dbdata = connection[dbname] collectiondata = dbdata[website] # pybloom num = (int(carnum) + collectiondata.count()) * 1.1 df = BloomFilter(capacity=num, error_rate=0.01) # urllist urllist = [] for i in collectiondata.find(): if "url" in i.keys(): item = i["url"] if "status" in i.keys(): if not (i['status'].find('sold') == -1): continue itemmd5 = md5(item).hexdigest() returndf = df.add(itemmd5) if not (returndf): urllist.append(item) connection.close() return urllist
def __init__(self, password_file=pw_file, fp_rate=0.001, ignore_case=True): self._log = logging.getLogger('passcheck.passcheck.PassCheck') self._fp_rate = fp_rate self._pw_file = os.path.realpath(password_file) self._ignore_case = ignore_case self._log.debug('Counting items in password file') with open(self._pw_file, 'r') as f: for line_num, line in enumerate(f): pass self._num_passwords = line_num + 1 self._log.debug('Creating BloomFilter with capacity=%d' % self._num_passwords) self._bf = BloomFilter(capacity=self._num_passwords, error_rate=self._fp_rate) self._log.debug('Loading passwords into BloomFilter') num_added = 0 with open(self._pw_file, 'r') as f: for line in f: pw = line[:-1] if self._ignore_case: pw = pw.lower() if not self._bf.add(pw): num_added += 1 if num_added > self._num_passwords: e = Exception('Password file was modified during load') self._log.error(e) raise e # Handle possibility of duplicates (especially if case is ignored) if num_added < self._num_passwords: self._log.warn('Expected %d passwords, but added %d' % (self._num_passwords, num_added)) self._num_passwords = num_added
def global_rebuild(self): #print "Lookup" #for e in sbf.lookup: # print e opt_m = self._global_optimal(self.num_filter, self.total_bits, self.num_insert, self.lookup) self.bf = [] for i, m in enumerate(opt_m): if m > 0: self.bf.append(BloomFilter(int(m), len(self.mapping_table[i]))) else: self.bf.append(None) # Clear counters or not #self.lookup = [0]*self.num_filter # Re-insert all keys into the sbf for i, m in enumerate(self.mapping_table): for n in m: if self.bf[i] != None: if not is_number: self.bf[i].add(padding_zero(n, item_len)) else: self.bf[i].add(int(n)) self.target_fpp = [ b.target_fpp if b is not None else 1.0 for b in self.bf ] self.old_fr = [float(l) / total_lookup for l in self.lookup]
def __init__(self, lines, estimated_lines, dup_proportion, truncate): super().__init__() estimated_dups = estimated_lines * dup_proportion self.truncate = truncate self.potential = BloomFilter(capacity=estimated_dups, error_rate=0.001) self.seen = set() self._find_collisions(lines, estimated_lines)
def main(argv): if argv: error_rate = float(argv[0]) print "[BUILDING] Using error-rate: {}".format(error_rate) if os.path.isfile(nsrl_path): print "[BUILDING] Reading in NSRL Database" with open(nsrl_path) as f_line: # Strip off header _ = f_line.readline() print "[BUILDING] Calculating number of hashes in NSRL..." num_lines = sum(bl.count("\n") for bl in blocks(f_line)) print "[BUILDING] There are %s hashes in the NSRL Database" % num_lines with open(nsrl_path) as f_nsrl: # Strip off header _ = f_nsrl.readline() print "[BUILDING] Creating bloomfilter" bf = BloomFilter(num_lines, error_rate) print "[BUILDING] Inserting hashes into bloomfilter" for line in f_nsrl: md5_hash = line.split(",")[1].strip('"') if md5_hash: try: md5 = binascii.unhexlify(md5_hash) bf.add(md5) except Exception as e: print "[ERROR] %s" % e print "[BUILDING] NSRL bloomfilter contains {} items.".format( len(bf)) with open('nsrl.bloom', 'wb') as nb: bf.tofile(nb) print "[BUILDING] Complete" else: print("[ERROR] No such file or directory: %s", nsrl_path) return
def feedBloom(row): f = BloomFilter(capacity=200, error_rate=0.6) f.add(row.src_ip) f.add(row.src_ip[0:5]) f.add(row.src_ip[5:8]) f.add(row.target_ip) return np.array(f.bitarray.tolist(), dtype=np.int)