class Sava_key(Thread): def __init__(self,save_queue,filename): super(Sava_key, self).__init__() self.save_queue = save_queue self.filename = filename self.boom = BloomFilter(capacity=1e7,error_rate=0.001) def run(self): while True: try: kw = save_queue.get() # 一层过滤,必须包含实验室 if '实验室' in kw: # 二层过滤,排除不需要的词 keywords = filter_key.clean_key(kw) if keywords is None: continue else: # 保存过的词过滤掉 if keywords in self.boom: continue self.boom.add(keywords) print('新词:{}'.format(keywords)) self.sava_txt(keywords) finally: self.save_queue.task_done() def sava_txt(self,kw): with open(self.filename,'a',encoding='utf-8') as fs: fs.write('{}\n'.format(kw))
class BloomData: capacity = 50 error_rate = 0.2 def __init__(self, capacity=50, error_rate=0.2): """存储布隆过滤器和真值 :param capacity:布隆过滤器容量 :param error_rate:错误率 """ self.bloom = BloomFilter(capacity=self.capacity, error_rate=self.error_rate) self.Ture_Data = [] def AddDataToBloom(self, new_data, key=0): """添加数据到布隆过滤器 :param new_data: 待添加数据 :param key: 模式1:布隆数据;模式0:字符数据 :return: 现布隆数据包含真值数量 """ if key == 0: self.Ture_Data.append(new_data) self.bloom.add(new_data) if key == 1: self.bloom.union(new_data.bloom) self.Ture_Data.extend(new_data.Ture_Data) return len(self.Ture_Data)
class TrainValBloom: def __init__( self, train_row_nrs: List[int], chosen_feats: List[int], full_dataset_size: int, val_row_nrs: List[int] = None, ): self.train_bloom = BloomFilter(capacity=len(train_row_nrs), error_rate=0.01) self.full_dataset_size = full_dataset_size self.subset_size = len(train_row_nrs) self.chosen_feats = chosen_feats all_row_nrs = list(range(0, full_dataset_size)) if val_row_nrs is None: val_row_nrs = [x for x in all_row_nrs if x not in train_row_nrs] self.val_bloom = BloomFilter(capacity=len(val_row_nrs), error_rate=0.01) for row_nr in train_row_nrs: self.train_bloom.add(row_nr) for row_nr in val_row_nrs: self.val_bloom.add(row_nr) def into_dataset(self, train_set: "TrainSet", save_bloom: bool) -> "TrainValSet": train_inds, val_inds = self.into_indices() if save_bloom: train_val_bloom = self else: train_val_bloom = None return TrainValSet( train_set.x_train[np.ix_(train_inds, self.chosen_feats)], train_set.y_train[train_inds], train_set.x_train[np.ix_(val_inds, self.chosen_feats)], train_set.y_train[val_inds], train_val_bloom, ) def into_indices(self) -> (List[int], List[int]): train_inds = np.zeros(self.subset_size) val_inds = np.zeros(self.full_dataset_size - self.subset_size) train_cntr = 0 val_cntr = 0 for i in range(0, self.full_dataset_size): if i in self.train_bloom and train_cntr < self.subset_size: train_inds[train_cntr] = i train_cntr += 1 else: val_inds[val_cntr] = i val_cntr += 1 val_inds = np.trim_zeros(val_inds, "b") # 'b' = trim only from back train_inds_lst: List[int] = train_inds.astype(int).tolist() val_inds_lst: List[int] = val_inds.astype(int).tolist() return train_inds_lst, val_inds_lst
class BloomCheckFunction(object): def __init__(self): self.filename = 'bloomFilter.blm' is_exist = os.path.exists(self.filename) if is_exist: self.bf = BloomFilter.fromfile(open(self.filename, 'rb')) else: self.bf = BloomFilter(100000000, 0.001) def process_item(self, data): data_encode_md5 = hashlib.md5( data.encode(encoding='utf-8')).hexdigest() if data_encode_md5 in self.bf: # 内容没有更新 丢弃item self.save_bloom_file() return False else: self.bf.add(data_encode_md5) self.save_bloom_file() return True def save_bloom_file(self): self.bf.tofile(open(self.filename, 'wb'))
class Related_Key(Thread,Downloader): def __init__(self,key_queue,save_queue): super(Related_Key, self).__init__() self.key_queue = key_queue # 采集列队 self.save_queue = save_queue #保存列队 self.bloom = BloomFilter(capacity=1e7,error_rate=0.001) #过滤器 def run(self): while True: try: kw = self.key_queue.get() # 过滤抓取,如果存在则不抓,不存在则添加进列表并抓取 if kw in self.bloom: continue self.bloom.add(kw) # 下载源码 source = self.download(kw) if source is None: continue self.parse_html(source) time.sleep(0.5) # 处理完一次暂停0.5秒 finally: self.key_queue.task_done() def parse_html(self,html): ele = etree.HTML(html) keyList = ele.xpath('//table//tr//th/a/text()') for key in keyList: self.key_queue.put(key) #添加采集列队 self.save_queue.put(key) #添加保存列队
def train_bloom_filter(elements, error_rate=0.001): elements = set(elements) bf = BloomFilter(capacity=len(elements), error_rate=error_rate) for element in elements: bf.add(element) return bf
class Filter_key(Thread): def __init__(self, save_queue,contain,filename): super().__init__() self.save_queue = save_queue #保存列队 self.contain = contain #必须包含词 self.filename = filename # 文件名 self.bloom = BloomFilter(capacity=1e7,error_rate=0.001) def run(self): while True: wd = self.save_queue.get() # 判断是否包含某词 for con in self.contain: if con in wd: # 关键词长度大于4个字才保存 if len(wd) > 10: # 符合包含词再进行去重 if wd in self.bloom: continue self.bloom.add(wd) print('得到新词:{}'.format(wd)) self.save_file(wd) self.save_queue.task_done() # 保存文件函数 def save_file(self,wd): with open(self.filename, mode='a', encoding='utf-8') as f: f.write('{}\n'.format(wd))
class MyBloomUtil: def __init__(self, bloom_name): bloom_dir = './bf' if not os.path.exists(bloom_dir): os.makedirs(bloom_dir) self.bloom_path = '%s/%s.blm' % (bloom_dir, bloom_name) is_exist = os.path.exists(self.bloom_path) if is_exist: self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb')) else: self.bf = BloomFilter(20000, 0.001) def is_exists(self, item): if item in self.bf: print('%s is already in bloom_filter.' % item) return True return False def add_in_bf(self, item): print('add %s' % item) self.bf.add(item) self.bf.tofile(open(self.bloom_path, 'wb')) def process_item(self, item): if item in self.bf: logger.info('[%s] is already in bloom.' % item) return None else: logger.info('add [%s]' % item) self.bf.add(item) self.bf.tofile(open(self.bloom_path, 'wb')) return item
def create_DB(self, path_DB): db = [] for j, filepath in enumerate( natsort.natsorted(os.listdir(path_DB))[:self.n]): dbBloom = BloomFilter(self.m, self.q) self.k = dbBloom.num_slices f = open('{}/{}'.format(path_DB, filepath), 'r') print " DEBUG: Reading patient file {:3d} from: {}/{}".format( j, path_DB, filepath) for i, line in enumerate(f): # '#'-lines might result in too few added snps if not line.startswith('#'): snp = line.strip().split('\t') try: dbBloom.add(snp[0] + snp[1] + snp[3] + snp[4]) except: pass if i + 1 >= self.m: break db.append(dbBloom.bitarray) f.close() # Update n (needs to be done since len(db) could be smaller than specified n) self.n = len(db) # Reset Bloom filter length (the used library is slightly above the theoretical optimum) self.l = len(db[0]) return db
def test_calculate_error_rate__large_test(self): # -- fetch test and train samples train, test = self.prepare_test_and_train(2) # -- train Bloom filter bf = BloomFilter(len(train), error_rate=0.1) for word in train: bf.add(word) assert round(calculate_error_rate(bf, test), 1) == 0.1
def create_query(self, path_QRY): queryBloom = BloomFilter(self.m, self.q) f = open(path_QRY, 'r') for qrySNP in f: if not qrySNP.startswith('#'): snp = qrySNP.strip().split('\t') try: queryBloom.add(snp[0] + snp[1] + snp[3] + snp[4]) except: pass return 0, queryBloom.bitarray
def writeToFile(outpath, path, size, error): bloom = BloomFilter(size, error) dest = open(outpath, 'w') f = open(path, 'r') for line in f: if not line.startswith('#'): snp = line.strip().split('\t') bloom.add(snp[0] + snp[1] + snp[3] + snp[4]) f.close() bloom.bitarray.tofile(dest) dest.close()
class Save_key(Thread): def __init__(self, save_queue, contain, db_config, filename): super(Save_key, self).__init__() self.save_queue = save_queue #保存列队 self.contain = contain #必须包含词 self.db_config = db_config #数据库配置 self.filename = filename #保存文件名 self.bloom = BloomFilter(capacity=1e7, error_rate=0.001) #关键词重复采集过滤器 def run(self): while True: wd = self.save_queue.get() # 一层过滤,必须包含实验室 if 'sem' or '竟价' in wd: # 二层过滤,排除不需要的词 keywords = filter_key.clean_key(wd) if keywords is None: continue else: # 过滤重复保存过的词 if wd in self.bloom: continue else: self.bloom.add(wd) print('得到新词:{}'.format(wd)) self.save_file(wd) self.save_queue.task_done() def save_file(self, wd): # 方式一,数据入库: # try: # conn = pymysql.Connect(**self.db_config) # try: # sql = "insert ignore into sys(keywords) values(%s)" # with conn.cursor() as cursor: # cursor.execute(sql, args=(wd)) # except pymysql.err.Error as err: # print('插入数据出错,新词:{},异常:{}'.format(wd, err)) # else: # conn.commit() # conn.close() # except pymysql.err.MySQLError: # print('链接数据库出错!') # 方式二,写入本地: with open(self.filename, mode='a', encoding='utf-8') as f: f.write('{}\n'.format(wd))
def bloom_url(url): is_exist = os.path.exists(r'C:\spiders\zhilian_celery\bloom.blm') if is_exist: bf = BloomFilter.fromfile( open(r'C:\spiders\zhilian_celery\bloom.blm', 'rb', buffering=40)) else: bf = BloomFilter(10000000, 0.001) # for animal in animals: if url in bf: print(1) return 0 else: bf.add(url) bf.tofile(open(r'C:\spiders\zhilian_celery\bloom.blm', 'wb')) return 1
class Save_key(Thread): def __init__(self, save_queue, contain, filename): super().__init__() self.save_queue = save_queue #保存列队 self.contain = contain #必须包含词 self.filename = filename #文件名 self.bloom = BloomFilter(capacity=1e7, error_rate=0.001) def run(self): while True: wd = self.save_queue.get() # 判断是否包含某词 for con in self.contain: if con in wd: # 关键词长度大于4个字才保存 if len(wd) > 5: # 符合包含词再进行去重 if wd in self.bloom: continue self.bloom.add(wd) print('得到新词:{}'.format(wd)) self.save_file(wd) self.save_queue.task_done() # 释放资源 gc.collect() # 保存文件函数 def save_file(self, wd): # 开始入库 # try: # conn = pymysql.Connect(**self.db_config) # try: # sql = "insert ignore into shiyanshi_key(keywords) values(%s)" # with conn.cursor() as cursor: # cursor.execute(sql, args=(wd)) # except pymysql.err.Error as err: # print('插入数据出错,新词:{},异常:{}'.format(wd, err)) # else: # conn.commit() # conn.close() # except pymysql.err.MySQLError: # print('链接数据库出错!') with open(self.filename, mode='a', encoding='utf-8') as f: f.write('{}\n'.format(wd)) # 释放资源 gc.collect()
def train_bloom_filter(): # -- training the Bloom filter hot_display_names = set() with open('./resources/0.xml', 'rb') as f: for line in f: user = row_to_dict(line) hot_display_names.add(user['displayname']) bf = BloomFilter(len(hot_display_names), error_rate=0.001) for name in hot_display_names: bf.add(name) with open('./resources/hot_names_bloom_filter', 'wb') as f: bf.tofile(f) return bf
class BloomCheckPipeline(object): def __init__(self): self.file_name = r'Z:/朱靖/布隆滤波器过滤文件/carpicture/BloomFiltercnki.blm' self.bf = None self.cap_begin = 0 self.cap_end = 0 self.cnt = 0 def open_spider(self, spider): if os.path.exists(self.file_name): self.bf = BloomFilter.fromfile(open(self.file_name, 'rb')) self.cap_begin = len(self.bf) # 打开blm文件时读入初始数量 print('open blm file success') print('初始容量:%d' % self.cap_begin) else: self.bf = BloomFilter(100000000, 0.001) print('Not find the blm file, creat one') def process_item(self, item, spider): if item['image_url'] in self.bf: print('drop one item %s for exist' % item['title']) raise DropItem('drop an item %s for exists' % item['title']) else: try: self.bf.add(item['image_url']) self.cnt += 1 except Exception as reason: print('BloomFilter Error------:%s' % reason) # 每写入1w个url时就保存blm文件一次 if self.cnt > 10000: self.save_blm() self.cnt = 0 return item def save_blm(self): print('Save Blm File ******') self.cap_end = len(self.bf) print('此次存入图片数量:%d' % (self.cap_end - self.cap_begin)) self.bf.tofile(open(self.file_name, 'wb')) def close_spider(self, spider): print('close_spider tofile------') self.cap_end = len(self.bf) print('此次存入图片数:%d' % (self.cap_end - self.cap_begin)) self.bf.tofile(open(self.file_name, 'wb'))
class BloomCheckPipeline(object): def __init__(self): self.file_name = 'Z:/朱靖/布隆滤波器过滤文件/学科网/bloomfilter_xuekew.blm' self.bf = None self.cap_begin = 0 self.cap_end = 0 self.cnt = 0 def open_spider(self, spider): if os.path.exists(self.file_name): self.bf = BloomFilter.fromfile(open(self.file_name, 'rb')) print('open blm file success') self.cap_begin = len(self.bf) print('open blm file success') print('初始容量:%d' % self.cap_begin) else: self.bf = BloomFilter(100000000, 0.001) print('Not find the blm file') def process_item(self, item, spider): if item['url'] in self.bf: print('drop one item %s for exits' % item['title']) raise DropItem('drop an item %s for exits' % item['title']) else: try: self.bf.add(item['url']) self.cnt += 1 except Exception as reason: print("BloomFilter Error------:%s" % reason) if self.cnt > 10000: self.save_blm() self.cnt = 0 return item def save_blm(self): print('Save Blm File ******') self.cap_end = len(self.bf) print('此次存入文章数:%d' % (self.cap_end - self.cap_begin)) self.bf.tofile(open(self.file_name, 'wb')) def close_spider(self, spider): print('close spider tofile-------') self.cap_end = len(self.bf) print('此次存入文章数:%d' % (self.cap_end - self.cap_begin)) self.bf.tofile(open(self.file_name, 'wb'))
class Spider(Thread,Downloads): def __init__(self,key_queue,save_queue): super(Spider, self).__init__() self.key_queue = key_queue #采集关键词列队 self.save_queue = save_queue #关键词保存列队 self.bloom = BloomFilter(capacity=1e7,error_rate=0.001) #关键词重复采集过滤器 def run(self): while True: try: kw = self.key_queue.get() #从关键词列队中提取一个 # 判断采集过滤器中是否已采集,如果有则跳过,没有则添加 if kw in self.bloom: continue self.bloom.add(kw) # 开始下载源码 source = self.download(kw) if source is None: continue # 开始提取源码中的内容 self.parse_html(source) finally: self.key_queue.task_done() #无论怎样都要把消息队列处理完 def parse_html(self,source): # 推荐 //div[@class="hint-mid"]/a/text() # 相关 //div[@class="hintBox"]//table//tr/td/p/a/text() elt = etree.HTML(source) # 推荐 recommend_list = elt.xpath('//div[@class="hint-mid"]/a/text()') # 遍历集合,将得到的词不断的添加到关键词队列及保存队列中 for recommend in recommend_list: self.key_queue.put(recommend) self.save_queue.put(recommend) # 相关 related_list = elt.xpath('//div[@class="hintBox"]//table//tr/td/p/a/text()') for related in related_list: self.key_queue.put(related) self.save_queue.put(related)
class BLOOMDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(2000000, 0.00001) @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
class MyBloomUtil: def __init__(self, bloom_name): self.bloom_path = '%s.blm' % bloom_name is_exist = os.path.exists(self.bloom_path) if is_exist: self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb')) else: self.bf = BloomFilter(20000, 0.001) def process_item(self, item): if item in self.bf: logger.info('[%s] is already in bloom.' % item) return None else: print('add one') self.bf.add(item) self.bf.tofile(open(self.bloom_path, 'wb')) return item
def parse(self, response): # fname = "/media/common/娱乐/Electronic_Design/Coding/Python/Scrapy/tutorial/tutorial/spiders/temp" # # html = response.xpath('//html').extract()[0] # fobj = open(fname, 'w') # fobj.writelines(html.encode('utf-8')) # fobj.close() # bloom = BloomFilter(100, 10) bloom = BloomFilter(1000, 0.001) animals = [ 'dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle', 'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear', 'chicken', 'dolphin', 'donkey', 'crow', 'crocodile' ] # First insertion of animals into the bloom filter for animal in animals: bloom.add(animal) # Membership existence for already inserted animals # There should not be any false negatives for animal in animals: if animal in bloom: print('{} is in bloom filter as expected'.format(animal)) else: print('Something is terribly went wrong for {}'.format(animal)) print('FALSE NEGATIVE!') # Membership existence for not inserted animals # There could be false positives other_animals = [ 'badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox', 'whale', 'shark', 'fish', 'turkey', 'duck', 'dove', 'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla', 'hawk' ] for other_animal in other_animals: if other_animal in bloom: print('{} is not in the bloom, but a false positive'.format( other_animal)) else: print('{} is not in the bloom filter as expected'.format( other_animal))
class Spider(Thread,Downloads): def __init__(self,key_queue,save_queue): super(Spider, self).__init__() self.key_queue = key_queue self.save_queue = save_queue self.boom = BloomFilter(capacity=1e7,error_rate=0.001) #关键词重复采集过滤器 def run(self): while True: try: kw = self.key_queue.get() # 如果存在采集过滤器中就跳过 if kw in self.boom: continue # 否则添加进采集过滤器中 self.boom.add(kw) source = self.download(kw) if source is None: continue self.parse_html(source) # 处理完一次,休眠3秒 # time.sleep(3) finally: self.key_queue.task_done() def parse_html(self,source): ele = etree.HTML(source) searchList = ele.xpath('//div[@class="c-row row-item row-item2"]/div/a/span/text()') for search in searchList: # print('{}'.format(search)) str_search = str(search) #lxml占用过多内存,转换为字符串并将其存储,这样可以防止整个树被垃圾回收 self.key_queue.put(str_search) self.save_queue.put(str_search) relatedList = ele.xpath('//div[@class="rw-list-new rw-list-new2"]/a/span/text()') for related in relatedList: # print('{}'.format(related)) str_related = str(related) self.key_queue.put(str_related) self.save_queue.put(str_related)
def domain(cls, domain_url): """checking the doamin URL, if it is found in the adult URL or contain the bad words. @:return True, if the domain has been found, else return false. If false, the domain can be added. """ bf = BloomFilter(10000000) path = os.path.dirname(os.path.abspath(__file__)) file = open(path + "/data/porn_sites_list.txt", "r+") files = file.readlines() for item in files: bf.add(item.strip()) file.close() result = domain_url in bf if result: return True # else: # for word in bad_domains_words: # if domain_url.__contains__(word): # return True return False
def filter_url(self, url): """ 进行url去重处理,可能需要的请求数据过多,防止重复 :param url:对url进行判断,看是否重复 :return: """ bloom_path = '{}.blm'.format(self.name) # 判断是否存在这个文件 is_exist = os.path.exists(bloom_path) if is_exist: bf = BloomFilter.fromfile(open(bloom_path, 'rb')) else: # 新建一个,储存在内存中 bf = BloomFilter(1000000, 0.01) if url in bf: return False # 不存在将url添加进去 bf.add(url) bf.tofile(open(bloom_path, 'wb')) return True
def build( infile, outfile, error_rate=0.0001, delim=None, column=1, skip_first=False, unhex=False, comment_prefix=None, num_items=None, ): print("[BUILDING] Using error-rate: {}".format(error_rate)) if os.path.isfile(infile): print("[BUILDING] Reading in Hashset: {}".format(infile)) print("[BUILDING] Calculating number of hashes...") if not num_items: num_items = get_number_of_items(infile, skip_first, comment_prefix) print("[BUILDING] There are {} hashes in the Hashset".format(num_items)) print("[BUILDING] Creating bloomfilter") bf = BloomFilter(num_items, error_rate) print("[BUILDING] Inserting hashes into bloomfilter") for item in get_items( infile, delim=delim, column=column, skip_first=skip_first, unhex=unhex, comment_prefix=comment_prefix, ): try: bf.add(item) except Exception as e: print("[ERROR] {}".format(e), file=sys.stderr) print("[BUILDING] Hashset bloomfilter contains {} items.".format(len(bf))) with open(outfile, "wb") as fh: bf.tofile(fh) print("[BUILDING] Complete") else: print("[ERROR] No such file or directory: {}".format(infile), file=sys.stderr) return
class BLOOMDupeFilter(BaseDupeFilter): """ BLOOM Duplicate Filter This filter is interesting to use if you crawl a lot of url, it will take less memory to filter the urls. """ def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(2000000, 0.00001) @classmethod def from_settings(cls, settings): return cls(job_dir(settings)) def request_seen(self, request): fp = request.url if fp in self.fingerprints: return True self.fingerprints.add(fp) def close(self, reason): self.fingerprints = None
def crawl(url, seen=None): print(f'crawling: {url}') if not seen: seen = BloomFilter(capacity=50000, error_rate=0.0001) with Timeout(5, False): try: response = requests.get(url) except requests.exception.RequestError: return location = domain(url) wanted_urls = [] for url_match in url_regex.finditer(response.text): url = url_match.group(0) # To not destroy the internet, we only fetch URLs on the same domain. if url not in seen and location in domain(url): wanted_urls.append(url) seen.add(url) subtasks = group(crawl.s(url, seen) for url in wanted_urls) subtasks.delay()
class Spider(Thread, Downloads): def __init__(self, key_queue, save_queue): super(Spider, self).__init__() self.key_queue = key_queue #采集关键词列队 self.save_queue = save_queue #关键词保存列队 self.bloom = BloomFilter(capacity=1e7, error_rate=0.001) #关键词重复采集过滤器 def run(self): while True: try: kw = self.key_queue.get() #从关键词列队中提取一个 # 判断采集过滤器中是否已采集,如果有则跳过,没有则添加 if kw in self.bloom: continue self.bloom.add(kw) # 开始下载源码 source = self.download(kw) if source is None: continue # 开始提取源码中的内容 self.parse_html(source) finally: self.key_queue.task_done() #无论怎样都要把消息队列处理完 def parse_html(self, source): # 相关 //table//tr//th/a/text() elt = etree.HTML(source) # 相关 related_list = elt.xpath('//table//tr//th/a/text()') for related in related_list: str_related = str( related) #将原来的lxml 数据格式转换为 str 防止 lxml 回收,一直采集会不停的占用内存 self.key_queue.put(str_related) self.save_queue.put(str_related)
def bloom_file_init(): path = '../spiders/sites.blm' is_exist = os.path.exists(path) # 判断是否存在bloom文件 # 判断存在就读取 if is_exist: bf = BloomFilter.fromfile(open(path, 'rb')) # 没有该文件则创建bf对象 最后的时候保存文件 else: bf = BloomFilter(10000000, 0.01) with MongoClient(get_project_settings()['MONGODB_URL']) as client: sites_coll = client.site.sites sites_unverified_coll = client.site.sites_unverified for x in sites_coll.find(): result = bf.add(x['url']) print(x['url'], ' ', result) for x in sites_unverified_coll.find({}): result = bf.add(x['url']) print(x['url'], ' ', result) bf.tofile(open(path, 'wb'))
# Calculate sourmash estimate of Jaccard index E1 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y') E2 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y') E1.add_sequence(seq1) E2.add_sequence(seq2) estimate_jaccard = E1.jaccard(E2) estimate_jaccards[it] = estimate_jaccard # Containment version. # Bloom filter f = BloomFilter(capacity=i_size+n1, error_rate=p) len_kmers_1 = 0 for val in kmers1: if val not in f: len_kmers_1 += 1 f.add(val) #len_kmers_1 *= (1 - p) # adjust for the false positive rate, shouldn't need to do this as I'm just adding elements int_est = 0 for val in E2._kmers: #if val in f: # in python2, no distinguishing between byte and utf-8 string if val is not '': if val.decode("utf-8") in f: int_est += 1 int_est -= p*h # adjust for the false positive rate containment_est = int_est / float(h) # Calculate the containment estimate of jaccard, len(kmers2) is exact (as in practice this is part of the training # database and so only needs to be done once (and the genomes are relatively small so this is no big cost) containment_est_jaccard = \ len(kmers2) * containment_est / \ (len(kmers2) + len_kmers_1 - len(kmers2) * containment_est)