def __init__(self, bloom_name): self.bloom_path = '%s.blm' % bloom_name is_exist = os.path.exists(self.bloom_path) if is_exist: self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb')) else: self.bf = BloomFilter(20000, 0.001)
def __init__(self, endpoint=config.config['general']['dbpedia']['endpoint'], one_hop_bloom_file=config.config['general']['dbpedia'] ['one_hop_bloom_file'], two_hop_bloom_file=config.config['general']['dbpedia'] ['two_hop_bloom_file']): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file, 'rb') as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None self.two_hop_bloom_file = two_hop_bloom_file self.two_hop_bloom = dict() for item in [True, False]: file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item)) if os.path.exists(file_path): with open(file_path, 'rb') as bloom_file: self.two_hop_bloom[item] = ScalableBloomFilter.fromfile( bloom_file) else: self.two_hop_bloom[item] = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.two_hop_bloom_counter = 0
def __init__(self): self.filename = 'bloomFilter.blm' is_exist = os.path.exists(self.filename) if is_exist: self.bf = BloomFilter.fromfile(open(self.filename, 'rb')) else: self.bf = BloomFilter(100000000, 0.001)
def open_spider(self, spider): if os.path.exists(self.file_name): self.bf = BloomFilter.fromfile(open(self.file_name, 'rb')) self.cap_begin = len(self.bf) # 打开blm文件时读入初始数量 print('open blm file success') print('初始容量:%d' % self.cap_begin) else: self.bf = BloomFilter(100000000, 0.001) print('Not find the blm file, creat one')
def test_train_bloom_filter__from_file(self): with open('./resources/hot_names_bloom_filter', 'rb') as f: bf = BloomFilter.fromfile(f) for name in sample(self.names, 10): assert name in bf prefix = ''.join(sample(ascii_letters, 10)) fake_name = f'{prefix}{name}' assert fake_name not in bf
def __init__(self, bloom_name): bloom_dir = './bf' if not os.path.exists(bloom_dir): os.makedirs(bloom_dir) self.bloom_path = '%s/%s.blm' % (bloom_dir, bloom_name) is_exist = os.path.exists(self.bloom_path) if is_exist: self.bf = BloomFilter.fromfile(open(self.bloom_path, 'rb')) else: self.bf = BloomFilter(20000, 0.001)
def bloom_readfrom_db(self): tempFile = open("tempFile", "wb") bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode}) if bloom_dict: #如果有布隆过滤器,读取 bloomData = bloom_dict["bloom_data"] tempFile.write(bloomData) tempFile.close() bloomFile = open("tempFile", "rb") self.bloom = BloomFilter.fromfile(bloomFile) else: self.bloom = BloomFilter(capacity=1000000, error_rate=0.00001)
def bloom_url(url): is_exist = os.path.exists(r'C:\spiders\zhilian_celery\bloom.blm') if is_exist: bf = BloomFilter.fromfile( open(r'C:\spiders\zhilian_celery\bloom.blm', 'rb', buffering=40)) else: bf = BloomFilter(10000000, 0.001) # for animal in animals: if url in bf: print(1) return 0 else: bf.add(url) bf.tofile(open(r'C:\spiders\zhilian_celery\bloom.blm', 'wb')) return 1
def filter_url(self, url): """ 进行url去重处理,可能需要的请求数据过多,防止重复 :param url:对url进行判断,看是否重复 :return: """ bloom_path = '{}.blm'.format(self.name) # 判断是否存在这个文件 is_exist = os.path.exists(bloom_path) if is_exist: bf = BloomFilter.fromfile(open(bloom_path, 'rb')) else: # 新建一个,储存在内存中 bf = BloomFilter(1000000, 0.01) if url in bf: return False # 不存在将url添加进去 bf.add(url) bf.tofile(open(bloom_path, 'wb')) return True
def bloom_file_init(): path = '../spiders/sites.blm' is_exist = os.path.exists(path) # 判断是否存在bloom文件 # 判断存在就读取 if is_exist: bf = BloomFilter.fromfile(open(path, 'rb')) # 没有该文件则创建bf对象 最后的时候保存文件 else: bf = BloomFilter(10000000, 0.01) with MongoClient(get_project_settings()['MONGODB_URL']) as client: sites_coll = client.site.sites sites_unverified_coll = client.site.sites_unverified for x in sites_coll.find(): result = bf.add(x['url']) print(x['url'], ' ', result) for x in sites_unverified_coll.find({}): result = bf.add(x['url']) print(x['url'], ' ', result) bf.tofile(open(path, 'wb'))
def Bulon(): if os.path.exists('布隆文件/{}.blm'.format(DATABASE)): bf = BloomFilter.fromfile(open('布隆文件/{}.blm'.format(DATABASE), 'rb')) else: bf = BloomFilter(1000000, 0.001) return bf
def mapper_init(self): with open(os.path.join(basedir, 'resources/hot_user_ids.bf'), 'rb') as f: self.filter = BloomFilter.fromfile(f)
# 导入库 import os from pybloom_live import BloomFilter # 数据库文件 animals = [ 'dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle', 'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear', 'chicken', 'dolphin', 'donkey', 'crow', 'crocodile', 'testadd' ] # 判断文件是否存在 # 存在时读取,不存在时创建 is_exist = os.path.exists('test.blm') if is_exist: bf = BloomFilter.fromfile(open('test.blm', 'rb')) # 若没有该文件则创建bf对象 else: bf = BloomFilter(20000, 0.001) # 如果存在则跳过,否则写入 for i in range(10): if i in bf: print('pass') pass else: print('add %s' % i) bf.add(i) bf.tofile(open('test.blm', 'wb')) #判断是否存在
items = items[:-1] for i in range(2): if not all([ item.startswith('<') and item.endswith('>') for item in items[i:i + 2] ]): continue key = ':'.join([items[i][1:-1], items[i + 1][1:-1]]) bloom.add(key) with open(os.path.join(blooms_path, 'spo1.bloom'), 'wb') as f: bloom.tofile(f) with open(os.path.join(blooms_path, 'spo1.bloom'), 'rb') as f: one_hop_bloom = BloomFilter.fromfile(f) ds = LC_Qaud_Linked( path=os.path.join(args.base_path, args.dataset_path)) ds.load() ds.parse() for row in ds.qapairs: for item in row.sparql.where_clause: if item[0].startswith('<'): key = ':'.join([item[0][1:-1], item[1][1:-1]]) elif item[2].startswith('<'): key = ':'.join([item[1][1:-1], item[2][1:-1]]) else: key = '' if '#type' not in key and key != '' and key not in one_hop_bloom: print(key)
def _open_bloom(infile): nb = open(infile, "rb") return BloomFilter.fromfile(nb)
import os from pybloom_live import BloomFilter # coon = pymysql.connect(host='127.0.0.1', user='******', passwd='qwer', db='haining') # cur = coon.cursor() # cur.execute("SELECT room_id from haining_room") # room_urls = cur.fetchall() ls = ["1049be49dc584707"] os.chdir(r'E:\Myproject\Scan\chizhou\chizhou\spiders') is_exist = os.path.exists('chizhou.blm') # 判断是否存在bloom文件 # 判断存在就读取 if is_exist: bf = BloomFilter.fromfile(open('chizhou.blm', 'rb')) # 没有该文件则创建bf对象 最后的时候保存文件 else: bf = BloomFilter(1000000, 0.0000001) i = 1 for room_url in ls: if room_url in bf: print('pass') pass else: # 加入布隆列表 bf.add(room_url) print('添加了 %s 个' % i) i += 1 # 创建,写入布隆文件(单次写入)