def __init__(self, endpoint=config.config['general']['dbpedia']['endpoint'], one_hop_bloom_file=config.config['general']['dbpedia'] ['one_hop_bloom_file'], two_hop_bloom_file=config.config['general']['dbpedia'] ['two_hop_bloom_file']): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file) as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None self.two_hop_bloom_file = two_hop_bloom_file self.two_hop_bloom = dict() for item in [True, False]: file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item)) if os.path.exists(file_path): with open(file_path) as bloom_file: self.two_hop_bloom[item] = ScalableBloomFilter.fromfile( bloom_file) else: self.two_hop_bloom[item] = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.two_hop_bloom_counter = 0
def boot1(self): try: self.multiFile.seek(0) a = ScalableBloomFilter.fromfile(self.multiFile) return a except: return ScalableBloomFilter(ScalableBloomFilter.LARGE_SET_GROWTH)
def __init__(self,filterfile): self.filterfile = filterfile #if filterfile is present load bloom filter from that file, else create new one if os.path.exists(filterfile): self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb")) print "available signatures = %d"%len(self.bf) else: self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
def _load_from_file(self): self.logger_.info('loading data from cache file...') if not os.path.isfile('data/bloom.data'): self.logger_.error('bloom cache file not found, create one instead.') self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4) else: with open('data/bloom.data', 'r') as f: self.deduper_ = ScalableBloomFilter.fromfile(f)
def open_spider(self, spider): self.fileName = spider.name + self.fileName if os.path.exists(self.fileName): with open(self.fileName, 'rb') as f: self.sbf = ScalableBloomFilter.fromfile(f) else: self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) pass
def ParseQueue(): # Load Checked Urls File if os.path.isfile(path_checked_url_file): with open(path_checked_url_file, 'rb') as rf: checked_url_pool = ScalableBloomFilter.fromfile(rf) print("bf: Read pybloom from %s.\n" % path_checked_url_file) else: checked_url_pool = ScalableBloomFilter( initial_capacity=1000, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) print("bf: Create pybloom") # Get each Item from Queue i = 1 # URL_QUEUE.put_nowait(None) # sign the end of Queue # for item in iter(URL_QUEUE.get_nowait, None): # cur_url = item[2] URL_DEQUE.appendleft(None) for item in iter(URL_DEQUE.pop, None): cur_url = item[2] if (cur_url in checked_url_pool) == False: # cur_url never checked try: time.sleep(0.3) page_html_raw = requests.get(cur_url, timeout=3) except requests.RequestException as e: print(e) # URL_DEQUE.appendleft(cur_url) with open(path_requestErr_log, 'a') as f_requestErr: f_requestErr.write( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "Timeout " + cur_url + '\n') else: page_html = page_html_raw.content.decode('utf-8', 'ignore') buffer = parser4me.parser_4_1(item, page_html) with open(path_output_folder + os.path.sep + item[1] + item[0][0:128] + ".txt", 'w', encoding='utf-8') as resf: resf.write(buffer) print("%s OK! to file %s" % (i, item[0])) checked_url_pool.add(cur_url) i += 1 else: print("Skip %s" % i) i += 1 with open(path_checked_url_file, 'wb') as wf: checked_url_pool.tofile(wf)
def load_existing_users(): obj = s3.get_object( Bucket=existing_user_bucket, Key=existing_user_key, ) f = StringIO.StringIO(obj['Body'].read()) f.seek(0) bloom = ScalableBloomFilter.fromfile(f) start_sqn = obj['Metadata'].get('start_sequence_number') return bloom, int(start_sqn) if start_sqn else None
def __init__(self, bloomfile, spider_name): self.bloomfile = bloomfile self.spider_name = spider_name # item crawled before logger.info("loading crawled items before...") if os.path.isfile(self.bloomfile): f = open(self.bloomfile, 'r') self.item_crawled = ScalableBloomFilter.fromfile(f) f.close() else: self.item_crawled = ScalableBloomFilter( 100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) cnt = self.item_crawled.count logger.info("pipline read %d crawled items" % cnt)
def load(cls, filename): #import pdb; pdb.set_trace() t = cls.transformer size = t.size with open(filename, "rb") as serialized_digest: readdata = serialized_digest.read(size) if len(readdata) != size: msg = 'invalid amount read from file for format %r: %r (should have been %d)' Logger("digest.load").log(msg % (t.format, readdata, size)) raise ValueError nonce, maxcapacity, urlcount, meta = t.unpack(readdata) # If meta has a conversion from string repr, use it. if hasattr(self, 'meta_from_string'): meta = self.meta_from_string() filterS = ScalableBloomFilter.fromfile(serialized_digest) digest = cls(maxcapacity, meta, filename, filterS=filterS, nonce=nonce) digest.urlcount = urlcount return digest
def load(cls, filename): """ This overrides the base class method to unpack using the siginfo. """ #import pdb; pdb.set_trace() t = cls.transformer size = t.size with open(filename, "rb") as serialized_digest: readdata = serialized_digest.read(size) if len(readdata) != size: msg = 'invalid amount read from file for format %r: %r (should have been %d)' Logger("scandigest.load").log(msg % (t.format, readdata, size)) raise ValueError nonce, maxcapacity, urlcount, scannervv, sigversion, sigtimestamp = t.unpack(readdata) # Read the datetime as non-utc, since that's how we wrote it with mktime. siginfo = SigInfo(scannervv, sigversion, datetime.datetime.fromtimestamp(sigtimestamp)) filterS = ScalableBloomFilter.fromfile(serialized_digest) scandigest = cls(maxcapacity, siginfo, filename, filterS=filterS, nonce=nonce) scandigest.urlcount = urlcount return scandigest
'notes', 'vaccine status', 'nonvaccine', 'last updated date', ] with open(CVX_PATH, encoding='utf-16') as handle: reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames) for row in reader: bf.add(CVX + '|' + row['cvx code'].strip()) try: # If the bloom filter already exists, we're probably just appending to it with open(BF_PATH, 'rb') as handle: bf = ScalableBloomFilter.fromfile(handle) except FileNotFoundError: # If it doesn't, we need to make one bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH, initial_capacity=INITIAL_CAPACITY, error_rate=ERROR_RATE) import_loinc(bf) import_snomed(bf) import_rxnorm(bf) import_icd9(bf) import_icd10(bf) import_cpt(bf) import_fhir(bf) import_daf(bf) import_argo(bf)
class BloomMiddleware(object): logger.info('Creating Bloomfilter') # 没有bloomfile 默认生成 bloom文件 try: bloom_path = os.path.abspath(settings['BLOOM_FILE']) bloom_file = open(bloom_path, 'rb') # 不是tofile生成的文件会出错 bloomfilter = ScalableBloomFilter.fromfile(bloom_file) except: logger.warn('No Bloom File') bloom_file = open('bloom', 'wb') bloomfilter = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) bloom_path = os.path.abspath('./bloom') if settings['URL_FILE']: # 读url try: url_file = open(settings['URL_FILE'], 'r') except: raise Exception('URL FILE ERROR') bloomfilter = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) for x in url_file.read().split('\n'): bloomfilter.add(x) url_file.close() bloom_file.close() logger.info('Create Bloomfilter Complete') def __init__(self): # 上次写入磁盘的时间点 self.last_write_time = time.time() # 写入磁盘的时间间隔(s) self.count = 0 if not isinstance(settings['BLOOM_WRITE_TIME'], int): self.write_time = 300 else: self.write_time = settings['BLOOM_WRITE_TIME'] # +1s 在爬虫结束时将 bloomfilter 内容写入到磁盘 atexit.register(self.write_to_disk) def process_request(self, request, spider): # 对有 Bloom 标记的 url 进行判重 if request.meta.get('Bloom'): # 如果 url 在 bloomfilter 中则丢弃这个 request tid = request.meta['item']['goods_id'] # 否则将 url 添加到 bloomfilter if tid in self.bloomfilter: self.count += 1 logger.info('IGNORE Request [goods_id:%s] ' % tid) logger.info(self.count) raise IgnoreRequest else: logger.debug('[id:%s]not in bloom file' % tid) self.bloomfilter.add(tid) return None # 定时将 bloomfilter 写入到磁盘 if time.time() - self.last_write_time > self.write_time: self.last_write_time = time.time() self.write_to_disk() return None def write_to_disk(self): logger.info('WRITE TO DISK') save_file = open(self.bloom_path, 'wb') self.bloomfilter.tofile(save_file) save_file.close() logger.info('WRITE COMPLETE')
def load_from_file(self): del self.filter f = open(self.filterfile, 'rb') self.filter = ScalableBloomFilter.fromfile(f) f.close()
RECOGNIZED = [LOINC, SNOMED, RXNORM, ICD9, ICD10, CPT, CVX, UNITS_OF_MEASURE] # Enumerating all the FHIR systems here would be a waste of time, # so load them from the constructed json file. VALUE_SETS = [] with open('./data/fhir/systems.json') as fhir_handle: RECOGNIZED += json.load(fhir_handle) with open('./data/fhir/daf.json') as daf_handle: VALUE_SETS += json.load(daf_handle) with open('./data/fhir/argo.json') as argo_handle: VALUE_SETS += json.load(argo_handle) # Instantiate the bloom filter. try: with open('./data/codes.bf', 'rb') as handle: BLOOM = ScalableBloomFilter.fromfile(handle) except FileNotFoundError: # Generated filter not found, just instantiate an empty one. BLOOM = ScalableBloomFilter() def validate_coding(coding): """ If the coding system is recognized, check the code. """ if coding.get('system') not in RECOGNIZED: raise SystemNotRecognized(coding.get('system')) if not coding.get('code'): return False key = coding['system'] + '|' + coding['code']
#!/usr/bin/python import json from flask import request from flask import Flask from gevent.pywsgi import WSGIServer from pybloom import ScalableBloomFilter import sys print("loading blooms") try: f = open('./blooms/wikidatabloom1hoppredicate.pickle') bloom1hoppred = ScalableBloomFilter.fromfile(f) f.close() f = open('./blooms/wikidatabloom1.5hopqualifiers.pickle') bloomqualifier = ScalableBloomFilter.fromfile(f) # ihoppred_qualifier f.close() f = open('./blooms/wikidatabloom1hopentity.pickle') bloom1hopentity = ScalableBloomFilter.fromfile(f) f.close() f = open('./blooms/bloom1hoptypeofentity.pickle') bloom1hoptypeofentity = ScalableBloomFilter.fromfile(f) f.close() except Exception, e: print e sys.exit(1) print "Blooms loaded" app = Flask(__name__) @app.route('/bloomconnections', methods=['POST'])