Exemple #1
0
    def __init__(self,
                 endpoint=config.config['general']['dbpedia']['endpoint'],
                 one_hop_bloom_file=config.config['general']['dbpedia']
                 ['one_hop_bloom_file'],
                 two_hop_bloom_file=config.config['general']['dbpedia']
                 ['two_hop_bloom_file']):
        super(DBpedia, self).__init__(endpoint)
        self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
        if os.path.exists(one_hop_bloom_file):
            with open(one_hop_bloom_file) as bloom_file:
                self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
        else:
            self.one_hop_bloom = None
        self.two_hop_bloom_file = two_hop_bloom_file

        self.two_hop_bloom = dict()
        for item in [True, False]:
            file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item))
            if os.path.exists(file_path):
                with open(file_path) as bloom_file:
                    self.two_hop_bloom[item] = ScalableBloomFilter.fromfile(
                        bloom_file)
            else:
                self.two_hop_bloom[item] = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH)

        self.two_hop_bloom_counter = 0
Exemple #2
0
 def boot1(self):
     try:
         self.multiFile.seek(0)
         a = ScalableBloomFilter.fromfile(self.multiFile)
         return a
     except:
         return ScalableBloomFilter(ScalableBloomFilter.LARGE_SET_GROWTH)
  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
 def _load_from_file(self):
   self.logger_.info('loading data from cache file...')
   if not os.path.isfile('data/bloom.data'):
     self.logger_.error('bloom cache file not found, create one instead.')
     self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
   else:
     with open('data/bloom.data', 'r') as f:
       self.deduper_ = ScalableBloomFilter.fromfile(f)
Exemple #5
0
 def open_spider(self, spider):
     self.fileName = spider.name + self.fileName
     if os.path.exists(self.fileName):
         with open(self.fileName, 'rb') as f:
             self.sbf = ScalableBloomFilter.fromfile(f)
     else:
         self.sbf = ScalableBloomFilter(
             mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     pass
Exemple #6
0
def ParseQueue():
    # Load Checked Urls File
    if os.path.isfile(path_checked_url_file):
        with open(path_checked_url_file, 'rb') as rf:
            checked_url_pool = ScalableBloomFilter.fromfile(rf)
            print("bf: Read pybloom from %s.\n" % path_checked_url_file)
    else:
        checked_url_pool = ScalableBloomFilter(
            initial_capacity=1000,
            error_rate=0.001,
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        print("bf: Create pybloom")

    # Get each Item from Queue
    i = 1
    # URL_QUEUE.put_nowait(None)  # sign the end of Queue
    # for item in iter(URL_QUEUE.get_nowait, None):
    #     cur_url = item[2]
    URL_DEQUE.appendleft(None)
    for item in iter(URL_DEQUE.pop, None):
        cur_url = item[2]

        if (cur_url in checked_url_pool) == False:  # cur_url never checked
            try:
                time.sleep(0.3)
                page_html_raw = requests.get(cur_url, timeout=3)
            except requests.RequestException as e:
                print(e)
                # URL_DEQUE.appendleft(cur_url)
                with open(path_requestErr_log, 'a') as f_requestErr:
                    f_requestErr.write(
                        time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time())) +
                        "Timeout " + cur_url + '\n')
            else:
                page_html = page_html_raw.content.decode('utf-8', 'ignore')
                buffer = parser4me.parser_4_1(item, page_html)
                with open(path_output_folder + os.path.sep + item[1] +
                          item[0][0:128] + ".txt",
                          'w',
                          encoding='utf-8') as resf:
                    resf.write(buffer)
                    print("%s OK! to file %s" % (i, item[0]))
                checked_url_pool.add(cur_url)
                i += 1
        else:
            print("Skip %s" % i)
            i += 1

        with open(path_checked_url_file, 'wb') as wf:
            checked_url_pool.tofile(wf)
Exemple #7
0
def load_existing_users():
    obj = s3.get_object(
        Bucket=existing_user_bucket,
        Key=existing_user_key,
    )

    f = StringIO.StringIO(obj['Body'].read())
    f.seek(0)

    bloom = ScalableBloomFilter.fromfile(f)

    start_sqn = obj['Metadata'].get('start_sequence_number')

    return bloom, int(start_sqn) if start_sqn else None
Exemple #8
0
    def __init__(self, bloomfile, spider_name):
        self.bloomfile = bloomfile
        self.spider_name = spider_name

        # item crawled before
        logger.info("loading crawled items before...")

        if os.path.isfile(self.bloomfile):
            f = open(self.bloomfile, 'r')
            self.item_crawled = ScalableBloomFilter.fromfile(f)
            f.close()
        else:
            self.item_crawled = ScalableBloomFilter(
                100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        cnt = self.item_crawled.count
        logger.info("pipline read %d crawled items" % cnt)
Exemple #9
0
    def load(cls, filename):
        #import pdb; pdb.set_trace()
        t = cls.transformer
        size = t.size
        with open(filename, "rb") as serialized_digest:
            readdata = serialized_digest.read(size)
            if len(readdata) != size:
                msg = 'invalid amount read from file for format %r: %r (should have been %d)'
                Logger("digest.load").log(msg % (t.format, readdata, size))
                raise ValueError
            nonce, maxcapacity, urlcount, meta = t.unpack(readdata)

            # If meta has a conversion from string repr, use it.
            if hasattr(self, 'meta_from_string'):
                meta = self.meta_from_string()
            filterS = ScalableBloomFilter.fromfile(serialized_digest)
        digest = cls(maxcapacity, meta, filename, filterS=filterS, nonce=nonce)
        digest.urlcount = urlcount
        return digest
Exemple #10
0
    def load(cls, filename):
        """
        This overrides the base class method to unpack using the siginfo.
        """
        #import pdb; pdb.set_trace()
        t = cls.transformer
        size = t.size
        with open(filename, "rb") as serialized_digest:
            readdata = serialized_digest.read(size)
            if len(readdata) != size:
                msg = 'invalid amount read from file for format %r: %r (should have been %d)'
                Logger("scandigest.load").log(msg % (t.format, readdata, size))
                raise ValueError
            nonce, maxcapacity, urlcount, scannervv, sigversion, sigtimestamp = t.unpack(readdata)

            # Read the datetime as non-utc, since that's how we wrote it with mktime.
            siginfo = SigInfo(scannervv, sigversion,
                              datetime.datetime.fromtimestamp(sigtimestamp))
            filterS = ScalableBloomFilter.fromfile(serialized_digest)
        scandigest = cls(maxcapacity, siginfo, filename, filterS=filterS, nonce=nonce)
        scandigest.urlcount = urlcount
        return scandigest
Exemple #11
0
        'notes',
        'vaccine status',
        'nonvaccine',
        'last updated date',
    ]
    with open(CVX_PATH, encoding='utf-16') as handle:
        reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames)

        for row in reader:
            bf.add(CVX + '|' + row['cvx code'].strip())


try:
    # If the bloom filter already exists, we're probably just appending to it
    with open(BF_PATH, 'rb') as handle:
        bf = ScalableBloomFilter.fromfile(handle)
except FileNotFoundError:
    # If it doesn't, we need to make one
    bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH,
                             initial_capacity=INITIAL_CAPACITY,
                             error_rate=ERROR_RATE)

import_loinc(bf)
import_snomed(bf)
import_rxnorm(bf)
import_icd9(bf)
import_icd10(bf)
import_cpt(bf)
import_fhir(bf)
import_daf(bf)
import_argo(bf)
class BloomMiddleware(object):

    logger.info('Creating Bloomfilter')

    # 没有bloomfile 默认生成 bloom文件
    try:
        bloom_path = os.path.abspath(settings['BLOOM_FILE'])
        bloom_file = open(bloom_path, 'rb')
        # 不是tofile生成的文件会出错
        bloomfilter = ScalableBloomFilter.fromfile(bloom_file)
    except:
        logger.warn('No Bloom File')
        bloom_file = open('bloom', 'wb')
        bloomfilter = ScalableBloomFilter(
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        bloom_path = os.path.abspath('./bloom')
    if settings['URL_FILE']:
        # 读url
        try:
            url_file = open(settings['URL_FILE'], 'r')
        except:
            raise Exception('URL FILE ERROR')
        bloomfilter = ScalableBloomFilter(
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        for x in url_file.read().split('\n'):
            bloomfilter.add(x)
        url_file.close()

    bloom_file.close()

    logger.info('Create Bloomfilter Complete')

    def __init__(self):
        # 上次写入磁盘的时间点
        self.last_write_time = time.time()
        # 写入磁盘的时间间隔(s)
        self.count = 0
        if not isinstance(settings['BLOOM_WRITE_TIME'], int):
            self.write_time = 300
        else:
            self.write_time = settings['BLOOM_WRITE_TIME']
        # +1s 在爬虫结束时将 bloomfilter 内容写入到磁盘
        atexit.register(self.write_to_disk)

    def process_request(self, request, spider):
        # 对有 Bloom 标记的 url 进行判重
        if request.meta.get('Bloom'):
            # 如果 url 在 bloomfilter 中则丢弃这个 request
            tid = request.meta['item']['goods_id']
            # 否则将 url 添加到 bloomfilter
            if tid in self.bloomfilter:
                self.count += 1
                logger.info('IGNORE Request [goods_id:%s] ' % tid)
                logger.info(self.count)
                raise IgnoreRequest
            else:
                logger.debug('[id:%s]not in bloom file' % tid)
                self.bloomfilter.add(tid)
                return None
        # 定时将 bloomfilter 写入到磁盘
        if time.time() - self.last_write_time > self.write_time:
            self.last_write_time = time.time()
            self.write_to_disk()
        return None

    def write_to_disk(self):
        logger.info('WRITE TO DISK')
        save_file = open(self.bloom_path, 'wb')
        self.bloomfilter.tofile(save_file)
        save_file.close()
        logger.info('WRITE COMPLETE')
 def load_from_file(self):
   del self.filter
   f = open(self.filterfile, 'rb')
   self.filter = ScalableBloomFilter.fromfile(f)
   f.close()
RECOGNIZED = [LOINC, SNOMED, RXNORM, ICD9, ICD10, CPT, CVX, UNITS_OF_MEASURE]

# Enumerating all the FHIR systems here would be a waste of time,
# so load them from the constructed json file.
VALUE_SETS = []
with open('./data/fhir/systems.json') as fhir_handle:
    RECOGNIZED += json.load(fhir_handle)
with open('./data/fhir/daf.json') as daf_handle:
    VALUE_SETS += json.load(daf_handle)
with open('./data/fhir/argo.json') as argo_handle:
    VALUE_SETS += json.load(argo_handle)

# Instantiate the bloom filter.
try:
    with open('./data/codes.bf', 'rb') as handle:
        BLOOM = ScalableBloomFilter.fromfile(handle)
except FileNotFoundError:
    # Generated filter not found, just instantiate an empty one.
    BLOOM = ScalableBloomFilter()


def validate_coding(coding):
    """ If the coding system is recognized, check the code.
    """
    if coding.get('system') not in RECOGNIZED:
        raise SystemNotRecognized(coding.get('system'))

    if not coding.get('code'):
        return False

    key = coding['system'] + '|' + coding['code']
Exemple #15
0
#!/usr/bin/python
import json
from flask import request
from flask import Flask
from gevent.pywsgi import WSGIServer
from pybloom import ScalableBloomFilter
import sys

print("loading blooms")
try:
    f = open('./blooms/wikidatabloom1hoppredicate.pickle')
    bloom1hoppred = ScalableBloomFilter.fromfile(f)
    f.close()
    f = open('./blooms/wikidatabloom1.5hopqualifiers.pickle')
    bloomqualifier = ScalableBloomFilter.fromfile(f)  # ihoppred_qualifier
    f.close()
    f = open('./blooms/wikidatabloom1hopentity.pickle')
    bloom1hopentity = ScalableBloomFilter.fromfile(f)
    f.close()
    f = open('./blooms/bloom1hoptypeofentity.pickle')
    bloom1hoptypeofentity = ScalableBloomFilter.fromfile(f)
    f.close()
except Exception, e:
    print e
    sys.exit(1)
print "Blooms loaded"

app = Flask(__name__)


@app.route('/bloomconnections', methods=['POST'])