コード例 #1
0
ファイル: pipelines.py プロジェクト: tousyou/SocialSpider
class BloomPipeline(object):
    def __init__(self, bloomfile, spider_name):
        self.bloomfile = bloomfile
        self.spider_name = spider_name

        # item crawled before
        logger.info("loading crawled items before...")

        if os.path.isfile(self.bloomfile):
            f = open(self.bloomfile, 'r')
            self.item_crawled = ScalableBloomFilter.fromfile(f)
            f.close()
        else:
            self.item_crawled = ScalableBloomFilter(
                100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        cnt = self.item_crawled.count
        logger.info("pipline read %d crawled items" % cnt)

    def __del__(self):
        f = open(self.bloomfile, 'w')
        self.item_crawled.tofile(f)
        f.close()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            #mongo_uri=crawler.settings.get('MONGODB_ADDRESS'),
            bloomfile=crawler.settings.get('BLOOM_FILE'),
            #bloomfile = "/root/dev/SocialSpider/data/weibotv/bloomfile",
            spider_name=crawler.spidercls.name)

    def process_item(self, item, spider):
        #if not item['md5']:
        #    md5 = hashlib.md5("%s%s%s"%(item['title'].encode('utf-8'),item['url'].encode('utf-8'))).hexdigest()
        #    item['md5'] = md5

        valid = True
        item_id = ''
        if self.spider_name == 'weibotv':
            item_id = item['mid']
        elif self.spider_name == 'toutiao':
            item_id = item['Url']
            #item_id = hashlib.md5("%s"%(item['Url'].encode('utf-8'))).hexdigest()
        elif self.spider_name == 'anyvspider':
            item_id = item['pid']
        else:
            pass

        if self.item_crawled.add(item_id):
            valid = False
        else:
            valid = True

        if valid:
            logger.info("item: %s wrote to bloomfile %s" %
                        (item_id.encode('utf-8'), self.bloomfile))
            return item
        else:
            logger.info("item droped %s " % item_id.encode('utf-8'))
コード例 #2
0
class FilterHandler(object):
  def __init__(self, logger):
    self.logger_ = logger
    self._load_from_file()


  def url_seen(self, url):
    if self.deduper_.add(url):
      self.logger_.info('url duplicated: %s', url)
      return True
    return False


  def _load_from_file(self):
    self.logger_.info('loading data from cache file...')
    if not os.path.isfile('data/bloom.data'):
      self.logger_.error('bloom cache file not found, create one instead.')
      self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
    else:
      with open('data/bloom.data', 'r') as f:
        self.deduper_ = ScalableBloomFilter.fromfile(f)


  def _dump_to_file(self):
    self.logger_.info('dumping data...')
    if not os.path.isdir('data'):
      os.mkdir('data')
    with open('data/bloom.data', 'w') as f:
      self.deduper_.tofile(f)
    self.logger_.info('dump data finished.')


  def close(self):
    self._dump_to_file()
コード例 #3
0
ファイル: main.py プロジェクト: ttttttboy/py1
def ParseQueue():
    # Load Checked Urls File
    if os.path.isfile(path_checked_url_file):
        with open(path_checked_url_file, 'rb') as rf:
            checked_url_pool = ScalableBloomFilter.fromfile(rf)
            print("bf: Read pybloom from %s.\n" % path_checked_url_file)
    else:
        checked_url_pool = ScalableBloomFilter(
            initial_capacity=1000,
            error_rate=0.001,
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        print("bf: Create pybloom")

    # Get each Item from Queue
    i = 1
    # URL_QUEUE.put_nowait(None)  # sign the end of Queue
    # for item in iter(URL_QUEUE.get_nowait, None):
    #     cur_url = item[2]
    URL_DEQUE.appendleft(None)
    for item in iter(URL_DEQUE.pop, None):
        cur_url = item[2]

        if (cur_url in checked_url_pool) == False:  # cur_url never checked
            try:
                time.sleep(0.3)
                page_html_raw = requests.get(cur_url, timeout=3)
            except requests.RequestException as e:
                print(e)
                # URL_DEQUE.appendleft(cur_url)
                with open(path_requestErr_log, 'a') as f_requestErr:
                    f_requestErr.write(
                        time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time())) +
                        "Timeout " + cur_url + '\n')
            else:
                page_html = page_html_raw.content.decode('utf-8', 'ignore')
                buffer = parser4me.parser_4_1(item, page_html)
                with open(path_output_folder + os.path.sep + item[1] +
                          item[0][0:128] + ".txt",
                          'w',
                          encoding='utf-8') as resf:
                    resf.write(buffer)
                    print("%s OK! to file %s" % (i, item[0]))
                checked_url_pool.add(cur_url)
                i += 1
        else:
            print("Skip %s" % i)
            i += 1

        with open(path_checked_url_file, 'wb') as wf:
            checked_url_pool.tofile(wf)
コード例 #4
0
class DuplicateItemFilterPipeline(Pipeline):  # bloomfiler 序列化
    fileName = "DuplicateItemFilter.dat"

    def open_spider(self, spider):
        self.fileName = spider.name + self.fileName
        if os.path.exists(self.fileName):
            with open(self.fileName, 'rb') as f:
                self.sbf = ScalableBloomFilter.fromfile(f)
        else:
            self.sbf = ScalableBloomFilter(
                mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        pass

    def close_spider(self, spider):
        with open(self.fileName, 'wb') as f:
            self.sbf = self.sbf.tofile(f)
        pass

    def process_item(self, item, spider):  # bloomfiler
        fp = hashlib.sha1()
        for key in item.keys():
            if key not in ['curlDate', 'reference'] \
                    and item[key] is not None:  # 不比较抓取时间,来源url
                fp.update(item[key])
        fpValue = fp.hexdigest()
        if not self.sbf.add(fpValue):
            return item
        else:
            raise DropItem("duplicate item :/n %s" % item)
コード例 #5
0
d = {}
red = redis.StrictRedis(host='localhost', port=6379, db=0)
#bloom1hop  = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)#capacity=200000000, error_rate=0.0001)
bloomreiqual = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH
                                   )  #capacity=200000000, error_rate=0.0001)
count = 0
red.set('linesread3', 0)
with open('wikidata-raw-2018.08.01_reifiedqualifiers.ttl') as infile:
    for line in infile:
        try:
            line = line.strip()
            if line[0] == '#':
                continue
            red.incr('linesread3')
            tokens = line.split(' ')
            s = tokens[0][1:-1]
            p = tokens[1][1:-1]
            o = tokens[2][1:-1]
            _s, _p, _o = s[37:].split('_')
            _qualrel = p[31:]
            _qualent = p[37:]
            bloomreiqual.add(_s + ':' + _qualrel + '_' + _qualent)
            bloomreiqual.add(_o + ':' + _qualrel + '_' + _qualent)
        except Exception as e:
            print(e)

f = open('bloom/bloomreifiedqualifiers.pickle', 'wb')
bloomreiqual.tofile(f)
f.close()
コード例 #6
0
ファイル: manager.py プロジェクト: Pengsicong/WebSiteSpider
def manager(initUrlList,  max_deep=MAX_DEEP, max_pageNum=MAX_PAGENUM\
 , crawl_type = CRAWL_TYPE, proxies=PROXIES):

    redis.set('success', 0)

    # 抓取网站个数
    page_num = 0

    htmlQueue = Queue()

    if isinstance(initUrlList, list):
        initUrl = initUrlList[0]
        for url in initUrlList:
            htmlQueue.put(url)
    elif isinstance(initUrlList, str):
        initUrl = initUrlList
        htmlQueue.put(initUrl)

    if max_pageNum == 0:
        max_pageNum = -1

    if max_deep == 0:
        max_deep = 9999

    try:
        with open('urlBloomfilter.bloom', 'rb') as f:
            sbf = ScalableBloomFilter().fromfile(f)
            print('bllomfilter 读取成功!')
    except:
        sbf = ScalableBloomFilter(initial_capacity=10000,
                                  error_rate=0.00001,
                                  mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    for deep in range(max_deep):
        gList = []
        while not htmlQueue.empty():
            url = htmlQueue.get()
            if not url in sbf or deep == 0:
                # htmlrun(url)
                gList.append(
                    gevent.spawn(htmlrun, url, crawl_type, 'localhost:8087'))

            max_pageNum -= 1
            page_num += 1
            if max_pageNum == 0:
                continue
        gevent.joinall(gList)

        while redis.scard('STATUS') > 0:
            url = redis.spop('STATUS').decode()
            sbf.add(url)

        if max_pageNum == 0:
            break

        while redis.scard('HTML') > 0:
            url = redis.spop('HTML')
            url = url.decode()
            htmlQueue.put(url)

        # 没有url需要爬取
        if htmlQueue.empty():
            break

    # 下载CSS文件
    while redis.scard('CSS') > 0:
        url = redis.spop('CSS').decode()
        url = parse.urljoin(initUrl, url)
        gList.append(gevent.spawn(cssrun, url))
    gevent.joinall(gList)

    # 最后保存Bloomfilter文件
    with open('urlBloomfilter.bloom', 'wb') as f:
        sbf.tofile(f)

    return page_num
コード例 #7
0
class BloomAutoYara:
  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

  def save_filter(self):
    print "saving filter to file %s "%self.filterfile
    self.bf.tofile(open(self.filterfile,"wb"))

  def add_string(self,str):
    self.bf.add(str)

  def search_string(self,str):
    if str in self.bf:
      return True
    else:
      return False

  def extractlines(self,filename,min_len=4):
    chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> "
    shortest_run = 4
    regexp = '[%s]{%d,}' % (chars, shortest_run)
    pattern = re.compile(regexp)
    fp = open(filename,"rb")
    data = fp.read()
    lines = pattern.findall(data)
    s = set(lines)
    fp.close()
    return list(s)
   
  def build_filter(self,dirname,extensions=[]):
    print extensions
    total = 0
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        ext = f.split(".")[-1]
        
        if len(extensions) != 0 and ext not in extensions:
          continue
          
        print "processing file %s"%f
        total += 1
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          self.add_string(line)
  
    print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile)
    self.save_filter()
    
  def find_file_topn(self,filename,topn=10):
    tmp = []
    lines = self.extractlines(filename)
    print "total unique strings in file %s = %d"%(filename,len(lines))
    for line in lines:
      if self.search_string(line) == False:
        tmp.append(line)
    tmp.sort(key=len)
    print "total strings which can be used for signature = %d"%len(tmp)
    tmp = tmp[-topn:]
    tmp.reverse()
    return tmp
    
  def find_dir_topn(self,dirname,topn=10):
    tmplist = []
    for (dir, _, files) in os.walk(dirname):
      for f in files:
        path = os.path.join(dir, f)
        lines = self.extractlines(path)
        for line in lines:
          if self.search_string(line) == False:
            tmplist.append(line) 
    
    counts = Counter(list(tmplist))
    return counts.most_common(topn)

  def escapechars(self,str):
    for c in "\/.^$*+-?()[]{}|":
      str = str.replace(c,"\\"+c)
    return str
    
  def list_to_rule(self,list,rulename,threshold=0.5):
    tmp = "rule " + rulename + "{\n"
    tmp += " strings:\n"
    
    for i in xrange(0,len(list)):
      esc = self.escapechars(list[i])
      tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/"
      tmp += "\n"
    
    tmp += "condition:\n"
    tmp += str(int(len(list)*threshold))
    tmp += " of ("
    for i in xrange(0,len(list)):
      tmp += "$str"+ str(i)
      if i != (len(list) - 1):
        tmp += ","
    
    tmp += ")\n}"
    
    print "rule = %s.yara is written to disk "%rulename
    fp = open(rulename+".yara","w")
    fp.write(tmp)
    fp.close()
コード例 #8
0
with open("wikidata-raw-2018.08.01.ttl") as infile:
    for line in infile:
        try:
            line = line.strip()
            if line[0] == '#':
                continue
            red.incr('linesread5')
            tokens = line.split(' ')
            url1 = tokens[0][1:-1]
            if 'resource' not in url1:
                continue
            sid = url1[37:]
            url2 = tokens[1][1:-1]
            if 'entity' not in url2:
                continue
            pid = url2[31:]
            bloom1.add(sid + ':' + pid)
            url3 = tokens[2][1:-1]
            if 'resource' not in url3:
                continue
            oid = url3[37:]
            bloom2.add(sid + ':' + oid)
        except Exception as e:
            print(e)
f = open('bloom/wikidatabloom1hoppredicate.pickle', 'wb')
bloom1.tofile(f)
f.close()
f = open('bloom/wikidatabloom1hopentity.pickle', 'wb')
bloom2.tofile(f)
f.close()
コード例 #9
0
ファイル: build_bf.py プロジェクト: narrasr/test-suite
        reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames)

        for row in reader:
            bf.add(CVX + '|' + row['cvx code'].strip())


try:
    # If the bloom filter already exists, we're probably just appending to it
    with open(BF_PATH, 'rb') as handle:
        bf = ScalableBloomFilter.fromfile(handle)
except FileNotFoundError:
    # If it doesn't, we need to make one
    bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH,
                             initial_capacity=INITIAL_CAPACITY,
                             error_rate=ERROR_RATE)

import_loinc(bf)
import_snomed(bf)
import_rxnorm(bf)
import_icd9(bf)
import_icd10(bf)
import_cpt(bf)
import_fhir(bf)
import_daf(bf)
import_argo(bf)
import_cvx(bf)

if __name__ == '__main__':
    with open(BF_PATH, 'wb') as handle:
        bf.tofile(handle)
コード例 #10
0
import sys
from pybloom import ScalableBloomFilter
import redis

red = redis.StrictRedis(host='localhost', port=6379, db=0)
bloom1hop = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH
                                )  #capacity=200000000, error_rate=0.0001)
count = 0
red.set('linesread7', 0)
with open('wikidata-instance-types-2018.08.01.ttl') as infile:
    for line in infile:
        try:
            line = line.strip()
            if line[0] == '#':
                continue
            red.incr('linesread7')
            tokens = line.split(' ')
            s = tokens[0][1:-1][37:]
            o = tokens[2][1:-1][28:]
            bloom1hop.add(s + ':' + o)
        except Exception as e:
            print(e)

f = open('bloom/bloom1hoptypeofentity.pickle', 'wb')
bloom1hop.tofile(f)
f.close()
コード例 #11
0
class BloomFilter:
  def __init__(self, datafile, filterfile):
    # https://github.com/jaybaird/python-bloomfilter/blob/master/pybloom/pybloom.py
    self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    self.datafile = datafile
    self.filterfile = filterfile
    self.datafilesize = None
    self.filterfilesize = None
    self.change = None

  def add_to_filter(self, update=False):
    # https://github.com/bigsnarfdude/Malware-Probabilistic-Data-Structres/blob/master/Mandiant_MD5_BloomFilter.py
    def stream_lines(filename):
      file = open(filename)
      while True:
        line = file.readline()
        if not line:
          file.close()
          break
        yield line.strip()

    def load_file(filename):
      lines = stream_lines(filename)
      templist = []
      for line in lines:
        templist.append(line)

      return templist

    itemlist = load_file(self.datafile)
    self.itemcount = len(itemlist)

    if not update:
      # reinitialize filter before adding a new set of items
      self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    for item in itemlist:
      _ = self.filter.add(item)

  def update_filter(self):
    # simulate updation via add
    self.add(update=True)

  def save_to_file(self):
    if self.filter:
      f = open(self.filterfile, 'wb')
      self.filter.tofile(f)
      f.close()

  def load_from_file(self):
    del self.filter
    f = open(self.filterfile, 'rb')
    self.filter = ScalableBloomFilter.fromfile(f)
    f.close()

  def search_filter(self, item):
    return True if item in self.filter else False

  def get_stats(self):
    if filter:
      self.datafilesize = file_size(self.datafile)
      self.filterfilesize = file_size(self.filterfile)
      self.change = 100 * (self.filterfilesize - self.datafilesize) / self.datafilesize

      return {
        "initial_capacity": self.filter.initial_capacity,
        "capacity": self.filter.capacity,
        "count": self.filter.count,
        "ratio": self.filter.ratio,
        "scale": self.filter.scale,
        "datafile": self.datafile,
        "filterfile": self.filterfile,
        "datafilesize": self.datafilesize,
        "filterfilesize": self.filterfilesize,
        "change": self.change
      }
    else:
      return None
コード例 #12
0
            d1[s].add(p)
        if o not in d1:
            d1[o] = set()
            d1[o].add(p)
        else:
            d1[o].add(p)

red.set('linesread7', 0)
with open("wikidata-raw-2018.08.01.ttl") as infile:
    for line in infile:
        line = line.strip()
        if line[0] == '#':
            continue
        red.incr('linesread7')
        tokens = line.split(' ')
        s = tokens[0][1:-1]
        p = tokens[1][1:-1]
        o = tokens[2][1:-1]
        if 'wikidata.dbpedia.org/resource' not in o:
            continue
        if o in d1:
            for pred in d1[o]:
                bloom.add(s + ':' + pred)
        if s in d1:
            for pred in d1[s]:
                bloom.add(o + ':' + pred)

f = open('bloom/bloom2hoppredicate.pickle', 'wb')
bloom.tofile(f)
f.close()
コード例 #13
0
    with open(CVX_PATH, encoding='utf-16') as handle:
        reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames)

        for row in reader:
            bf.add(CVX + '|' + row['cvx code'].strip())

try:
    # If the bloom filter already exists, we're probably just appending to it
    with open(BF_PATH, 'rb') as handle:
        bf = ScalableBloomFilter.fromfile(handle)
except FileNotFoundError:
    # If it doesn't, we need to make one
    bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH,
                             initial_capacity=INITIAL_CAPACITY,
                             error_rate=ERROR_RATE)

import_loinc(bf)
import_snomed(bf)
import_rxnorm(bf)
import_icd9(bf)
import_icd10(bf)
import_cpt(bf)
import_fhir(bf)
import_daf(bf)
import_argo(bf)
import_cvx(bf)

if __name__ == '__main__':
    with open(BF_PATH, 'wb') as handle:
        bf.tofile(handle)