Example #1
0
class FileBloom(object):
    def __init__(self):
        self.file_path = "bloom/bloom_weibo.txt"
        self.bloom_filter = ScalableBloomFilter(initial_capacity=10000,
                                                error_rate=0.001)

    def read_bloom(self):
        if os.path.exists(self.file_path):
            f = open(self.file_path, "r")
            ids = f.readlines()
            for id in ids:
                id_s = id.strip()
                self.bloom_filter.add(id_s)
            f.close()
        else:
            f = open(self.file_path, "w")
            f.close()

    def to_file(self):
        pass

    def update_bloom_file(self, m_id):
        f = open(self.file_path, "a")
        f.write(str(m_id) + "\n")
        f.close()

    def update_bloom(self, m_id):
        self.bloom_filter.add(m_id)

    def has_id(self, m_id):
        if m_id in self.bloom_filter:
            return True
        else:
            return False
Example #2
0
def dedup_lines_bloom(text,
                      just_words=True,
                      zero_digits=True,
                      capacity=100000,
                      error=0.00001):
    sbf = ScalableBloomFilter(initial_capacity=capacity,
                              error_rate=error,
                              mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    for line in text:
        if not isinstance(line, str):
            raise TypeError(
                'Expected "text" to contain stings, found: {}'.format(
                    type(line)))

        key = line.strip()
        if not key:
            yield line

        key = normalize('NFKD', key)

        if just_words:
            key = ' '.join(re.findall(r'\w+', key))
        if zero_digits:
            key = re.sub(r'\d', '0', key)

        if key in sbf:
            line = ''
        else:
            sbf.add(key)

        yield line
  def add_to_filter(self, update=False):
    # https://github.com/bigsnarfdude/Malware-Probabilistic-Data-Structres/blob/master/Mandiant_MD5_BloomFilter.py
    def stream_lines(filename):
      file = open(filename)
      while True:
        line = file.readline()
        if not line:
          file.close()
          break
        yield line.strip()

    def load_file(filename):
      lines = stream_lines(filename)
      templist = []
      for line in lines:
        templist.append(line)

      return templist

    itemlist = load_file(self.datafile)
    self.itemcount = len(itemlist)

    if not update:
      # reinitialize filter before adding a new set of items
      self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    for item in itemlist:
      _ = self.filter.add(item)
Example #4
0
 def boot1(self):
     try:
         self.multiFile.seek(0)
         a = ScalableBloomFilter.fromfile(self.multiFile)
         return a
     except:
         return ScalableBloomFilter(ScalableBloomFilter.LARGE_SET_GROWTH)
Example #5
0
class DuplicateItemFilterPipeline(Pipeline):  # bloomfiler 序列化
    fileName = "DuplicateItemFilter.dat"

    def open_spider(self, spider):
        self.fileName = spider.name + self.fileName
        if os.path.exists(self.fileName):
            with open(self.fileName, 'rb') as f:
                self.sbf = ScalableBloomFilter.fromfile(f)
        else:
            self.sbf = ScalableBloomFilter(
                mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        pass

    def close_spider(self, spider):
        with open(self.fileName, 'wb') as f:
            self.sbf = self.sbf.tofile(f)
        pass

    def process_item(self, item, spider):  # bloomfiler
        fp = hashlib.sha1()
        for key in item.keys():
            if key not in ['curlDate', 'reference'] \
                    and item[key] is not None:  # 不比较抓取时间,来源url
                fp.update(item[key])
        fpValue = fp.hexdigest()
        if not self.sbf.add(fpValue):
            return item
        else:
            raise DropItem("duplicate item :/n %s" % item)
Example #6
0
class BloomPipeline(object):
    def __init__(self, bloomfile, spider_name):
        self.bloomfile = bloomfile
        self.spider_name = spider_name

        # item crawled before
        logger.info("loading crawled items before...")

        if os.path.isfile(self.bloomfile):
            f = open(self.bloomfile, 'r')
            self.item_crawled = ScalableBloomFilter.fromfile(f)
            f.close()
        else:
            self.item_crawled = ScalableBloomFilter(
                100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        cnt = self.item_crawled.count
        logger.info("pipline read %d crawled items" % cnt)

    def __del__(self):
        f = open(self.bloomfile, 'w')
        self.item_crawled.tofile(f)
        f.close()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            #mongo_uri=crawler.settings.get('MONGODB_ADDRESS'),
            bloomfile=crawler.settings.get('BLOOM_FILE'),
            #bloomfile = "/root/dev/SocialSpider/data/weibotv/bloomfile",
            spider_name=crawler.spidercls.name)

    def process_item(self, item, spider):
        #if not item['md5']:
        #    md5 = hashlib.md5("%s%s%s"%(item['title'].encode('utf-8'),item['url'].encode('utf-8'))).hexdigest()
        #    item['md5'] = md5

        valid = True
        item_id = ''
        if self.spider_name == 'weibotv':
            item_id = item['mid']
        elif self.spider_name == 'toutiao':
            item_id = item['Url']
            #item_id = hashlib.md5("%s"%(item['Url'].encode('utf-8'))).hexdigest()
        elif self.spider_name == 'anyvspider':
            item_id = item['pid']
        else:
            pass

        if self.item_crawled.add(item_id):
            valid = False
        else:
            valid = True

        if valid:
            logger.info("item: %s wrote to bloomfile %s" %
                        (item_id.encode('utf-8'), self.bloomfile))
            return item
        else:
            logger.info("item droped %s " % item_id.encode('utf-8'))
Example #7
0
    def __init__(self,
                 endpoint=config.config['general']['dbpedia']['endpoint'],
                 one_hop_bloom_file=config.config['general']['dbpedia']
                 ['one_hop_bloom_file'],
                 two_hop_bloom_file=config.config['general']['dbpedia']
                 ['two_hop_bloom_file']):
        super(DBpedia, self).__init__(endpoint)
        self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"
        if os.path.exists(one_hop_bloom_file):
            with open(one_hop_bloom_file) as bloom_file:
                self.one_hop_bloom = BloomFilter.fromfile(bloom_file)
        else:
            self.one_hop_bloom = None
        self.two_hop_bloom_file = two_hop_bloom_file

        self.two_hop_bloom = dict()
        for item in [True, False]:
            file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item))
            if os.path.exists(file_path):
                with open(file_path) as bloom_file:
                    self.two_hop_bloom[item] = ScalableBloomFilter.fromfile(
                        bloom_file)
            else:
                self.two_hop_bloom[item] = ScalableBloomFilter(
                    mode=ScalableBloomFilter.LARGE_SET_GROWTH)

        self.two_hop_bloom_counter = 0
class URLFilter(object):

    lock = RLock()

    def __init__(self):
        self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
                               'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
        self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def forbidden_key_word(self, url):
        for key_word in self.forbidden_keys:
            if key_word in url:
                log.debug('## FORBIDDEN: {}'.format(url))
                return False
        return True

    @staticmethod
    def is_english(url):
        try:
            url.decode('ascii')
        except UnicodeDecodeError:
            log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url))
            return False
        else:
            return True

    def pass_check(self, url):
        with URLFilter.lock:
            if url in self.seen:
                log.debug('## SEEN: {}'.format(url))
                return False
            self.seen.add(url)
            return self.forbidden_key_word(url) and self.is_english(url)
Example #9
0
    def __init__(self, tasks=2, loop=None):
        self.tasks = tasks
        self.loop = loop or asyncio.get_event_loop()
        self.redis_cookie = RedisCookie()
        self.redis_job = RedisJob()
        self.bloom_filter = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.weibo_limit = True
        self.time_current_pattern = re.compile(r'(\d*)分钟前')
        self.time_today_pattern = re.compile(r'今天\s*(\d*):(\d*)')
        self.time_year_pattern = re.compile(r'(\d*)月(\d*)日\s*(\d*):(\d*)')
        self.user_id_pattern = re.compile(r'https://weibo.cn/u/(\d*)')
        self.weibo_host = 'https://weibo.cn'
        self.follow_url = self.weibo_host + '/%s/follow'

        self.fan_url = self.weibo_host + '/%s/fans'
        self.user_info_url = self.weibo_host + '/%s/info'
        self.user_tweet_url = self.weibo_host + '/%s'
        self.user_tweet_url2 = self.weibo_host + '/%s?page=%d'
        self.user_repost_url = self.weibo_host + '/repost/%s'
        self.user_repost_url2 = self.weibo_host + '/repost/%s?page=%d'
        self.tweet_comment_url = self.weibo_host + '/comment/%s'
        self.tweet_comment_url2 = self.weibo_host + '/comment/%s?page=%d'
        self.weibo_producer = WeiboProcuder(['localhost:9092'], 'sinaweibo')
        self.search_url = 'https://weibo.cn/search/?pos=search'
        self.get_search_url = 'https://weibo.cn/search/mblog/?keyword=%s&filter=hasori'
Example #10
0
class FilterHandler(object):
  def __init__(self, logger):
    self.logger_ = logger
    self._load_from_file()


  def url_seen(self, url):
    if self.deduper_.add(url):
      self.logger_.info('url duplicated: %s', url)
      return True
    return False


  def _load_from_file(self):
    self.logger_.info('loading data from cache file...')
    if not os.path.isfile('data/bloom.data'):
      self.logger_.error('bloom cache file not found, create one instead.')
      self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
    else:
      with open('data/bloom.data', 'r') as f:
        self.deduper_ = ScalableBloomFilter.fromfile(f)


  def _dump_to_file(self):
    self.logger_.info('dumping data...')
    if not os.path.isdir('data'):
      os.mkdir('data')
    with open('data/bloom.data', 'w') as f:
      self.deduper_.tofile(f)
    self.logger_.info('dump data finished.')


  def close(self):
    self._dump_to_file()
	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()

		self.session.begin(subtransactions=True)
		
		ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins))
		
		#s = set()
		s = ScalableBloomFilter()
		query = self.session.query(SupplierCatalogModel.id)
		for (supplier_catalog_id, ) in query.yield_per(100):
			s.add(supplier_catalog_id)
		
		
		for plug in self.plugins.itervalues():
			supplier_catalog_filter_id = plug.supplier_catalog_filter_id()
			model_name = plug.version_model()  + 'Model'
			VersionModel = getattr(model, model_name)
			query = self.session.query(VersionModel)
			if limit:
				query = query.order_by(VersionModel.vacuumed.nullsfirst())
				query = query.limit(limit)

			ts['sub_done'] = 0
			ts['sub_total'] = query.count()
			for supplier_catalog_item_version in query.yield_per(10):
				if supplier_catalog_item_version.supplier_catalog_id not in s:
					logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id)
					self.session.delete(supplier_catalog_item_version)
				ts['sub_done'] += 1
			ts['done'] += 1
		self.session.commit()
		ts.finish()
		logger.debug('End vacuum_all()')
Example #12
0
class kmer_store:
    def __init__(self):
        self.bloom_filter = ScalableBloomFilter(
            initial_capacity=1000000,
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        self.kmers = {}

    def update(self, item):
        if item in self.bloom_filter:
            if item in self.kmers:
                self.kmers[item] += 1
            else:
                self.kmers[item] = 2
        else:
            self.bloom_filter.add(item)

    def __iter__(self):
        for key in self.kmers:
            yield key

    def __getitem__(self, key):
        return self.kmers[key]

    def __repr__(self):
        return str(self.kmers)

    def __str__(self):
        return str(self.kmers)
Example #13
0
 def __init__(self, datafile, filterfile):
   # https://github.com/jaybaird/python-bloomfilter/blob/master/pybloom/pybloom.py
   self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
   self.datafile = datafile
   self.filterfile = filterfile
   self.datafilesize = None
   self.filterfilesize = None
   self.change = None
Example #14
0
  def __init__(self,filterfile):
    self.filterfile = filterfile
	  #if filterfile is present load bloom filter from that file, else create new one
    if os.path.exists(filterfile):
      self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb"))
      print "available signatures = %d"%len(self.bf)
    else:
      self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
Example #15
0
 def _load_from_file(self):
   self.logger_.info('loading data from cache file...')
   if not os.path.isfile('data/bloom.data'):
     self.logger_.error('bloom cache file not found, create one instead.')
     self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4)
   else:
     with open('data/bloom.data', 'r') as f:
       self.deduper_ = ScalableBloomFilter.fromfile(f)
Example #16
0
class BloomMembership(GenericMembership):
    def __init__(self, max_size: int, error_rate: float):
        self.bloom_filter = ScalableBloomFilter(max_size, error_rate)

    def add(self, key: str):
        self.bloom_filter.add(key)

    def __contains__(self, key: str) -> bool:
        return key in self.bloom_filter
Example #17
0
 def open_spider(self, spider):
     self.fileName = spider.name + self.fileName
     if os.path.exists(self.fileName):
         with open(self.fileName, 'rb') as f:
             self.sbf = ScalableBloomFilter.fromfile(f)
     else:
         self.sbf = ScalableBloomFilter(
             mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     pass
Example #18
0
class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
Example #19
0
def second_new_warn_entity():
    minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE,
                           ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL)
    row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d')
    b7 = ScalableBloomFilter(100000, 0.001)
    b30 = ScalableBloomFilter(100000, 0.001)
    b90 = ScalableBloomFilter(100000, 0.001)
    for i, k in minDates.items():
        dateTime = datetime.strptime(k, '%Y-%m-%d')
        dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400
        if dValue < 7 and dValue >= 0:
            [b7.add(i)]
        if dValue < 30 and dValue >= 0:
            [b30.add(i)]
        if dValue < 90 and dValue >= 0:
            [b90.add(i)]
    result90 = secondDetectFromBigTable(90, TABLE_REPORT_ILLEGAL, RISK_LEVEL,
                                        ILLEGAL_SCORE, 'all', 0, 0, 'all',
                                        'all', TABLE_LOGS, 'all')
    count7 = 0
    count30 = 0
    count90 = 0
    resultIds = []
    for each in result90:
        if not each['entity_id'] in resultIds:
            resultIds.append(each['entity_id'])
    for id in resultIds:
        if id in b7:
            count7 += 1
        if id in b30:
            count30 += 1
        if id in b90:
            count90 += 1
    result = {'count7': count7, 'count30': count30, 'count90': count90}
    return json.dumps(result, ensure_ascii=False)
Example #20
0
class WishPipeline(object):
    def __init__(self):
        self.urls = ScalableBloomFilter(
            mode=ScalableBloomFilter.LARGE_SET_GROWTH)

    def process_item(self, item, spider):
        if item is None or item['url'] is None or item['url'] in self.urls:
            raise DropItem("Duplicate item found.")
        else:
            self.urls.add(item['url'])
            return item
Example #21
0
def get_city_rank(table, table4, field, province_name, risk_level):
    cur = defaultDatabase()
    city_list = []
    list = []
    province_list = []
    sql = "select max(date) from %s" % table
    cur.execute(sql)
    end_time = cur.fetchall()[0][0]
    start_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7)
    start_time = start_time.strftime("%Y-%m-%d")
    start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30)
    start_time1 = start1_time.strftime("%Y-%m-%d")
    sql1 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % (
        table, table4, table4, start_time, end_time, risk_level)
    cur.execute(sql1)
    res1 = cur.fetchall()
    result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1]
    sql2 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % (
        table, table4, table4, start_time1, end_time, risk_level)
    cur.execute(sql2)
    res2 = cur.fetchall()
    result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2]
    result = result1 + result2
    b = ScalableBloomFilter(1000000, 0.001)
    for p in result:
        if not p['city'] in b:
            [b.add(p['city'])]
            city_list.append({'province': p['province'], 'city': p['city']})
    for d in city_list:
        if not d['province'] in province_list:
            province_list.append(d['province'])
    if province_name:
        for d in city_list:
            if d['province'] == province_name and d['city']:
                pro_dict = {"province": d['province'], "city": d['city']}
                for dict in result1:
                    if dict['city'] == d['city']:
                        pro_dict.update({'count7': dict['count']})
                for dict in result2:
                    if dict['city'] == d['city']:
                        pro_dict.update({'count30': dict['count']})
                list.append(pro_dict)
    if not province_name:
        for p in province_list:
            if p:
                pro_dict = {"province": p}
                count = 0
                for dict in result1:
                    if dict['province'] == p:
                        count += dict['count']
                pro_dict.update({"count": count})
                list.append(pro_dict)
    return list
Example #22
0
    def __init__(self, base_url: str = 'https://news.sina.com.cn', n_news: int = 100, n_producers: int = 5, n_consumers: int = 1):
        self.base_url = base_url
        self.n_news = n_news
        self.n_producers = n_producers
        self.n_consumers = n_consumers

        self.urls = gevent.queue.Queue()
        self.soups = gevent.queue.Queue(maxsize=n_producers)
        self.urls.put(URL(base_url))

        self.visited_urls = Filter(initial_capacity=n_news)
        self.news_filter = Filter(initial_capacity=n_news)
Example #23
0
 def __init__(self, domain, threads, depth, times, headers, father):
     self.domain = domain
     if self.domain[self.domain.__len__() - 1] == '/':
         self.domain = self.domain[0:self.domain.__len__() - 1]
     self.threads = threads
     self.times = times
     self.cookies = {}
     self.headers = {}
     self.count = 0
     self.controlthread = 0
     self.depth = depth
     self.father = father
     self.realdomain = ''
     self.payload = Payload()
     self.encode = Encode()
     if headers != '':
         self.setheader(headers)
     if 'https' in self.domain:
         self.domain1 = self.domain.replace('https://', '')
         self.domain2 = 'http://' + self.domain1
         self.domain3 = 'http%3A%2F%2F' + self.domain1
         self.domain4 = 'https%3A%2F%2F' + self.domain1
     elif 'http' in self.domain:
         self.domain1 = self.domain.replace('http://', '')
         self.domain2 = 'https://' + self.domain1
         self.domain3 = 'http%3A%2F%2F' + self.domain1
         self.domain4 = 'https%3A%2F%2F' + self.domain1
     else:
         self.domain1 = 'http://' + self.domain
         self.domain2 = 'https://' + self.domain
         self.domain3 = 'http%3A%2F%2F' + self.domain
         self.domain4 = 'https%3A%2F%2F' + self.domain
     self.queue = Queue()
     self.urlqueue = Queue()
     self.lock = threading.RLock()
     self.lock2 = threading.RLock()
     self.lock3 = threading.RLock()
     self.lock4 = threading.RLock()
     self.lock5 = threading.RLock()
     self.bloomfilter = ScalableBloomFilter(
         initial_capacity=10000,
         error_rate=0.001,
         mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     self.bloomfilter2 = ScalableBloomFilter(
         initial_capacity=10000,
         error_rate=0.001,
         mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     self.blacklist = [
         '<', '{', '\'', '"', '.css', '.jpg', '.mp4', '.png', '.gif',
         '.avi', '.jpeg', '.ico', '.mp3', '.pdf', 'docx', 'doc', 'bmp',
         '.rmvb', '.zip', '.rar', '.exe', '.ppt', '.pptx', 'xls'
     ]
     self.rule = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
Example #24
0
class URLBloomFilter(RFPDupeFilter):
    def __init__(self, path=None, debug=False):
        self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path)

    def request_seen(self, request):
        fp = hashlib.sha1()
        fp.update(canonicalize_url(request.url).encode("utf8"))
        url_sha1 = fp.hexdigest()
        if url_sha1 in self.urls_sbf:
            return True
        else:
            self.urls_sbf.add(url_sha1)
Example #25
0
    def test_bloom_string(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
            rnd = ''.join(random.choice(string.letters) for i in xrange(40))
            _ = f.add(rnd)

        self.assertEqual(rnd in f, True)

        for i in string.letters:
            self.assertEqual(i in f, False)

        self.assertEqual(rnd in f, True)
Example #26
0
class RequestFilter(object):
    """ RequestFilter """
    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
Example #27
0
class URLBloomFilter(RFPDupeFilter):
    """根据urlhash_bloom过滤"""
    def __init__(self,path=None):
        self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path)

    def request_seen(self, request):
        fp = hashlib.sha1()
        fp.update(canonicalize_url(request.url))
        url_shal = fp.hexdigest()
        if url_shal in self.urls_sbf:
            return True
        else:
            self.urls_sbf.add(url_shal)
    def __init__(self):
        super(StreamingTriangles, self).__init__()

        # Set up connection to Redis server
        self.redis_server = 'localhost'
        self.redis_db = redis.StrictRedis(host=self.redis_server,
                                          port=6379,
                                          db=0)

        # Initialize reservoir sizes
        self.edge_res_size = 40000
        self.wedge_res_size = 40000

        # Set Scalable Bloom Filter for ignoring repeated edges
        self.bloom_filter = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        # Init counters and arrays for Streaming-Triangles algorithm
        self.edge_count = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0}

        self.total_wedges = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0}

        self.edge_res = {
            RED: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            BLUE: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            YELLOW: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)],
            GREEN: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)]
        }

        self.wedge_res = {
            RED: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            BLUE:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            YELLOW:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)],
            GREEN:
            [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)]
        }

        self.is_closed = {
            RED: [False for _ in xrange(self.wedge_res_size)],
            BLUE: [False for _ in xrange(self.wedge_res_size)],
            YELLOW: [False for _ in xrange(self.wedge_res_size)],
            GREEN: [False for _ in xrange(self.wedge_res_size)]
        }

        # Track percent of uncategorized transactions
        self.num_missed = 0
        self.num_colored = 0
Example #29
0
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
Example #30
0
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001):
    """
    Converts the iterable into a ScalableBloomFilter
    
    :rtype : pybloom.ScalableBloomFilter
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    bloom = ScalableBloomFilter(init_cap, err_rate)
    for element in iterable:
        bloom.add(element)

    return bloom
Example #31
0
    def test_bloom_int(self):
        f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        for i in xrange(0, 10000):
             _ = f.add(i)

        for i in xrange(0, 10000):
            self.assertEqual(i in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(0,10000-1)
            self.assertEqual(r in f, True)

        for i in xrange(0, 10000 / 2 ):
            r = random.randint(10000,10000 * 2)
            self.assertEqual(r in f, False)
Example #32
0
    def generate_task(
        self,
        generate_func_name,
        g_kw={},
        sleep=180,
        times=20,
    ):
        '''
        params: generate_func_name -> 任务生成函数的名字
        params: g_kw -> generate_func的关键字参数
        params: sleep,times ->每过sleep秒执行一次generate_func,times为执行的次数

        任务生成函数,可多次执行generate_func,无需多次将times设置为1即可
        '''
        if self.is_filter:
            self.sbf = ScalableBloomFilter()
        else:
            self.sbf = None

        table = Table(logger=self.logger)
        generate_func = getattr(table, generate_func_name)
        e_kw = dict(
            generate_func=generate_func,
            g_kw=g_kw,
        )

        self.loop_task(execute_func=self.core_generate_task,
                       e_kw=e_kw,
                       flag=1,
                       sleep=sleep,
                       times=times)
        table.close()
	def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier):
		"""Category Conversion"""
		if self.category_conversion_filter is None:
			self.category_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				CategoryConversionModel.supplier_id,
				CategoryConversionModel.manufacturer_id,
				CategoryConversionModel.needle
			)
			for row in query.yield_per(100):
				self.category_conversion_filter.add(row)
		
		row = (supplier_id, manufacturer_id, category_identifier)
		if row in self.category_conversion_filter:
			query = self.session.query(CategoryConversionModel)
			query = query.filter(CategoryConversionModel.supplier_id == supplier_id)
			query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id)
			query = query.filter(CategoryConversionModel.needle == category_identifier)
			try:
				category_conversion = query.one()
				return category_conversion
			except NoResultFound:
				pass

		category_conversion = CategoryConversionModel()
		category_conversion.manufacturer_id = manufacturer_id
		category_conversion.supplier_id = supplier_id
		category_conversion.needle = category_identifier
		self.session.add(category_conversion)
		self.category_conversion_filter.add(row)
		return category_conversion
Example #34
0
class UrlFilter(RFPDupeFilter):
    def __init__(self, path=None, debug=False):
        self.urls_sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path, debug)

    def request_seen(self, request):

        fp = hashlib.sha1()
        fp.update(canonicalize_url(request.url).encode('utf-8'))
        url_sha1 = fp.hexdigest()
        if url_sha1 not in self.urls_sbf and not mysqldb.queryItem(
                request.url):
            self.urls_sbf.add(url_sha1)
        else:
            return True
Example #35
0
class RedisJob(object):
    redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=1)
    url_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    @classmethod
    def push_job(cls, job_type, job_info):

        if 'url' in job_info:
            if job_info['url'] not in cls.url_filter:
                cls.url_filter.add(job_info['url'])
                r = redis.Redis(connection_pool=cls.redis_pool)
                r.lpush(str(job_type), json.dumps(job_info))
                LOGGER.info("push %s job into redis: %s" %
                            (job_type, str(job_info)))
            else:
                LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info)))
        else:
            r = redis.Redis(connection_pool=cls.redis_pool)
            r.lpush(str(job_type), json.dumps(job_info))
            LOGGER.info("push %s job into redis: %s" %
                        (job_type, str(job_info)))

    @classmethod
    def fetch_job(cls, job_type):
        r = redis.Redis(connection_pool=cls.redis_pool)
        job_info = r.lpop(job_type)
        if job_info:
            LOGGER.info('fetched job: %s' % job_info)
            return json.loads(job_info)
        else:
            return None
Example #36
0
class RequestFilter(object):

    """ RequestFilter """

    def __init__(self):
        self.sbf = ScalableBloomFilter(
            mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    def request_seen(self, request):
        """request seen
        """
        finger = request_fingerprint(request)
        if finger in self.sbf:
            return True
        self.sbf.add(finger)
        return False
Example #37
0
class URLBloomFilter(RFPDupeFilter):
    # 根据urlhash_bloom过滤
    def __init__(self, path=None):
        self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        RFPDupeFilter.__init__(self, path)

    def request_seen(self, request):
        # 生成一个哈希sha1处理实例
        fp = hashlib.sha1()
        # 更新传入的参数为格式化统一后的函数(有时候同一个网址,可能请求网址的格式不一样)
        fp.update(canonicalize_url(request.url))
        # sha1处理后的url
        url_sha1 = fp.hexdigest()
        if url_sha1 in self.urls_sbf:
            return True
        else:
            self.urls_sbf.add(url_sha1)
Example #38
0
    def __init__(self, city):
        """豆瓣页面抓取,抓取正在上映列表和电影介绍页。

        :param city: 抓取影片数据的城市。
        """
        self._url = 'https://movie.douban.com/cinema/nowplaying/{}/'.format(
            city.lower())
        # 电影列表页请求头
        self._list_headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep - alive',
            'Host':
            'movie.douban.com',
            'Referer':
            'https://movie.douban.com/',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
        }
        # 电影介绍页请求头
        self._info_headers = self._list_headers.copy()
        self._info_headers.update({'Referer': self._url})
        # 用布隆过滤器去重
        self._bf = ScalableBloomFilter()

        cfg = ConfigParser()
        cfg.read('config.ini')
        db_host = cfg.get('database', 'host')
        db_port = cfg.getint('database', 'port')
        db_dbname = cfg.get('database', 'database')
        db_collection = cfg.get('database', 'collection')

        self._db = MongoClient(db_host, db_port)[db_dbname][db_collection]
        for movie in self._db.find({}):
            self.logger.debug('get {} in database'.format(movie['url']))
            self._bf.add(movie['url'])
Example #39
0
 def __init__(self, initial_capacity=1000, error_rate=0.0001):
     self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
             error_rate=error_rate,
             mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     # False positives in the Bloom filter will cause us to fail to
     # garbage-collect an object.  Salt the Bloom filter to ensure
     # that we get a different set of false positives on every run.
     self._bloom_salt = os.urandom(2)
Example #40
0
    def __init__(self, bloomfile, spider_name):
        self.bloomfile = bloomfile
        self.spider_name = spider_name

        # item crawled before
        logger.info("loading crawled items before...")

        if os.path.isfile(self.bloomfile):
            f = open(self.bloomfile, 'r')
            self.item_crawled = ScalableBloomFilter.fromfile(f)
            f.close()
        else:
            self.item_crawled = ScalableBloomFilter(
                100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        cnt = self.item_crawled.count
        logger.info("pipline read %d crawled items" % cnt)
Example #41
0
 def __init__(self, withDistinct=None):
     super(DistinctElementCount, self).__init__()
     self.count = 0
     self.bloom = None
     self.set = None
     if withDistinct:
         self.bloom = ScalableBloomFilter(error_rate=0.00001)
         self.distinct = 0
         self.set = set([])
Example #42
0
    def add_sbf(self, query=None):
        '''
        params: query -->mysql 查询语句
        过滤任务处理结果
        '''

        if query is None:
            return None

        sbf = ScalableBloomFilter()
        table = Table(logger=self.logger)
        result_dict = table.execute(query=query)
        data = result_dict.get('data')
        for each in data:
            id = each.get('id')
            sbf.add(int(id))
        table.close()
        return sbf
Example #43
0
 def __init__(self, source_image):
     self.source_image = source_image
     self.bloom_filter = ScalableBloomFilter(
         initial_capacity=source_image.tiles.count(),
         error_rate=0.0001,  # 1 in 10,000
     )
     existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
     for tile_id, existing_match_id in existing_matches:
         self.bloom_filter.add((tile_id, existing_match_id))
Example #44
0
 def __init__(self, spider):
     super(BFSFrontier, self).__init__(spider)
     self._spider = spider
     self.args = {'rules': [],
                  'order': 'bfs'}
     self.redis = RediSugar.getConnection()
     self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
     self.todo = spider.name + '-todo'
     self.visited = spider.name + '-visited'
     self._feedfilter()
Example #45
0
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001):
    """
    Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate
    the number of distinct values found in this iterable.
    
    :param iterable:
    :param init_cap:
    :param err_rate:
    """

    counter = 0

    set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate)

    for element in iterable:
        if element not in set_of_distinct_values:
            set_of_distinct_values.add(element)
            counter += 1

    return counter
Example #46
0
def main(args):
    seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
    for ln in sys.stdin:
        if not ln:
            continue
        fetchedUrl = json.loads(ln)

        # continue if we've seen this url already.
        if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet:
            continue

        # add unseen url to the url set
        seenUrlSet.add(fetchedUrl["url"])
        seenUrlSet.add(fetchedUrl["effective_url"])

        # extract links and filter out some urls by url filter.
        outlinks = url_filter(extract_links(fetchedUrl))

        # analyze

        print "[postproc]%s" % fetchedUrl["url"]
Example #47
0
class BloomSet(object):
    def __init__(self, initial_capacity=1000, error_rate=0.0001):
        self._set = ScalableBloomFilter(initial_capacity=initial_capacity,
                error_rate=error_rate,
                mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        # False positives in the Bloom filter will cause us to fail to
        # garbage-collect an object.  Salt the Bloom filter to ensure
        # that we get a different set of false positives on every run.
        self._bloom_salt = os.urandom(2)

    def add(self, name):
        self._set.add(self._bloom_key(name))

    def __contains__(self, name):
        # May return false positives.
        return self._bloom_key(name) in self._set

    def _bloom_key(self, name):
        if isinstance(name, unicode):
            name = name.encode('utf-8')
        return self._bloom_salt + name
	def vacuum_all(self, limit=None):
		logger.debug('Begin vacuum_all(limit=%s)', limit)
		self.plugins = self.load_plugins()
		ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins))
		tx = transaction.get()
		
		try:
			#s = set()
			s = ScalableBloomFilter()
			query = DBSession.query(SupplierSpecialModel.id)
			for (supplier_special_id, ) in query.yield_per(100):
				s.add(supplier_special_id)
			
			for plug in self.plugins.itervalues():
				supplier_special_filter_id = plug.supplier_special_filter_id()
				model_name = plug.version_model()  + 'Model'
				VersionModel = getattr(model, model_name)
				query = DBSession.query(VersionModel)
				if limit:
					query = query.order_by(VersionModel.vacuumed.nullsfirst())
					query = query.limit(limit)

				ts['sub_done'] = 0
				ts['sub_total'] = query.count()
				for supplier_special_item_version in query.yield_per(10):
					if supplier_special_item_version.supplier_special_id not in s:
						logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id)
						DBSession.delete(supplier_special_item_version)
					ts['sub_done'] += 1
					if ts['sub_done'] % 1000 == 0:
						DBSession.flush()
				DBSession.flush()
				ts['done'] += 1
		except Exception:
			logger.exception('Caught Exception: ')
			tx.abort()
		finally:
			ts.finish()
		transaction.commit()
		logger.debug('End vacuum_all()')
	def get_scale_conversion(self, supplier_id, scale_identifier):
		"""Scale Conversion"""
		
		if scale_identifier is None:
			return None
		if supplier_id is None:
			return None


		if self.scale_conversion_filter is None:
			self.scale_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				ScaleConversionModel.supplier_id,
				ScaleConversionModel.scale_identifier
			)
			for row in query.yield_per(100):
				self.scale_conversion_filter.add(row)
		
		row = (supplier_id, scale_identifier)
		if row in self.scale_conversion_filter:
			
			query = self.session.query(ScaleConversionModel)
			query = query.filter(ScaleConversionModel.supplier_id == supplier_id)
			query = query.filter(ScaleConversionModel.scale_identifier == scale_identifier)
			
			try:
				scale_conversion = query.one()
				return scale_conversion
			except NoResultFound:
				pass

		query = self.session.query(ScaleModel)
		query = query.filter(ScaleModel.name == scale_identifier)

		try:
			scale = query.one()
		except NoResultFound:
			scale = None

		if scale is not None:
			scale_conversion = ScaleConversionModel()
			scale_conversion.scale_id = scale.id
			return scale_conversion
		else:
			scale_conversion = ScaleConversionModel()
			scale_conversion.scale_id = None
			scale_conversion.supplier_id = supplier_id
			scale_conversion.scale_identifier = scale_identifier
			self.session.add(scale_conversion)
			self.scale_conversion_filter.add(row)
			self.session.flush()
			return scale_conversion
	def get_price_control(self, supplier_id, manufacturer_id, retail, preorder, special):
		"""Price Control"""
		if self.price_control_filter is None:
			self.price_control_filter = ScalableBloomFilter()
			query = self.session.query(
				PriceControlModel.supplier_id,
				PriceControlModel.manufacturer_id
			)
			for row in query.yield_per(100):
				self.price_control_filter.add(row)
		
		row = (supplier_id, manufacturer_id)
		if row in self.price_control_filter:
			query = self.session.query(PriceControlModel)
			query = query.filter(PriceControlModel.supplier_id == supplier_id)
			query = query.filter(PriceControlModel.manufacturer_id == manufacturer_id)
			if preorder:
				query = query.filter(PriceControlModel.preorder == True)
				
			if special:
				query = query.filter(PriceControlModel.special == True)
			
			if (not preorder) and (not special):
				query = query.filter(PriceControlModel.normal == True)
			
			query = query.filter(PriceControlModel.retail_low <= retail)
			query = query.filter(PriceControlModel.retail_high >= retail)
			query = query.filter(PriceControlModel.enable == True)
			try:
				price_control = query.one()
				return price_control
			except NoResultFound:
				#logger.warning(
				#	"No PriceControl found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", 
				#	supplier_id, 
				#	manufacturer_id, 
				#	retail, 
				#	preorder, 
				#	special
				#)
				return None
			except MultipleResultsFound:
				logger.warning(
					"Duplicate PriceControls found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", 
					supplier_id, 
					manufacturer_id, 
					retail, 
					preorder, 
					special
				)
		return None
Example #51
0
class kmer_store:
	def __init__(self):
		self.bloom_filter = ScalableBloomFilter(initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
		self.kmers = {}

	def update(self, item):
		if item in self.bloom_filter:
			if item in self.kmers:
				self.kmers[item] += 1
			else:
				self.kmers[item] = 2
		else:
			self.bloom_filter.add(item)

	def __iter__(self):
		for key in self.kmers:
			yield key
	def __getitem__(self, key):
		return self.kmers[key]
	def __repr__(self):
		return str(self.kmers)
	def __str__(self):
		return str(self.kmers)
Example #52
0
class StockTileExclusions(object):
    """
    Object that keeps track of which stock tiles have already been used.
    """
    def __init__(self, source_image):
        self.source_image = source_image
        self.bloom_filter = ScalableBloomFilter(
            initial_capacity=source_image.tiles.count(),
            error_rate=0.0001,  # 1 in 10,000
        )
        existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match')
        for tile_id, existing_match_id in existing_matches:
            self.bloom_filter.add((tile_id, existing_match_id))

    def __contains__(self, key):
        if key in self.bloom_filter:
            return True
        elif self.source_image.tiles.filter(stock_tile_match_id=key[1]).exists():
            self.add(key)
            return True
        return False

    def add(self, key):
        self.bloom_filter.add(key)
Example #53
0
    def __init__(self, ioloop=None, start_url=None, max_depth=5):
        super().__init__()

        self.ioloop = ioloop or tornado.ioloop.IOLoop.instance()
        self.start_url = start_url or {}
        self.fetch_queue = Queue()
        self.fetched = []
        self.fetched_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        self.fetch_finished = []

        for u in start_url:
            self.fetch_queue.put(u)

        self.fetching = 0
        self.max_depth = max_depth
Example #54
0
File: digest.py Project: F3DS/f3ds
    def load(cls, filename):
        #import pdb; pdb.set_trace()
        t = cls.transformer
        size = t.size
        with open(filename, "rb") as serialized_digest:
            readdata = serialized_digest.read(size)
            if len(readdata) != size:
                msg = 'invalid amount read from file for format %r: %r (should have been %d)'
                Logger("digest.load").log(msg % (t.format, readdata, size))
                raise ValueError
            nonce, maxcapacity, urlcount, meta = t.unpack(readdata)

            # If meta has a conversion from string repr, use it.
            if hasattr(self, 'meta_from_string'):
                meta = self.meta_from_string()
            filterS = ScalableBloomFilter.fromfile(serialized_digest)
        digest = cls(maxcapacity, meta, filename, filterS=filterS, nonce=nonce)
        digest.urlcount = urlcount
        return digest
Example #55
0
    def load(cls, filename):
        """
        This overrides the base class method to unpack using the siginfo.
        """
        #import pdb; pdb.set_trace()
        t = cls.transformer
        size = t.size
        with open(filename, "rb") as serialized_digest:
            readdata = serialized_digest.read(size)
            if len(readdata) != size:
                msg = 'invalid amount read from file for format %r: %r (should have been %d)'
                Logger("scandigest.load").log(msg % (t.format, readdata, size))
                raise ValueError
            nonce, maxcapacity, urlcount, scannervv, sigversion, sigtimestamp = t.unpack(readdata)

            # Read the datetime as non-utc, since that's how we wrote it with mktime.
            siginfo = SigInfo(scannervv, sigversion,
                              datetime.datetime.fromtimestamp(sigtimestamp))
            filterS = ScalableBloomFilter.fromfile(serialized_digest)
        scandigest = cls(maxcapacity, siginfo, filename, filterS=filterS, nonce=nonce)
        scandigest.urlcount = urlcount
        return scandigest
	def get_manufacturer_conversion(self, supplier_id, manufacturer_identifier):
		"""Manufacturer Conversion"""
		if self.manufacturer_conversion_filter is None:
			self.manufacturer_conversion_filter = ScalableBloomFilter()
			query = self.session.query(
				ManufacturerConversionModel.supplier_id,
				ManufacturerConversionModel.manufacturer_identifier
			)
			for row in query.yield_per(100):
				self.manufacturer_conversion_filter.add(row)
		
		row = (supplier_id, manufacturer_identifier)
		if row in self.manufacturer_conversion_filter:
			query = self.session.query(ManufacturerConversionModel)
			query = query.filter(ManufacturerConversionModel.supplier_id == supplier_id)
			query = query.filter(ManufacturerConversionModel.manufacturer_identifier == manufacturer_identifier)
			try:
				manufacturer_conversion = query.one()
				return manufacturer_conversion
			except NoResultFound:
				pass
			
		query = self.session.query(ManufacturerModel)
		query = query.filter(ManufacturerModel.identifier == manufacturer_identifier)
		try:
			manufacturer = query.one()
		except NoResultFound:
			logger.warning("No ManufacturerConversion found for supplier_id '%s' manufacturer_identifier '%s'", supplier_id, manufacturer_identifier)
			return None
		
		manufacturer_conversion = ManufacturerConversionModel()
		manufacturer_conversion.manufacturer_id = manufacturer.id
		manufacturer_conversion.supplier_id = supplier_id
		manufacturer_conversion.manufacturer_identifier = manufacturer_identifier
		#self.session.add(manufacturer_conversion)
		return manufacturer_conversion
Example #57
0
 def __init__(self):
     self.products = dict()
     self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)
     self.redis_queue = None
Example #58
0
class ProductSpider(RedisSpider):
    name = "product"
    allowed_domains = ["aliexpress.com"]
    start_urls = (
        'http://www.aliexpress.com/',
    )

    prefix = ''

    def __init__(self):
        self.products = dict()
        self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)
        self.redis_queue = None

    def get_queue(self):
        for value in set(self.server.smembers(self.redis_key)):
            yield value

    def start_requests(self):
        ProductSpider.prefix = self.settings['prefix']
        self.redis_key = '{}:product'.format(ProductSpider.prefix)

        self.redis_queue = self.get_queue()

        db = MongoClient().aliexpress
        for product in db['{}product'.format(ProductSpider.prefix)].find():
            self.ids.add(product['url'][product['url'].rfind('/') + 1:product['url'].rfind('.')])

        yield self.next_request()

    def next_request(self):
        while True:
            try:
                url = next(self.redis_queue)
            except StopIteration:
                url = None

            if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])):
                break

        if url:
            return self.make_requests_from_url(url)
        else:
            raise CloseSpider('redis queue has no url to request')

    def parse(self, response):
        self.log('product url: {}'.format(response.url), logging.INFO)

        try:
            store_url = response.css('.shop-name').xpath('a/@href').extract()[0]
            self.log('crawl store url: {}'.format(store_url), logging.INFO)

            store_item = UrlItem()
            store_item['prefix'] = ProductSpider.prefix
            store_item['type'] = 'store'
            store_item['url'] = store_url
            yield store_item

            feedback_base_url = response.xpath('//div[@id="feedback"]/iframe/@thesrc').extract()[0]
            parsed = urlparse.urlparse(feedback_base_url)
            product_id = urlparse.parse_qs(parsed.query)['productId'][0]

            try:
                percent_num = response.css('.percent-num').xpath('text()').extract()[0]
                rantings_text = response.css('.rantings-num').xpath('text()').extract()[0]
                rantings_num = rantings_text[1:rantings_text.index(' ')]
                order_text = response.css('.order-num').xpath('text()').extract()[0]
                order_num = order_text[:order_text.index(' ')]
            except:
                percent_num = 0
                rantings_num = 0
                order_num = 0

            product_item = ProductItem()
            product_item['prefix'] = ProductSpider.prefix
            product_item['_id'] = product_id
            product_item['store'] = store_url
            product_item['url'] = response.url
            product_item['percent_num'] = percent_num
            product_item['rantings_num'] = rantings_num
            product_item['order_num'] = order_num
            yield product_item

            feedback_item = UrlItem()
            feedback_item['prefix'] = ProductSpider.prefix
            feedback_item['type'] = 'feedback'
            feedback_item['url'] = feedback_base_url
            yield feedback_item

            order_item = UrlItem()
            order_item['prefix'] = ProductSpider.prefix
            order_item['type'] = 'order'
            order_item[
                'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format(
                product_id)
            yield order_item
        except:
            try:
                product_url = response.meta['redirect_urls'][0]
            except:
                product_url = response.url
                self.log('strange product url: {}'.format(product_url), logging.ERROR)
            finally:
                self.log('meet anti-spider, back product: {}'.format(product_url), logging.INFO)

                url_item = UrlItem()
                url_item['prefix'] = ProductSpider.prefix
                url_item['type'] = 'product'
                url_item['url'] = product_url
                yield url_item
 def __init__(self):
     self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv',
                            'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare']
     self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
Example #60
0
File: cli.py Project: longrw/fastqp
def run(args):
    """ read FASTQ or SAM and tabulate basic metrics """
    time_start = time.time()
    if args.input.name != '<stdin>':
        bsize = os.path.getsize(args.input.name)

    est_counter = int()
    sample_lengths = list()
    sample_binsizes = list()
    act_nlines = int()
    name, ext = os.path.splitext(args.input.name)
    if (args.leftlimit > 0) and (args.rightlimit > 0):
        if args.rightlimit < args.leftlimit:
            sys.exit("Left limit must be less than right limit.\n")
    if args.type:
        ext = '.' + args.type
    if ext not in ['.fq','.fastq', '.sam', '.bam', '.gz'] and args.input.name != '<stdin>':
        sys.exit("Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n")

    if args.name:
        sample_name = args.name
    else:
        sample_name = args.input.name

    # estimate the number of lines in args.input if we can
    if ext in ['.fastq','.fq']:
        with FastqReader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length "
                "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry,
                                                                    len=mean_len,
                                                                    est=est_nlines))
    elif ext  == '.sam':
        with Reader(open(args.input.name)) as fh:
            for read in fh:
                sample_lengths.append(len(read))
                sample_binsizes.append(len(str(read)))
                est_counter += 1
                if est_counter == 10000:
                    break
            mean_bentry = mean(sample_binsizes)
            mean_len = mean(sample_lengths)
            est_nlines = int(bsize / mean_bentry)
            if not args.quiet:
                sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length "
                "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry,
                                                                    len=mean_len,
                                                                    est=est_nlines))
    elif ext == '.bam':
        est_nlines = sum(bam_read_count(args.input.name))
        if not args.quiet:
            sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines))
    elif ext == '.gz':
        if args.binsize:
            n = args.binsize
            est_nlines = None
            if not args.quiet:
                sys.stderr.write("Reading from gzipped file, bin size (-s) set to {binsize:n}.\n".format(binsize=n))
        else:
            sys.stderr.write("Gzipped file detected. Reading file to determine bin size (-s).\n")
            p1 = Popen(shlex.split('gzip -dc %s' % args.input.name), stdout=PIPE)
            p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE)
            est_nlines, _ = p2.communicate()
            est_nlines = int(est_nlines) // 4
            if not args.quiet:
                sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines))
    elif name == '<stdin>':
        if args.binsize:
            n = args.binsize
        else:
            n = 1
        if not args.quiet:
            sys.stderr.write("Reading from <stdin>, bin size (-s) set to {binsize:n}.\n".format(binsize=n))
        est_nlines = None
    if est_nlines is not None:
        # set up factor for sampling bin size
        if args.binsize:
            n = args.binsize
        else:
            nf = math.floor(est_nlines / args.nreads)
            if nf >= 1:
                n = int(nf)
            else:
                n = 1
        if not args.quiet:
            sys.stderr.write("Bin size (-s) set to {binsize:n}.\n".format(binsize=n))

    if ext in ['.sam', '.bam']:
        infile = Reader(args.input)
    else:
        infile = FastqReader(args.input, ext=ext)

    read_len = defaultdict(int)
    cycle_nuc = defaultdict(lambda: defaultdict(int))
    cycle_qual = defaultdict(lambda: defaultdict(int))
    cycle_gc = defaultdict(int)
    cycle_kmers = defaultdict(lambda: defaultdict(int))
    cycle_mismatch = {'C': defaultdict(lambda: defaultdict(int)),
                      'G': defaultdict(lambda: defaultdict(int)),
                      'A': defaultdict(lambda: defaultdict(int)),
                      'T': defaultdict(lambda: defaultdict(int))}

    if args.count_duplicates:
        try:
            from pybloom import ScalableBloomFilter
            bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        except ImportError:
            sys.exit("--count-duplicates option requires 'pybloom' package.\n")

    duplicates = 0
    percent_complete = 10
    reads = infile.subsample(n)

    for read in reads:
        if isinstance(read, Sam):
            if args.aligned_only and not read.mapped:
                continue
            elif args.unaligned_only and read.mapped:
                continue
            if read.reverse:
                seq = read.seq[::-1]
                qual = read.qual[::-1]
            else:
                seq = read.seq
                qual = read.qual
        else:
            seq = read.seq
            qual = read.qual

        # Set up limits
        if (args.leftlimit == 1) and (args.rightlimit < 0):
            pass
        elif (args.leftlimit >= 1) and (args.rightlimit > 0):
            try:
                seq = seq[args.leftlimit - 1:args.rightlimit]
                qual = qual[args.leftlimit - 1:args.rightlimit]
            except IndexError:
                act_nlines += n
                continue
        elif (args.leftlimit > 1) and (args.rightlimit < 0):
            try:
                seq = seq[args.leftlimit - 1:]
                qual = qual[args.leftlimit - 1:]
            except IndexError:
                act_nlines += n
                continue
        if len(seq) == 0:
            act_nlines += n
            continue
        cycle_gc[gc(seq)] += 1

        if args.count_duplicates:
            if seq in bloom_filter:
                duplicates += 1
            else:
                bloom_filter.add(seq)

        for i, (s, q) in enumerate(zip(seq, qual)):
            cycle_nuc[args.leftlimit + i][s] += 1
            cycle_qual[args.leftlimit + i][q] += 1
        read_len[len(qual)] += 1

        for i, kmer in enumerate(window(seq, n=args.kmer)):
            cycle_kmers[args.leftlimit+i][kmer] += 1

        if isinstance(read, Sam) and read.mapped:
            try:
                ref = read.parse_md()
                for i, (s, r) in enumerate(zip(seq, ref)):
                    if s != r:
                        try:
                            cycle_mismatch[r][args.leftlimit+i][s] += 1
                        except KeyError:
                            pass
            except KeyError:
                pass


        if est_nlines is not None:
            if (act_nlines / est_nlines) * 100 >= percent_complete:
                sys.stderr.write("Approximately {0:n}% complete at "
                                 "read {1:,} in {2}\n".format(percent_complete,
                                                              act_nlines,
                                                              time.strftime('%H:%M:%S',
                                                                            time.gmtime(time.time()-time_start))))
                percent_complete += 10
        act_nlines += n

    positions = [k for k in sorted(cycle_qual.keys())]
    depths = [read_len[k] for k in sorted(read_len.keys())]

    basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())]
    bases = set(list(itertools.chain.from_iterable(basecalls)))
    #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())]
    map(padbases(bases), cycle_nuc.values())

    quantile_values = [0.05,0.25,0.5,0.75,0.95]
    quantiles = []
    ## replace ASCII quality with integer
    for _, v in sorted(cycle_qual.items()):
        for q in tuple(v.keys()): ## py3 keys are iterator, so build a tuple to avoid recursion
            v[ord(str(q)) - 33] = v.pop(q)
        line = [percentile(v, p) for p in quantile_values]
        quantiles.append(line)

    # build kmer set of known adapter sequences
    adapter_kmers = set()
    for adapter in all_adapter_sequences:
        for kmer in window(adapter, n=args.kmer):
            adapter_kmers.add(kmer)

    # test for nonuniform kmer profiles and calculate obs/exp
    observed_expected = dict()
    all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())]
    kmers = set(list(itertools.chain.from_iterable(all_kmers)))
    bad_kmers = []
    sequenced_bases = sum((l * n for l, n in read_len.items()))
    priors = tuple(map(float, args.base_probs.split(',')))
    for kmer in kmers:
        kmer_counts = [(i, cycle_kmers[i][kmer]) for i in sorted(cycle_kmers.keys())]
        expected_fraction = reduce(mul, (p ** kmer.count(b) for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1)
        expected = expected_fraction * sequenced_bases
        observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected
        slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts))
        if abs(slope) > 2 and p_value < 0.05:
            bad_kmers.append((kmer, slope, p_value))
    bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10]
    pos_gc = [sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([cycle_nuc[i]['C'],
                                                              cycle_nuc[i]['G'],
                                                              cycle_nuc[i]['A'],
                                                              cycle_nuc[i]['T']]) * 100 for i in positions]

    # see http://vita.had.co.nz/papers/tidy-data.pdf
    sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='reads', pos='None', value=act_nlines))

    for cycle, count in read_len.items():
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_len', pos=cycle,
                                                               value=count))

    for i, position in enumerate(positions):
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q05', pos=position,
                                                               value=quantiles[i][0]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q25', pos=position,
                                                               value=quantiles[i][1]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q50', pos=position,
                                                               value=quantiles[i][2]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q75', pos=position,
                                                               value=quantiles[i][3]))
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='q95', pos=position,
                                                               value=quantiles[i][4]))
    for base in bases:
        for position in positions:
            sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                                   column=base, pos=position,
                                                                   value=cycle_nuc[position][base]))
    for position in positions:
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='cycle_gc', pos=position,
                                                               value=cycle_gc[position]))
    for i in range(101):
        sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name,
                                                               column='read_gc', pos=i,
                                                               value=cycle_gc[i]))

    for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]):
        sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name,
                                                               column=kmer, pos='None',
                                                               value=obs_exp))

    if args.count_duplicates:
        sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='duplicate', pos='None', value=duplicates/act_nlines))


    from zipfile import ZipFile
    with ZipFile(args.output + '.zip', mode='w') as zip_archive:
        fig_kw = {'figsize':(8, 6)}
        qualplot(positions, quantiles, zip_archive, fig_kw)
        median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw)
        qualmap(cycle_qual, zip_archive, fig_kw)
        depthplot(read_len, zip_archive, fig_kw)
        gcplot(positions, pos_gc, zip_archive, fig_kw)
        gcdist(cycle_gc, zip_archive, fig_kw)
        nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw)
        kmerplot(positions, cycle_kmers, zip_archive, [fields[0] for fields in bad_kmers], fig_kw)
        adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive, fig_kw)
        if isinstance(infile, Reader):
            mismatchplot(positions
                         , cycle_mismatch, zip_archive, fig_kw)
    time_finish = time.time()
    elapsed = time_finish - time_start
    if not args.quiet:
        sys.stderr.write("There were {counts:,} reads in the file. Analysis finished in {sec}.\n".format(counts=act_nlines,
                                                                                                                       sec=time.strftime('%H:%M:%S',
                                                                                                                                         time.gmtime(elapsed))
        ))
        if len(bad_kmers) > 0:
            for kmer in bad_kmers:
                sys.stderr.write("KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n" % (kmer))
        if median_qual < args.median_qual:
            sys.stderr.write("QualityWarning: median base quality score is %s.\n" % median_qual)