class FileBloom(object): def __init__(self): self.file_path = "bloom/bloom_weibo.txt" self.bloom_filter = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001) def read_bloom(self): if os.path.exists(self.file_path): f = open(self.file_path, "r") ids = f.readlines() for id in ids: id_s = id.strip() self.bloom_filter.add(id_s) f.close() else: f = open(self.file_path, "w") f.close() def to_file(self): pass def update_bloom_file(self, m_id): f = open(self.file_path, "a") f.write(str(m_id) + "\n") f.close() def update_bloom(self, m_id): self.bloom_filter.add(m_id) def has_id(self, m_id): if m_id in self.bloom_filter: return True else: return False
def dedup_lines_bloom(text, just_words=True, zero_digits=True, capacity=100000, error=0.00001): sbf = ScalableBloomFilter(initial_capacity=capacity, error_rate=error, mode=ScalableBloomFilter.LARGE_SET_GROWTH) for line in text: if not isinstance(line, str): raise TypeError( 'Expected "text" to contain stings, found: {}'.format( type(line))) key = line.strip() if not key: yield line key = normalize('NFKD', key) if just_words: key = ' '.join(re.findall(r'\w+', key)) if zero_digits: key = re.sub(r'\d', '0', key) if key in sbf: line = '' else: sbf.add(key) yield line
def add_to_filter(self, update=False): # https://github.com/bigsnarfdude/Malware-Probabilistic-Data-Structres/blob/master/Mandiant_MD5_BloomFilter.py def stream_lines(filename): file = open(filename) while True: line = file.readline() if not line: file.close() break yield line.strip() def load_file(filename): lines = stream_lines(filename) templist = [] for line in lines: templist.append(line) return templist itemlist = load_file(self.datafile) self.itemcount = len(itemlist) if not update: # reinitialize filter before adding a new set of items self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for item in itemlist: _ = self.filter.add(item)
def boot1(self): try: self.multiFile.seek(0) a = ScalableBloomFilter.fromfile(self.multiFile) return a except: return ScalableBloomFilter(ScalableBloomFilter.LARGE_SET_GROWTH)
class DuplicateItemFilterPipeline(Pipeline): # bloomfiler 序列化 fileName = "DuplicateItemFilter.dat" def open_spider(self, spider): self.fileName = spider.name + self.fileName if os.path.exists(self.fileName): with open(self.fileName, 'rb') as f: self.sbf = ScalableBloomFilter.fromfile(f) else: self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) pass def close_spider(self, spider): with open(self.fileName, 'wb') as f: self.sbf = self.sbf.tofile(f) pass def process_item(self, item, spider): # bloomfiler fp = hashlib.sha1() for key in item.keys(): if key not in ['curlDate', 'reference'] \ and item[key] is not None: # 不比较抓取时间,来源url fp.update(item[key]) fpValue = fp.hexdigest() if not self.sbf.add(fpValue): return item else: raise DropItem("duplicate item :/n %s" % item)
class BloomPipeline(object): def __init__(self, bloomfile, spider_name): self.bloomfile = bloomfile self.spider_name = spider_name # item crawled before logger.info("loading crawled items before...") if os.path.isfile(self.bloomfile): f = open(self.bloomfile, 'r') self.item_crawled = ScalableBloomFilter.fromfile(f) f.close() else: self.item_crawled = ScalableBloomFilter( 100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) cnt = self.item_crawled.count logger.info("pipline read %d crawled items" % cnt) def __del__(self): f = open(self.bloomfile, 'w') self.item_crawled.tofile(f) f.close() @classmethod def from_crawler(cls, crawler): return cls( #mongo_uri=crawler.settings.get('MONGODB_ADDRESS'), bloomfile=crawler.settings.get('BLOOM_FILE'), #bloomfile = "/root/dev/SocialSpider/data/weibotv/bloomfile", spider_name=crawler.spidercls.name) def process_item(self, item, spider): #if not item['md5']: # md5 = hashlib.md5("%s%s%s"%(item['title'].encode('utf-8'),item['url'].encode('utf-8'))).hexdigest() # item['md5'] = md5 valid = True item_id = '' if self.spider_name == 'weibotv': item_id = item['mid'] elif self.spider_name == 'toutiao': item_id = item['Url'] #item_id = hashlib.md5("%s"%(item['Url'].encode('utf-8'))).hexdigest() elif self.spider_name == 'anyvspider': item_id = item['pid'] else: pass if self.item_crawled.add(item_id): valid = False else: valid = True if valid: logger.info("item: %s wrote to bloomfile %s" % (item_id.encode('utf-8'), self.bloomfile)) return item else: logger.info("item droped %s " % item_id.encode('utf-8'))
def __init__(self, endpoint=config.config['general']['dbpedia']['endpoint'], one_hop_bloom_file=config.config['general']['dbpedia'] ['one_hop_bloom_file'], two_hop_bloom_file=config.config['general']['dbpedia'] ['two_hop_bloom_file']): super(DBpedia, self).__init__(endpoint) self.type_uri = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" if os.path.exists(one_hop_bloom_file): with open(one_hop_bloom_file) as bloom_file: self.one_hop_bloom = BloomFilter.fromfile(bloom_file) else: self.one_hop_bloom = None self.two_hop_bloom_file = two_hop_bloom_file self.two_hop_bloom = dict() for item in [True, False]: file_path = two_hop_bloom_file.replace('spo2', 'spo2' + str(item)) if os.path.exists(file_path): with open(file_path) as bloom_file: self.two_hop_bloom[item] = ScalableBloomFilter.fromfile( bloom_file) else: self.two_hop_bloom[item] = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.two_hop_bloom_counter = 0
class URLFilter(object): lock = RLock() def __init__(self): self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv', 'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare'] self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH) def forbidden_key_word(self, url): for key_word in self.forbidden_keys: if key_word in url: log.debug('## FORBIDDEN: {}'.format(url)) return False return True @staticmethod def is_english(url): try: url.decode('ascii') except UnicodeDecodeError: log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url)) return False else: return True def pass_check(self, url): with URLFilter.lock: if url in self.seen: log.debug('## SEEN: {}'.format(url)) return False self.seen.add(url) return self.forbidden_key_word(url) and self.is_english(url)
def __init__(self, tasks=2, loop=None): self.tasks = tasks self.loop = loop or asyncio.get_event_loop() self.redis_cookie = RedisCookie() self.redis_job = RedisJob() self.bloom_filter = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.weibo_limit = True self.time_current_pattern = re.compile(r'(\d*)分钟前') self.time_today_pattern = re.compile(r'今天\s*(\d*):(\d*)') self.time_year_pattern = re.compile(r'(\d*)月(\d*)日\s*(\d*):(\d*)') self.user_id_pattern = re.compile(r'https://weibo.cn/u/(\d*)') self.weibo_host = 'https://weibo.cn' self.follow_url = self.weibo_host + '/%s/follow' self.fan_url = self.weibo_host + '/%s/fans' self.user_info_url = self.weibo_host + '/%s/info' self.user_tweet_url = self.weibo_host + '/%s' self.user_tweet_url2 = self.weibo_host + '/%s?page=%d' self.user_repost_url = self.weibo_host + '/repost/%s' self.user_repost_url2 = self.weibo_host + '/repost/%s?page=%d' self.tweet_comment_url = self.weibo_host + '/comment/%s' self.tweet_comment_url2 = self.weibo_host + '/comment/%s?page=%d' self.weibo_producer = WeiboProcuder(['localhost:9092'], 'sinaweibo') self.search_url = 'https://weibo.cn/search/?pos=search' self.get_search_url = 'https://weibo.cn/search/mblog/?keyword=%s&filter=hasori'
class FilterHandler(object): def __init__(self, logger): self.logger_ = logger self._load_from_file() def url_seen(self, url): if self.deduper_.add(url): self.logger_.info('url duplicated: %s', url) return True return False def _load_from_file(self): self.logger_.info('loading data from cache file...') if not os.path.isfile('data/bloom.data'): self.logger_.error('bloom cache file not found, create one instead.') self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4) else: with open('data/bloom.data', 'r') as f: self.deduper_ = ScalableBloomFilter.fromfile(f) def _dump_to_file(self): self.logger_.info('dumping data...') if not os.path.isdir('data'): os.mkdir('data') with open('data/bloom.data', 'w') as f: self.deduper_.tofile(f) self.logger_.info('dump data finished.') def close(self): self._dump_to_file()
def vacuum_all(self, limit=None): logger.debug('Begin vacuum_all(limit=%s)', limit) self.plugins = self.load_plugins() self.session.begin(subtransactions=True) ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins)) #s = set() s = ScalableBloomFilter() query = self.session.query(SupplierCatalogModel.id) for (supplier_catalog_id, ) in query.yield_per(100): s.add(supplier_catalog_id) for plug in self.plugins.itervalues(): supplier_catalog_filter_id = plug.supplier_catalog_filter_id() model_name = plug.version_model() + 'Model' VersionModel = getattr(model, model_name) query = self.session.query(VersionModel) if limit: query = query.order_by(VersionModel.vacuumed.nullsfirst()) query = query.limit(limit) ts['sub_done'] = 0 ts['sub_total'] = query.count() for supplier_catalog_item_version in query.yield_per(10): if supplier_catalog_item_version.supplier_catalog_id not in s: logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id) self.session.delete(supplier_catalog_item_version) ts['sub_done'] += 1 ts['done'] += 1 self.session.commit() ts.finish() logger.debug('End vacuum_all()')
class kmer_store: def __init__(self): self.bloom_filter = ScalableBloomFilter( initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.kmers = {} def update(self, item): if item in self.bloom_filter: if item in self.kmers: self.kmers[item] += 1 else: self.kmers[item] = 2 else: self.bloom_filter.add(item) def __iter__(self): for key in self.kmers: yield key def __getitem__(self, key): return self.kmers[key] def __repr__(self): return str(self.kmers) def __str__(self): return str(self.kmers)
def __init__(self, datafile, filterfile): # https://github.com/jaybaird/python-bloomfilter/blob/master/pybloom/pybloom.py self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.datafile = datafile self.filterfile = filterfile self.datafilesize = None self.filterfilesize = None self.change = None
def __init__(self,filterfile): self.filterfile = filterfile #if filterfile is present load bloom filter from that file, else create new one if os.path.exists(filterfile): self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb")) print "available signatures = %d"%len(self.bf) else: self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
def _load_from_file(self): self.logger_.info('loading data from cache file...') if not os.path.isfile('data/bloom.data'): self.logger_.error('bloom cache file not found, create one instead.') self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4) else: with open('data/bloom.data', 'r') as f: self.deduper_ = ScalableBloomFilter.fromfile(f)
class BloomMembership(GenericMembership): def __init__(self, max_size: int, error_rate: float): self.bloom_filter = ScalableBloomFilter(max_size, error_rate) def add(self, key: str): self.bloom_filter.add(key) def __contains__(self, key: str) -> bool: return key in self.bloom_filter
def open_spider(self, spider): self.fileName = spider.name + self.fileName if os.path.exists(self.fileName): with open(self.fileName, 'rb') as f: self.sbf = ScalableBloomFilter.fromfile(f) else: self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) pass
class WishPipeline(object): def __init__(self): self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) def process_item(self, item, spider): if item is None or item['url'] is None or item['url'] in self.urls: raise DropItem("Duplicate item found.") else: self.urls.add(item['url']) return item
def second_new_warn_entity(): minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') b7 = ScalableBloomFilter(100000, 0.001) b30 = ScalableBloomFilter(100000, 0.001) b90 = ScalableBloomFilter(100000, 0.001) for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < 7 and dValue >= 0: [b7.add(i)] if dValue < 30 and dValue >= 0: [b30.add(i)] if dValue < 90 and dValue >= 0: [b90.add(i)] result90 = secondDetectFromBigTable(90, TABLE_REPORT_ILLEGAL, RISK_LEVEL, ILLEGAL_SCORE, 'all', 0, 0, 'all', 'all', TABLE_LOGS, 'all') count7 = 0 count30 = 0 count90 = 0 resultIds = [] for each in result90: if not each['entity_id'] in resultIds: resultIds.append(each['entity_id']) for id in resultIds: if id in b7: count7 += 1 if id in b30: count30 += 1 if id in b90: count90 += 1 result = {'count7': count7, 'count30': count30, 'count90': count90} return json.dumps(result, ensure_ascii=False)
class WishPipeline(object): def __init__(self): self.urls = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) def process_item(self, item, spider): if item is None or item['url'] is None or item['url'] in self.urls: raise DropItem("Duplicate item found.") else: self.urls.add(item['url']) return item
def get_city_rank(table, table4, field, province_name, risk_level): cur = defaultDatabase() city_list = [] list = [] province_list = [] sql = "select max(date) from %s" % table cur.execute(sql) end_time = cur.fetchall()[0][0] start_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7) start_time = start_time.strftime("%Y-%m-%d") start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30) start_time1 = start1_time.strftime("%Y-%m-%d") sql1 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % ( table, table4, table4, start_time, end_time, risk_level) cur.execute(sql1) res1 = cur.fetchall() result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1] sql2 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % ( table, table4, table4, start_time1, end_time, risk_level) cur.execute(sql2) res2 = cur.fetchall() result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2] result = result1 + result2 b = ScalableBloomFilter(1000000, 0.001) for p in result: if not p['city'] in b: [b.add(p['city'])] city_list.append({'province': p['province'], 'city': p['city']}) for d in city_list: if not d['province'] in province_list: province_list.append(d['province']) if province_name: for d in city_list: if d['province'] == province_name and d['city']: pro_dict = {"province": d['province'], "city": d['city']} for dict in result1: if dict['city'] == d['city']: pro_dict.update({'count7': dict['count']}) for dict in result2: if dict['city'] == d['city']: pro_dict.update({'count30': dict['count']}) list.append(pro_dict) if not province_name: for p in province_list: if p: pro_dict = {"province": p} count = 0 for dict in result1: if dict['province'] == p: count += dict['count'] pro_dict.update({"count": count}) list.append(pro_dict) return list
def __init__(self, base_url: str = 'https://news.sina.com.cn', n_news: int = 100, n_producers: int = 5, n_consumers: int = 1): self.base_url = base_url self.n_news = n_news self.n_producers = n_producers self.n_consumers = n_consumers self.urls = gevent.queue.Queue() self.soups = gevent.queue.Queue(maxsize=n_producers) self.urls.put(URL(base_url)) self.visited_urls = Filter(initial_capacity=n_news) self.news_filter = Filter(initial_capacity=n_news)
def __init__(self, domain, threads, depth, times, headers, father): self.domain = domain if self.domain[self.domain.__len__() - 1] == '/': self.domain = self.domain[0:self.domain.__len__() - 1] self.threads = threads self.times = times self.cookies = {} self.headers = {} self.count = 0 self.controlthread = 0 self.depth = depth self.father = father self.realdomain = '' self.payload = Payload() self.encode = Encode() if headers != '': self.setheader(headers) if 'https' in self.domain: self.domain1 = self.domain.replace('https://', '') self.domain2 = 'http://' + self.domain1 self.domain3 = 'http%3A%2F%2F' + self.domain1 self.domain4 = 'https%3A%2F%2F' + self.domain1 elif 'http' in self.domain: self.domain1 = self.domain.replace('http://', '') self.domain2 = 'https://' + self.domain1 self.domain3 = 'http%3A%2F%2F' + self.domain1 self.domain4 = 'https%3A%2F%2F' + self.domain1 else: self.domain1 = 'http://' + self.domain self.domain2 = 'https://' + self.domain self.domain3 = 'http%3A%2F%2F' + self.domain self.domain4 = 'https%3A%2F%2F' + self.domain self.queue = Queue() self.urlqueue = Queue() self.lock = threading.RLock() self.lock2 = threading.RLock() self.lock3 = threading.RLock() self.lock4 = threading.RLock() self.lock5 = threading.RLock() self.bloomfilter = ScalableBloomFilter( initial_capacity=10000, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.bloomfilter2 = ScalableBloomFilter( initial_capacity=10000, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.blacklist = [ '<', '{', '\'', '"', '.css', '.jpg', '.mp4', '.png', '.gif', '.avi', '.jpeg', '.ico', '.mp3', '.pdf', 'docx', 'doc', 'bmp', '.rmvb', '.zip', '.rar', '.exe', '.ppt', '.pptx', 'xls' ] self.rule = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
class URLBloomFilter(RFPDupeFilter): def __init__(self, path=None, debug=False): self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path) def request_seen(self, request): fp = hashlib.sha1() fp.update(canonicalize_url(request.url).encode("utf8")) url_sha1 = fp.hexdigest() if url_sha1 in self.urls_sbf: return True else: self.urls_sbf.add(url_sha1)
def test_bloom_string(self): f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for i in xrange(0, 10000): rnd = ''.join(random.choice(string.letters) for i in xrange(40)) _ = f.add(rnd) self.assertEqual(rnd in f, True) for i in string.letters: self.assertEqual(i in f, False) self.assertEqual(rnd in f, True)
class RequestFilter(object): """ RequestFilter """ def __init__(self): self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) def request_seen(self, request): """request seen """ finger = request_fingerprint(request) if finger in self.sbf: return True self.sbf.add(finger) return False
class URLBloomFilter(RFPDupeFilter): """根据urlhash_bloom过滤""" def __init__(self,path=None): self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path) def request_seen(self, request): fp = hashlib.sha1() fp.update(canonicalize_url(request.url)) url_shal = fp.hexdigest() if url_shal in self.urls_sbf: return True else: self.urls_sbf.add(url_shal)
def __init__(self): super(StreamingTriangles, self).__init__() # Set up connection to Redis server self.redis_server = 'localhost' self.redis_db = redis.StrictRedis(host=self.redis_server, port=6379, db=0) # Initialize reservoir sizes self.edge_res_size = 40000 self.wedge_res_size = 40000 # Set Scalable Bloom Filter for ignoring repeated edges self.bloom_filter = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) # Init counters and arrays for Streaming-Triangles algorithm self.edge_count = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0} self.total_wedges = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0} self.edge_res = { RED: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], BLUE: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], YELLOW: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], GREEN: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)] } self.wedge_res = { RED: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], BLUE: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], YELLOW: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], GREEN: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)] } self.is_closed = { RED: [False for _ in xrange(self.wedge_res_size)], BLUE: [False for _ in xrange(self.wedge_res_size)], YELLOW: [False for _ in xrange(self.wedge_res_size)], GREEN: [False for _ in xrange(self.wedge_res_size)] } # Track percent of uncategorized transactions self.num_missed = 0 self.num_colored = 0
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001): """ Converts the iterable into a ScalableBloomFilter :rtype : pybloom.ScalableBloomFilter :param iterable: :param init_cap: :param err_rate: """ bloom = ScalableBloomFilter(init_cap, err_rate) for element in iterable: bloom.add(element) return bloom
def test_bloom_int(self): f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for i in xrange(0, 10000): _ = f.add(i) for i in xrange(0, 10000): self.assertEqual(i in f, True) for i in xrange(0, 10000 / 2 ): r = random.randint(0,10000-1) self.assertEqual(r in f, True) for i in xrange(0, 10000 / 2 ): r = random.randint(10000,10000 * 2) self.assertEqual(r in f, False)
def generate_task( self, generate_func_name, g_kw={}, sleep=180, times=20, ): ''' params: generate_func_name -> 任务生成函数的名字 params: g_kw -> generate_func的关键字参数 params: sleep,times ->每过sleep秒执行一次generate_func,times为执行的次数 任务生成函数,可多次执行generate_func,无需多次将times设置为1即可 ''' if self.is_filter: self.sbf = ScalableBloomFilter() else: self.sbf = None table = Table(logger=self.logger) generate_func = getattr(table, generate_func_name) e_kw = dict( generate_func=generate_func, g_kw=g_kw, ) self.loop_task(execute_func=self.core_generate_task, e_kw=e_kw, flag=1, sleep=sleep, times=times) table.close()
def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier): """Category Conversion""" if self.category_conversion_filter is None: self.category_conversion_filter = ScalableBloomFilter() query = self.session.query( CategoryConversionModel.supplier_id, CategoryConversionModel.manufacturer_id, CategoryConversionModel.needle ) for row in query.yield_per(100): self.category_conversion_filter.add(row) row = (supplier_id, manufacturer_id, category_identifier) if row in self.category_conversion_filter: query = self.session.query(CategoryConversionModel) query = query.filter(CategoryConversionModel.supplier_id == supplier_id) query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id) query = query.filter(CategoryConversionModel.needle == category_identifier) try: category_conversion = query.one() return category_conversion except NoResultFound: pass category_conversion = CategoryConversionModel() category_conversion.manufacturer_id = manufacturer_id category_conversion.supplier_id = supplier_id category_conversion.needle = category_identifier self.session.add(category_conversion) self.category_conversion_filter.add(row) return category_conversion
class UrlFilter(RFPDupeFilter): def __init__(self, path=None, debug=False): self.urls_sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path, debug) def request_seen(self, request): fp = hashlib.sha1() fp.update(canonicalize_url(request.url).encode('utf-8')) url_sha1 = fp.hexdigest() if url_sha1 not in self.urls_sbf and not mysqldb.queryItem( request.url): self.urls_sbf.add(url_sha1) else: return True
class RedisJob(object): redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=1) url_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) @classmethod def push_job(cls, job_type, job_info): if 'url' in job_info: if job_info['url'] not in cls.url_filter: cls.url_filter.add(job_info['url']) r = redis.Redis(connection_pool=cls.redis_pool) r.lpush(str(job_type), json.dumps(job_info)) LOGGER.info("push %s job into redis: %s" % (job_type, str(job_info))) else: LOGGER.warn("%s job filtered. %s" % (job_type, str(job_info))) else: r = redis.Redis(connection_pool=cls.redis_pool) r.lpush(str(job_type), json.dumps(job_info)) LOGGER.info("push %s job into redis: %s" % (job_type, str(job_info))) @classmethod def fetch_job(cls, job_type): r = redis.Redis(connection_pool=cls.redis_pool) job_info = r.lpop(job_type) if job_info: LOGGER.info('fetched job: %s' % job_info) return json.loads(job_info) else: return None
class URLBloomFilter(RFPDupeFilter): # 根据urlhash_bloom过滤 def __init__(self, path=None): self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path) def request_seen(self, request): # 生成一个哈希sha1处理实例 fp = hashlib.sha1() # 更新传入的参数为格式化统一后的函数(有时候同一个网址,可能请求网址的格式不一样) fp.update(canonicalize_url(request.url)) # sha1处理后的url url_sha1 = fp.hexdigest() if url_sha1 in self.urls_sbf: return True else: self.urls_sbf.add(url_sha1)
def __init__(self, city): """豆瓣页面抓取,抓取正在上映列表和电影介绍页。 :param city: 抓取影片数据的城市。 """ self._url = 'https://movie.douban.com/cinema/nowplaying/{}/'.format( city.lower()) # 电影列表页请求头 self._list_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep - alive', 'Host': 'movie.douban.com', 'Referer': 'https://movie.douban.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' } # 电影介绍页请求头 self._info_headers = self._list_headers.copy() self._info_headers.update({'Referer': self._url}) # 用布隆过滤器去重 self._bf = ScalableBloomFilter() cfg = ConfigParser() cfg.read('config.ini') db_host = cfg.get('database', 'host') db_port = cfg.getint('database', 'port') db_dbname = cfg.get('database', 'database') db_collection = cfg.get('database', 'collection') self._db = MongoClient(db_host, db_port)[db_dbname][db_collection] for movie in self._db.find({}): self.logger.debug('get {} in database'.format(movie['url'])) self._bf.add(movie['url'])
def __init__(self, initial_capacity=1000, error_rate=0.0001): self._set = ScalableBloomFilter(initial_capacity=initial_capacity, error_rate=error_rate, mode=ScalableBloomFilter.LARGE_SET_GROWTH) # False positives in the Bloom filter will cause us to fail to # garbage-collect an object. Salt the Bloom filter to ensure # that we get a different set of false positives on every run. self._bloom_salt = os.urandom(2)
def __init__(self, bloomfile, spider_name): self.bloomfile = bloomfile self.spider_name = spider_name # item crawled before logger.info("loading crawled items before...") if os.path.isfile(self.bloomfile): f = open(self.bloomfile, 'r') self.item_crawled = ScalableBloomFilter.fromfile(f) f.close() else: self.item_crawled = ScalableBloomFilter( 100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) cnt = self.item_crawled.count logger.info("pipline read %d crawled items" % cnt)
def __init__(self, withDistinct=None): super(DistinctElementCount, self).__init__() self.count = 0 self.bloom = None self.set = None if withDistinct: self.bloom = ScalableBloomFilter(error_rate=0.00001) self.distinct = 0 self.set = set([])
def add_sbf(self, query=None): ''' params: query -->mysql 查询语句 过滤任务处理结果 ''' if query is None: return None sbf = ScalableBloomFilter() table = Table(logger=self.logger) result_dict = table.execute(query=query) data = result_dict.get('data') for each in data: id = each.get('id') sbf.add(int(id)) table.close() return sbf
def __init__(self, source_image): self.source_image = source_image self.bloom_filter = ScalableBloomFilter( initial_capacity=source_image.tiles.count(), error_rate=0.0001, # 1 in 10,000 ) existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match') for tile_id, existing_match_id in existing_matches: self.bloom_filter.add((tile_id, existing_match_id))
def __init__(self, spider): super(BFSFrontier, self).__init__(spider) self._spider = spider self.args = {'rules': [], 'order': 'bfs'} self.redis = RediSugar.getConnection() self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.todo = spider.name + '-todo' self.visited = spider.name + '-visited' self._feedfilter()
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001): """ Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate the number of distinct values found in this iterable. :param iterable: :param init_cap: :param err_rate: """ counter = 0 set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate) for element in iterable: if element not in set_of_distinct_values: set_of_distinct_values.add(element) counter += 1 return counter
def main(args): seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for ln in sys.stdin: if not ln: continue fetchedUrl = json.loads(ln) # continue if we've seen this url already. if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet: continue # add unseen url to the url set seenUrlSet.add(fetchedUrl["url"]) seenUrlSet.add(fetchedUrl["effective_url"]) # extract links and filter out some urls by url filter. outlinks = url_filter(extract_links(fetchedUrl)) # analyze print "[postproc]%s" % fetchedUrl["url"]
class BloomSet(object): def __init__(self, initial_capacity=1000, error_rate=0.0001): self._set = ScalableBloomFilter(initial_capacity=initial_capacity, error_rate=error_rate, mode=ScalableBloomFilter.LARGE_SET_GROWTH) # False positives in the Bloom filter will cause us to fail to # garbage-collect an object. Salt the Bloom filter to ensure # that we get a different set of false positives on every run. self._bloom_salt = os.urandom(2) def add(self, name): self._set.add(self._bloom_key(name)) def __contains__(self, name): # May return false positives. return self._bloom_key(name) in self._set def _bloom_key(self, name): if isinstance(name, unicode): name = name.encode('utf-8') return self._bloom_salt + name
def vacuum_all(self, limit=None): logger.debug('Begin vacuum_all(limit=%s)', limit) self.plugins = self.load_plugins() ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins)) tx = transaction.get() try: #s = set() s = ScalableBloomFilter() query = DBSession.query(SupplierSpecialModel.id) for (supplier_special_id, ) in query.yield_per(100): s.add(supplier_special_id) for plug in self.plugins.itervalues(): supplier_special_filter_id = plug.supplier_special_filter_id() model_name = plug.version_model() + 'Model' VersionModel = getattr(model, model_name) query = DBSession.query(VersionModel) if limit: query = query.order_by(VersionModel.vacuumed.nullsfirst()) query = query.limit(limit) ts['sub_done'] = 0 ts['sub_total'] = query.count() for supplier_special_item_version in query.yield_per(10): if supplier_special_item_version.supplier_special_id not in s: logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id) DBSession.delete(supplier_special_item_version) ts['sub_done'] += 1 if ts['sub_done'] % 1000 == 0: DBSession.flush() DBSession.flush() ts['done'] += 1 except Exception: logger.exception('Caught Exception: ') tx.abort() finally: ts.finish() transaction.commit() logger.debug('End vacuum_all()')
def get_scale_conversion(self, supplier_id, scale_identifier): """Scale Conversion""" if scale_identifier is None: return None if supplier_id is None: return None if self.scale_conversion_filter is None: self.scale_conversion_filter = ScalableBloomFilter() query = self.session.query( ScaleConversionModel.supplier_id, ScaleConversionModel.scale_identifier ) for row in query.yield_per(100): self.scale_conversion_filter.add(row) row = (supplier_id, scale_identifier) if row in self.scale_conversion_filter: query = self.session.query(ScaleConversionModel) query = query.filter(ScaleConversionModel.supplier_id == supplier_id) query = query.filter(ScaleConversionModel.scale_identifier == scale_identifier) try: scale_conversion = query.one() return scale_conversion except NoResultFound: pass query = self.session.query(ScaleModel) query = query.filter(ScaleModel.name == scale_identifier) try: scale = query.one() except NoResultFound: scale = None if scale is not None: scale_conversion = ScaleConversionModel() scale_conversion.scale_id = scale.id return scale_conversion else: scale_conversion = ScaleConversionModel() scale_conversion.scale_id = None scale_conversion.supplier_id = supplier_id scale_conversion.scale_identifier = scale_identifier self.session.add(scale_conversion) self.scale_conversion_filter.add(row) self.session.flush() return scale_conversion
def get_price_control(self, supplier_id, manufacturer_id, retail, preorder, special): """Price Control""" if self.price_control_filter is None: self.price_control_filter = ScalableBloomFilter() query = self.session.query( PriceControlModel.supplier_id, PriceControlModel.manufacturer_id ) for row in query.yield_per(100): self.price_control_filter.add(row) row = (supplier_id, manufacturer_id) if row in self.price_control_filter: query = self.session.query(PriceControlModel) query = query.filter(PriceControlModel.supplier_id == supplier_id) query = query.filter(PriceControlModel.manufacturer_id == manufacturer_id) if preorder: query = query.filter(PriceControlModel.preorder == True) if special: query = query.filter(PriceControlModel.special == True) if (not preorder) and (not special): query = query.filter(PriceControlModel.normal == True) query = query.filter(PriceControlModel.retail_low <= retail) query = query.filter(PriceControlModel.retail_high >= retail) query = query.filter(PriceControlModel.enable == True) try: price_control = query.one() return price_control except NoResultFound: #logger.warning( # "No PriceControl found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", # supplier_id, # manufacturer_id, # retail, # preorder, # special #) return None except MultipleResultsFound: logger.warning( "Duplicate PriceControls found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", supplier_id, manufacturer_id, retail, preorder, special ) return None
class kmer_store: def __init__(self): self.bloom_filter = ScalableBloomFilter(initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.kmers = {} def update(self, item): if item in self.bloom_filter: if item in self.kmers: self.kmers[item] += 1 else: self.kmers[item] = 2 else: self.bloom_filter.add(item) def __iter__(self): for key in self.kmers: yield key def __getitem__(self, key): return self.kmers[key] def __repr__(self): return str(self.kmers) def __str__(self): return str(self.kmers)
class StockTileExclusions(object): """ Object that keeps track of which stock tiles have already been used. """ def __init__(self, source_image): self.source_image = source_image self.bloom_filter = ScalableBloomFilter( initial_capacity=source_image.tiles.count(), error_rate=0.0001, # 1 in 10,000 ) existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match') for tile_id, existing_match_id in existing_matches: self.bloom_filter.add((tile_id, existing_match_id)) def __contains__(self, key): if key in self.bloom_filter: return True elif self.source_image.tiles.filter(stock_tile_match_id=key[1]).exists(): self.add(key) return True return False def add(self, key): self.bloom_filter.add(key)
def __init__(self, ioloop=None, start_url=None, max_depth=5): super().__init__() self.ioloop = ioloop or tornado.ioloop.IOLoop.instance() self.start_url = start_url or {} self.fetch_queue = Queue() self.fetched = [] self.fetched_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.fetch_finished = [] for u in start_url: self.fetch_queue.put(u) self.fetching = 0 self.max_depth = max_depth
def load(cls, filename): #import pdb; pdb.set_trace() t = cls.transformer size = t.size with open(filename, "rb") as serialized_digest: readdata = serialized_digest.read(size) if len(readdata) != size: msg = 'invalid amount read from file for format %r: %r (should have been %d)' Logger("digest.load").log(msg % (t.format, readdata, size)) raise ValueError nonce, maxcapacity, urlcount, meta = t.unpack(readdata) # If meta has a conversion from string repr, use it. if hasattr(self, 'meta_from_string'): meta = self.meta_from_string() filterS = ScalableBloomFilter.fromfile(serialized_digest) digest = cls(maxcapacity, meta, filename, filterS=filterS, nonce=nonce) digest.urlcount = urlcount return digest
def load(cls, filename): """ This overrides the base class method to unpack using the siginfo. """ #import pdb; pdb.set_trace() t = cls.transformer size = t.size with open(filename, "rb") as serialized_digest: readdata = serialized_digest.read(size) if len(readdata) != size: msg = 'invalid amount read from file for format %r: %r (should have been %d)' Logger("scandigest.load").log(msg % (t.format, readdata, size)) raise ValueError nonce, maxcapacity, urlcount, scannervv, sigversion, sigtimestamp = t.unpack(readdata) # Read the datetime as non-utc, since that's how we wrote it with mktime. siginfo = SigInfo(scannervv, sigversion, datetime.datetime.fromtimestamp(sigtimestamp)) filterS = ScalableBloomFilter.fromfile(serialized_digest) scandigest = cls(maxcapacity, siginfo, filename, filterS=filterS, nonce=nonce) scandigest.urlcount = urlcount return scandigest
def get_manufacturer_conversion(self, supplier_id, manufacturer_identifier): """Manufacturer Conversion""" if self.manufacturer_conversion_filter is None: self.manufacturer_conversion_filter = ScalableBloomFilter() query = self.session.query( ManufacturerConversionModel.supplier_id, ManufacturerConversionModel.manufacturer_identifier ) for row in query.yield_per(100): self.manufacturer_conversion_filter.add(row) row = (supplier_id, manufacturer_identifier) if row in self.manufacturer_conversion_filter: query = self.session.query(ManufacturerConversionModel) query = query.filter(ManufacturerConversionModel.supplier_id == supplier_id) query = query.filter(ManufacturerConversionModel.manufacturer_identifier == manufacturer_identifier) try: manufacturer_conversion = query.one() return manufacturer_conversion except NoResultFound: pass query = self.session.query(ManufacturerModel) query = query.filter(ManufacturerModel.identifier == manufacturer_identifier) try: manufacturer = query.one() except NoResultFound: logger.warning("No ManufacturerConversion found for supplier_id '%s' manufacturer_identifier '%s'", supplier_id, manufacturer_identifier) return None manufacturer_conversion = ManufacturerConversionModel() manufacturer_conversion.manufacturer_id = manufacturer.id manufacturer_conversion.supplier_id = supplier_id manufacturer_conversion.manufacturer_identifier = manufacturer_identifier #self.session.add(manufacturer_conversion) return manufacturer_conversion
def __init__(self): self.products = dict() self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.redis_queue = None
class ProductSpider(RedisSpider): name = "product" allowed_domains = ["aliexpress.com"] start_urls = ( 'http://www.aliexpress.com/', ) prefix = '' def __init__(self): self.products = dict() self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.redis_queue = None def get_queue(self): for value in set(self.server.smembers(self.redis_key)): yield value def start_requests(self): ProductSpider.prefix = self.settings['prefix'] self.redis_key = '{}:product'.format(ProductSpider.prefix) self.redis_queue = self.get_queue() db = MongoClient().aliexpress for product in db['{}product'.format(ProductSpider.prefix)].find(): self.ids.add(product['url'][product['url'].rfind('/') + 1:product['url'].rfind('.')]) yield self.next_request() def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request') def parse(self, response): self.log('product url: {}'.format(response.url), logging.INFO) try: store_url = response.css('.shop-name').xpath('a/@href').extract()[0] self.log('crawl store url: {}'.format(store_url), logging.INFO) store_item = UrlItem() store_item['prefix'] = ProductSpider.prefix store_item['type'] = 'store' store_item['url'] = store_url yield store_item feedback_base_url = response.xpath('//div[@id="feedback"]/iframe/@thesrc').extract()[0] parsed = urlparse.urlparse(feedback_base_url) product_id = urlparse.parse_qs(parsed.query)['productId'][0] try: percent_num = response.css('.percent-num').xpath('text()').extract()[0] rantings_text = response.css('.rantings-num').xpath('text()').extract()[0] rantings_num = rantings_text[1:rantings_text.index(' ')] order_text = response.css('.order-num').xpath('text()').extract()[0] order_num = order_text[:order_text.index(' ')] except: percent_num = 0 rantings_num = 0 order_num = 0 product_item = ProductItem() product_item['prefix'] = ProductSpider.prefix product_item['_id'] = product_id product_item['store'] = store_url product_item['url'] = response.url product_item['percent_num'] = percent_num product_item['rantings_num'] = rantings_num product_item['order_num'] = order_num yield product_item feedback_item = UrlItem() feedback_item['prefix'] = ProductSpider.prefix feedback_item['type'] = 'feedback' feedback_item['url'] = feedback_base_url yield feedback_item order_item = UrlItem() order_item['prefix'] = ProductSpider.prefix order_item['type'] = 'order' order_item[ 'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format( product_id) yield order_item except: try: product_url = response.meta['redirect_urls'][0] except: product_url = response.url self.log('strange product url: {}'.format(product_url), logging.ERROR) finally: self.log('meet anti-spider, back product: {}'.format(product_url), logging.INFO) url_item = UrlItem() url_item['prefix'] = ProductSpider.prefix url_item['type'] = 'product' url_item['url'] = product_url yield url_item
def __init__(self): self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv', 'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare'] self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def run(args): """ read FASTQ or SAM and tabulate basic metrics """ time_start = time.time() if args.input.name != '<stdin>': bsize = os.path.getsize(args.input.name) est_counter = int() sample_lengths = list() sample_binsizes = list() act_nlines = int() name, ext = os.path.splitext(args.input.name) if (args.leftlimit > 0) and (args.rightlimit > 0): if args.rightlimit < args.leftlimit: sys.exit("Left limit must be less than right limit.\n") if args.type: ext = '.' + args.type if ext not in ['.fq','.fastq', '.sam', '.bam', '.gz'] and args.input.name != '<stdin>': sys.exit("Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n") if args.name: sample_name = args.name else: sample_name = args.input.name # estimate the number of lines in args.input if we can if ext in ['.fastq','.fq']: with FastqReader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.sam': with Reader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.bam': est_nlines = sum(bam_read_count(args.input.name)) if not args.quiet: sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines)) elif ext == '.gz': if args.binsize: n = args.binsize est_nlines = None if not args.quiet: sys.stderr.write("Reading from gzipped file, bin size (-s) set to {binsize:n}.\n".format(binsize=n)) else: sys.stderr.write("Gzipped file detected. Reading file to determine bin size (-s).\n") p1 = Popen(shlex.split('gzip -dc %s' % args.input.name), stdout=PIPE) p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE) est_nlines, _ = p2.communicate() est_nlines = int(est_nlines) // 4 if not args.quiet: sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines)) elif name == '<stdin>': if args.binsize: n = args.binsize else: n = 1 if not args.quiet: sys.stderr.write("Reading from <stdin>, bin size (-s) set to {binsize:n}.\n".format(binsize=n)) est_nlines = None if est_nlines is not None: # set up factor for sampling bin size if args.binsize: n = args.binsize else: nf = math.floor(est_nlines / args.nreads) if nf >= 1: n = int(nf) else: n = 1 if not args.quiet: sys.stderr.write("Bin size (-s) set to {binsize:n}.\n".format(binsize=n)) if ext in ['.sam', '.bam']: infile = Reader(args.input) else: infile = FastqReader(args.input, ext=ext) read_len = defaultdict(int) cycle_nuc = defaultdict(lambda: defaultdict(int)) cycle_qual = defaultdict(lambda: defaultdict(int)) cycle_gc = defaultdict(int) cycle_kmers = defaultdict(lambda: defaultdict(int)) cycle_mismatch = {'C': defaultdict(lambda: defaultdict(int)), 'G': defaultdict(lambda: defaultdict(int)), 'A': defaultdict(lambda: defaultdict(int)), 'T': defaultdict(lambda: defaultdict(int))} if args.count_duplicates: try: from pybloom import ScalableBloomFilter bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) except ImportError: sys.exit("--count-duplicates option requires 'pybloom' package.\n") duplicates = 0 percent_complete = 10 reads = infile.subsample(n) for read in reads: if isinstance(read, Sam): if args.aligned_only and not read.mapped: continue elif args.unaligned_only and read.mapped: continue if read.reverse: seq = read.seq[::-1] qual = read.qual[::-1] else: seq = read.seq qual = read.qual else: seq = read.seq qual = read.qual # Set up limits if (args.leftlimit == 1) and (args.rightlimit < 0): pass elif (args.leftlimit >= 1) and (args.rightlimit > 0): try: seq = seq[args.leftlimit - 1:args.rightlimit] qual = qual[args.leftlimit - 1:args.rightlimit] except IndexError: act_nlines += n continue elif (args.leftlimit > 1) and (args.rightlimit < 0): try: seq = seq[args.leftlimit - 1:] qual = qual[args.leftlimit - 1:] except IndexError: act_nlines += n continue if len(seq) == 0: act_nlines += n continue cycle_gc[gc(seq)] += 1 if args.count_duplicates: if seq in bloom_filter: duplicates += 1 else: bloom_filter.add(seq) for i, (s, q) in enumerate(zip(seq, qual)): cycle_nuc[args.leftlimit + i][s] += 1 cycle_qual[args.leftlimit + i][q] += 1 read_len[len(qual)] += 1 for i, kmer in enumerate(window(seq, n=args.kmer)): cycle_kmers[args.leftlimit+i][kmer] += 1 if isinstance(read, Sam) and read.mapped: try: ref = read.parse_md() for i, (s, r) in enumerate(zip(seq, ref)): if s != r: try: cycle_mismatch[r][args.leftlimit+i][s] += 1 except KeyError: pass except KeyError: pass if est_nlines is not None: if (act_nlines / est_nlines) * 100 >= percent_complete: sys.stderr.write("Approximately {0:n}% complete at " "read {1:,} in {2}\n".format(percent_complete, act_nlines, time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))) percent_complete += 10 act_nlines += n positions = [k for k in sorted(cycle_qual.keys())] depths = [read_len[k] for k in sorted(read_len.keys())] basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())] bases = set(list(itertools.chain.from_iterable(basecalls))) #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())] map(padbases(bases), cycle_nuc.values()) quantile_values = [0.05,0.25,0.5,0.75,0.95] quantiles = [] ## replace ASCII quality with integer for _, v in sorted(cycle_qual.items()): for q in tuple(v.keys()): ## py3 keys are iterator, so build a tuple to avoid recursion v[ord(str(q)) - 33] = v.pop(q) line = [percentile(v, p) for p in quantile_values] quantiles.append(line) # build kmer set of known adapter sequences adapter_kmers = set() for adapter in all_adapter_sequences: for kmer in window(adapter, n=args.kmer): adapter_kmers.add(kmer) # test for nonuniform kmer profiles and calculate obs/exp observed_expected = dict() all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())] kmers = set(list(itertools.chain.from_iterable(all_kmers))) bad_kmers = [] sequenced_bases = sum((l * n for l, n in read_len.items())) priors = tuple(map(float, args.base_probs.split(','))) for kmer in kmers: kmer_counts = [(i, cycle_kmers[i][kmer]) for i in sorted(cycle_kmers.keys())] expected_fraction = reduce(mul, (p ** kmer.count(b) for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1) expected = expected_fraction * sequenced_bases observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts)) if abs(slope) > 2 and p_value < 0.05: bad_kmers.append((kmer, slope, p_value)) bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10] pos_gc = [sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([cycle_nuc[i]['C'], cycle_nuc[i]['G'], cycle_nuc[i]['A'], cycle_nuc[i]['T']]) * 100 for i in positions] # see http://vita.had.co.nz/papers/tidy-data.pdf sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='reads', pos='None', value=act_nlines)) for cycle, count in read_len.items(): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_len', pos=cycle, value=count)) for i, position in enumerate(positions): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q05', pos=position, value=quantiles[i][0])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q25', pos=position, value=quantiles[i][1])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q50', pos=position, value=quantiles[i][2])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q75', pos=position, value=quantiles[i][3])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q95', pos=position, value=quantiles[i][4])) for base in bases: for position in positions: sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column=base, pos=position, value=cycle_nuc[position][base])) for position in positions: sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='cycle_gc', pos=position, value=cycle_gc[position])) for i in range(101): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_gc', pos=i, value=cycle_gc[i])) for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]): sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column=kmer, pos='None', value=obs_exp)) if args.count_duplicates: sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='duplicate', pos='None', value=duplicates/act_nlines)) from zipfile import ZipFile with ZipFile(args.output + '.zip', mode='w') as zip_archive: fig_kw = {'figsize':(8, 6)} qualplot(positions, quantiles, zip_archive, fig_kw) median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw) qualmap(cycle_qual, zip_archive, fig_kw) depthplot(read_len, zip_archive, fig_kw) gcplot(positions, pos_gc, zip_archive, fig_kw) gcdist(cycle_gc, zip_archive, fig_kw) nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw) kmerplot(positions, cycle_kmers, zip_archive, [fields[0] for fields in bad_kmers], fig_kw) adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive, fig_kw) if isinstance(infile, Reader): mismatchplot(positions , cycle_mismatch, zip_archive, fig_kw) time_finish = time.time() elapsed = time_finish - time_start if not args.quiet: sys.stderr.write("There were {counts:,} reads in the file. Analysis finished in {sec}.\n".format(counts=act_nlines, sec=time.strftime('%H:%M:%S', time.gmtime(elapsed)) )) if len(bad_kmers) > 0: for kmer in bad_kmers: sys.stderr.write("KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n" % (kmer)) if median_qual < args.median_qual: sys.stderr.write("QualityWarning: median base quality score is %s.\n" % median_qual)