class A25ppSpider(scrapy.Spider): name = '25pp' allowed_domains = ['25pp.com'] start_urls = [ 'https://www.25pp.com/android/', 'https://www.25pp.com/android/game/' ] base_url = 'https://www.25pp.com' def __init__(self, checkpoint=None, *a, **kw): super(A25ppSpider, self).__init__(*a, **kw) self.bf = BloomFilter(capacity=10000000) self.apkbf = BloomFilter(capacity=100000000) self.checkpoint = checkpoint if not checkpoint == None: fd = open(checkpoint, 'r') while True: line = fd.readline() if not line: break line = line.strip() self.apkbf.add(line) fd.close() def start_requests(self): for url in self.start_urls: self.bf.add(url) yield Request( url, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, dont_filter=True) def parse(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') categorypattern = re.compile(ur'fenlei/[0-9]+') for aitem in soup.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if categorypattern.search(href) == None: continue if href in self.bf: continue self.bf.add(href) yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, callback=self.parse_category) def parse_category(self, response): print(response.url) soup = bs4.BeautifulSoup(response.text, 'html.parser') category = soup.select('.active')[2].get_text() print(category) applist = soup.select('.app-list')[0] pagelist = soup.select('.page-wrap')[0] for aitem in applist.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if href in self.bf: continue self.bf.add(href) yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta={'category': category}, callback=self.parse_detail) for aitem in pagelist.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if href in self.bf: continue yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, callback=self.parse_category) def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') appinfo = soup.select('.app-info')[0] commonname = appinfo.select('.app-title')[0].get_text() pls = soup.select('.permission-list') permissionlist = list() if not len(pls) == 0: for perm in pls[0].select('.clearfix')[0].find_all('li'): permissionlist.append(perm.get_text()) category = response.meta['category'] detail_info = soup.select('.app-detail-info')[0].select('strong') size = detail_info[1].get_text() updatetime = detail_info[0].get_text() version = detail_info[2].get_text() urllink = soup.select('.btn-install')[0]['appdownurl'] platform = self.name detailpattern = re.compile(ur'detail_[0-9]+') idpattern = re.compile(ur'[0-9]+') detailstring = detailpattern.search(response.url).group() apkid = idpattern.search(detailstring).group() packagename = commonname if apkid in self.apkbf: return print("apkid%s" % apkid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('apkplaform', platform) item.add_value('category', category) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permissionlist) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
bufgen = takewhile(lambda x: x, (f.raw.read(1024 * 1024) for _ in repeat(None))) return sum(buf.count(b'\n') for buf in bufgen) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Generate bloom filter files') parser.add_argument("--base_path", default='../') parser.add_argument("--path", help="dataset path", default="data/LC-QUAD/linked_answer6.json", dest="dataset_path") parser.add_argument('--create', action='store_true') args = parser.parse_args() bloom = BloomFilter(capacity=200000000, error_rate=0.000001) dbpedia_path = os.path.join(args.base_path, 'data', 'dbpedia') blooms_path = os.path.join(args.base_path, 'data', 'blooms') if args.create: for ttl_file in os.listdir(dbpedia_path): if '.ttl' not in ttl_file: continue print(ttl_file) file_path = os.path.join(dbpedia_path, ttl_file) with open(file_path, 'r') as f: for line in tqdm(f, total=rawincount(file_path)): items = line.split(' ') if len(items) != 4: continue items = items[:-1]
basedir = os.path.dirname(__file__) def row_to_dict(row): row = row.strip() record = dict(xmltodict.parse(row)['row']) return {k.replace('@', '').lower(): v for k, v in record.items()} # # TRAIN BLOOM FILTER # # -- training the Bloom filter bf = BloomFilter(capacity=10**5, error_rate=0.01) with open('./resources/0.xml', 'r') as f: for line in f: user = row_to_dict(line) bf.add(user['displayname']) with open('./resources/hot_displayname.bf', 'wb') as f: bf.tofile(f) # # MAP REDUCE JOB USING THE FILTER # class NotHotFilterJob(MRJob): def mapper_init(self):
# 'connection': 'keep-alive', # 'cache-control': 'no-cache', # 'upgrade-insecure-requests': '1', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36 QQBrowser/4.2.4976.400', # 'Accept': 'text/html, */*; q=0.01', # 'Accept-Language': 'zh-CN,zh;q=0.8', # } # req = request.Request(url, headers=request_headers) # response = request.urlopen(req) # return response.read() city_home_pages = [] city_ids = [] dirname = 'mafengwo_notes/' download_bf = BloomFilter(1024 * 1024 * 16, 0.01) def download_city_notes(id): for i in range(1, 999): url = 'http://www.mafengwo.cn/yj/%s/1-0-%d.html' % (id, i) if url in download_bf: continue print('open url' + url) htmlcontent = do_request(url).decode('utf-8') # 一页中所有的游记,使用正则是因为可以直接拿来url用 city_notes = re.findall('href="/i/\d{7}.html', htmlcontent) if len(city_notes) == 0:
def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(2000000, 0.00001)
def __init__(self,save_queue,filename): super(Sava_key, self).__init__() self.save_queue = save_queue self.filename = filename self.boom = BloomFilter(capacity=1e7,error_rate=0.001)
def __init__(self, disk_filepath, block_size, n_blocks, n_input_data, growth_factor=2, pointer_density=0.1): super(FractionalBloomCola, self).__init__(disk_filepath, block_size, n_blocks, n_input_data) self.g = int(growth_factor) self.p = float(pointer_density) self.bloom_filter = BloomFilter(capacity=self.n_input_data, error_rate=ERROR_RATE) # compute the number of levels needed to store all input data self.n_levels = 1 n_elements = 1 while n_elements < self.n_input_data: level_size = 2 * (self.g - 1) * self.g**(self.n_levels - 1) level_n_lookahead = int( math.floor(2 * self.p * (self.g - 1) * self.g**(self.n_levels - 1))) n_elements += (level_size - level_n_lookahead) self.n_levels += 1 self.n_levels += 1 # compute the number of lookahead pointers self.level_sizes = [1] + [(2 * (self.g - 1) * self.g**(i - 1)) for i in range(1, self.n_levels)] self.level_n_lookaheads = [0] + [ int(math.floor(2 * self.p * (self.g - 1) * self.g**(i - 1))) for i in range(1, self.n_levels) ] self.level_n_items = np.zeros(self.n_levels, dtype=int) self.disk_size = np.sum(self.level_sizes) self.level_start_idxs = np.zeros(self.n_levels, dtype=int) for i in range(1, self.n_levels ): # preform prefix sum to get start idxs for the level self.level_start_idxs[i] = self.level_start_idxs[ i - 1] + self.level_sizes[i - 1] # create storage file. if os.path.exists(disk_filepath): os.remove(disk_filepath) else: dirname = os.path.dirname(disk_filepath) if not os.path.exists(dirname): os.makedirs(dirname) disk = h5py.File(self.disk_filepath, 'w') disk.create_dataset('dataset', shape=(self.disk_size, ), dtype=np.int) disk.create_dataset('is_lookaheads', shape=(self.disk_size, ), dtype=np.bool) disk.create_dataset('references', shape=(self.disk_size, ), dtype=np.int) disk.close() self.disk = h5py.File(self.disk_filepath, 'r+') self.data = self.disk['dataset'] self.is_lookaheads = self.disk['is_lookaheads'] self.references = self.disk['references'] self.n_items = 0 self.final_insert_level = 0
from pybloom_live import ScalableBloomFilter # 可自动扩容的布隆过滤器 bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001) url1 = 'http://www.baidu.com' url2 = 'http://qq.com' bloom.add(url1) print(url1 in bloom) print(url2 in bloom) # BloomFilter 是定长的 from pybloom_live import BloomFilter url1 = 'http://www.baidu.com' url2 = 'http://qq.com' bf = BloomFilter(capacity=1000) bf.add(url1) print(url1 in bf) print(url2 in bf)
from os.path import abspath, dirname import sqlite3 from flask import Flask, request from pybloom_live import BloomFilter here = dirname(abspath(__file__)) db_file = '{}/events.db'.format(here) app = Flask(__name__) id_cache = BloomFilter(1000000) cursor = None def db_connect(): schema = ''' CREATE TABLE IF NOT EXISTS events ( id CHAR(32) PRIMARY KEY ); ''' conn = sqlite3.connect(db_file) conn.executescript(schema) return conn.cursor() def event_in_db(event_id): cursor.execute('SELECT * FROM events WHERE id = ?', (event_id, )) return cursor.fetchone() is not None def insert_new_event(event_id):
class FractionalBloomCola(WriteOptimizedDS): def __init__(self, disk_filepath, block_size, n_blocks, n_input_data, growth_factor=2, pointer_density=0.1): super(FractionalBloomCola, self).__init__(disk_filepath, block_size, n_blocks, n_input_data) self.g = int(growth_factor) self.p = float(pointer_density) self.bloom_filter = BloomFilter(capacity=self.n_input_data, error_rate=ERROR_RATE) # compute the number of levels needed to store all input data self.n_levels = 1 n_elements = 1 while n_elements < self.n_input_data: level_size = 2 * (self.g - 1) * self.g**(self.n_levels - 1) level_n_lookahead = int( math.floor(2 * self.p * (self.g - 1) * self.g**(self.n_levels - 1))) n_elements += (level_size - level_n_lookahead) self.n_levels += 1 self.n_levels += 1 # compute the number of lookahead pointers self.level_sizes = [1] + [(2 * (self.g - 1) * self.g**(i - 1)) for i in range(1, self.n_levels)] self.level_n_lookaheads = [0] + [ int(math.floor(2 * self.p * (self.g - 1) * self.g**(i - 1))) for i in range(1, self.n_levels) ] self.level_n_items = np.zeros(self.n_levels, dtype=int) self.disk_size = np.sum(self.level_sizes) self.level_start_idxs = np.zeros(self.n_levels, dtype=int) for i in range(1, self.n_levels ): # preform prefix sum to get start idxs for the level self.level_start_idxs[i] = self.level_start_idxs[ i - 1] + self.level_sizes[i - 1] # create storage file. if os.path.exists(disk_filepath): os.remove(disk_filepath) else: dirname = os.path.dirname(disk_filepath) if not os.path.exists(dirname): os.makedirs(dirname) disk = h5py.File(self.disk_filepath, 'w') disk.create_dataset('dataset', shape=(self.disk_size, ), dtype=np.int) disk.create_dataset('is_lookaheads', shape=(self.disk_size, ), dtype=np.bool) disk.create_dataset('references', shape=(self.disk_size, ), dtype=np.int) disk.close() self.disk = h5py.File(self.disk_filepath, 'r+') self.data = self.disk['dataset'] self.is_lookaheads = self.disk['is_lookaheads'] self.references = self.disk['references'] self.n_items = 0 self.final_insert_level = 0 def insert(self, item): insert_data = [item] self.n_items += 1 n_inserts = 1 next_level_data = None self.bloom_filter.add(item, skip_check=True) # perform the downward merge last_insert_level = 0 for i in range(self.n_levels): level_start_idx = self.level_start_idxs[i] level_n_items = self.level_n_items[i] level_size = self.level_sizes[i] level_end_idx = level_start_idx + level_n_items level_data = self.data[level_start_idx:level_end_idx] level_is_lookaheads = self.is_lookaheads[ level_start_idx:level_end_idx] level_references = self.references[level_start_idx:level_end_idx] merge_size = n_inserts + level_n_items merged_data = np.zeros(shape=merge_size, dtype=int) merged_is_lookaheads = np.zeros(shape=merge_size, dtype=bool) merged_references = np.zeros(shape=merge_size, dtype=int) # perform the merge here, we merge to the front of the merge array. merged_i, insert_i, level_i = 0, 0, 0 leftmost_lookahead_idx = INVALID_IDX while level_i < level_n_items and insert_i < n_inserts: if level_data[level_i] <= insert_data[ insert_i]: # insert level items merged_data[merged_i] = level_data[level_i] merged_is_lookaheads[merged_i] = level_is_lookaheads[ level_i] # if is lookahead pointer, then if merged_is_lookaheads[merged_i]: merged_references[merged_i] = level_references[level_i] leftmost_lookahead_idx = merged_i else: # not lookahead, so point to the nearest lookahead. merged_references[merged_i] = leftmost_lookahead_idx level_i += 1 else: merged_data[merged_i] = insert_data[insert_i] merged_is_lookaheads[merged_i] = False merged_references[merged_i] = leftmost_lookahead_idx insert_i += 1 merged_i += 1 if insert_i < n_inserts: assert level_i == level_n_items merged_data[merged_i:] = insert_data[insert_i:] merged_is_lookaheads[merged_i:] = np.zeros_like( insert_data[insert_i:], dtype=bool) merged_references[merged_i:] = np.ones_like( insert_data[insert_i:], dtype=int) * leftmost_lookahead_idx elif level_i < level_n_items: assert insert_i == n_inserts merged_data[merged_i:] = level_data[level_i:] merged_is_lookaheads[merged_i:] = level_is_lookaheads[level_i:] for j, is_lookahead in enumerate( level_is_lookaheads[level_i:]): if is_lookahead: merged_references[merged_i + j] = level_references[level_i + j] leftmost_lookahead_idx = level_i + j else: merged_references[merged_i + j] = leftmost_lookahead_idx if level_n_items + n_inserts > level_size: # it will be full, grab all non-pointers self.level_n_items[i] = 0 data_idxs = np.argwhere( np.bitwise_not(merged_is_lookaheads)).reshape(-1) insert_data = merged_data[data_idxs] n_inserts = len(insert_data) else: self.level_n_items[i] = merge_size level_end_idx = level_start_idx + merge_size # perfrom writes here. self.data[level_start_idx:level_end_idx] = merged_data self.is_lookaheads[ level_start_idx:level_end_idx] = merged_is_lookaheads self.references[ level_start_idx:level_end_idx] = merged_references # update for searches self.final_insert_level = max(self.final_insert_level, i) # update for the upward insertion of lookahead pointers last_insert_level = i next_level_data = merged_data break # perform the upward insertion of lookahead pointers, note that all upper levels were merged # and should not have any items, so we can simply override them. for i in reversed(range(last_insert_level)): level_n_lookahead = self.level_n_lookaheads[i] if level_n_lookahead == 0: break # no more lookaheads next_level_size = self.level_sizes[i + 1] next_level_n_items = self.level_n_items[i + 1] assert len(next_level_data) == next_level_n_items lookahead_stride = next_level_size // level_n_lookahead lookahead_references = [ ref for ref in range(lookahead_stride - 1, next_level_n_items, lookahead_stride) ] n_lookahead = len(lookahead_references) if n_lookahead == 0: break # no more lookahead pointers to insert. lookahead_data = next_level_data[lookahead_references] # update n_items self.level_n_items[i] = n_lookahead level_start_idx = self.level_start_idxs[i] level_end_idx = level_start_idx + n_lookahead # write to disk self.data[level_start_idx:level_end_idx] = lookahead_data self.is_lookaheads[level_start_idx:level_end_idx] = np.ones( shape=n_lookahead, dtype=bool) self.references[ level_start_idx:level_end_idx] = lookahead_references # update for next iteration next_level_data = lookahead_data def query(self, item): idx = self._search(item) return idx > INVALID_IDX def _search(self, item): if item not in self.bloom_filter: return INVALID_IDX n_search_levels = self.final_insert_level + 1 search_start = INVALID_IDX search_end = INVALID_IDX for i in range(n_search_levels): if search_start == INVALID_IDX: search_start = 0 level_n_item = self.level_n_items[i] if search_end == INVALID_IDX: search_end = level_n_item assert search_start <= search_end if search_end - search_start == 0: search_start = INVALID_IDX search_end = INVALID_IDX continue level_start_idx = self.level_start_idxs[i] start_idx = level_start_idx + search_start end_idx = level_start_idx + search_end search_arr = self.data[start_idx:end_idx] l, r = self.binary_search(search_arr, item) is_found = (l == r) and (l != INVALID_IDX) if is_found: loc = start_idx + l is_lookahead = self.is_lookaheads[loc] if is_lookahead: reference = self.references[loc] search_start = reference search_end = reference + 1 else: return loc else: if l == INVALID_IDX: search_start = INVALID_IDX else: loc = start_idx + l is_lookahead = self.is_lookaheads[loc] reference = self.references[loc] if is_lookahead: search_start = reference else: if reference == INVALID_IDX: search_start = INVALID_IDX else: loc = level_start_idx + reference search_start = self.references[loc] if r == INVALID_IDX: search_end = INVALID_IDX else: loc = start_idx + r is_lookahead = self.is_lookaheads[loc] reference = self.references[loc] if is_lookahead: search_end = reference else: search_end = INVALID_IDX is_lookaheads = self.is_lookaheads[level_start_idx + r + 1:level_start_idx + level_n_item] for j, is_lookahead in enumerate(is_lookaheads): if is_lookahead: reference = self.references[level_start_idx + r + 1 + j] search_end = reference return -1 @staticmethod def binary_search(search_arr, item): # boundary conditions search_arr = np.array(search_arr, dtype=int) last_idx = len(search_arr) - 1 if item == search_arr[0]: # if item is found at the startign idx return 0, 0 if item == search_arr[-1]: # if item is found at the last idx return last_idx, last_idx if item > search_arr[-1]: # if item is bigger than all items return last_idx, INVALID_IDX if item < search_arr[0]: # if item is smaller than all items return INVALID_IDX, 0 l = 0 h = last_idx while (l + 1) < h: # terminate when l + 1 = h mid = (l + h) // 2 if item == search_arr[mid]: return mid, mid elif item < search_arr[mid]: h = mid else: # item > search_arr[mid] l = mid return l, h
from spider.io.pojo import User, Title, TitleDetail, Img from spider.io.DB import session from spider.io.DB import engine from spider.io.DB import Base Base.metadata.create_all(engine) # redis r = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT']) def processUserURL(url): return url.split('&ie=')[0] user_bloomFilter = BloomFilter(capacity=2 << 25, error_rate=0.01) img_bloomFilter = BloomFilter(capacity=2 << 15, error_rate=0.01) class TiebaPipeline(object): def process_item(self, item, spider): user_url = processUserURL(item['user_url']) t = Title(url=item['url'], title=item['title'], user_url=user_url) if 'home/main?un' in user_url and user_url not in user_bloomFilter: user_bloomFilter.add(user_url) r.lpush('tieba:user_urls', user_url) else: print('unsupport user_url:%s' % user_url) r.lpush('tieba:title_urls', item['url']) session.add(t) # @UndefinedVariable session.commit() # @UndefinedVariable
from pybloom_live import BloomFilter # 初始化 BloomFilter 对象,设定容量为 1000,误判几率 0.001 f = BloomFilter(capacity=1000, error_rate=0.001) # 循环将 0~4 的数字添加到 vector 中,并打印返回结果 res = [f.add(x) for x in range(5)] print(res) # 单独将数字 4 添加到 vector 中,并打印返回结果 print(f.add(3)) # 判断数字 10 和数字 5 是否在 vector 中,并打印判断结果 print(10 in f) print(5 in f)
class ProproHelper: def __init__(self, filepath="../src_tgt.txt", vocabfile="../miniparapair/fastText/vocab.txt", wordVecfile="../miniparapair/fastText/wordVec.txt", sentencesfile="../miniparapair/fastText/sentenceSet.txt", maxlen=25, capacity=250000000): self.filepath = filepath self.vocabfile = vocabfile self.wordVecfile = wordVecfile self.sentencesfile = sentencesfile self.maxlen = maxlen self.bf = BloomFilter(capacity=capacity) def extractVocabsAndSentences(self): ''' 这里考虑到词典需要100%准确,所以词典采用集合的方式去重 这里句子 采用布隆过滤器 进行去重 随时一点精度 :param vocabfile: 保存单词 :param sentencesfile: 保存所有的句子 :return: ''' vocabSet = set() sentencesTokenSet = [] #这是存储所有已经分好词句子 里面没有重复的 num = 0 try: with open(self.filepath, mode="r", encoding="utf-8") as fr: for line in fr: try: num += 1 if num % 100000 == 0: print("数据正在提取单词,数据正在去重,,,", num / 233864191) line = line.strip() if line != "": sen1, sen2 = line.split("---xhm---") if len(sen1) > self.maxlen or len( sen2) > self.maxlen: # 长度太大的不需要 continue words_1 = list(jieba.cut(sen1)) words_2 = list(jieba.cut(sen2)) # 将单词添加到单词集合中 for word in words_1: if word not in vocabSet: vocabSet.add(word) for word in words_2: if word not in vocabSet: vocabSet.add(word) #将句子添加到句子集合中 if sen1 not in self.bf: sentencesTokenSet.append(" ".join(words_1)) self.bf.add(sen1) if sen2 not in self.bf: sentencesTokenSet.append(" ".join(words_2)) self.bf.add(sen2) except Exception: print("这是出错的行", line) except Exception: print("内部错误") with open(self.vocabfile, mode="w", encoding="utf-8") as fw: fw.write("\n".join(vocabSet)) with open(self.sentencesfile, mode="w", encoding="utf-8") as fw: fw.write("\n".join(sentencesTokenSet)) def computeAndSaveWord2vec(self): fr = open(self.vocabfile, mode="r", encoding="utf-8") fw = open(self.wordVecfile, mode="w", encoding="utf-8") for line in fr: line = line.strip() if line != "": vec = fastTextHelper.getWordVec(line) vec = [str(num) for num in vec] fw.write(line + " " + " ".join(vec) + "\n") fr.close() fw.close()
#!/usr/bin/env python # -*- coding: utf-8 -*- # @File : 1.python布隆过滤器.py # 安装:pip install pybloom-live from pybloom_live import BloomFilter # capacity是容量, error_rate 是能容忍的误报率 f = BloomFilter(capacity=1000, error_rate=0.001) # 返回False,一定不存在/返回True,则有可能存在 state = f.add('你好') # ScalableBloomFilter:自动扩容 from pybloom_live import ScalableBloomFilter # SMALL_SET_GROWTH,扩容规则 sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) sbf.add() # 与BloomFilter 同
import os from pybloom_live import BloomFilter # coon = pymysql.connect(host='127.0.0.1', user='******', passwd='qwer', db='haining') # cur = coon.cursor() # cur.execute("SELECT room_id from haining_room") # room_urls = cur.fetchall() ls = ["1049be49dc584707"] os.chdir(r'E:\Myproject\Scan\chizhou\chizhou\spiders') is_exist = os.path.exists('chizhou.blm') # 判断是否存在bloom文件 # 判断存在就读取 if is_exist: bf = BloomFilter.fromfile(open('chizhou.blm', 'rb')) # 没有该文件则创建bf对象 最后的时候保存文件 else: bf = BloomFilter(1000000, 0.0000001) i = 1 for room_url in ls: if room_url in bf: print('pass') pass else: # 加入布隆列表 bf.add(room_url) print('添加了 %s 个' % i) i += 1 # 创建,写入布隆文件(单次写入)
def trial(fd): params = search_params() for blk_size in BLK_SIZE: for fpr_r in FPR_RECEIVER: for fraction in FRACTION: # True_positives is the number of txns in the blk the receiver has true_positives = int(blk_size * fraction) true_false_positives = blk_size - true_positives mempool_size = true_false_positives + true_positives assert mempool_size == blk_size print( 'Running %d trials for parameter combination: extra txns in mempool %d blk size %d CB bound %f fraction %f' % (NUM_TRIAL, true_false_positives, blk_size, bound, fraction)) # Size of Compact block (inv + getdata) getdata = true_false_positives * TXN_SHORT_BYTES_CB inv = blk_size * TXN_SHORT_BYTES_CB compact = inv + getdata for i in range(NUM_TRIAL): blk, receiver_mempool = create_mempools( mempool_size, fraction, blk_size, true_false_positives) # Sender creates BF of blk a, fpr_sender, iblt_rows_first = params.CB_solve_a( mempool_size, blk_size, blk_size, 0, bound) bloom_sender = BloomFilter(blk_size, fpr_sender) tmp = blk_size + 0.5 exponent = (-bloom_sender.num_slices * tmp) / (bloom_sender.num_bits - 1) real_fpr_sender = (1 - exp(exponent))**bloom_sender.num_slices #exponent = (-bloom_sender.num_slices*blk_size) / bloom_sender.num_bits #tmp = (1-exp(exponent)) ** bloom_sender.num_slices #real_fpr_sender = max(tmp, fpr_sender) #assert real_fpr_sender >= fpr_sender # Sender creates IBLT of blk iblt_sender_first = PYBLT(a, TXN_SHORT_BYTES) # Add to BF and IBLT for txn in blk: bloom_sender.add(txn) iblt_sender_first.insert(txn, 0x0) # Receiver computes how many items pass through BF of sender and creates IBLT iblt_receiver_first = PYBLT(a, TXN_SHORT_BYTES) Z = [] for txn in receiver_mempool: if txn in bloom_sender: Z.append(txn) iblt_receiver_first.insert(txn, 0x0) #(id and content) z = len(Z) observed_false_positives = z - true_positives # Eppstein subtraction T = iblt_receiver_first.subtract(iblt_sender_first) boolean, result = T.list_entries() #assert boolean == False # Check whether decoding successful if boolean == True: flag, in_blk = decode_blk(result, Z, blk) # Each component of graphene blk size first_IBLT = (iblt_rows_first * TAU) first_BF = (bloom_sender.num_bits / 8.0) extra = (len(in_blk) * TXN_SHORT_BYTES) # Compute size of Graphene block graphene = first_IBLT + first_BF + extra fd.write( str(true_false_positives) + '\t' + str(blk_size) + '\t' + str(bound) + '\t' + str(fraction) + '\t' + str(mempool_size) + '\t' + str(fpr_sender) + '\t' + str(real_fpr_sender) + '\t' + str(0) + '\t' + str(a) + '\t' + str(0) + '\t' + str(0) + '\t' + str(z) + '\t' + str(0) + '\t' + str(observed_false_positives) + '\t' + str(boolean and flag) + '\t' + str(False) + '\t' + str(graphene) + '\t' + str(first_IBLT) + '\t' + str(first_BF) + '\t' + str(0) + '\t' + str(0) + '\t' + str(extra) + '\t' + str(iblt_rows_first) + '\t' + str(0) + '\t' + str(compact) + '\t' + str(0) + '\t' + str(0) + '\n') else: fpr_receiver = fpr_r bloom_receiver = BloomFilter(z, fpr_receiver) for txn in Z: bloom_receiver.add(txn) # Sender determines IBLT size from_sender = [] for txn in blk: if txn not in bloom_receiver: from_sender.append(txn) T.insert(txn, 0x0) h = len( from_sender) # sender sends these over to receiver #z is the count of txns that pass through bloom filter S x_star = params.search_x_star(z=blk_size - h, mempool_size=blk_size, fpr=fpr_receiver, bound, blk_size) temp = (blk_size - x_star) * fpr_receiver y_star = params.CB_bound(temp, fpr_receiver, bound) y_star = ceil(y_star) b, fpr_sender_second, iblt_rows_second = params.solve_a( m=blk_size, n=x_star, x=x_star, y=y_star) bloom_sender_second = BloomFilter( blk_size - h, fpr_sender_second) iblt_sender_second = PYBLT(b + y_star, TXN_SHORT_BYTES) for txn in blk: iblt_sender_second.insert(txn, 0x0) if txn not in from_sender: bloom_sender_second.add(txn) # Receiver determines IBLT size count = 0 for txn in Z: if txn in bloom_sender_second: from_sender.append(txn) T.insert(txn, 0x0) count = count + 1 iblt_receiver_second = PYBLT(b + y_star, TXN_SHORT_BYTES) # Size of IBLT # if b+(blk_size-h-x_star)-1 >= len(params.params): # difference too much # tmp = b+(blk_size-h-x_star) * 1.362549 # rows = ceil(tmp) # iblt_rows_second = rows * 12 # else: # rows = params.params[b+(blk_size-h-x_star)-1][3] # iblt_rows_second = rows * 12 for txn in from_sender: iblt_receiver_second.insert(txn, 0x0) # Eppstein subtraction T_second = iblt_receiver_second.subtract( iblt_sender_second) boolean, result = T_second.list_entries() #print(boolean) #print('Z', z) # Check whether blk was reconstructed properly flag, in_blk = decode_blk(result, from_sender, blk) final = False if boolean == False or flag == False: final, in_blk, not_in_blk = try_ping_pong( T, T_second, set(), set()) #print('Ping pong result', final) if final == True: possibly_in_blk = set(from_sender) possibly_in_blk.difference_update(not_in_blk) reconstructed_blk = list( in_blk.union(possibly_in_blk)) assert set(reconstructed_blk) == set(blk) # Each component of graphene blk size first_IBLT = (iblt_rows_first * TAU) first_BF = (bloom_sender.num_bits / 8.0) second_IBLT = (iblt_rows_second * TAU) second_BF = (bloom_receiver.num_bits / 8.0) third_BF = (bloom_sender_second.num_bits / 8.0) extra = (len(in_blk) * TXN_SHORT_BYTES) # Compute size of Graphene block graphene = first_IBLT + first_BF + second_IBLT + second_BF + third_BF + extra fd.write( str(true_false_positives) + '\t' + str(blk_size) + '\t' + str(bound) + '\t' + str(fraction) + '\t' + str(mempool_size) + '\t' + str(fpr_sender) + '\t' + str(real_fpr_sender) + '\t' + str(fpr_receiver) + '\t' + str(a) + '\t' + str(b) + '\t' + str(x_star) + '\t' + str(z) + '\t' + str(count) + '\t' + str(observed_false_positives) + '\t' + str(boolean and flag) + '\t' + str(final) + '\t' + str(graphene) + '\t' + str(first_IBLT) + '\t' + str(first_BF) + '\t' + str(second_IBLT) + '\t' + str(second_BF) + '\t' + str(extra) + '\t' + str(iblt_rows_first) + '\t' + str(iblt_rows_second) + '\t' + str(compact) + '\t' + str(third_BF) + '\t' + str(fpr_sender_second) + '\n') fd.flush()
import os from pybloom_live import BloomFilter #你完全可以避免下载庞大的vc运行库,在https://www.lfd.uci.edu/~gohlke/pythonlibs/下载whl文件 ''' animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle', 'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear', 'chicken', 'dolphin', 'donkey', 'crow', 'crocodile','testadd'] ''' is_exist = os.path.exists('test.blm') #判断是否存在bloom文件 #判断存在就读取 if is_exist: bf = BloomFilter.fromfile(open('test.blm', 'rb')) #没有该文件则创建bf对象 最后的时候保存文件 else: bf = BloomFilter(20000, 0.001) for i in range(10): if i in bf: print('pass') pass else: print('add %s' % i) bf.add(i) n = open('test.blm', 'wb') bf.tofile(n) n.close() for i in range(20): if i in bf: print("written")
class BasicBloomCola(WriteOptimizedDS): """ this augments the basic cola data structure with bloom filters at each subarray level and a larger bloom filter that checks for existence across all levels""" def __init__(self, disk_filepath, block_size, n_blocks, n_input_data, growth_factor=2, pointer_density=0.1): super(BasicBloomCola, self).__init__(disk_filepath, block_size, n_blocks, n_input_data) self.g = int(growth_factor) self.bloom_filter = BloomFilter(capacity=self.n_input_data, error_rate=ERROR_RATE) # compute the number of levels needed to store all input data self.n_levels = math.ceil(math.log(self.n_input_data, self.g)) + 1 self.level_sizes = np.array([self.g**i for i in range(self.n_levels)], dtype=int) self.level_n_items = np.zeros(self.n_levels, dtype=int) self.disk_size = np.sum(self.level_sizes) + self.block_size self.level_start_idxs = np.zeros(self.n_levels, dtype=int) for i in range(1, self.n_levels ): # preform prefix sum to get start idxs for the level self.level_start_idxs[i] = self.level_start_idxs[ i - 1] + self.level_sizes[i - 1] # create storage file. if os.path.exists(disk_filepath): os.remove(disk_filepath) else: dirname = os.path.dirname(disk_filepath) if not os.path.exists(dirname): os.makedirs(dirname) disk = h5py.File(self.disk_filepath, 'w') disk.create_dataset('dataset', shape=(self.disk_size, ), dtype=int) disk.close() self.disk = h5py.File(self.disk_filepath, 'r+') self.data = self.disk['dataset'] self.n_items = 0 self.final_insert_level = 0 def insert(self, item): insert_data = [item] self.n_items += 1 n_inserts = 1 self.bloom_filter.add(item) # perform the downward merge for i in range(self.n_levels): level_start_idx = self.level_start_idxs[i] level_n_items = self.level_n_items[i] level_size = self.level_sizes[i] level_end_idx = level_start_idx + level_n_items level_data = self.data[level_start_idx:level_end_idx] merge_size = n_inserts + level_n_items merged_data = np.zeros(shape=merge_size, dtype=int) # perform the merge here. merged_i, insert_i, level_i = 0, 0, 0 while level_i < level_n_items and insert_i < n_inserts: if level_data[level_i] <= insert_data[ insert_i]: # insert level items merged_data[merged_i] = level_data[level_i] level_i += 1 else: merged_data[merged_i] = insert_data[insert_i] insert_i += 1 merged_i += 1 if insert_i < n_inserts: assert level_i == level_n_items merged_data[merged_i:] = insert_data[insert_i:] elif level_i < level_n_items: merged_data[merged_i:] = level_data[level_i:] if merge_size > level_size: # it will be full self.level_n_items[i] = 0 insert_data = copy.deepcopy(merged_data) n_inserts = len(insert_data) else: self.level_n_items[i] = merge_size level_end_idx = level_start_idx + merge_size self.data[level_start_idx:level_end_idx] = merged_data # update for queries self.final_insert_level = max(self.final_insert_level, i) break def query(self, item): idx = self._search(item) return idx > INVAlID_SEARCH_IDX def _search(self, item): if item not in self.bloom_filter: # check bloom filter first. return INVAlID_SEARCH_IDX n_search_levels = self.final_insert_level + 1 for i in range(n_search_levels): level_n_item = self.level_n_items[i] if level_n_item == 0: continue # no items to search level_start_idx = self.level_start_idxs[i] level_end_idx = level_start_idx + level_n_item search_data = self.data[level_start_idx:level_end_idx] idx = bs.search(search_data, item) if idx < len(search_data) and search_data[idx] == item: return level_start_idx + idx return INVAlID_SEARCH_IDX
class A360Spider(scrapy.Spider): name = '360' allowed_domains = ['zhushou.360.cn'] start_urls = [ 'http://zhushou.360.cn/list/index/cid/1/', 'http://zhushou.360.cn/list/index/cid/2/' ] base_url = 'http://zhushou.360.cn' custom_settings = { "CONCURRENT_REQUESTS": 3 } def __init__(self, checkpoint=None, *a, **kw): super(A360Spider, self).__init__(*a, **kw) self.bf = BloomFilter(capacity=10000000) self.apkbf = BloomFilter(capacity=100000000) self.checkpoint = checkpoint if not checkpoint == None: fd = open(checkpoint, 'r') while (True): line = fd.readline() if not line: break line = line.strip() self.apkbf.add(line) fd.close() def start_requests(self): for url in self.start_urls: self.bf.add(url) yield Request( url=url, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, dont_filter=True) def parse(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser") categorypattern = re.compile(ur'cid/[0-9]+/$') for aitem in soup.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if categorypattern.search(href) == None: continue if href in self.bf: continue self.bf.add(href) yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta={ "category_url": href, 'category': aitem.get_text() }, callback=self.parse_category) def parse_category(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') appinfo = soup.select('div .icon_box')[0] pagepattern = re.compile(ur'pageCount\s*=\s*[0-9]+') numpattern = re.compile(ur'[0-9]+') for aitem in appinfo.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if href.find('detail') == -1: continue if href in self.bf: continue self.bf.add(href) yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta={'category': response.meta['category']}, callback=self.parse_detail) pageinfo = soup.select('script')[7] pagenum = numpattern.search(pagepattern.search( pageinfo.text).group()).group() print(response.url) print(pagenum) for np in range(2, int(pagenum)): yield Request( url=response.meta['category_url'] + '?page=%d' % np, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta={ "category_url": response.meta['category_url'], 'category': response.meta['category'] }, callback=self.parse_category) def parse_detail(self, response): urlpattern = re.compile(ur'url=.*') apkidpattern = re.compile(ur'soft_id/[0-9]+') numpattern = re.compile(ur'[0-9]+') packagenamepattern = re.compile(ur'/[^/]*\.apk') soup = bs4.BeautifulSoup(response.text, 'html.parser') print(response.url) commonname = soup.select('#app-name')[0].get_text() size = soup.select('.s-3')[1].get_text() urllink = urlpattern.search( soup.select('.js-downLog.dbtn')[0]['href']).group()[4:] packagename = packagenamepattern.search(urllink).group()[1:-4] apkid = numpattern.search(apkidpattern.search( response.url).group()).group() metainfo = soup.select('.base-info')[0] metainfo = metainfo.select('td') developer = metainfo[0].get_text() developer = developer[developer.find(u':') + 1:] version = metainfo[2].get_text() version = version[version.find(u':') + 1:] updatetime = metainfo[1].get_text() updatetime = updatetime[updatetime.find(u':') + 1:] permissionlist = list() permission = soup.select('#authority-panel')[0].select( 'p')[0].get_text().split('\n') category = response.meta['category'] for perm in permission: if perm.strip().startswith(u'-'): permissionlist.append(perm.strip()) if apkid in self.apkbf: return self.apkbf.add(apkid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkplaform', self.name) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('category', category) item.add_value('developer', developer) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permissionlist) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def __init__(self,key_queue,save_queue): super(Spider, self).__init__() self.key_queue = key_queue self.save_queue = save_queue self.boom = BloomFilter(capacity=1e7,error_rate=0.001) #关键词重复采集过滤器
class MeizuSpider(scrapy.Spider): name = 'meizu' allowed_domains = ['app.meizu.com', 'app.flyme.cn'] start_urls = [ 'http://app.meizu.com/', 'http://app.flyme.cn/games/public/index' ] custom_settings = { "CONCURRENT_REQUESTS": 3 } download_url = 'http://app.flyme.cn/%s/public/download.json?app_id=%d' def __init__(self, checkpoint=None, *a, **kw): super(MeizuSpider, self).__init__(*a, **kw) self.bf = BloomFilter(capacity=10000000) self.checkpoint = checkpoint self.apkbf = BloomFilter(capacity=100000000) if not checkpoint == None: fd = open(checkpoint, 'r') while True: line = fd.readline() if not line: break line = line.strip() self.apkbf.add(line) fd.close() def start_requests(self): for url in self.start_urls: metainfo = { 'type': 'apps' } if not url.find('games') == -1: metainfo = { 'type': 'games' } yield Request( url, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta=metainfo, dont_filter=True) def parse(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser") category_url = 'http://app.flyme.cn/%s/public/category/%d/all/feed/index/0/18' categorylist = soup.select("#categoryList") if not len(categorylist) == 1: return categorylist = categorylist[0] dataparam = categorylist.select("li") for dp in dataparam: if dp.has_attr('data-param'): yield Request( url=category_url % (response.meta['type'], int(dp['data-param'])), headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta=response.meta, callback=self.parse_category) def parse_category(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser") applist = soup.select('#app') base_url = 'http://app.flyme.cn' if len(applist) == 0: return applist = applist[0].find_all('a') for app in applist: if not app.has_attr('href'): continue if base_url + app['href'] in self.bf: continue self.bf.add(base_url + app['href']) yield Request( url=base_url + app['href'], headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta=response.meta, callback=self.parse_detail) def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser") metedata = soup.select('div.left.inside_left')[0] platform = self.name category = metedata.select('.current')[0]['title'] packagenamepattern = re.compile(ur'package_name=.*') packagename = packagenamepattern.search(response.url).group()[13:] urllink = response.url app_titles = metedata.find_all("span", class_="app_title") app_content = metedata.find_all('div', class_='app_content') size = app_content[5].get_text().strip() version = app_content[3].get_text().strip() updatetime = app_content[6].get_text().strip() developer = app_content[2].get_text().strip() commonname = soup.find_all('div', class_='app_detail')[0] commonname = commonname.find_all('div', class_='detail_top')[0] commonname = commonname.find_all('h3')[0].get_text() apkid = soup.select('.price_bg.downloading')[0]['data-appid'] yield Request( url=self.download_url % (response.meta['type'], int(apkid)), headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, meta={ 'commonname': commonname, 'platform': platform, 'category': category, 'developer': developer, 'packagename': packagename, 'updatetime': updatetime, 'size': size, 'version': version, 'urllink': urllink }, callback=self.parse_download) def parse_download(self, response): json_response = json.loads(response.body_as_unicode()) if not json_response['code'] == 200: return urllink = json_response['value']['downloadUrl'] apkid = response.meta['packagename'] if apkid in self.apkbf: return self.apkbf.add(apkid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('commonname', response.meta['commonname']) item.add_value('apkplaform', response.meta['platform']) item.add_value('category', response.meta['category']) item.add_value('developer', response.meta['developer']) item.add_value('packagename', response.meta['packagename']) item.add_value('updatetime', response.meta['updatetime']) item.add_value('size', response.meta['size']) item.add_value('version', response.meta['version']) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def make_filter(): return BloomFilter( capacity=settings["MAX_POSTS"], error_rate=0.001 )
class QqSpider(scrapy.Spider): name = 'qq' allowed_domains = ['sj.qq.com']; start_urls = ['https://sj.qq.com/myapp/index.htm']; custom_settings = {"CONCURRENT_REQUESTS": 3}; base_cate_url = "https://sj.qq.com/myapp/cate/appList.htm?orgame=%d&categoryId=%d&pageSize=20&pageContext=%d"; def __init__(self, checkpoint=None, *a, **kw): super(QqSpider,self).__init__(*a,**kw); self.step = 20; self.begin_step = 0; self.categorybf = BloomFilter(capacity = 100000000); self.checkpoint = checkpoint; self.apkbf = BloomFilter(capacity=100000000); if not checkpoint == None: fd = open(checkpoint,'r'); while(True): line = fd.readline(); if not line: break; line = line.strip(); self.apkbf.add(line); fd.close(); def start_requests(self): for url in self.start_urls: yield Request(url, headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"}, dont_filter = True ); def parse(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser"); pattern = re.compile(ur'categoryId=-?[0-9]+'); idpattern = re.compile(ur'-?[0-9]+'); orgamepattern = re.compile(ur'orgame=[0-9]+'); orgameidpattern = re.compile(ur'[0-9]+'); for aitem in soup.find_all('a'): href = aitem['href']; if not href.find('categoryId') == -1 and href not in self.categorybf: self.categorybf.add(href); categoryid = pattern.search(href).group(); categoryid = idpattern.search(categoryid).group(); orgname = orgameidpattern.search(orgamepattern.search(href).group()).group(); url = self.base_cate_url%(int(orgname),int(categoryid),self.begin_step); #print(url); yield Request( url, headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"}, meta={'orgname':orgname}, callback=self.parse_json ); def parse_json(self, response): categorypattern = re.compile(ur'categoryId=-?[0-9]+'); pagecontext = re.compile(ur'pageContext=-?[0-9]+'); idpattern = re.compile(ur'-?[0-9]+'); catestring = categorypattern.search(response.url).group(); pagestring = pagecontext.search(response.url).group(); cateid = idpattern.search(catestring).group(); pageid = idpattern.search(pagestring).group(); json_response = json.loads(response.body_as_unicode()); count = 0; if json_response.has_key('count'): count = int(json_response['count']); else: return; print(response.url); print(count); if count <= 0: return; objs = ""; if json_response.has_key('obj'): objs = json_response['obj']; else: return; apkplaform = 'qq'; for obj in objs: if obj['apkUrl'] in self.categorybf: continue; if obj['appId'] in self.apkbf: continue; self.apkbf.add(obj['appId']); self.categorybf.add(obj['apkUrl']); print(obj); item = ItemLoader(item=ApkspiderItem(), response=response); item.add_value("commonname",obj['appName']); item.add_value('apkplaform',apkplaform); item.add_value('apkid_specifiedbyplaform',str(obj['appId'])); item.add_value('category',obj['categoryName']); item.add_value('developer',obj['authorName']); item.add_value('packagename',obj['pkgName']); item.add_value('updatetime',obj['apkPublishTime']); item.add_value('version',obj['versionName']); item.add_value('urllink',obj['apkUrl']); item.add_value('file_urls',obj['apkUrl']); item.add_value('checkpoint',self.checkpoint); yield item.load_item(); url = self.base_cate_url%(int(response.meta['orgname']),int(cateid),int(pageid)+self.step); yield Request( url, headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"}, meta={'orgname':response.meta['orgname']}, callback=self.parse_json );
def mapper_init(self): with open(os.path.join(basedir, 'resources/hot_displayname.bf'), 'rb') as f: self.filter = BloomFilter.fromfile(f)
def str_filesize(size): d = [(1024 - 1, 'K'), (1024**2 - 1, 'M'), (1024**3 - 1, 'G'), (1024**4 - 1, 'T')] s = [x[0] for x in d] index = bisect.bisect_left(s, size) - 1 if index == -1: return str(size) else: b, u = d[index] return str(size / (b + 1)) + u #定长 bf = BloomFilter(capacity=1000) #超出报错 for i in range(0, 101): try: bf.add("zjl-{}".format(i)) except Exception as e: print(i, e) #address, size, endianness, unused, allocated #布隆过滤器的一些信息 address, size, endianness, unused, allocated = bf.bitarray.buffer_info() print(address, size, endianness, unused, allocated, str_filesize(size))
def _open_bloom(infile): nb = open(infile, "rb") return BloomFilter.fromfile(nb)
def trial(fd): params = search_params() for blk_size in BLK_SIZE: true_false_positives = blk_size for bound in B: for fraction in FRACTION: # True_positives is the number of txns in the blk the receiver has true_positives = int(blk_size * fraction) mempool_size = true_false_positives + true_positives print( 'Running %d trials for parameter combination: extra txns in mempool %d blk size %d fraction %f' % (NUM_TRIAL, true_false_positives, blk_size, fraction)) # Size of Compact block (inv + getdata) getdata = (1 - fraction) * blk_size * TXN_SHORT_BYTES_CB inv = blk_size * TXN_SHORT_BYTES_CB compact = inv + getdata for i in range(NUM_TRIAL): blk, receiver_mempool = create_mempools( mempool_size, fraction, blk_size, true_false_positives) # Sender creates BF of blk a, fpr_sender, iblt_rows_first = params.solve_a( mempool_size, blk_size, blk_size, 0) bloom_sender = BloomFilter(blk_size, fpr_sender) tmp = blk_size + 0.5 exponent = (-bloom_sender.num_slices * tmp) / (bloom_sender.num_bits - 1) real_fpr_sender = (1 - exp(exponent))**bloom_sender.num_slices #exponent = (-bloom_sender.num_slices*blk_size) / bloom_sender.num_bits #tmp = (1-exp(exponent)) ** bloom_sender.num_slices #real_fpr_sender = max(tmp, fpr_sender) #assert real_fpr_sender >= fpr_sender # Sender creates IBLT of blk iblt_sender_first = PYBLT(a, TXN_SHORT_BYTES) # Add to BF and IBLT for txn in blk: bloom_sender.add(txn) iblt_sender_first.insert(txn, 0x0) # Receiver computes how many items pass through BF of sender and creates IBLT iblt_receiver_first = PYBLT(a, TXN_SHORT_BYTES) Z = [] for txn in receiver_mempool: if txn in bloom_sender: Z.append(txn) iblt_receiver_first.insert(txn, 0x0) #(id and content) z = len(Z) observed_false_positives = z - true_positives # Eppstein subtraction T = iblt_receiver_first.subtract(iblt_sender_first) boolean, result = T.list_entries() #assert boolean == False # Check whether decoding successful if boolean == True: flag, in_blk = decode_blk(result, Z, blk) # Each component of graphene blk size first_IBLT = (iblt_rows_first * TAU) first_BF = (bloom_sender.num_bits / 8.0) extra = (len(in_blk) * TXN_SHORT_BYTES) # Compute size of Graphene block graphene = first_IBLT + first_BF + extra fd.write( str(true_false_positives) + '\t' + str(blk_size) + '\t' + str(bound) + '\t' + str(fraction) + '\t' + str(mempool_size) + '\t' + str(fpr_sender) + '\t' + str(real_fpr_sender) + '\t' + str(0) + '\t' + str(a) + '\t' + str(0) + '\t' + str(0) + '\t' + str(0) + '\t' + str(z) + '\t' + str(0) + '\t' + str(observed_false_positives) + '\t' + str(boolean and flag) + '\t' + str(False) + '\t' + str(graphene) + '\t' + str(first_IBLT) + '\t' + str(first_BF) + '\t' + str(0) + '\t' + str(0) + '\t' + str(extra) + '\t' + str(iblt_rows_first) + '\t' + str(0) + '\t' + str(compact) + '\n') else: # Receiver creates BF of txns that passed through sender's BF # print('z', z) # print('bound', bound) x_star = params.search_x_star(z, mempool_size, real_fpr_sender, bound, blk_size) temp = (mempool_size - x_star) * real_fpr_sender y_star = params.CB_bound(temp, real_fpr_sender, bound) #print('y_star', y_star) y_star = ceil(y_star) b, fpr_receiver, iblt_rows_second = params.solve_a( blk_size, z, x_star, y_star) bloom_receiver = BloomFilter(z, fpr_receiver) for txn in Z: bloom_receiver.add(txn) # Receiver determines IBLT size iblt_sender_second = PYBLT(b + y_star, TXN_SHORT_BYTES) # Sender creates IBLT of blk again and sends txns that do not pass through BF of receiver count = 0 for txn in blk: iblt_sender_second.insert(txn, 0x0) if txn not in bloom_receiver: T.insert( txn, 0x0 ) # add txns just received to subtracted IBLT Z = Z + [txn] # sends the txn to the receiver count = count + 1 iblt_receiver_second = PYBLT(b + y_star, TXN_SHORT_BYTES) for txn in Z: iblt_receiver_second.insert(txn, 0x0) # Eppstein subtraction T_second = iblt_receiver_second.subtract( iblt_sender_second) boolean, result = T_second.list_entries() #print(boolean) #print('Z', z) # Check whether blk was reconstructed properly flag, in_blk = decode_blk(result, Z, blk) final = False if boolean == False or flag == False: final, in_blk, not_in_blk = try_ping_pong( T, T_second, set(), set()) #print('Ping pong result', final) if final == True: possibly_in_blk = set(Z) possibly_in_blk.difference_update(not_in_blk) reconstructed_blk = list( in_blk.union(possibly_in_blk)) assert set(reconstructed_blk) == set(blk) # Each component of graphene blk size first_IBLT = (iblt_rows_first * TAU) first_BF = (bloom_sender.num_bits / 8.0) second_IBLT = (iblt_rows_second * TAU) second_BF = (bloom_receiver.num_bits / 8.0) extra = (len(in_blk) * TXN_SHORT_BYTES) # Compute size of Graphene block graphene = first_IBLT + first_BF + second_IBLT + second_BF + extra fd.write( str(true_false_positives) + '\t' + str(blk_size) + '\t' + str(bound) + '\t' + str(fraction) + '\t' + str(mempool_size) + '\t' + str(fpr_sender) + '\t' + str(real_fpr_sender) + '\t' + str(fpr_receiver) + '\t' + str(a) + '\t' + str(b) + '\t' + str(x_star) + '\t' + str(y_star) + '\t' + str(z) + '\t' + str(count) + '\t' + str(observed_false_positives) + '\t' + str(boolean and flag) + '\t' + str(final) + '\t' + str(graphene) + '\t' + str(first_IBLT) + '\t' + str(first_BF) + '\t' + str(second_IBLT) + '\t' + str(second_BF) + '\t' + str(extra) + '\t' + str(iblt_rows_first) + '\t' + str(iblt_rows_second) + '\t' + str(compact) + '\n') fd.flush()
if __name__ == '__main__': # 一个新集群Key文件构建一个布隆过滤器 logging.info( '===================================begin new compare task===================================' ) logging.info('base new cluster key files begin build filter') buildFilterThreads = [] filterKeys = [] filterLens = [] for filename in os.listdir(newClusterPath): if not filename.endswith('csv'): continue filterKey = BloomFilter(capacity=1000 * 10000, error_rate=0.00001) filterLen = BloomFilter(capacity=1000 * 10000, error_rate=0.00001) filterKeys.append(filterKey) filterLens.append(filterLen) logging.info('base %s build BloomFilter cost memory %dM' % (filename, len(filterKey.bitarray * 2) / 8 / 1024 / 1024)) t = threading.Thread(target=buildFilter, args=(filterKeys[-1], filterLens[-1], filename, newClusterPath)) buildFilterThreads.append(t) t.start() # break for a in buildFilterThreads: a.join() logging.info('base new cluster key files end build filter')
class Main(): def __init__(self): self.taskCode = "" #读取配置文件 configPath = "config.ini" WebConfig = configparser.ConfigParser() WebConfig.read(configPath, encoding='utf-8-sig') self.redisHost = WebConfig.get("redis", "host") self.redisPort = WebConfig.get("redis", "port") self.redisPassword = WebConfig.get("redis", "password") self.redisDb = WebConfig.get("redis", "database") self.redis_platform_address = WebConfig.get("redis","redis_platform_address") self.url_key_name = self.redis_platform_address+":url:" + self.taskCode self.redis = redis.Redis(host=self.redisHost, port=self.redisPort, decode_responses=True, password=self.redisPassword, db=self.redisDb) mongoHost = WebConfig.get("mongodb", "host") mongoPort = WebConfig.get("mongodb", "port") mongoUser = WebConfig.get("mongodb", "user") mongoPassword = WebConfig.get("mongodb", "password") mongourl = "mongodb://" + mongoUser + ":" + mongoPassword + "@" + mongoHost + ":" + mongoPort conn = pymongo.MongoClient(mongourl) mongoDatabase = WebConfig.get("mongodb", "database") # mongo数据库名 self.myMongo = conn[mongoDatabase] # 数据库名 self.bloom = None self.webType = "" self.executionType ="" # 页面翻页设置 self.start_url = "" self.second_page_value = "" self.page_interval = "" self.end_page_value = "" self.url_type = "" self.lineListXpath = "" self.json_page_re = "" self.page_xpath = "" #page页如果有需要提取的数据 # 获取页面元素 self.titleXpath = "" self.contentXpath = "" self.proxy = None self.proxy_url = None self.headers = { 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ' 'Windows NT 6.1; Win64; x64; Trident/5.0)'), } # header self.timeout = 10 self.timeInterval = 0 # 时间间隔 self.post_data = "" self.page_num_str = "" # 从数据库读布隆过滤器数据 def bloom_readfrom_db(self): tempFile = open("tempFile", "wb") bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode}) if bloom_dict: #如果有布隆过滤器,读取 bloomData = bloom_dict["bloom_data"] tempFile.write(bloomData) tempFile.close() bloomFile = open("tempFile", "rb") self.bloom = BloomFilter.fromfile(bloomFile) else: self.bloom = BloomFilter(capacity=1000000, error_rate=0.00001) def get_proxy(self): ps = requests.get(self.proxy_url).text return ps # 将布隆过滤器数据写入数据库保存 def bloom_writeto_db(self): bloomDbKeyName = self.redis_platform_address + ":bloom:" + self.taskCode tempFile_del = open("tempFile", "wb") self.bloom.tofile(tempFile_del) #将布隆过滤器数据写入文件 tempFile_del.close() bloomFile = open("tempFile", "rb") #打开保存数据的文件 bloomData = bloomFile.read() insert_data = {"_id": self.taskCode, "bloom_data": bloomData} bloom_dict = self.myMongo["bloom"].find_one({"_id": self.taskCode}) if bloom_dict: #更新布隆过滤器 self.myMongo["bloom"].update_one({"_id": self.taskCode},{"$set": {"bloom_data":bloomData}}) else: self.myMongo["bloom"].insert_one(insert_data) bloomFile.close() logging.info("布隆过滤器成功保存到数据库"+bloomDbKeyName) # 构造链接页的所有链接 def get_PageUrlList(self): """构造翻页链接""" urlList = [] for i in range(int(self.second_page_value), int(self.end_page_value)): page_num = str(i) page_url = self.url_type.replace("%d", page_num) urlList.append(page_url) urlList.append(self.start_url) return urlList #根据url下载数据 def download(self, url): try: if self.proxy: proxy = self.get_proxy().strip() proxies={'https':proxy} # 获取代理 response = requests.get(url, proxies=proxies, timeout=self.timeout, headers=self.headers,verify=False) logging.info(url) logging.info("以使用代理") else: response = requests.get(url, timeout=self.timeout, headers=self.headers,verify=False) statusCode = response.status_code codeStyle = cchardet.detect(response.content)["encoding"] if not codeStyle: codeStyle = "utf-8" webData = response.content.decode(codeStyle, errors="ignore") return (webData, statusCode) except Exception as e: print(e) return (0,0) def change_outqueue_num(self): keyName = self.redis_platform_address + ":status:" + self.taskCode # 获取任务状态键值 status_data = self.redis.get(keyName) # 获取所有状态数据 print("-------------------------", self.taskCode) taskData = json.loads(status_data) taskData["outQueue"] = 1 #更新json数据 keyname_data = json.dumps(taskData) # 转化为字符串 self.redis.set(keyName, keyname_data) # 更新redis # 更新所有需要的属性 def update_attr(self): keyName = self.redis_platform_address+":status:" + self.taskCode # 获取任务状态键值 status_data = self.redis.get(keyName) # 获取所有状态数据 print("-------------------------", self.taskCode) taskData = json.loads(status_data) self.executionType = int(taskData["executionType"]) self.taskCode = taskData["taskCode"] self.timeInterval = taskData["timeInterval"] self.url_key_name = self.redis_platform_address+":url:" + self.taskCode # 下载 设置 if "proxy" in taskData: self.proxy = taskData["proxy"] else: self.proxy = "" if "proxyProductValue" in taskData: self.proxy_url = taskData["proxyProductValue"] else: self.proxy_url = "" if "timeout" in taskData: self.timeout = taskData["timeout"] else: self.timeout = 10 temp_data = json.loads(taskData["templateInfo"]) #模板数据 print(temp_data) try: self.webType = temp_data["web_type"] except KeyError: self.webType = temp_data["webType"] # 页面翻页设置 self.start_url = temp_data["start_url"] self.second_page_value = int(temp_data["second_page_value"]) if "page_interval" in temp_data: self.page_interval = int(temp_data["page_interval"]) else: self.page_interval = 1 self.end_page_value = int(temp_data["end_page_value"]) self.url_type = temp_data["url_type"] try: self.lineListXpath = temp_data["line_list_xpath"] except KeyError: self.lineListXpath = temp_data["lineListXpath"] if "headers" in temp_data: self.headers = temp_data["headers"] else: self.headers = { 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; ''Windows NT 6.1; Win64; x64; Trident/5.0)'), } # header if "json_page_re" in temp_data: self.json_page_re = temp_data["json_page_re"] else: self.json_page_re = "" if "post" in temp_data: self.post_data = temp_data["post"] else: self.post_data = None if "page_num_str" in temp_data: self.page_num_str = temp_data["page_num_str"] else: self.page_num_str = "" if "page_xpath" in temp_data: self.page_xpath = temp_data["page_xpath"] else: self.page_xpath = "" def deal_html_page_data(self,base_url,line,swtich=False): #处理链接页的数据 if self.page_xpath: one_data_dict = {} for key, keyxpath in self.page_xpath.items(): if key == "url_xpath" or key == "url": content_url = line.xpath(keyxpath) if content_url: endUrl = urljoin(base_url, content_url[0]) one_data_dict["url"] = endUrl continue else: # 没有获取到url swtich = True keystr = line.xpath(keyxpath) keystr = "".join(keystr) if keystr == "images" or keystr == "images_xpath": # 对图片的链接进行处理 keystr = urljoin(base_url, keystr) one_data_dict[key] = keystr end_data = json.dumps(one_data_dict) # 将字典转化为字符串 else: end_data = urljoin(base_url,line) return end_data,swtich def deal_json_page_data(self,base_url,line,swtich=False): if self.page_xpath: one_data_dict = {} swtich = False for key, keyxpath in self.page_xpath.items(): if key == "url_xpath" or key == "url": content_url = jsonpath.jsonpath(line, keyxpath) if content_url: endUrl = urljoin(base_url, content_url[0]) one_data_dict["url"] = endUrl continue else: # 没有获取到url swtich = True keystr = jsonpath.jsonpath(line, keyxpath) keystr = " ".join(keystr) one_data_dict[key] = keystr end_data = json.dumps(one_data_dict) # 将字典转化为字符串 else: end_data = urljoin(base_url, line) return end_data,swtich # 根据url获取该页面的所有文本的链接或者链接字典 def judge_url_in_bloom(self,judge_data): """判断url或字典里的url是否在布隆过滤器,不在的话加入布隆过滤器,并将数据加入redis""" if judge_data.startswith("{"): judge_data_json = json.loads(judge_data) insert_url = judge_data_json["url"] if insert_url in self.bloom: return True else: self.bloom.add(insert_url) print(judge_data) self.redis.lpush(self.url_key_name, judge_data) return False else: if judge_data in self.bloom: return True else: self.bloom.add(judge_data) print(judge_data) self.redis.lpush(self.url_key_name, judge_data) return False def get_content_url_list(self, url): """获取静态链接页内容""" """获取静态链接页内容""" endUrlList = [] response = self.download(url) if response[1] == 200: ps = response[0] mytree = lxml.etree.HTML(ps) linelist = mytree.xpath(self.lineListXpath) for line in linelist: dealed_page_data, swtich = self.deal_html_page_data(url, line) if dealed_page_data and not swtich: # swtich处理链接页,有一行没有获取到链接的情况 endUrlList.append(dealed_page_data) return endUrlList # json 根据 url获取该 json 页面所有的链接以及其他数据 def get_json_content_url_list(self, url): """获取动态链接页内容""" end_data_list = [] response = self.download(url) if response[1] == 200: ps = response[0] ps = ps.replace("\n", "") if self.json_page_re: ps = re.compile(self.json_page_re).findall(ps) if ps: ps = ps[0] else: logging.info(url + "---------这个url用json_page_re处理,结果为空") return myjson = json.loads(ps) linelist = jsonpath.jsonpath(myjson, self.lineListXpath) for line in linelist: one_data_dict, swtich = self.deal_json_page_data(url, line) if swtich: continue end_data_list.append(one_data_dict) return end_data_list # post 的有关函数 #根据url和datapost下载数据 def post_download(self,url,data): try: if self.proxy == "1": proxy = self.get_proxy().strip() proxies = {'https': proxy} # 获取代理 response = requests.post(url, proxies=proxies, timeout=self.timeout, headers=self.headers,data=data) logging.info(url) logging.info("以使用代理") else: response = requests.post(url, timeout=self.timeout, headers=self.headers,data=data) statusCode = response.status_code codeStyle = cchardet.detect(response.content)["encoding"] if not codeStyle: codeStyle = "utf-8" webData = response.content.decode(codeStyle, errors="ignore") print(webData) return (webData, statusCode) except Exception as e: print(e) return (0, 0) def get_post_data_list(self): data_list = [] for i in range(int(self.second_page_value), int(self.end_page_value),int(self.page_interval)): current_page_data = self.post_data.copy() current_page_data[self.page_num_str] = str(i) data_list.append(current_page_data) return data_list def post_html(self,post_data_list): switch = False for post_data in post_data_list: time.sleep(self.timeInterval) response = self.post_download(self.start_url, post_data) if response[1] == 200: ps = response[0] mytree = lxml.etree.HTML(ps) linelist = mytree.xpath(self.lineListXpath) for line in linelist: one_data_dict, swtich_url = self.deal_html_page_data(self.start_url, line) if swtich_url: continue judge_answer = self.judge_url_in_bloom(one_data_dict) if self.executionType != 1 and judge_answer: # 增量爬虫 switch = True if switch: # 布隆过滤器判断有去重 break def post_json(self,post_data_list): for post_data in post_data_list: swtich = False # 判断这一页是否有布隆过滤器去重 time.sleep(self.timeInterval) response = self.post_download(self.start_url, post_data) if response[1] == 200: ps = response[0] myjson = json.loads(ps) linelist = jsonpath.jsonpath(myjson, self.lineListXpath) for line in linelist: # 每一行的操作 one_data_dict, swtich_url = self.deal_json_page_data(self.start_url, line) if swtich_url: # 这一行没有url,跳过这一行 continue judge_answer = self.judge_url_in_bloom(one_data_dict) if self.executionType != 1 and judge_answer: # 增量爬虫 swtich = True if swtich: break def get_post_url_list(self): """针对wen_type为4,即post的url变化但是post data不变的情况 http://www.nhsa.gov.cn/module/web/jpage/dataproxy.jsp?startrecord=%d&endrecord=%p&perpage=15 """ end_url_list = [] for first_num in range(int(self.second_page_value),int(self.end_page_value),int(self.page_interval)): second_num = first_num+int(self.page_interval)-1 if second_num>int(self.end_page_value): second_num = int(self.end_page_value) post_url = self.start_url.replace("%d",str(first_num)).replace("%p",str(second_num)) end_url_list.append(post_url) return end_url_list def post_url_change(self): if self.page_xpath: switch = False url_list = self.get_post_url_list() for url in url_list: time.sleep(self.timeInterval) response = self.post_download(url,self.post_data) if response[1] == 200: ps = response[0] mytree = lxml.etree.HTML(ps) linelist = mytree.xpath(self.lineListXpath) for line in linelist: one_data_dict = {} swtich_url = False for key, keyxpath in self.page_xpath.items(): if key == "url_xpath" or key == "url": content_url = line.xpath(keyxpath) if content_url: content_url = content_url[0] content_url = parse.unquote(content_url) endUrl = urljoin(self.start_url, content_url) one_data_dict["url"] = endUrl continue else: # 没有获取到url swtich_url=True keystr = line.xpath(keyxpath) keystr = "".join(keystr) if keystr == "images" or keystr == "images_xpath": # 对图片的链接进行处理 keystr = urljoin(self.start_url, keystr) one_data_dict[key] = keystr if swtich_url: continue bloom_url = one_data_dict["url"] if self.executionType != 1: # 增量爬虫 if bloom_url in self.bloom: logging.info(self.taskCode+"判断url在布隆过滤器成功") switch = True else: self.bloom.add(bloom_url) one_data_dict = json.dumps(one_data_dict) # 将字典转化为字符串 print(one_data_dict) self.redis.lpush(self.url_key_name, one_data_dict) else: one_data_dict = json.dumps(one_data_dict) # 将字典转化为字符串 print(one_data_dict) self.redis.lpush(self.url_key_name, one_data_dict) if switch: # 布隆过滤器判断有去重 break else: swtich = False url_list = self.get_post_url_list() for url in url_list: time.sleep(self.timeInterval) response = self.post_download(url,self.post_data) if response[1] == 200: ps = response[0] mytree = lxml.etree.HTML(ps) linelist = mytree.xpath(self.lineListXpath) for ii in linelist: content_url = parse.unquote(ii) endUrl = urljoin(self.start_url, content_url) if self.executionType != 1: # 增量爬虫 if endUrl in self.bloom: logging.info(self.taskCode + "判断url在布隆过滤器成功") swtich=True else: self.bloom.add(endUrl) print(endUrl) self.redis.lpush(self.url_key_name, endUrl) else: print(endUrl) self.redis.lpush(self.url_key_name, endUrl) if swtich: break url_list = self.get_post_url_list() for url in url_list: response = self.post_download(url,self.post_data) if response[0]==200: ps = response[1] def post_start(self): """post_data,page_num_str""" if self.webType == 2: #post,html类型 post_data_list = self.get_post_data_list() # 构造post请求数据 self.post_html(post_data_list) elif self.webType == 3: # post json类型 post_data_list = self.get_post_data_list() # 构造post请求数据 self.post_json(post_data_list) else: #web_type==4,url变化但是postdata不变的情况 self.post_url_change() #html和json的get方法处理 def get_start(self): # 存量爬虫 if self.executionType == 1: pageList = self.get_PageUrlList() # 页数链接 for url in pageList: time.sleep(self.timeInterval) if self.webType == 0: urlList = self.get_content_url_list(url) else: urlList = self.get_json_content_url_list(url) time.sleep(self.timeInterval) for content_data in urlList: print(content_data) self.redis.lpush(self.url_key_name, content_data) # 增量爬虫 else: switch = False if self.webType == 0: start_data_urlList = self.get_content_url_list(self.start_url) else: start_data_urlList = self.get_json_content_url_list(self.start_url) time.sleep(self.timeInterval) # 链接页只有url的情况下 if not self.page_xpath: for start_data in start_data_urlList: # 判断第一页 if start_data in self.bloom: logging.info(self.taskCode + "判断url在布隆过滤器成功") switch = True # 如果第一页出现以前爬过的url,switch为true,后续的就不在爬了 else: self.bloom.add(start_data) print(start_data) self.redis.lpush(self.url_key_name, start_data) if not switch: # 判断第二页及以后页数 for pageIndex in range(int(self.second_page_value), int(self.end_page_value)): swtich2 = False theUrl = self.url_type.replace("%d", str(pageIndex)) if self.webType == 0: second_content_urlList = self.get_content_url_list(theUrl) # 每一页的文本链接列表 else: second_content_urlList = self.get_json_content_url_list(theUrl) # json格式的每一页的文本链接列表 for second_content_url in second_content_urlList: if second_content_url in self.bloom: logging.info(self.taskCode + "判断url在布隆过滤器成功") swtich2 = True else: self.bloom.add(second_content_url) self.redis.lpush(self.url_key_name, second_content_url) print(second_content_url) if swtich2: break # 文本链接在一个字典里 {"url": "http://www.nea.gov.cn/2015-01/16/c_133924732.htm","statement_time_xpath": "2015-01-16"} else: for start_data in start_data_urlList: # 判断第一页 start_data_json = json.loads(start_data) current_url = start_data_json["url"] if current_url in self.bloom: logging.info(self.taskCode + "判断url在布隆过滤器成功") switch = True # 如果第一页出现以前爬过的url,switch为true,后续的就不在爬了 else: self.bloom.add(current_url) self.redis.lpush(self.url_key_name, start_data) print(start_data) if not switch: # 判断第二页及以后页数 for pageIndex in range(int(self.second_page_value), int(self.end_page_value)): swtich2 = False theUrl = self.url_type % pageIndex # 从第二页开始构造链接 if self.webType == 0: second_content_urlList = self.get_content_url_list(theUrl) # 每一页的文本链接列表 else: second_content_urlList = self.get_json_content_url_list(theUrl) # json格式的每一页的文本链接列表 for second_content_data in second_content_urlList: second_content_data_json = json.loads(second_content_data) current_url = second_content_data_json["url"] if current_url in self.bloom: logging.info(self.taskCode + "判断url在布隆过滤器成功") swtich2 = True else: self.bloom.add(current_url) print(current_url) self.redis.lpush(self.url_key_name, second_content_data) print(second_content_data) if swtich2: break def judge_status(self,task_data): """处理周期执行任务,判断周期执行的任务状态,在暂停和停止状态下的处理情况""" task_data_json = json.loads(task_data) task_code = task_data_json["taskCode"] task_key_name = self.redis_platform_address + ":task" #任务队列键值 status_key_name = self.redis_platform_address + ":status:" + task_code # 状态队列键值 status_data = self.redis.get(status_key_name) print("status_key_name",status_key_name) print("status_data",status_data) status_data = json.loads(status_data) status = status_data["status"] if status=="1" or status=="2": print("判断状态为进行中", task_data) self.redis.lrem(task_key_name, 0, task_data) print("删除任务", task_data) return True if status=="3": print("判断状态为暂停",task_data) time.sleep(1) return False if status=="4": print("判断状态为停止",task_data) time.sleep(1) self.redis.lrem(task_key_name,0,task_data) print("删除任务",task_data) return False def start(self): while True: task_key_name = self.redis_platform_address+":task" task_data_list = self.redis.lrange(task_key_name,0,100) print(task_data_list) time.sleep(5) for task_data in task_data_list: swtich = self.judge_status(task_data) # 更新self.taskCode if swtich: print(self.taskCode) self.taskCode = json.loads(task_data)["taskCode"] self.change_outqueue_num() #更改outQueue值为1 self.update_attr() # 更新属性 if self.executionType != 1: #增量爬虫 更新布隆过滤器 executionType self.bloom_readfrom_db() if self.post_data or type(self.post_data) == dict: self.post_start() #处理post else: self.get_start() #处理get方法html和json if self.executionType != 1: self.bloom_writeto_db() # 布隆过滤器保存到数据库
kmers2.add(seq2[i:i+ksize]) true_jaccard = len(kmers1.intersection(kmers2)) / float(len(kmers1.union(kmers2))) true_jaccards[it] = true_jaccard # Calculate sourmash estimate of Jaccard index E1 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y') E2 = MH.CountEstimator(n=h, max_prime=prime, ksize=ksize, save_kmers='y') E1.add_sequence(seq1) E2.add_sequence(seq2) estimate_jaccard = E1.jaccard(E2) estimate_jaccards[it] = estimate_jaccard # Containment version. # Bloom filter f = BloomFilter(capacity=i_size+n1, error_rate=p) len_kmers_1 = 0 for val in kmers1: if val not in f: len_kmers_1 += 1 f.add(val) #len_kmers_1 *= (1 - p) # adjust for the false positive rate, shouldn't need to do this as I'm just adding elements int_est = 0 for val in E2._kmers: #if val in f: # in python2, no distinguishing between byte and utf-8 string if val is not '': if val.decode("utf-8") in f: int_est += 1 int_est -= p*h # adjust for the false positive rate containment_est = int_est / float(h)
class AppchinaSpider(scrapy.Spider): name = 'appchina' allowed_domains = ['appchina.com'] start_urls = ['http://www.appchina.com/'] base_url = 'http://www.appchina.com' def __init__(self, checkpoint=None, *a, **kw): super(AppchinaSpider, self).__init__(*a, **kw) self.bf = BloomFilter(capacity=10000000) self.apkbf = BloomFilter(capacity=10000000) self.checkpoint = checkpoint if not checkpoint == None: fd = open(checkpoint, 'r') while (True): line = fd.readline() if not line: break line = line.strip() self.apkbf.add(line) fd.close() def start_requests(self): for url in self.start_urls: self.bf.add(url) yield Request( url=url, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, dont_filter=True) def parse(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') for aitem in soup.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if href in self.bf: continue self.bf.add(href) if href.find('category') == -1: continue yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, callback=self.parse_category) def parse_category(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') pagesoup = soup.select('.discuss_fangye')[0] appsoup = soup.select('.app-list')[0] for aitem in pagesoup.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if href in self.bf: continue self.bf.add(href) yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, callback=self.parse_category) for aitem in appsoup.find_all('a'): if not aitem.has_attr('href'): continue href = self.base_url + aitem['href'] if href in self.bf: continue self.bf.add(href) yield Request( url=href, headers={ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0" }, callback=self.parse_detail) def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') print(response.url) urllinkpattern = re.compile(ur'\'.*\'') urllink = soup.select('.download_app')[0] if not urllink.has_attr( 'onclick') or urllink['onclick'] == 'return false;': return urllink = urllink['onclick'] urllink = urllinkpattern.search(urllink).group()[1:-1] commonname = soup.select('.app-name')[0].get_text() detaillist = soup.select('.art-content') size = detaillist[2].get_text() size = size[size.find(u':') + 1:] version = detaillist[3].get_text() version = version[version.find(u':') + 1:] category = detaillist[6].get_text() category = category[category.find(u':') + 1:] packagename = response.url[response.url.rfind('/') + 1:] permissionlist = list() permissions = soup.select('.permissions-list')[0].find_all('li') for perm in permissions: permissionlist.append(perm.get_text()) if packagename in self.apkbf: return self.apkbf.add(packagename) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('apkid_specifiedbyplaform', packagename) item.add_value('commonname', commonname) item.add_value('apkplaform', self.name) item.add_value('category', category) item.add_value('packagename', packagename) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permissionlist) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()