def all_surface( uri_pagetitle_file: Path, surfaceformscore_file: Path, redirect_file: Path = None, ): """Collect all surfaceforms, print as (URI, surfaceform-json) TSV Args: uri_pagetitle_file: TSV of (uri, page title) pairs surfaceformscore_file: surfaceFormsScore file redirect_file: Redirect file """ import sys, tqdm, json assert uri_pagetitle_file.exists() assert surfaceformscore_file.exists() def load_synonyms(lines: typing.Collection[str]): import urllib.parse as ul for line in lines: line = ul.unquote_plus(line.strip()) try: a, b = line.split("\t", 1) a, b = a.strip(), b.strip() if a and b and (a != b): yield b, a yield a, b except: pass syn = {} check_syn: typing.Container = () if redirect_file: print("Loading synonyms from", redirect_file, file=sys.stderr) t = get_file_lines(redirect_file) syn = dict(load_synonyms(tqdm.tqdm(redirect_file.open(), total=t))) try: from pybloomfilter import BloomFilter # type: ignore print(f"Making Bloom filter", file=sys.stderr) bf = BloomFilter(len(syn), 0.1, "/tmp/filter.bloom") bf.update(syn) check_syn = bf except: check_syn = syn print(f"Using {len(syn)} synonyms", file=sys.stderr) def get_synonyms(s, path=()): if s and (s not in path): yield s if s in check_syn: yield from get_synonyms(syn.get(s), path + (s,)) ent_surface_scores: typing.Dict = {} if surfaceformscore_file: import urllib.parse as ul t = get_file_lines(surfaceformscore_file) print(f"Loading surface forms from {surfaceformscore_file}", file=sys.stderr) with Path(surfaceformscore_file).open() as fo: for line in tqdm.tqdm(fo, total=t): try: line = ul.unquote_plus(line.strip()) ent, surface, score = line.split("\t") score = float(score) if "\\u" in surface: surface = surface.encode("utf8").decode("unicode-escape") for val in get_synonyms(surface): ss = ent_surface_scores.setdefault(ent, {}) ss[val] = max(ss.get(val, 0), score) except Exception as e: log.error(e) t = get_file_lines(uri_pagetitle_file) for line in tqdm.tqdm(open(uri_pagetitle_file), total=t): try: uri, pagetitle = line.strip().split(None, 1) import urllib.parse as ul pagetitle = ul.unquote_plus(pagetitle) surface_score = ent_surface_scores.get(pagetitle) if surface_score: top = max(surface_score.values()) surface_score = { sur: round((score / top) if score != 1.0 else 1.0, 5) for sur, score in surface_score.items() } print(uri, json.dumps(surface_score), sep="\t") except Exception as e: log.error(e)
def load_bf(self, filename, capacity, error_rate): bf = BloomFilter(capacity=capacity, error_rate=error_rate) with open(filename) as f: for line in f: bf.add(line.split('\t')[0].strip()) return bf
from pybloomfilter import BloomFilter import sys, signal from time import time, sleep import os from worker_filter import Filter st = time() done_sites_fname = 'done_sites.bin' if os.path.isfile(done_sites_fname): bfdone = BloomFilter.open(done_sites_fname) else: print "no file" bfdone = BloomFilter(2**27, 10**(-5), done_sites_fname) #8M start = 0 filter = Filter() f = open('done_urls20160601.txt').read().strip().split('\n') for url in f: bfdone.add(url) print len(f) cnt = 0 for url in f: if url in bfdone: cnt += 1 print cnt inc = 0 print time() - st
_, dim = T_des.shape # In[4]: LSH_random_vectors_set = [] #powers_of_two = 1 << np.arange(LSH_dim-1, -1, -1) # creating the multiple LSH random vectors for i in range(L_buckets): np.random.seed(i) LSH_random_vectors_set.append(np.random.randn(dim, LSH_dim)) # creating the multiple Bloom Filters BF_set = [] for i in range(L_buckets): BF_set.append(BloomFilter(2**(2 * LSH_dim), 0.01, None)) # In[5]: t0 = time.process_time() Q_kp, Q_des = detector.detectAndCompute(query_img, None) t1 = time.process_time() # We now add each LSH hash result to their dedicated Bloom Filter for i in range(L_buckets): Q_reflections = Q_des.dot(LSH_random_vectors_set[i]) >= 0 for q in np.array(Q_reflections, dtype=int): BF_set[i].add(q.tostring(None))
#!/usr/bin/env python # coding:utf-8 # manning 2015-1-27 import time import os import urlparse import hashlib import sys #sys.path.append("..") #from config.config import * #reload(sys) #sys.setdefaultencoding("utf-8") from pybloomfilter import BloomFilter bf = BloomFilter(100000, 0.01) def format(url): ''' 策略是构建一个三元组 第一项为url的netloc 第二项为path中每项的拆分长度 第三项为query的每个参数名称(参数按照字母顺序排序,避免由于顺序不同而导致的重复问题) ''' if urlparse.urlparse(url)[2] == '': url = url + '/' url_structure = urlparse.urlparse(url) netloc = url_structure[1] path = url_structure[2]
1. 国内-省-目的地 可以获取该地区所有城市 2. 城市-景点 可以获取该城市所有景点 3. 城市-社区-游记 可以获取该城市所有游记 -- BloomFilter """ import os import requests import re from pybloomfilter import BloomFilter dir_name = 'notes/' bf = BloomFilter(1024 * 1024 * 16, 0.01) def find_all_city_pages_url(): req = requests.get('http://www.mafengwo.cn/mdd/') city_pages = re.findall('/travel-scenic-spot/mafengwo/\d{5}.html', req.text) return city_pages def get_city_number(url): return url[29:34] def save_html(file_name, html): with open(file_name, 'wb+') as f: f.write(html.encode())
def setup(database: dict, password: str, bloomfilter_file=None, bf_false_positive_rate=BLOOMFILTER_DEFAULT_FALSE_POSITIVE_RATE, paralleled=False, num_processes=None) -> tuple: """ Setup method of OXT for a database :param database: database with id -> list of words :param password: password to create keys :param bloomfilter_file: file to read/write bloomfilter :param bf_false_positive_rate: bloomfilter false positive rate :param bool paralleled: should we parallel the process or not :param num_processes: number of process used if parallel :return: (key, encrypted database) """ global var_dict # TODO: generate keys from password K_P = random_secure(1) # key to XOR index K_S = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for e iv = random_secure( CMAC_AES128_KEY_LENGTH_IN_BYTES) # IV for AES encryption K_X = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for xtag K_I = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for index K_Z = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for Z K_T = random_secure(CMAC_AES128_KEY_LENGTH_IN_BYTES) # Key for keyword pairing = PairingGroup('SS512') g = pairing.random(GT) assert g.initPP(), "ERROR: Failed to init pre-computation table for g." total_pairs = 0 inverted_index_all_pairs = defaultdict( list) # word -> list of ids containing this word if paralleled: # parallel processing logger.info('Parallel gen_inverted_index') pool = multiprocessing.Pool() num_docs = len(database) inverted_tuples = pool.starmap( gen_inverted_index_paralleled, list(zip(database.items(), [K_P] * num_docs))) for inverted_list in inverted_tuples: for word, rind in inverted_list: inverted_index_all_pairs[word].append(rind) total_pairs += 1 else: # sequential processing logger.info('Seq inverted_index_all_pairs') for (ind, words) in database.items(): inverted_list = gen_inverted_index(ind, words, K_P) for word, rind in inverted_list: inverted_index_all_pairs[word].append( rind) # rind is now bytes total_pairs += 1 # generate xtags. Each xtag is for a pair (word, index) xtags = set() if paralleled: logger.info('Parallel xtags') # parallel processing with multiprocessing.Pool(processes=num_processes, initializer=init_gen_xtags_parallel, initargs=(K_X, pairing, K_I, g)) as pool: xtags_lists = pool.map(gen_xtags_parallel, inverted_index_all_pairs.items()) for xtags_list in xtags_lists: xtags.update(xtags_list) var_dict = {} else: logger.info('Seq xtags') for word, indices in inverted_index_all_pairs.items(): xtags.update(gen_xtags(word, indices, K_X, pairing, K_I, g)) # Create a Bloom filter and bitarray if bloomfilter_file is not None: bf = BloomFilter(total_pairs, bf_false_positive_rate, bloomfilter_file) else: bf = BloomFilter(total_pairs, bf_false_positive_rate) num_bits = bf.num_bits bits = bitarray(num_bits) bits.setall(False) # compute the positions of each xtag and set it # the reason we need to use bits array because the library doesn't expose bits. e.g. check if a bit is set or not xtag: str for xtag in xtags: bf.add(xtag) # mimic set in bits array for hash_seed in bf.hash_seeds: pos = bloomfilter_hash(xtag, hash_seed) % num_bits bits[pos] = True # generate encrypted database edb1 = dict() if paralleled: logger.info('Parallel edb1') # parallel processing with multiprocessing.Pool(processes=num_processes, initializer=init_gen_t_set_parallel, initargs=(K_S, K_I, K_Z, K_T, iv, pairing)) as pool: t_set_dict_lists = pool.map(gen_t_set_parallel, inverted_index_all_pairs.items()) for t_set_dict in t_set_dict_lists: edb1.update(t_set_dict) var_dict = {} else: logger.info('Seq edb1') for word, indices in inverted_index_all_pairs.items(): edb1.update( gen_t_set(word, indices, K_S, K_I, K_Z, K_T, iv, pairing)) key = (K_P, K_S, K_X, K_I, K_Z, K_T) g_serialized = pairing.serialize(g) return key, iv, g_serialized, edb1, bf, bits
# In[3]: T_kp, T_des = detector.detectAndCompute(train_img, None) # In[4]: _, dim = T_des.shape LSH_dim = 16 np.random.seed(0) LSH_random_vectors = np.random.randn(dim, LSH_dim) powers_of_two = 1 << np.arange(LSH_dim - 1, -1, -1) # In[5]: bf = BloomFilter(10**(LSH_dim / 4), 0.01, None) # We maximize the efficiency by utilizing matrix operations # for the crude LSH implementation t0 = time.process_time() Q_kp, Q_des = detector.detectAndCompute(query_img, None) t1 = time.process_time() Q_reflections = Q_des.dot(LSH_random_vectors) >= 0 #Q_bin = Q_reflections.dot(powers_of_two) # And we remove duplicates to ensure uniqueness of features for q in np.array(Q_reflections, dtype=int):
def __init__(self): self.filename = ROOTDIR + '/factory/cfg/filter.bloom' self.bf = self.__getbloomfilter() if self.bf == -1: self.bf = BloomFilter(100000, 0.001, self.filename) pass
def __init__(self): try: self.bf = BloomFilter.open('tuniu.filter') except: logging.info("new filter.bloom") self.bf = BloomFilter(100000000, 0.05, 'tuniu.filter')
def forwards(self, orm): for poll in orm.Poll.objects.all(): poll.seen_ips = BloomFilter(1000, 0.01, '/tmp/test.bloom').to_base64() poll.save()
def analyse_fpr(matrix, df, i, j, correl_data_struct, target_fpr, block_size): num_blocks = math.floor(len(matrix) / block_size) print("num blocks:", num_blocks) many_many_elements = set(correl_data_struct.exception_list_0) one_many_elements = set(correl_data_struct.exception_list_not_one) size_correl = 0.0 size_normal = 0.0 block_bloom_list_0_normal = [] block_bloom_list_0_correl = [] block_bloom_list_1 = [] block_set_0 = [] block_set_1 = [] for t in range(0, num_blocks): block_set_0.append(set([])) block_set_1.append(set([])) for t in range(0, int(block_size * num_blocks)): ind = math.floor(t / block_size) block_set_0[ind].add(matrix[t][i]) block_set_1[ind].add(matrix[t][j]) for t in range(0, num_blocks): count_to_add = 0 for item in block_set_0[t]: if item in one_many_elements: count_to_add += 1 block_bloom_list_0_correl.append(BloomFilter(count_to_add, target_fpr)) block_bloom_list_0_normal.append( BloomFilter(len(block_set_0[t]), target_fpr)) block_bloom_list_1.append(BloomFilter(len(block_set_1[t]), target_fpr)) for item in block_set_0[t]: block_bloom_list_0_normal[-1].add(item) if item in one_many_elements: block_bloom_list_0_correl[-1].add(item) # print("perecentage used:",count_to_add*1.00/len(block_set_0[t])) for item in block_set_1[t]: block_bloom_list_1[-1].add(item) size_normal += 1.44 * math.log(1.00 / target_fpr, 2) * len( block_set_0[t]) size_correl += 1.44 * math.log(1.00 / target_fpr, 2) * count_to_add print("Size Ratio:", size_correl * 1.00 / size_normal) # correl_bf=BloomFilter(len(correl_data_struct.exception_list_0), 0.01) # for item in correl_data_struct.exception_list_0: # correl_bf.add(item) # # print(item) # correl_bf_not_one=BloomFilter(len(correl_data_struct.exception_list_not_one), 0.01) # for item in correl_data_struct.exception_list_not_one: # correl_bf_not_one.add(item) # size_correl=size_normal # size_correl+=1.44*math.log(1.00/0.01,2)*len(correl_data_struct.exception_list_0) # size_correl+=1.44*math.log(1.00/0.01,2)*len(correl_data_struct.exception_list_not_one) num_queries_per_block = 1000 total_negatives = 0 total_false_positives_normal = 0 total_false_positives_correl = 0 for curr_block in tqdm(range(0, num_blocks)): rand_list = np.random.uniform(0, 1.0, num_queries_per_block) for t in range(0, num_queries_per_block): ind = math.floor(rand_list[t] * num_blocks * block_size) # If true positive, continue if matrix[ind][i] in block_set_0[curr_block]: if matrix[ind][i] not in many_many_elements: val = math.floor(matrix[ind][i] / correl_data_struct.factor_0_to_1) # This will give an error if the factor is too small if val not in block_bloom_list_1[ curr_block] or val not in block_set_1[curr_block]: print("ERROR", val, matrix[ind][i], matrix[ind][j]) sys.exit(1) continue total_negatives += 1 if matrix[ind][i] in block_bloom_list_0_normal[curr_block]: total_false_positives_normal += 1 if matrix[ind][i] in many_many_elements: if matrix[ind][i] in block_bloom_list_0_correl[curr_block]: total_false_positives_correl += 1 else: val = math.floor(matrix[ind][i] / correl_data_struct.factor_0_to_1) if matrix[ind][i] in one_many_elements: if matrix[ind][i] in block_bloom_list_0_correl[ curr_block] and val in block_bloom_list_1[ curr_block]: total_false_positives_correl += 1 else: if val in block_bloom_list_1[curr_block]: total_false_positives_correl += 1 fpr_correl = total_false_positives_correl * 1.00 / total_negatives fpr_normal = total_false_positives_normal * 1.00 / total_negatives print("Normal False positive rate:", fpr_normal) print("Correl False positive rate:", fpr_correl) print("\n\n") return fpr_correl, size_correl, fpr_normal, size_normal
# In[3]: T_kp, T_des = detector.detectAndCompute(train_img, None) # In[4]: _, dim = T_des.shape LSH_dim = 16 np.random.seed(0) LSH_random_vectors = np.random.randn(dim, LSH_dim) powers_of_two = 1 << np.arange(LSH_dim - 1, -1, -1) # In[5]: bf = BloomFilter(2**(LSH_dim + 1), 0.01, None) # We maximize the efficiency by utilizing matrix operations # for the crube LSH implementation t1 = time.process_time() Q_kp, Q_des = detector.detectAndCompute(query_img, None) Q_reflections = Q_des.dot(LSH_random_vectors) >= 0 Q_bin = Q_reflections.dot(powers_of_two) # And we remove duplicates to ensure uniqueness of features for q in list(set(Q_bin)): # needs to insert here a method for re-hashing or # transforming the array list of descriptors to a bit array
def R_2_A(index_url, url_tail, site_name, level, is_sege): if not is_sege: if level == 0: return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 1: if site_name == "qq_copyright": temp = [] #这里拿到所有的专辑id,在url_tail(在此作一个去重),去访问所有的专辑信息 bloomname = "albummid_filter" isexists = os.path.exists(bloomname + ".bloom") if isexists: #存在即打开这个文件 bf = BloomFilter.open(bloomname + ".bloom") else: #不存在即创建 bf = BloomFilter(10000000, 0.001, bloomname + ".bloom") for token in url_tail: if not bf.add(token): temp.append(token) #url_tail.remove(token) #print "重复id,丢弃",token res_urls = [] map( lambda i: res_urls.append( "https://c.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid={aid}&g_tk=5381&format=jsonp" .format(aid=i)), temp) return res_urls return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 2: return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 3: return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 4: return general_func.Relative_to_Absolute(index_url, url_tail) else: if level == 0: if site_name == "qq_music": res_urls = [] map( lambda i: res_urls.append( "https://y.qq.com/portal/singer/{aid}.html".format( aid=i)), url_tail) return res_urls if site_name == "qq_copyright": #这里拿到歌手id,直接去请求下面这个所有歌曲的接口 , 每个歌曲中都会带有一个albummid,我们需要对这个albummid作一个去重 res_urls = [] map( lambda i: res_urls.append( "https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg?g_tk=5381&format=jsonp&singermid={aid}&begin=0&num=900" .format(aid=i)), url_tail) return res_urls return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 1: return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 2: return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 3: return general_func.Relative_to_Absolute(index_url, url_tail) elif level == 4: return general_func.Relative_to_Absolute(index_url, url_tail)
"(install with `pip install pydeep`)") try: import magic except ImportError: raise MaliceDependencyError("Unable to import magic " "(install with `pip install magic`)") try: from pybloomfilter import BloomFilter except ImportError: raise MaliceDependencyError("Unable to import pybloomfilter " "(install with `pip install pybloomfilter`)") if os.path.isfile('filter.bloom'): bf = BloomFilter.open('filter.bloom') else: bf = BloomFilter(10000000, 0.01, 'filter.bloom') # csrf = CsrfProtect(app) sm = ScanManager() github = 'https://github.com/blacktop/malice' #current_app.config['GITHUB'] # open connection before each request @malice.before_request def before_request(): try: g.rdb_conn = r.connect(host='localhost', port=28015, db='file') g.rdb_sess_conn = r.connect(host='localhost', port=28015, db='session') g.rdb_sample_conn = r.connect(host='localhost',
#!/usr/bin/env python from pybloomfilter import BloomFilter import os.path import sys # cat <values_file> | ./ingest.py <bloom_file> <max_items> <error_rate> bloomFilePath = sys.argv[1] if os.path.isfile(bloomFilePath): bf = BloomFilter.open(bloomFilePath) else: maxItems = int(sys.argv[2]) errorRate = float(sys.argv[3]) bf = BloomFilter(maxItems, errorRate, bloomFilePath) valuesBuffer = [] for line in iter(sys.stdin.readline, ''): valuesBuffer.append(unicode(line.rstrip('\n'))) if len(valuesBuffer) > 100000: bf.update(valuesBuffer) valuesBuffer = [] bf.update(valuesBuffer) bf.sync
train_data = add_padding_idx(train_data) test_data = add_padding_idx(test_data) # Note that, no matter how many node types are here, make sure the # hyperedge (N1,N2,N3,...) has id, N1 < N2 < N3... compress = True # Note that, no matter how many node types are here, make sure the # hyperedge (N1,N2,N3,...) has id, N1 < N2 < N3... if not dynamic_dict: test_dict = build_hash(dict_data, compress=compress, max_size=max_size, min_size=min_size, fname="test") train_dict = test_dict # train_dict = build_hash(train_data, compress = compress, max_size=max_size, min_size = min_size, fname="test") else: train_dict = [BloomFilter(1e8, 1e-3) for i in range(max_size + 1)] test_dict = [BloomFilter(1e8, 1e-3) for i in range(max_size + 1)] print("dict_size", len(train_dict), len(test_dict)) print("after weight filter", train_data.shape, test_data.shape, dict_data.shape) print(train_weight, np.min(train_weight), np.max(train_weight)) train_weight_mean = np.mean(train_weight) train_weight = train_weight / train_weight_mean * neg_num test_weight = test_weight / train_weight_mean * neg_num dict_weight = dict_weight / train_weight_mean * neg_num print("train data amount", len(train_data)) if args.feature == 'walk':
sys.setdefaultencoding('utf-8') #系统输出编码置为utf8 sys.setrecursionlimit(1000000) #设置递归调用深度 urlTest = 'http://www.my089.com/ConsumerInfo1.aspx?uid=0C7C8143B7536149' urlStart = urlTest filedirectory = getConfig() #test() if login(): print('Login success!') #test() strtime = str(time.strftime('%Y%m%d%H%M', time.localtime(time.time()))) createFolder('log') bf = BloomFilter(100000000, 0.001, 'log/' + strtime + 'filter' + '.bloom') print "num_bits: " + str(bf.num_bits) print "num_hashes: " + str(bf.num_hashes) #bf.clear_all() #orderCount = 0 #allCount = 0 logf1 = open('log/' + strtime + 'log1' + '.log', 'wb') #记录处理过的页面 logf2 = open('log/' + strtime + 'log2' + '.log', 'wb') #记录处理过的页面 logAll = open('log/' + strtime + 'all' + '.log', 'wb') #记录所有找到的链接 aList.append(urlDefault) aList.append(urlSucceed) allCount += len(aList) for item in aList:
#!/usr/bin/python # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup from JobCrawler import JobCrawler from pybloomfilter import BloomFilter from time import time company_bf = BloomFilter(1024 * 1024 * 16, 0.01) total_page = 1 def get_company_info(url, page=1): if page > total_page: return wbdata = requests.get(url).content soup = BeautifulSoup(wbdata, 'lxml') # print soup.prettify() company_list = soup.select('div.el > span.t2') # print type(company_list), '\ncompany_list :', company_list for index, company in enumerate(company_list): if index != 0: company_result = company.find_all(name='a') company_link = company_result[0].attrs['href'] company_name = company_result[0].attrs['title'] print company_name, ' - ', company_link
class MostFollowTopicsSpider(scrapy.spiders.Spider): name = "MostFollowTopicsSpider" allowed_domains = ["zhihu.com"] start_urls = ['https://www.zhihu.com/topic/19776749/organize/entire'] topic_bloom_filter = BloomFilter(500000, 0.001, 'topic.bloom') header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', } def parse(self, response): url = response.request.url if response.request.method == 'GET': index = len('https://www.zhihu.com/topic/') try: topic_id = int(response.url[index:index + 8]) except: raw_input('wrong...wait!') yield scrapy.Request(url=response.url, callback=self.parse) topic_name = response.css( 'h1.zm-editable-content::text').extract_first() followers = int( response.css('div.zm-topic-side-followers-info strong::text'). extract_first()) or 0 self.topic_bloom_filter.add(topic_id) yield { 'topic_id': topic_id, 'topic_name': topic_name, 'followers': followers, } print('[%s] %s: %s' % (topic_id, topic_name, followers)) self.xsrf = response.css( 'input[name=_xsrf]::attr(value)').extract_first() yield scrapy.FormRequest(response.url, formdata={'_xsrf': self.xsrf}, callback=self.parse) elif response.request.method == 'POST': js = json.loads(response.text) for topic_object_list in js['msg'][1]: topic_object = topic_object_list[0] if topic_object[0] == 'topic': if topic_object[2] not in self.topic_bloom_filter: yield scrapy.Request( 'https://www.zhihu.com/topic/%s/organize/entire' % topic_object[2], callback=self.parse) else: print('repeat') elif topic_object[0] == 'load': print('more!!') url = urlparse.urlparse( response.url).path + '?child=%s&parent=%s' % ( topic_object[2], topic_object[3]) url = response.urljoin(url) yield scrapy.FormRequest(url, formdata={'_xsrf': self.xsrf}, callback=self.parse) def start_requests(self): t = str(int(time.time() * 1000)) captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en' return [ scrapy.Request(url=captcha_url, headers=self.header, callback=self.parser_captcha) ] def parser_captcha(self, response): with open('captcha.jpg', 'wb') as f: f.write(response.body) f.close() try: im = Image.open('captcha.jpg') im.show() im.close() except: print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg')) captcha = raw_input("please input the captcha\n>") return scrapy.FormRequest(url='https://www.zhihu.com/#signin', headers=self.header, callback=self.login, meta={'captcha': captcha}) def login(self, response): xsrf = response.xpath("//input[@name='_xsrf']/@value").extract_first() if xsrf is None: return '' post_url = 'https://www.zhihu.com/login/phone_num' post_data = { "_xsrf": xsrf, "phone_num": '13987654321', "password": '******', "captcha": response.meta['captcha'] } return [ scrapy.FormRequest(url=post_url, formdata=post_data, headers=self.header, callback=self.check_login) ] # 验证返回是否成功 def check_login(self, response): js = json.loads(response.text) if 'msg' in js and js['msg'] == u'登录成功': for url in self.start_urls: yield scrapy.Request(url=url, headers=self.header, dont_filter=True) else: print('login failed') print(js['msg'])
import requests import re import json from redis import Redis from rq import Queue from bs4 import BeautifulSoup from pybloomfilter import BloomFilter from utils import get_html,get_proxy,delete_proxy,get_content from urllib.parse import urlencode low = Queue('low',connection=Redis(host='localhost',port=6379)) bloom_f = BloomFilter(capacity=100000, error_rate=0.01) def spider_movie_comment(movie_id): # Get Pages url = "https://movie.douban.com/subject/"+movie_id+"/reviews?start=" head = get_html(url+str(0)) html = BeautifulSoup(head.content,"lxml") temp_html = html.select("#content > h1") print(temp_html) # f = open("index.html","w") # f.write(html.prettify()) # f.close() text = temp_html[0].text page = int(re.sub(r"\D*","", text)) data = [] for page_num in range(page//20+1):
class CrawlBSF: request_headers = { 'host': "www.mafengwo.cn", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" } cur_level = 0 max_level = 5 dir_name = 'iterate/' iter_width = 50 downloaded_urls = [] du_md5_file_name = dir_name + 'download.txt' du_url_file_name = dir_name + 'urls.txt' download_bf = BloomFilter(1024 * 1024 * 16, 0.01) cur_queue = deque() child_queue = deque() def __init__(self, url): self.root_url = url self.cur_queue.append(url) self.du_file = open(self.du_url_file_name, 'a+') try: self.dumd5_file = open(self.du_md5_file_name, 'r') self.downloaded_urls = self.dumd5_file.readlines() self.dumd5_file.close() for urlmd5 in self.downloaded_urls: self.download_bf.add(urlmd5[:-2]) except IOError: print "File not found" finally: self.dumd5_file = open(self.du_md5_file_name, 'a+') def enqueueUrl(self, url): self.child_queue.append(url) def dequeuUrl(self): try: url = self.cur_queue.popleft() return url except IndexError: self.cur_level += 1 if self.cur_level == self.max_level: return None if len(self.child_queue) == 0: return None self.cur_queue = self.child_queue self.child_queue = deque() return self.dequeuUrl() def getpagecontent(self, cur_url): print "downloading %s at level %d" % (cur_url, self.cur_level) try: req = urllib2.Request(cur_url, headers=self.request_headers) response = urllib2.urlopen(req) html_page = response.read() filename = cur_url[7:].replace('/', '_') fo = open("%s%s.html" % (self.dir_name, filename), 'wb+') fo.write(html_page) fo.close() except urllib2.HTTPError, Arguments: print Arguments return except httplib.BadStatusLine: print 'BadStatusLine' return
#!/usr/bin/python # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import csv import sys import datetime from time import time from pybloomfilter import BloomFilter reload(sys) sys.setdefaultencoding('utf-8') download_bf = BloomFilter(1024*1024*16, 0.01) def request(url, isFirstPage): if url not in download_bf: download_bf.add(url) else: return res = requests.get(url).text soup = BeautifulSoup(res, 'html.parser') # print soup.prettify() keylist = soup.select('div.key-list > div.item-mod') for index, house in enumerate(keylist): # if index == 2: # print house
break continue PST._set_p_hash_kind("md5") # Assignate the correct redis connexion r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] # Creating the bloom filter name: bloomyyyymm filebloompath = os.path.join( bloompath, 'bloom' + PST.p_date.year + PST.p_date.month) if os.path.exists(filebloompath): bloom = BloomFilter.open(filebloompath) else: bloom = BloomFilter(100000000, 0.01, filebloompath) bloop_path_set.add(filebloompath) # UNIQUE INDEX HASHS TABLE r_serv0 = dico_redis["201300"] r_serv0.incr("current_index") index = r_serv0.get("current_index") + str(PST.p_date) # HASHTABLES PER MONTH (because of r_serv1 changing db) r_serv1.set(index, PST.p_path) r_serv1.sadd("INDEX", index) # For each bloom filter opened_bloom = [] for bloo in bloop_path_set: # Opening blooms opened_bloom.append(BloomFilter.open(bloo))
def __init__(self, capacity, error_rate): super().__init__() self.bloom_filter_1 = BloomFilter(capacity, error_rate) self.bloom_filter_2 = BloomFilter(capacity, error_rate)
class Crawling: request_headers = { 'host': dest_url[7:], 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" } level = 0 max_level = 3 dir_name = "download" if not os.path.exists(dir_name): os.mkdir(dir_name) curr_queue = deque() next_queue = deque() urls_bloom = BloomFilter(1024 * 1024 * 16, 0.01) down_bloom = BloomFilter(1024 * 1024 * 16, 0.01) def __init__(self, url): self.curr_queue.append(url) # 只是记录那些已经下载下来了, 目前没有其他作用 self.down_file = open(self.dir_name + "/history.txt", 'a+') # 记录已经下载下来的url的hash self.hash_file = open(self.dir_name + "/history_md5.txt", "a+") # 导入到bloomfilter中区,因此不会重复下载 for md5 in self.hash_file.readlines(): self.down_bloom.add(md5[:-2]) def enqueue(self, url): ''' urls_bloom 中只记录hash,是为了减少记录在文件中的大小;实际也可以记录url?原有做法更准确? ''' hash = hashlib.md5(url.encode('utf8')).hexdigest() if url not in self.urls_bloom and hash not in self.down_bloom: self.urls_bloom.add(url) self.next_queue.append(url) print("enqueue: [{}]".format(url)) def dequeue(self): try: url = self.curr_queue.popleft() return url except IndexError: return None def complete(self, url): hash = hashlib.md5(url.encode('utf8')).hexdigest() self.down_bloom.add(hash) self.hash_file.write(hash + "\r\n") self.down_file.write(url + "\r\n") #self.down_file.flush() def next_level(self): self.curr_queue = self.next_queue self.next_queue = deque() def close(self): self.down_file.close()
def __init__(self): self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit()
TIMEOUT = 20 GITHUB_IMG = '//img[contains(@class, "avatar width-full height-full")]' GITHUB_REPO_TITLE = '//a[@itemprop="name codeRepository"]' GITHUB_LANGUAGE = '//span[@itemprop="programmingLanguage"]' GITHUB_USERNAME_TITLE = '//span[@class="link-gray pl-1"]' PATH_PREFIX_DEFAULT = 'data/users_order_' options = webdriver.ChromeOptions() options.add_argument(' - incognito') browser = webdriver.Chrome( executable_path=r'/home/kevin/Downloads/chromedriver', options=options) seen_usernames = BloomFilter(10000, .03) def make_url(user, tab): return f'https://github.com/{user}?tab={tab}' def is_loaded(url): browser.get(url) try: WebDriverWait(browser, TIMEOUT).until( EC.visibility_of_element_located((By.XPATH, GITHUB_IMG))) except TimeoutException: print(f'Timed out waiting for {url} to load') return False
def __init__(self, path=None): self.file = None self.fingerprints = BloomFilter(3000000, 0.00001, 'bloomTemp')
# Date 13-Nov-2017: In this version, we are just using plain features # and adding them to the Bloom. Specifically, we just convert each # the 128-element array of SIFT features to their byte representation # and add them to the Bloom; thus, each from pybloomfilter import BloomFilter import numpy as np import cv2 import sys # In[2]: detector = cv2.xfeatures2d.SIFT_create() bf = BloomFilter(10000000, 0.01, None) train_img = cv2.imread('train.jpg', 0) query_img = cv2.imread('raw.png', 0) # In[3]: T_kp, T_des = detector.detectAndCompute(train_img, None) Q_kp, Q_des = detector.detectAndCompute(query_img, None) _, dim = Q_des.shape LSH_dim = 64 np.random.seed(0) LSH_random_vectors = np.random.randn(dim, LSH_dim) powers_of_two = 1 << np.arange(LSH_dim - 1, -1, -1)