def run(fname): stopws = stopwords.words('english') bf = pybloom_live.BloomFilter(capacity=100000, error_rate=0.001) with open("data/" + fname, "r") as f: start = time.time() print("Starting {0}.".format(fname)) for i, l in enumerate(f): reviews_dict = defaultdict(int) scores_dict = defaultdict(float) count_dict = defaultdict(int) revs = json.loads(l)["allReviews"] for rev in revs: score = rev[0] wlist = break_rev(rev[1], stopws, bf) for w in wlist: count = r.hget("#counts", w) if count is None: continue if int(count) < 1000 or int(count) > 7000: continue reviews_dict[w] = reviews_dict[w] + 1 scores_dict[w] = scores_dict[w] + score count_dict[w] = count_dict[w] + 1 for key in reviews_dict: reviews_dict[key] = reviews_dict[key] / int(r.hget("#counts", key)) results = [(x[0], x[1], (scores_dict[x[0]] / count_dict[x[0]]), count_dict[x[0]]) for x in reviews_dict.items()] results.sort(key=lambda x: x[1]) if 'title' in l: print(json.loads(l)["title"]) print(list(map(lambda x: x[0], results[::-1][:10]))) if i % 100 == 0: print("{2} {0} products, refreshing bf, took {1} seconds since last print".format(i, time.time() - start, fname)) bf = pybloom_live.BloomFilter(capacity=100000, error_rate=0.001) start = time.time() if i == 10: break
def initialize_bloom_filter(item, bloom_error_rate): md5, tokens = item bloom_filter = pybloom_live.BloomFilter(capacity=len(set(tokens)), error_rate=bloom_error_rate) for token in tokens: bloom_filter.add(token) return (md5, bloom_filter)
def queue_arrange(): m = 0 url_total = [] url_total = list_open.url_list_read() url_next_num.put(url_total[len(url_total) - 1]) b = pybloom_live.BloomFilter(capacity=90000000, error_rate=0.01) with open('urllist.txt', 'r') as file_object: for line in file_object: b.add(line.rstrip()) while 1: try: url_current = [] if not url_current_num.empty( ): #对url_total队列进行去重,并将url_current_num压入总列表和队列 while not url_current_num.empty(): url_current.append(url_current_num.get()) for num1 in url_current: if num1 not in b: #url_total.append(num1) url_next_num.put(num1) b.add(str(num1)) m = m + 1 print('url管理进程运行了%d次' % (m)) print('total列表有%d个元素' % (len(url_total))) #b.sync() #time.sleep(2) except Exception: continue
def run(fname): out_file = "wordset/" + fname + "-wordset" stopws = set(stopwords.words('english')) bf = pybloom_live.BloomFilter(capacity=1000000, error_rate=0.001) with open("data/" + fname, "r") as f: with open(out_file, "w") as out_f: # Timer start = time.time() print("Starting {0}.".format(fname)) tokenized_words = set() for i, l in enumerate(f): d = json.loads(l) # Break reviews revs = d["allReviews"] for rev in revs: for w in break_string(rev[1], stopws, bf): tokenized_words.add(w) # Break title if "title" in d: for w in break_string(d["title"], stopws, bf): tokenized_words.add(w) # Break desc if "description" in d: for w in break_string(d["description"], stopws, bf): tokenized_words.add(w) # Timer if (i + 1) % 100 == 3: bf = pybloom_live.BloomFilter(capacity=1000000, error_rate=0.001) print( "{2} {0} products, refreshing bf, took {1} seconds since last print" .format(i, time.time() - start, fname)) start = time.time() for w in tokenized_words: out_f.write(w) out_f.write("\n")
def load_bfobj(self): """load bloom file obj""" files_list = os.walk(defaults.BLOOM_FILE_PATH) files_list_str = str(list(files_list)) # bloom file not exist --> first run if defaults.BLOOM_FILE_NAME not in files_list_str: print('init') return pybloom_live.BloomFilter(capacity=self.capacity, error_rate=self.error_rate) # bloom file exist with open(defaults.BLOOM_FILE_PATH + defaults.BLOOM_FILE_NAME, 'rb') as fp: print('load_bfobj', defaults.BLOOM_FILE_PATH + defaults.BLOOM_FILE_NAME) return pybloom_live.BloomFilter.fromfile(fp)
def __init__(self, url, deep=4, thread_num=100, url_beyond=100, suspend=5, timeout=5, capacity=1000000, error_rate=0.001): # 爬虫起始url self._url = url if url.endswith('/'): self._url = url[:-1] # 顶级域名 self.top_domain = tldextract.extract(url).domain # 所有爬取到url self.__urls_dict = {} # 临时存储URL数据 {'url链接':[此链接中提取到的URL]} self._urls_dict_tmp = {} self.urls_dict_tmp = url # 爬虫深度 self.deep = deep self.bloom = pybloom_live.BloomFilter(capacity=capacity, error_rate=error_rate) self._req_tds = [] # 超出个数时暂停 self.url_beyond = url_beyond # 暂停时间 self.suspend = suspend # 线程数 self.thread_num = thread_num # 请求超时时间 self.timeout = timeout
class Vieclam24hQlSpider(scrapy.Spider): name = 'vieclam24h_QL' start_urls = [ 'https://vieclam24h.vn/tim-kiem-viec-lam-nhanh/?hdn_nganh_nghe_cap1=&hdn_dia_diem=&hdn_tu_khoa=&hdn_hinh_thuc=&hdn_cap_bac=', ] ur = pybloom_live.BloomFilter(capacity=2097152, error_rate=0.005) collection_name = 'News' client = pymongo.MongoClient(settings.MONGO_URI) db = client[settings.MONGO_DATABASE] collection = db[collection_name] Y_in_db = list( collection.find({}, { "title": 1, "company": 1, "address": 1, "_id": 0 })) no_duplicate_items = 0 def parse(self, response): for tn in response.xpath('//div[@class="list-items "]/div/div/span'): src = tn.xpath('a/@href').extract_first() src = response.urljoin(src) add_url = self.ur.add(src) if add_url is False: yield scrapy.Request(src, callback=self.parse_src) next_pages = response.xpath('//li[@class="next"]/a/@href').extract() next_page = next_pages[len(next_pages) - 1] if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse) def parse_src(self, response): self.item = JobItem() self.item["url"] = response.request.url title = response.xpath( '//div[@class="col-xs-12"]/h1[@class="text_blue font28 mb_10 mt_20 fws title_big"]/text()' ).extract() x_title = title[0] self.item["title"] = x_title #Cong ty company = response.xpath( '//p[@class="font16"]//a[@class="text_grey3"]/text()').extract() x_company = company[0] self.item['company'] = x_company #Noi lam viec addresses = response.xpath( '//span[@class="pl_28"]//a[@class="job_value text_pink"]/text()' ).extract() address = ', '.join([ address.replace("Việc làm", "").replace("TP.HCM", "Hồ Chí Minh").strip() for address in addresses ]) self.item['address'] = address #Check duplicate data_need_check = DataReduction( 3, [[job['title'], job['company'], job['address']] for job in self.Y_in_db]) if data_need_check.is_match([x_title, x_company, address]): self.no_duplicate_items += 1 print(self.no_duplicate_items) return #Luong salary = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Mức lương")]/span/text()' ).extract() if len(salary) > 0: salary_str = " ".join(salary) salary_need_normalize = Normalize_salary() salary_normalized = salary_need_normalize.normalize_salary( salary_str) self.item["salary"] = salary_normalized else: self.item["salary"] = 'Thỏa thuận' pass #Kinh nghiem experience = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Kinh nghiệm")]/span/text()' ).extract() if len(experience) > 0: self.item["experience"] = experience[0] else: self.item["experience"] = 'Không yêu cầu' # Bang cap diploma = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Yêu cầu bằng cấp")]/span/text()' ).extract() if len(diploma) > 0: self.item["diploma"] = diploma[0] else: self.item['diploma'] = 'Không yêu cầu' #So luong amount = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Số lượng cần tuyển")]/span/text()' ).extract() if len(amount) > 0: self.item["amount"] = amount[0] else: self.item['amount'] = 'Không yêu cầu' #Nganh nghe career = response.xpath( '//div[@class="line-icon mb_12"]//h2[contains(text(),"Ngành nghề")]//a/text()' ).extract() career_need_nomarlize = Normalize_careers() career_normalized = career_need_nomarlize.normalize_careers(career) self.item["career"] = career_normalized # Chuc vu position = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Chức vụ")]/span/text()' ).extract() if len(position) > 0: self.item["position"] = position[0] else: self.item['position'] = 'Không yêu cầu' #Hinh thuc lam viec fulltime/parttime category = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Hình thức làm việc")]/span/text()' ).extract() if len(category) > 0: self.item["category"] = category[0] else: self.item['category'] = 'Không yêu cầu' #Thoi gian thu viec trial_time = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Thời gian thử việc")]/span/text()' ).extract() if len(trial_time) > 0: self.item["trial_time"] = trial_time[0] else: self.item['trial_time'] = 'Không yêu cầu' #Yeu cau gioi tinh sex = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Yêu cầu giới tính")]/span/text()' ).extract() if len(sex) > 0: self.item["sex"] = sex[0] else: self.item['sex'] = 'Không yêu cầu' #Yeu cau tuoi age = response.xpath( '//div[@class="col-xs-6"]//p//span[contains(text(),"Yêu cầu độ tuổi")]/span/text()' ).extract() if len(age) > 0: self.item["age"] = age[0] else: self.item['age'] = 'Không yêu cầu' #Mo ta description = response.xpath( '(//div[@id="ttd_detail"]//div[@class="item row"])[1]//p[@class="col-md-9 pr_0 mb_0 word_break"]/text()' ).extract() self.item["description"] = " ".join( [des.strip() for des in description]) #Quyen loi duoc huong benefits = response.xpath( '(//div[@id="ttd_detail"]//div[@class="item row"])[2]//p[@class="col-md-9 pr_0 mb_0 word_break"]/text()' ).extract() self.item["benefits"] = " ".join( [benefit.strip() for benefit in benefits]) #Yeu cau khac require_skills = response.xpath( '(//div[@id="ttd_detail"]//div[@class="item row"])[3]//p[@class="col-md-9 pr_0 mb_0 word_break"]/text()' ).extract() self.item["require_skill"] = " ".join( [require_skill.strip() for require_skill in require_skills]) #Thong tin lien he per_contact = response.xpath( '(//div[@class="job_description bg_white pl_24 pr_24 mt_16 pb_18 box_shadow"]//div[@class="item row pt_14 pb_14"])[1]//p[@class="col-md-9 pr_0 mb_0"]/text()' ).extract() add_contact = response.xpath( '(//div[@class="job_description bg_white pl_24 pr_24 mt_16 pb_18 box_shadow"]//div[@class="item row pt_14 pb_14"])[2]//p[@class="col-md-9 pr_0 mb_0"]/text()' ).extract() contact = u"Người liên hệ: " + per_contact[0].strip( ) + u" Địa chỉ liên hệ: " + add_contact[0].strip() self.item["contact"] = contact #Han nop ho so expired = response.xpath( '(//span[@class="text_pink"])[1]/text()').extract() #Het han if len(expired) > 0: self.item["expired"] = expired[0] pass #Ngay tao hoso created = response.xpath( '(//p[@class="text_grey2 font12 mt8 mb12"]//span)[3]/text()' ).extract() #Ngay tao if len(created) > 0: created_at = created[0][14:] self.item["created"] = created_at if self.item["title"] != "": yield self.item
class CareerbuilderSpider(scrapy.Spider): name = 'careerbuilder' start_urls = ['https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html/'] ur = pybloom_live.BloomFilter(capacity=2097152, error_rate=0.005) collection_name = 'News' # Y_in_db = [] client = pymongo.MongoClient(settings.MONGO_URI) db = client[settings.MONGO_DATABASE] collection = db[collection_name] Y_in_db = list(collection.find({}, {"title":1,"company":1, "address":1, "_id":0})) no_duplicate_items = 0 def parse(self, response): # client = pymongo.MongoClient(self.settings.get("MONGO_URI")) # db = client[self.settings.get("MONGO_DATABASE")] # collection = db[self.collection_name] # self.Y_in_db = list(collection.find({}, {"title":1,"company":1, "address":1, "_id":0})) # Time start for tn in response.xpath('//h3[@class="job"]'): src = tn.xpath('a/@href').extract_first() src = response.urljoin(src) add_url = self.ur.add(src) if add_url is False: yield scrapy.Request(src, callback=self.parse_src) pass next_pages = response.xpath('//a[@class="right"]/@href').extract() next_page = next_pages[len(next_pages) - 1] if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse) def parse_src(self, response): self.item = JobItem() self.item["url"] = response.request.url #Nganh nghe career= response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Ngành nghề')]/following-sibling::b/a/text()").extract() print(career) career_need_nomarlize = Normalize_careers() career_normalized = career_need_nomarlize.normalize_careers(career) self.item["career"] = career_normalized #Tieu de title = response.xpath('//div[@class="top-job-info"]/h1/text()').extract() x_title = title[0] self.item["title"] = x_title company = response.xpath('//div[@class="tit_company"]/text()').extract() x_company = company[0] self.item["company"] = x_company #Noi lam viec address = response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Nơi làm việc')]/following-sibling::b/a/text()").extract() add = ", ".join(address) self.item["address"] = ", ".join(address) data_need_check = DataReduction(3, [[job['title'], job['company'], job['address']] for job in self.Y_in_db]) #Check duplicate if data_need_check.is_match([x_title, x_company, add]): self.no_duplicate_items += 1 print(self.no_duplicate_items) return # Luong salary = response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Lương')]/following-sibling::label/text()").extract() if len(salary) > 0: salary_str = " ".join(salary) salary_need_normalize = Normalize_salary() salary_normalized = salary_need_normalize.normalize_salary(salary_str) self.item["salary"] = salary_normalized else: self.item["salary"] = 'Thỏa thuận' pass #Kinh nghiem experience = response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Kinh nghiệm')]/../text()").extract() if len(experience) > 0: self.item["experience"] = experience[0].strip() elif not experience: self.item["experience"] = "Không yêu cầu" descriptions = response.xpath("//div[@class='MarBot20']/h4[contains(text(),'Mô tả Công việc')]/following-sibling::div[@class='content_fck']//text()").extract() self.item["description"] = ' '.join([description.replace('-', '').strip() for description in descriptions]) #Thong tin khac bao gom bang cap, do tuoi, va hinh thuc lam viec info_others = response.xpath("//div[@class='MarBot20']/h4[contains(text(),'Thông tin khác')]/following-sibling::div[@class='content_fck']/ul/li//text()").extract() # Bang cap diploma = [info_other.strip() for info_other in info_others if "bằng cấp" in info_other.lower() or "tốt nghiệp" in info_other.lower()] + [ description.strip() for description in descriptions if "bằng cấp" in description.lower() or "tốt nghiệp" in description.lower()] if len(diploma) > 0: self.item['diploma'] = diploma[0].split(':')[-1].strip() else: self.item['diploma'] = 'Không yêu cầu' #So 0 amount = "" self.item["amount"] = amount # Chuc vu position = response.xpath('(//p[@class="fl_right"])[1]//label/text()').extract() if len(position) > 0: self.item["position"] = position[0] pass #Hinh thuc lam viec fulltime/parttime category = [info_other.strip() for info_other in info_others if "hình thức" in info_other.lower()] + [description.strip() for description in descriptions if "hình thức" in description.lower()] if len(category) > 0: self.item['category'] = category[0].split(':')[-1].strip() else: self.item['category'] = 'Không yêu cầu' #Thoi gian thu viec trial_time = [info_other.strip() for info_other in info_others if "thời gian thử việc" in info_other.lower()] if len(trial_time) > 0: self.item["trial_time"] = trial_time else: self.item['trial_time'] = 'Không yêu cầu' #Yeu cau gioi tinh sex_male = "Nam" if len([info_other for info_other in info_others if "nam" in info_other.lower()]) > 0 else "" sex_female = "Nữ" if len([info_other for info_other in info_others if "nữ" in info_other.lower()]) > 0 else "" sex = sex_male + ("", "/")[sex_male != "" and sex_female != ""] + sex_female if sex == "": self.item['sex'] = "Không yêu cầu" else: self.item['sex'] = sex.strip() ages = [other.strip() for other in info_others if "tuổi" in other] + [description.strip() for description in descriptions if "tuổi" in description] if len(ages) > 0: self.item["age"] = ages[0].split(":")[-1].strip() else: self.item["age"] = 'Không yêu cầu' #Mo ta #Quyen loi duoc huong benefits = response.xpath("//div[@class='MarBot20 benefits-template']/h4[contains(text(),'Phúc lợi')]/following-sibling::ul/li/text()").extract() self.item["benefits"] = ', '.join(benefits).strip() #Yeu cau khac require_skills = response.xpath("//div[@class='MarBot20']/h4[contains(text(),'Yêu Cầu Công Việc')]/following-sibling::div[@class='content_fck']//text()").extract() self.item["require_skill"] = " ".join([skill.replace('-', '').strip() for skill in require_skills]) #Thong tin lien he per_contact = response.xpath('(//p[@class="TitleDetailNew"]//label)[3]//strong/text()').extract() add_contact = response.xpath('(//p[@class="TitleDetailNew"]//label)[2]/text()').extract() if len(per_contact) > 0 : pers_contact = re.sub(r'<.*?>', ' ', per_contact[0]) pers_contact = re.sub(r'\n', ' ', pers_contact) pers_contact = re.sub(r'\r', ' ', pers_contact) pass else: pers_contact = "" if len(add_contact) > 0: addr_contact = re.sub(r'<.*?>', ' ', add_contact[0]) addr_contact = re.sub(r'\n', ' ', addr_contact) addr_contact = re.sub(r'\r', ' ', addr_contact) pass else: addr_contact = "" # contact = pers_contact + "\n" +addr_contact contact = u"Người liên hệ: " + pers_contact.strip() + u" Địa chỉ liên hệ: " + addr_contact.strip() self.item["contact"] = contact #Han nop ho so expired = response.xpath('(//p[@class="fl_right"])[3]/text()').extract() #Het han if len(expired) > 0: self.item["expired"] = expired[0] pass #Ngay tao hoso created = response.xpath('//div[@class="datepost"]//span/text()').extract() #Ngay tao if len(created) > 0: self.item["created"] = created[0] if self.item["title"] != None: yield self.item
import pybloom_live import requests if __name__ == '__main__': texts = requests.get('http://www.gutenberg.org/files/2852/2852-0.txt') bloom = pybloom_live.BloomFilter(capacity=10000, error_rate=0.1) for each in texts.text.split(): bloom.add(each) print(len(bloom)) print('the' in bloom) print('luo_wei_' not in bloom)
'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" } city_home_pages = [] city_ids = [] dirname = 'mafengwo_notes/' # 创建 Bloom Filter download_bf = pybloom_live.BloomFilter(1024 * 1024 * 16, 0.01) def download_city_notes(id): for i in range(1, 999): #遍历每个城市的每一页游记,再下载每一页的游记 url = 'http://www.mafengwo.cn/yj/%s/1-0-%d.html' % (id, i) if url in download_bf: continue print("open url %s" % url) download_bf.add(url) req = urllib.request.Request(url, headers=request_headers) response = urllib.request.urlopen(req) htmlcontent = response.read().decode("utf-8") city_notes = re.findall(r'href="/i/\d{7}.html', htmlcontent) # 如果导航页错误,该页的游记数为0,则意味着 1-0-xxx.html 已经遍历完,结束这个城市