def parse_item(self, response): item = Job51Item() city_name = self.city jobname = response.xpath('//div[@class="cn"]/h1/@title')[0].extract() position = response.xpath('//div[@class="cn"]/p/@title')[0].extract( ).split('|')[0].strip('\xa0\xa0') workingExp = response.xpath('//div[@class="cn"]/p/@title')[0].extract( ).split('|')[1].strip('\xa0\xa0') eduLevel = response.xpath('//div[@class="cn"]/p/@title')[0].extract( ).split('|')[2].strip('\xa0\xa0') salary = response.xpath( '//div[@class="cn"]/strong/text()')[0].extract() company_name = response.xpath( '//div[@class="cn"]/p/a/@title')[0].extract() update_time = response.xpath('//div[@class="cn"]/p/@title')[0].extract( ).split('|')[4].strip('\xa0\xa0') require = response.xpath( '//div[@class="bmsg job_msg inbox"]/p/text()').extract() for i in require: self.str += i[2:] job_require = self.str sha1 = hashlib.sha1() string = (company_name + '' + update_time) stri = string.encode('utf8') sha1.update(stri) hash_id = sha1.hexdigest() for field in item.fields.keys(): item[field] = eval(field) yield item
def parse(self, response): nodes = response.xpath('//div[@class="dw_table"]/div[@class="el"]') for node in nodes: item = Job51Item() jobname = node.xpath('./p//a/@title').extract_first() company = node.xpath( './span[@class="t2"]/a/text()').extract_first() location = node.xpath('./span[@class="t3"]/text()').extract_first() salary = node.xpath('./span[@class="t4"]/text()').extract_first() item['jobname'] = jobname item['company'] = company item['location'] = location item['salary'] = salary yield item next_url = response.xpath('//li[@class="bk"]/a/@href').extract() self.page += 1 print("51job page:" + str(self.page)) time.sleep(3) if next_url: url = response.urljoin(next_url[-1]) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) else: print("退出")
def detailParse(self,response): item = Job51Item() print("可以获取详情了") # print(response) item['company'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title').extract_first() if not item['company']: item['company'] = "暂无" item['workname'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title').extract_first() if not item['workname']: item['workname'] = "暂无" item['salary'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').extract_first() if not item['salary']: item['salary']="暂无" # print(item['salary']) item['add'] = response.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()').extract_first() if not item['add']: item['add']="暂无" item['workdetail'] = "".join(response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div/p/text()').extract()) if not item['workdetail']: item['workdetail'] = "暂无" item['require'] = "".join(response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()').extract()) if not item['require']: item['require'] = "暂无" yield item
def parse_item(self, response): #self.log('Hi, this is an item page! %s' % response.url) #print(response.body) item = Job51Item() item['company_url'] = response.url # item['company_name']=response.xpath('//div[@class="in "]/title/text()').extract[0] #item['company_name']=response.xpath('/html/head/title/text()').extract[0] 报错 #item['company_name']=response.xpath('/html/head/title/text()') 正确 #item['company_name']=response.xpath('//div[@class="in "]/h1/@title') #正确应用text()与extract() cname = response.xpath( '//div[@class="in "]/h1/@title | //div[@class="in img_on"]/h1/@title' ) #item['company_name']=cname[0].extract() 与下面语句效果相同 item['company_name'] = cname.extract()[0] caddress = response.xpath('//p[@class="fp"]/text()').extract() cadstr = "".join(caddress) item['company_address'] = cadstr.strip().replace(" ", "") cinfolist = response.xpath('//div[@class="in"]/p/text()').extract() cinfostr = "".join(cinfolist) #如果需要换行,或者需要正常在网页上显示,去掉下面的后处理过程 cinfo = cinfostr.strip().replace(" ", "").replace('\xa0', '') item['company_info'] = cinfo yield item
def joblist(self, response): #nextlink = response.css('li[class=bk]').extract()[-1] jobs = response.xpath("//div[@id='resultList']/div") for job in jobs: try: item = Job51Item() item['jobname'] = job.xpath( './p[@class="t1 "]//a/@title').extract()[0] item['joblink'] = job.xpath( './p[@class="t1 "]//a/@href').extract()[0] item['company'] = job.xpath( './span[@class="t2"]//a/@title').extract()[0] item['place'] = job.xpath( './span[@class="t3"]/text()').extract()[0] try: item['salary'] = job.xpath( './span[@class="t4"]/text()').extract()[0] except: item['salary'] = '面谈' yield scrapy.Request(item['joblink'], callback=self.jobinfo, meta={'item': item}) except: print('joblist error', response.url)
def parse_item(self, response): l = ItemLoader(item=Job51Item(),response=response) l.add_value('Job_url',response.url) l.add_xpath('Job_name',xpath='//div[@class="cn"]/h1/@title') l.add_xpath('Job_location','//span[@class="lname"]/text()') l.add_xpath('Job_salary','//div[@class="cn"]/strong/text()') l.add_xpath('Company_name','//p[@class="cname"]/a/@title') l.add_xpath('Company_type','//p[@class="msg ltype"]/text()')#item pipeline[0] l.add_xpath('Require_exp','//div[@class="jtag inbox"]//span[@class="sp4"]/text()')#pipeline l.add_xpath('Location','//p[@class="fp"]/text()') return l.load_item()
def parse(self, response): item = Job51Item() datas = response.xpath('//div[@class="el"]/p') for data in datas: item['url'] = data.xpath('.//a/@href').extract_first() yield scrapy.Request(url=item['url'], meta={'item': item}, callback=self.get_info) text = response.xpath( '//div[@class="p_in"]//span[@class="td"]/text()').extract_first() maxnum = int(re.findall('(\d+)', text)[0]) for i in range(2, maxnum + 1): next_url = self.base_url.format(key=key, page=i) yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): node_list = response.xpath('//*[@id="resultList"]/div[@class="el"]') next_page = response.xpath('//div[@class="p_in"]/ul/li/a/@href').extract() for node in node_list: item = Job51Item() detail_link = node.xpath('./p/span/a/@href').extract_first() item['position_name'] = node.xpath('./p/span/a/@title').extract_first() item['company'] = node.xpath('./span[1]/a/@title').extract_first() item['work_address'] = node.xpath('./span[2]/text()').extract_first() item['salary'] = node.xpath('./span[3]/text()').extract_first() item['publishtime'] = node.xpath('./span[4]/text()').extract_first() yield scrapy.Request(url=detail_link, callback=self.parse_detail, meta={ 'item': item, }) for url in next_page: yield scrapy.Request(url=url, callback=self.parse)
def parseContent(self, response): soup = BeautifulSoup(response.text, 'lxml') content = soup.find('div', class_='tHjob').find('div', class_='cn') item = Job51Item() item['zwmc'] = content.find('h1')['title'].encode('utf-8') item['gzdd'] = content.find('span', class_='lname').get_text().encode('utf-8') item['gzxz'] = content.find('strong').get_text().encode('utf-8') item['gsmc'] = content.find( 'p', class_='cname').find('a')['title'].encode('utf-8') item['gslx'] = content.find( 'p', class_='ltype').get_text().encode('utf-8').split('|')[0] content1 = soup.find('div', class_='tCompany_main').find( 'div', class_='jtag').find('div', class_='t1').find_all('span') for c in content1: if c.find('em', class_='i1') != None: item['gzjy'] = c.get_text().encode('utf-8') break item['gzjy'] = '' for c in content1: if c.find('em', class_='i2') != None: item['zdxl'] = c.get_text().encode('utf-8') break item['zdxl'] = '' for c in content1: if c.find('em', class_='i3') != None: item['zprs'] = c.get_text() break item['zprs'] = '' for c in content1: if c.find('em', class_='i4') != None: fbsj_temp = c.get_text().encode('utf-8') item['fbsj'] = fbsj_temp[:fbsj_temp.find('发布')].encode('utf-8') break item['fbsj'] = '' content2 = soup.find('div', class_='tCompany_main').find('p', class_='t2') fldy = '' if content2 != None: spans = content2.find_all('span') for c in spans: fldy += (c.get_text() + ','.encode('utf-8')) item['fldy'] = fldy return item
def process_item(self, item, spider): data = Job51Item(item) # 去除每个属性数据中的 \t字符,再用 \t字符作为分隔符 # 在hadoop中就以 \t为分隔符来切割记录 # 由于dict中各个字段的顺序不固定,所以采用这种方式 output_data = (data["title"] + "\t" + data["salary"] + "\t" + data["place"] + "\t" + data["experience"] + "\t" + data["education"] + "\t" + data["need_persons"] + "\t" + data["publish_date"] + "\t" + re.sub(r"\s+", " ", data["need_skill"]).strip()) + "\n" # 通过title属性,把记录写入相应的文件 now_day = str(datetime.datetime.now().date()) file_path = "D:/Code/GraduationProject/files/" + now_day + "_" + str( data["title"]) + ".txt" with open(file_path, "a", encoding="utf-8") as f: f.write(output_data)
def parse(self, response): job_json = re.findall(r"window.__SEARCH_RESULT__ = ([\s\S]*?)</script", response.text, re.S) if len(job_json) <= 0: return job_json = json.loads(job_json[0]) for each_job in job_json['engine_search_result']: item = Job51Item() item['job_name'] = each_job['job_name'] item['salary'] = each_job['providesalary_text'] item['update_date'] = each_job['updatedate'] item['company_name'] = each_job['company_name'] item['company_type'] = each_job['companyind_text'] item['work_address'] = each_job['workarea_text'] item['company_size'] = each_job['companytype_text'] item['welfare'] = ";".join(each_job['jobwelf_list']) item['job_href'] = each_job['job_href'] yield Request(url=each_job['job_href'], callback=self.parse_detail, meta={'item': item})
def real_data(self, response): item = Job51Item() item['url'] = response.url item['title'] = response.xpath('//h1/@title').extract_first() # print(item['title']) item['location'] = response.xpath('//div[@class="cn"]/p[2]/text()[1]').extract_first().strip() item['company_name'] = response.xpath('//div[@class="cn"]/p/a[1]/text()').extract_first().strip() item['salary'] = response.xpath('//div[@class="cn"]/strong/text()').extract_first() item['company_info'] = response.xpath('//div[@class="com_tag"]/p/text()').extract() item['experience'] = response.xpath('//div[@class="cn"]/p[2]/text()[2]').extract_first().strip() job_info = response.xpath( '//div[@class="bmsg job_msg inbox"]/p/text()|//div[@class="bmsg job_msg inbox"]/text()').extract() item['job_info'] = "".join(job_info).strip() address = response.xpath('//div[@class="bmsg inbox"]/p[@class="fp"]/text()').extract() item['address'] = "".join(address).replace('\r','').replace('\n','').replace('\t','') # print(item['address']) yield item
def parse_job_info(self, response): item = Job51Item() for echo in response.xpath('//div[@class="el"]')[4:]: item["position_name"] = echo.xpath( './p/span/a/@title').extract_first() item["company"] = echo.xpath( './span[@class="t2"]/a/text()').extract_first() item["address"] = echo.xpath( './span[@class="t3"]/text()').extract_first() item["salary"] = echo.xpath( './span[@class="t4"]/text()').extract_first() item["time"] = echo.xpath( './span[@class="t5"]/text()').extract_first() yield item yield scrapy.Request(url=response.url, callback=self.parse, meta={}, dont_filter=True)
def parse_url(self, responses): response = Selector(responses) head = response.xpath(r'.//div[@class="cn"]') no = Parse_ele(head) # 标题用搜索关键字替代 # title = no.xpath_no(r'./h1/@title') title = responses.meta["search_key"] salary = no.xpath_no(r'./strong/text()') salary = self.changeSalary(salary) need = no.xpath_no(r'./p[contains(class, msg)]/@title') needs = str(need).split('|') try: place = needs[0].split('-')[0].strip() education = "缺失" experience = "缺失" need_persons = "缺失" publish_date = "缺失" for n in needs[1:]: if "经验" in n: experience = n.strip() elif "人" in n: need_persons = n.strip() elif "发布" in n: publish_date = n.strip() else: education = n.strip() need_skill = response.xpath( r'.//div[@class="bmsg job_msg inbox"]//text()').extract() except Exception as e: print("信息获取有误!", e, response.response.url, sep=",") needs_skill = "".join([x for x in need_skill if x.strip() != '']) item = Job51Item(title=title, salary=salary, place=place, experience=experience, education=education, need_persons=need_persons, publish_date=publish_date, need_skill=needs_skill) yield item
def parse(self, response): #抓取数据 htmlstr = response.body.decode('GBK') #正则表达式 reg = re.compile( r'class="t1.*?title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>', re.S) items = re.findall(reg, htmlstr) for x in items: item = Job51Item() item['jobname'] = x[0] item['companyname'] = x[2] item['jobadd'] = x[3] item['jobsalary'] = x[4] for y in self.getjobcontent(x[1]): item['jobcontent'] = re.sub(r'<.*?>| |\t|\n|\r|\s;', '', y[3]).replace(r"\t", "").replace( r"\r", "") item['jobexperience'] = y[0] item['education'] = y[1] item['peoplepnumber'] = y[2] yield item
def parse(self, response): """首页解析的方法""" # 1.爬取起始页的数据 item = Job51Item() job_list = response.xpath( "//div[@id='pageContent']/div[@class='items']/a") for job_one in job_list: # 职位信息详情url item["jobHref"] = job_one.xpath("./@href").extract_first() # 工作地点 item["jobErea"] = job_one.xpath("./i/text()").extract_first() # 公司名称 item["jobCompany"] = job_one.xpath( "./aside/text()").extract_first() # 薪资待遇 item["jobSalary"] = job_one.xpath("./em/text()").extract_first() # ===============================日志记录================================== # logging.warning("item::::::: %s" %item) yield scrapy.Request( # 职位详情url item["jobHref"], # 回调函数 callback=self.parse_detail, meta={"item": item}) # 2.爬取下一页的数据 next_url = response.xpath( "//div[@id='pageContent']/form[@id='turnpage']/div[@class='paging']/a[@class='next']/@href" ).extract_first() # 如果存在下一页就需要继续爬取 # javascript:void(0); if not next_url.find("javascript") >= 0: yield scrapy.Request( # 下一页 next_url, # 回调函数 callback=self.parse) yield item
def detail_parse(self, response): #判断信息是否存在 ifexists = lambda x: x[0] if x else '' job = Job51Item() #职位名称 job['name'] = response.xpath( '//div[@class="tHeader tHjob"]//h1//text()').extract()[0] #公司名称 job['co_name'] = response.xpath( '//p[@class="cname"]/a//text()').extract()[0] #区域 job['area'] = response.xpath( '//div[@class="tHeader tHjob"]//span/text()').extract()[0] #工资 job['salary'] = ifexists( response.xpath( '//div[@class="tHeader tHjob"]//strong/text()').extract()) #所有要求 #其他要求 otherq = '' all_require = response.xpath( '//div[@class="tBorderTop_box bt"]//div[@class="t1"]/span/text()' ).extract() for require in all_require: if '经验'.decode('utf8') in require: job['exp'] = require elif require in self.edu_type: job['edu'] = require elif '人'.decode('utf8') in require: job['num'] = require elif '发布'.decode('utf8') in require: job['time'] = require else: otherq = otherq + require + ' ' job['otherq'] = otherq #福利 welfare = ' ' fuli = response.xpath( '//div[@class="tBorderTop_box bt"]//p[@class="t2"]/span/text()' ).extract() for f in fuli: welfare = welfare + f + ' ' job['welfare'] = welfare #职位信息 posi_info = response.xpath( '//div[@class="tBorderTop_box"][1]//div[@class="bmsg job_msg inbox"]//text()' ).extract() for i in posi_info: if i in self.unrequire: posi_info.remove(i) else: i.strip() job['info'] = ' '.join(posi_info) #上班地址 job['local'] = ifexists( response.xpath( '//div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]//p/text()[2]' ).extract()) #公司网址 job['co_url'] = response.xpath( '//div[@class="tHeader tHjob"]//p[@class="cname"]/a/@href' ).extract()[0] #公司类型 str1 = response.xpath( '//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()' ).extract()[0] strtotal = '' strlist = str1.split('|') for s in strlist: strtotal = strtotal + s.strip() + '|' job['co_type'] = strtotal yield job