def parse_position_page(self, response): position = response.xpath( "//div[contains(@class,'title-info')]/h1/text()").extract()[0] requirement = '\n'.join( response.xpath( "//div[contains(@class,'main-message')][1]/div/text()"). extract()) benefit = ','.join( response.xpath( "//div[contains(@class,'tag-list')]/span/text()").extract()) try: companylink = response.xpath( "//div[@class='company-infor']/h4/a[1]/@href").extract()[0] except: companylink = '' item = LiepinItem() item['requirement'] = requirement item['companylink'] = companylink item['company'] = response.meta['company'] item['position'] = position item['city'] = self.city_dict[response.meta['city']] item['salary'] = response.meta['salary'] item['benefit'] = benefit item['pid'] = response.meta['pid'] item['catagory'] = response.meta['key'] item['rank'] = response.meta['rank'] yield item
def parse_item(self, response): item = LiepinItem() item['position'] = self.get_position(response) item['welfare'] = self.get_welfare(response) item['annual'] = self.get_annual(response) item['worklocation'] = self.get_worklocation(response) item['education'] = self.get_education(response) item['condition'] = self.get_condition(response) item['company'] = self.get_company(response) item['companylocation'] = self.get_companylocation(response) item['type'] = self.get_type(response) item['scale'] = self.get_scale(response) yield item
def parse(self, response): print(response) title = response.css('title::text').extract_first() print(title) item = LiepinItem() positions = response.xpath( '//div[@class="job-content"]/div[@class="sojob-result "]/ul[@class="sojob-list"]' ) ps = positions.xpath('//li') for p in ps: print(p) job_name = p.xpath( '//div[@class="job-info"]/h3/a/text()').extract() for j in job_name: item['job_name'] = j yield item
def parse(self, response): body = response.body.replace('\n', '').replace('\t', '').replace('\r', '') info = re.findall( '<div class="job-info"><h3 title="(.*?)"><a href="(.*?)" data-promid="(.*?)" target="_blank"onclick="(.*?)">(.*?) </a></h3><p class="condition clearfix"title="(.*?)"><span class="text-warning">(.*?)</span><a href="(.*?)"data-selector="data-url" class="area">(.*?)</a><span class="edu">(.*?)</span><span>(.*?)</span></p><p class="time-info clearfix"><time title="(.*?)">(.*?)</time><span title="(.*?)">(.*?)</span></p></div>', body) for item in info: items = LiepinItem() items['title'] = item[0] items['info_url'] = item[1] items['workinfo'] = item[4] items['pay'] = item[6] items['worktime'] = item[10] yield Request(url=items['info_url'], callback=self.detail, meta={'item': items})
def parse_item(self, response): item = LiepinItem() city_name = self.city jobname1 = response.xpath('//div[@class="title-info"]/h1/text()') if jobname1 == []: jobname = response.xpath( '//div[@class="title-info "]/h1/text()')[0].extract() else: jobname = jobname1[0].extract() span = response.xpath( '//p[@class="basic-infor"]/span/a/text()').extract() if span == []: position = response.xpath( '//p[@class="basic-infor"]/span/text()')[0].extract() else: position = span[0] workingExp = response.xpath( '//div[@class="job-qualifications"]/span/text()')[1].extract() eduLevel = response.xpath( '//div[@class="job-qualifications"]/span/text()')[0].extract() salary = response.xpath('//div[@class="job-title-left"]/p/text()' )[0].extract().rstrip('\r\n ') company_name = response.xpath( '//div[@class="title-info"]/h3/a/text()')[0].extract() update_time = response.xpath( '//p[@class="basic-infor"]/time/@title')[0].extract() job_require = response.xpath( '//div[@class="content content-word"]/text()').extract() sha1 = hashlib.sha1() string = (company_name + '' + update_time) stri = string.encode('utf8') sha1.update(stri) hash_id = sha1.hexdigest() for field in item.fields.keys(): item[field] = eval(field) yield item
def parse(self, response): r = response.xpath('//ul[@class="sojob-list"]/li') for a in r: job_xueli = a.xpath( '//div[contains(@class,"job-info")]/p/span[@class="edu"]/text()' ).extract() job_jingyan = a.xpath( '//div[contains(@class,"job-info")]/p/span[@class="edu"]/following-sibling::span/text()' ).extract() job_xinshui = a.xpath( '//div[contains(@class,"job-info")]/p/span[@class="text-warning"]/text()' ).extract() job_shijian = a.xpath( '//div[contains(@class,"job-info")]/p/time/@title/text()' ).extract() job_zhicheng = [ x.strip() for x in ( a.xpath('//div[contains(@class,"job-info")]/h3/a/text()') ).extract() ] job_company_name = a.xpath( '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a/text()' ).extract() job_url = a.xpath( '//div[contains(@class,"job-info")]/h3/a/@href').extract() job_company_url = a.xpath( '//div[contains(@class,"sojob-item-main")]//p[@class="company-name"]/a/@href' ).extract() item = LiepinItem() item["liepin_jingyan"] = job_jingyan item["liepin_xueli"] = job_xueli item["job_xinshui"] = job_xinshui item["job_zhicheng"] = job_zhicheng item["job_company_name"] = job_company_name item["job_url"] = job_url item["job_company_url"] = job_company_url yield item
def parse(self, response): sel = Selector(response) #获得猎聘网url和名称,薪资 ,发布时间 jobs = sel.xpath('//div[@class="job-info"]') for job in jobs: article_name = job.xpath('h3/a/span/text()').extract() article_url = job.xpath('h3/a/@href').extract() article_xinzi = job.xpath('p[@class="condition clearfix"]/span' ).xpath('string(.)').extract() article_time = job.xpath('p[@class="time-info clearfix"]').xpath( 'string(.)').extract() item = LiepinItem() item['article_name'] = article_name item['article_url'] = article_url item['article_xinzi'] = article_xinzi item['article_time'] = article_time yield item #获取下一篇文章内容 urls = sel.xpath('//div[@class="pagerbar"]/a/@href').extract() for url in urls: print url url = "http://bj.liepin.com" + url print url yield Request(url, callback=self.parse)
def detail_parse(self, response): panduan = lambda x: x[0] if x else 'unknown' job = LiepinItem() #如果是'/a/'类型网页 if '/a/' in response.url: #职位名称 job['name'] = response.xpath( '//div[@class="title-info"]/h1/text() | //div[@class="title-info "]/h1/text()' ).extract()[0] #公司名称 job['co_name'] = response.xpath( '//div[@class="title-info"]/h3/text() | //div[@class="title-info "]/h3/text()' ).extract()[0].strip() #区域 job['area'] = response.xpath( '//div[@class="title"]//p[@class="basic-infor"]/span/text()' ).extract()[0] #薪资 job['salary'] = response.xpath( '//div[@class="title"]//p[@class="job-main-title"]/text()' ).extract()[0].strip() #经验 job['exp'] = response.xpath( '//div[@class="resume clearfix"]/span[2]/text()').extract()[0] #学历 job['edu'] = response.xpath( '//div[@class="resume clearfix"]/span[1]/text()').extract()[0] #招聘人数 job['num'] = 'unknown' #发布时间 job['time'] = response.xpath( '//div[@class="job-title-left"]/p/time/text()').extract( )[0].strip() #其他要求 otherqlist = response.xpath( '//div[@class="resume clearfix"]/span[position()>2]/text()' ).extract() job['otherq'] = ','.join(otherqlist) #福利 fulis = [] fuliList = response.xpath( '//div[@class="job-main main-message"][3]//ul/li') for fuli in fuliList: fulis.append( fuli.xpath('./span/text()').extract()[0] + ':' + fuli.xpath('./text()').extract()[0]) job['welfare'] = ','.join(fulis) #职位信息 infolist = response.xpath( '//div[@class="job-main main-message"][1]/div[@class="content content-word"]/text()' ).extract() job['info'] = ' '.join(infolist) #上班地址 job['local'] = 'unknown' #公司网址 job['co_url'] = 'unknown' #公司类别 job['co_type'] = response.xpath( '//div[@class="job-main main-message"][2]//ul/li[5]/text()' ).extract()[0] #如果是 '/job/'类型网页 elif '/job/' in response.url: #职位名称 job['name'] = response.xpath( '//div[@class="title-info"]/h1/text()').extract()[0] #公司名称 job['co_name'] = response.xpath( '//div[@class="title-info"]/h3/a/text()').extract()[0].strip() #区域 job['area'] = response.xpath( '//div[@class="job-item"]//p[@class="basic-infor"]/span/a/text()' ).extract()[0] #薪资 job['salary'] = response.xpath( '//div[@class="job-item"]//p[@class="job-item-title"]//text()' ).extract()[0].strip() #经验 job['exp'] = response.xpath( '//div[@class="job-qualifications"]/span[2]/text()').extract( )[0] #学历 job['edu'] = response.xpath( '//div[@class="job-qualifications"]/span[1]/text()').extract( )[0] #招聘人数 job['num'] = 'unknown' #发布时间 job['time'] = response.xpath( '//div[@class="job-title-left"]/p/time/text()').extract( )[0].strip() #其他要求 otherqlist = response.xpath( '//div[@class="job-qualifications"]/span[position()>2]/text()' ).extract() job['otherq'] = ','.join(otherqlist) #福利 welist = response.xpath( '//div[@class="tag-list"]/span/text()').extract() job['welfare'] = ','.join(welist) #职位信息 infolist = response.xpath( '//div[@class="content content-word"]//text()').extract() job['info'] = ' '.join(infolist) #上班地址 job['local'] = response.xpath( '//div[@class="company-infor"]//ul[@class="new-compintro"]//li[3]//text()' ).extract()[0].split(':'.decode('utf8')).pop() #公司网址 job['co_url'] = response.xpath( '//div[@class="company-infor"]//div[@class="company-logo"]//p/a/@href' ).extract()[0] #公司类型 if response.xpath( '//ul[@class="new-compintro"]/li[1]/a/text()').extract(): job['co_type'] = response.xpath( '//ul[@class="new-compintro"]/li[1]/a/text()').extract()[0] else: job['co_type'] = response.xpath( '//ul[@class="new-compintro"]/li[1]/text()').extract()[0] #如果是'/cjob/'网页 else: #职位名称 job['name'] = response.xpath( '//div[@class="job-title"]/h1/text()').extract()[0] #公司名称 job['co_name'] = response.xpath( '//div[@class="job-title"]/h2/text()').extract()[0] #区域 job['area'] = response.xpath( '//div[@class="job-main"]/p[@class="job-main-tip"]/span[1]/text()[2]' ).extract()[0] #薪资 job['salary'] = response.xpath( '//div[@class="job-main"]/div[@class="job-main-title"]/strong/text()' ).extract()[0] #经验 job['exp'] = panduan( response.xpath( '//div[@class="job-main"]/p[@class="job-qualifications"]/span[2]/text()' ).extract()) #学历 job['edu'] = panduan( response.xpath( '//div[@class="job-main"]/p[@class="job-qualifications"]/span[1]/text()' ).extract()) #招聘人数 job['num'] = 'unknown' #发布时间 job['time'] = response.xpath( '//p[@class="job-main-tip"]/span[2]/text()').extract( )[0].strip() #其他要求 job['otherq'] = 'unknown' #福利 wellist = panduan( response.xpath( '//p[@class="job-labels"]/span/text()').extract()) job['welfare'] = ','.join(wellist) #职位信息 job['info'] = response.xpath( '//div[@class="job-info"]//div[@class="job-info-content"]/text()' ).extract()[0].strip() #上班地址 job['local'] = response.xpath( '//div[@class="side-box right-post-map"]/div[@class="side-content"]/p/text()' ).extract()[0] #公司网址 job['co_url'] = 'unknown' #公司类型 job['co_type'] = 'unknown' yield job
def parse(self, response, **kwargs): item = LiepinItem() title = response.xpath( '//div[@class=\'job-info\']/h3/@title').extract() item['title'] = title return item
def parse(self, response): jobs_xpath = '/html/body[@id="sojob"]//div[@class="sojob-item-main clearfix"]' for sel in response.xpath(jobs_xpath): job = sel.xpath('div[@class="job-info"]') job_detail_url = job.xpath('h3/a/@href').extract_first( default='Null').strip() if job_detail_url != 'Null': if job_detail_url.startswith('http'): yield Request(job_detail_url, headers=self.headers) elif job_detail_url.startswith('/'): yield Request('%s%s' % ('https://www.liepin.com', job_detail_url), headers=self.headers) # 产生下一页地址 next_xpath = '//div[@class="pagerbar"]/a[@class="current"]/following-sibling::a[1]' # 下一页不是disable了,说明还没头 if response.xpath(next_xpath + '/text()').extract_first().strip() != u'下一页': yield Request('https://www.liepin.com' + response.xpath(next_xpath + '/@href').extract_first().strip(), headers=self.headers) one_job = "/html/body/div[@id='job-view-enterprise']" if response.xpath(one_job): item = LiepinItem() list_link = response.xpath( "//link[@rel='canonical']/@href").extract() item['link'] = "".join(list_link).strip() item['job_name'] = response.xpath( "/html//div[@id='job-view-enterprise']//div[@class='title-info']/h1/text()" ).extract_first().strip() item['salary'] = response.xpath( "/html//p[@class='job-item-title']/text()").extract_first( ).strip() item['address'] = response.xpath( "/html//div[@class='job-title-left']/p[@class='basic-infor']/span[1]/a/text()" ).extract_first().strip() item['education'] = response.xpath( "/html//div[@class='job-qualifications']/span[1]/text()" ).extract_first().strip() item['experience'] = response.xpath( "/html//div[@class='job-qualifications']/span[2]/text()" ).extract_first().strip() item['job_require'] = response.xpath( "/html//div[@class='about-position']/div[@class='job-item main-message job-description'][1]/div[@class='content content-word']" ).extract_first().strip() item['company_size'] = response.xpath( "//ul[@class='new-compintro']/li[2]/text()").extract_first( ).strip() item['company_name'] = response.xpath( "/html//div[@class='title-info']/h3/a/text()").extract_first( ).strip() yield item one_hunter_job = "/html/body/div[@id='job-hunter']" if response.xpath(one_hunter_job): item = LiepinItem() job_name_raw = response.xpath( "/html/body/div[@id='job-hunter']/div[@class='wrap clearfix']/div[@class='clearfix content']/div[@class='main ']/div[@class='about-position']/div[@class='title']/div[@class='title-info']/h1/text()" ).extract_first() if job_name_raw is None: job_name_raw = response.xpath( "/html/body/div[@id='job-hunter']/div[@class='wrap clearfix']/div[@class='clearfix content']/div[@class='main']/div[@class='about-position']/div[@class='title']/div[@class='title-info ']/h1/text()" ).extract_first() item['job_name'] = job_name_raw.strip() item['salary'] = response.xpath( "/html/body/div[@id='job-hunter']//" "p[@class='job-main-title']/text()").extract_first().strip() item['education'] = response.xpath( "/html/body/div[@id='job-hunter']//div[@class='resume clearfix']/span[1]/text()" ).extract_first().strip() item['experience'] = response.xpath( "/html/body/div[@id='job-hunter']//div[@class='resume clearfix']/span[2]/text()" ).extract_first().strip() item['job_require'] = response.xpath( "/html/body/div[@id='job-hunter']//div[@class='content content-word']" ).extract_first().strip() yield item
def parse(self, response): sel = Selector(response) current_url = response.url # print response.body # 详情页分析 for detail_link in CONFIG['detail_link_rule']: if is_match(current_url, detail_link): print 'match detail_link page!', current_url item = LiepinItem() # items接收list,防止出现index out of range div_about_position = sel.xpath( '//div[@class="about-position"]') # div_right_blcok_post = sel.xpath('//div[@class="right_blcok_post"]') item[u'hire_website'] = 'NULL' item[u'company_name'] = div_about_position.xpath( '//h3/a/text()').extract()[0] # [0] item[u'job_name'] = div_about_position.xpath( '//h1/text()').extract()[0] # [0] # judge if key terms in the job_name flag = False for term in data_related_terms: if term in item[u'job_name']: flag = True if flag is False: return company_industry = sel.xpath( '//div[@class="company-infor"]/ul/li/a/text()').extract() if company_industry is None or len(company_industry) is 0: item[u'job_category'] = sel.xpath( '//div[@class="company-infor"]/ul/li/text()').extract( )[0].strip() else: item[u'job_category'] = company_industry[0].strip() ul_other_info_text = div_about_position.xpath( u'//h3[text()="其他信息:"]/following::div[1]/ul/li/label/text()' ).extract() item[u'department'] = ul_other_info_text[0] item[u'location'] = div_about_position.xpath( '//p[@class="basic-infor"]/span[1]/a/text()').extract( )[-1].strip() # [-1] item[u'job_nature'] = u'全职' requirement = div_about_position.xpath( '//div[@class="job-qualifications"]/span/text()').extract( ) item[u'experience'] = requirement[1] item[u'education'] = requirement[0] item[u'salary'] = div_about_position.xpath( '//p[@class="job-item-title"]/text()').extract()[0].strip( ) # [0] item[u'salary'] = process_salary(item[u'salary']) item[u'major'] = ul_other_info_text[1] item[u'hire_num'] = 'NULL' description = div_about_position.xpath( u'//h3[text()="职位描述:"]/following::div[1]/text()').extract( ) item[u'temptation'] = div_about_position.xpath( u'//div[@class="tag-list"]/span[@class="tag"]/text()' ).extract() item[u'temptation'] = ','.join(item[u'temptation']) item[u'description'] = "\n".join(description) ul_company_detail_text = sel.xpath( '//div[@class="company-infor"]/ul/li/text()').extract() # print ul_company_detail_text item[u'industry'] = sel.xpath( '//div[@class="company-infor"]/ul/li[1]/a/text()').extract( )[0] item[u'company_nature'] = ul_company_detail_text[-1].strip() item[u'finance'] = 'NULL' for li_text in ul_company_detail_text: if u'人' in li_text: item[u'staff_num'] = li_text.strip() elif u'轮' in li_text: item[u'finance'] = li_text.strip() item[u'company_website'] = 'NULL' item[u'publish_date'] = div_about_position.xpath( '//p[@class="basic-infor"]/span[2]/text()').extract( )[-1].strip() # [-1] item[u'publish_date'] = get_publish_time(item[u'publish_date']) item[u'publish_website'] = 'NULL' item[u'original_url'] = current_url # item['requirement'] = div_about_position.xpath('//div[@class="job-qualifications"]/span/text()').extract()[0] # item['report_to'] = ul_other_info_text[2] 汇报对象 # item['subordinates_num'] = ul_other_info_text[3] 下属人数 # company_website = div_about_position.xpath('//h3/a/@href').extract() # item['company_website'] = 'NULL' if len(company_website) == 0 else company_website[0] # item['company_address'] = sel.xpath('//div[@class="company-infor"]/p/text()').extract()[-1] print item.__dict__ yield item # 过滤出所有的列表页和详情页进行回调。 for url in sel.xpath('//a/@href').extract(): url = urljoin(current_url, url) # print url for list_link in CONFIG['list_link_rule']: if is_match(url, list_link): # print 'match list_link page! ', url # log.msg('list_url: %s' % url, level=log.INFO) yield Request(url, callback=self.parse) for detail_link in CONFIG['detail_link_rule']: if is_match(url, detail_link): print 'match detail_link page!', url # log.msg('detail_url: %s' % url, level=log.INFO) yield Request(url, callback=self.parse)