def parse(self, response): jsonBody = json.loads(response.body.decode()) results = jsonBody['content']['positionResult']['result'] items = [] for result in results: item = LagouItem() item['job'] = result['positionName'] item['address'] = result['city'] item['money'] = result['salary'] item['req'] = result['education'] + '/' + result['workYear'] item['company'] = result['companyFullName'] item[ 'qua'] = result['industryField'] + '/' + result['financeStage'] item['des'] = ','.join(result['companyLabelList']) items.append(item) return items
def parse(self, response): print("我执行了!!!!!!!!!!!!!!!!") self.i += 1 print(self.i) item = LagouItem() if response.status != 200: print(response.status) self.status = response.status item['url'] = response.meta['url'] item['code'] = self.status item['keyword'] = self.keyword return item html = pq(response.text) self.status = response.status item['code'] = self.status item['keyword'] = self.keyword item['id'] = response.meta['id'] try: address_list = html(".work_addr").children() address = "" for temp in address_list: address += temp.text item['address'] = address item['url'] = response.meta['url'] item['advantage'] = html(".job-advantage p").text() item['company'] = html(".company").text() description_list = html(".job_bt p") description = str(description_list).replace("<p>", " ").replace( "</p>", " ").replace("<br/>", " ") item['description'] = description job_info = html(".job_request p span") item['salary'] = job_info[0].text item['location'] = job_info[1].text.replace("/", "") item['experience'] = job_info[2].text.replace("/", "") item['education'] = job_info[3].text item['type'] = job_info[4].text item['label'] = html(".position-label li").text() item['name'] = html(".job-name").attr('title') except Exception as e: print(item['url'], e) return item
def parse(self, response): item = LagouItem() tag_url = response.xpath('//li[@class="wrapper"]//div/ul/li/a/@href') next_page = response.xpath( '//div[@class="pager_container"]/span[last()]/@class') for each in response.xpath('//li[@class="company-item"]'): lg_company_url = each.xpath('div/p[1]/a/@href').extract_first() tag = each.xpath( 'div/p[@class="indus-stage wordCut"]/text()').extract_first() item['tag'] = tag item['lg_company_url'] = lg_company_url # for i in range(1,21): #print(item,next_page) # for url in tag_url: # yield scrapy.Request(next_page, callback=self.parse) # print(lg_company_url) yield scrapy.Request(url='https://www.lagou.com/gongsi/10483.html', headers=self.headers, callback=self.company_page, meta={'item': item})
def parseDetail(self , response): print(response.status) if response.status == 200: Item = LagouItem(); print("----------------------------------------------------------------------------------------") #公司名 Item["companyName"] = response.xpath("/html/body/div[2]/div/div[1]/div/div[1]/text()").extract(); #职位名 Item["positionName"] = response.xpath("/html/body/div[2]/div/div[1]/div/span/text()").extract(); #职位简介 Item["positionIntro"] = response.xpath("/html/body/div[2]/div/div[1]/dd/p[1]/span/text()").extract(); #职位标签 Item["positionLabel"] = response.xpath("/html/body/div[2]/div/div[1]/dd/ul/li/text()").extract(); #职位职责 Item["workResponsibility"] = response.xpath("//*[@id='job_detail']/dd[2]/div/p/text()").extract(); #职位福利 # //*[@id="job_detail"]/dd[1]/p Item["positionWelfare"] = response.xpath("//*[@id='job_detail']/dd[1]/p/text()").extract(); # 职位地址 Item["workAddress"] = response.xpath("//*[@id='job_detail']/dd[3]/div[1]/a/text()").extract(); yield Item;
def parse(self, response): # print response.body # fp = open('1.html', 'w') # fp.write(response.body) # fp.close() jdict = json.loads(response.body) jcontent = jdict['content'] jposresult = jcontent["positionResult"] jresult = jposresult["result"] self.totalPageCount = jposresult['totalCount'] / 15 + 1 print self.totalPageCount item = LagouItem() for each in jresult: # print each['city'] # print each['companyFullName'] # print each['companySize'] # print each['positionName'] # print each['firstType'] # print each['salary'] # print '' item['city'] = each['city'] item['companyFullName'] = each['companyFullName'] item['companySize'] = each['companySize'] item['positionName'] = each['positionName'] item['firstType'] = each['firstType'] item['salary'] = each['salary'] yield item if self.curpage <= self.totalPageCount: self.curpage += 1 yield scrapy.http.FormRequest(self.reqUrl, formdata={ 'pn': str(self.curpage), 'kd': self.kd }, callback=self.parse)
def parse(self, response): #inspect_response(response, self) print("request -> " + response.url) html = json.loads(response.text) #except ValueError: #yield self.next_request(response) if (html.get("success")): if html.get('content').get('positionResult').get( 'resultSize') != 0: results = html.get('content').get('positionResult').get( 'result') print('lagou Nums:' + str(len(results))) for result in results: item = LagouItem() item['salary'] = result.get('salary').replace("k", "K") item['positionName'] = result.get('positionName') item['positionLables'] = result.get('positionLables') item['companyFullName'] = result.get('companyFullName') item['companyLabelList'] = result.get('companyLabelList') item['companySize'] = result.get('companySize') item['city'] = result.get('city') item['district'] = result.get('district') item['education'] = result.get('education') item['firstType'] = result.get('firstType') item['industryField'] = result.get('industryField') item['jobNature'] = result.get('jobNature') item['workYear'] = result.get('workYear') yield item totalPage = math.floor( int( html.get('content').get('positionResult').get( "totalCount")) / int(html.get('content').get("pageSize"))) self.curPage = self.curPage + 1 if (self.curPage <= totalPage): yield self.next_request(response) else: time.sleep(60) yield self.next_request(response)
def parse_item(self, response): item = LagouItem() item['position_name'] = response.xpath( '//div[@class="job-name"]//span[@class="name"]/text()').extract( )[0] # 公司名 item['company'] = response.xpath( '//div[@class="company"]/text()').extract()[0] # 薪水 item['salary'] = response.xpath( '//dd[@class="job_request"]//span[1]/text()').extract()[0] # 经验 item['experience'] = response.xpath( '//dd[@class="job_request"]//span[3]/text()').extract()[0] # 学历 item['education'] = response.xpath( '//dd[@class="job_request"]//span[4]/text()').extract()[0] # 工作地点 item['location'] = response.xpath( '//input[@name="positionAddress"]/@value').extract()[0] yield item
def parse_item(self, response): i = LagouItem() url = response.url print(url) i["job_id"] i["title"] = response.xpath( '//div[@class="position-content"]/div/div[@class="job-name"]/@title' ) i["url"] = response.url i["salary"] = response.xpath( '//dd[@class="job_request"]/sapn[@class="salary"]/text()') i["job_city"] = response.xpath('//dd[@class="job_request"]/span')[1] i["work_years"] = response.xpath('//dd[@class="job_request"]/span')[2] i["degree_need"] = response.xpath('//dd[@class="job_request"]/span')[3] i["job_type"] = response.xpath('//dd[@class="job_request"]/span')[4] i["publish_time"] = response.xpath( '//dd[@class="job_request"]/p[@class="publish_time"]/text()') i["job_advantage"] = response.xpath( '//dd[@class="job-advantage"]/p/text()') i["job_desc"] = response.xpath('//dd[@class="job_bt"]/div/p/text()') city = response.xpath( '//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()' )[0] district = response.xpath( '//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()' )[1] road = response.xpath( '//dd[@class="job-address clearfix"]/div[@class="work_addr"]/a/text()' )[2] address = city + district + road i["job_addr"] = address i["company_url"] = response.xpath( '//div[@class="content_r"]/d1[@class="job_company"]/dt/a/@href') i["company_name"] = response.xpath( '//div[@class="content_r"]/d1[@class="job_compony"]/dt/img/@alt') #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i
def parse(self, response): response_body = json.loads(response.text) for compony in response_body["content"]["positionResult"]["result"]: lagouitem = LagouItem() # print "*" * 40 # print u"公司编号" + str(compony["companyId"]) # print compony["positionName"] # print compony["workYear"] # print compony["education"] # print compony["positionId"] # print compony["salary"] # print compony["companyFullName"] lagouitem["company_id"] = compony["companyId"] lagouitem["position_name"] = compony["positionName"] lagouitem["work_year"] = compony["workYear"] lagouitem["education"] = compony["education"] lagouitem["position_id"] = compony["positionId"] lagouitem["salary"] = compony["salary"] lagouitem["cmpany_full_name"] = compony["companyFullName"] print lagouitem yield lagouitem
def parse(self, response): pagecode = json.loads( response.body)['content']['positionResult']['result'] item = LagouItem() for job in pagecode: item['jobname'] = job['positionName'] item['releasetime'] = job['createTime'] item['salary'] = job['salary'] item['companyname'] = job['companyFullName'] item['experience'] = job['workYear'] item['Education'] = job['education'] yield item #获取下一页链接 for i in range(30): post_data = { 'first': 'true', 'kd': 'python', 'pn': '{}'.format(i), 'city': u'上海' } yield FormRequest(url=self.joburl, formdata=post_data, callback=self.parse)
def parse(self, response): item = LagouItem() divs = response.xpath('//*[@id="s_position_list"]/ul/li/div[1]') for div in divs: title = div.xpath('./div[1]/div[1]/a/h3/text()').extract() address = div.xpath('./div[1]/div[1]/a/span/em/text()').extract() money = div.xpath('./div[1]/div[2]/div/span/text()').extract() company = div.xpath('./div[2]/div[1]/a/text()').extract() fintance = div.xpath('./div[2]/div[2]/text()').extract() job_title = title[0] if len(title) > 0 else '无数据' job_address = address[0] if len(address) > 0 else '无数据' job_money = money[0] if len(money) > 0 else '无数据' job_company = company[0] if len(company) > 0 else '无数据' job_fintance = fintance[0] if len(fintance) > 0 else '无数据' item['title'] = job_title.strip() item['address'] = job_address.strip() item['money'] = job_money.strip() item['company'] = job_company.strip() item['fintance'] = job_fintance.strip() yield item
def parse_url(self, response): jobclass = response.meta["jobClass"] for sel in response.xpath("//ul[@class='item_con_list']/li"): # 初始化 Item = LagouItem() jobname = sel.xpath("div/div/div/a/h3/text()").extract() jobmoney = sel.xpath("div/div/div/div/span/text()").extract() jobneed = sel.xpath("div/div/div/div/text()").extract() jobneed = jobneed[2].strip() jobcompany = sel.xpath("div/div/div/a/text()").extract() jobcompany = jobcompany[3].strip() jobplace = sel.xpath("div/div/div/a/span/em/text()").extract() jobtype = sel.xpath("div/div/div/text()").extract() jobtype = jobtype[7].strip() jobspesk = sel.xpath( "div[@class='list_item_bot']/div/text()").extract() jobspesk = jobspesk[-1].strip() Item['jobClass'] = jobclass Item['jobName'] = jobname Item['jobMoney'] = jobmoney Item['jobNeed'] = jobneed Item['jobCompany'] = jobcompany Item['jobPlace'] = jobplace Item['jobType'] = jobtype Item['jobSpesk'] = jobspesk yield Item pass
def parse(self, response): content = json.loads(response.text)['content'] # print(json.load(response.text)) page = content['pageNo'] self.success_pages.append(page) #解析数据,提取需要的数据 results = content['positionResult']['result'] for result in results: job = LagouItem() job['job_name'] = result['positionName'] job['job_addr'] = result['district'] job['job_time'] = result['createTime'] job['job_limit'] = result['education'] + '、' + result['workYear'] job['job_salary'] = result['salary'] job['job_company'] = result['companyFullName'] job['job_company_type'] = result['financeStage'] + '、' + result[ 'industryField'] job['job_vip'] = result['positionAdvantage'] job['page'] = page yield job
def parse_item(self, response): print(response.url) item = LagouItem() job_url = response.url #路径 job_comp = response.css('#job_company dt a img::attr(alt)').extract()[ 0] #公司名 print() job_name = response.xpath('//div[@class="job-name"]/@title').extract()[ 0] # 工作名 job_degree = response.xpath( '//dd[@class="job_request"]/p/span[4]/text()').extract()[0].rstrip( ' /') # 学历 money = response.xpath( '//dd[@class="job_request"]/p/span[@class="salary"]/text()' ).extract()[0] if '-' in money: job_smoney = money.lower().replace('k', '').split('-')[0] # 工资最小数 job_emoney = money.lower().replace('k', '').split('-')[1] # 工资最大数 else: job_smoney = 0 job_emoney = 0 job_address = response.xpath( '//dd[@class="job_request"]/p/span[2]/text()').extract()[0].lstrip( '/').rstrip(' /') # 工作地址 job_comp_type = response.xpath( '//ul[@class="c_feature"]/li[2]/text()').extract()[1].replace( r'\n ', '') #公司类型 job_business = response.xpath( '//ul[@class="c_feature"]/li[1]').extract()[0].split( r'</i>')[1].split('<span class="hovertips">')[0].replace( r'\n', '') job_date_pub = response.xpath('//p[@class="publish_time"]/text()' ).extract()[0].split(' ')[0] #发布时间 job_num = response.xpath( '//ul[@class="c_feature"]/li[3]/text()').extract()[1] if '-' in job_num: job_comp_snum = job_num.split('-')[0] # 公司最小人数 job_comp_enum = job_num.split('-')[1].replace('人', '') # 公司最大人数 elif '以上' in job_num: job_comp_snum = job_num.replace('人以上', '') job_comp_enum = job_comp_snum else: job_comp_snum = 0 job_comp_enum = 0 job_year = response.xpath( '//dd[@class="job_request"]/p/span[3]/text()').extract()[0] if '-' in job_year: job_syear = job_year.split('-')[0].replace('经验', '') # 最小经验年限 job_eyear = job_year.split('-')[1].replace('年 /', '') # 最大经验年限 elif '以上' in job_year: job_syear = job_year.replace('年以上', '').lstrip('经验') job_eyear = job_syear else: job_syear = 0 job_eyear = 0 job_datetime = datetime.datetime.now().strftime('%Y-%m-%d') # 爬取时间 job_welfafe = response.xpath( '//dd[@class="job-advantage"]/p/text()').extract()[0] # 公司福利 job_people = '' # 招聘人数 job_desc = response.xpath( '//dd[@class="job_bt"]//p/text()').extract() # 工作简介 job_desc = str(job_desc).lstrip('[').rstrip(']').replace(r'\xa0', '') job_request = '' # 工作要求 job_tag = response.xpath( '//li[@class="labels"]/text()').extract() # 爬取种类分类 job_tag = ','.join(job_tag) # print(job_business,job_tag) item['job_name'] = job_name item['job_degree'] = job_degree item['job_smoney'] = job_smoney item['job_emoney'] = job_emoney item['job_address'] = job_address item['job_comp_snum'] = job_comp_snum item['job_comp_enum'] = job_comp_enum item['job_syear'] = job_syear item['job_eyear'] = job_eyear item['job_datetime'] = job_datetime item['job_welfafe'] = job_welfafe item['job_people'] = job_people item['job_desc'] = job_desc item['job_request'] = job_request item['job_tag'] = job_tag item['job_url'] = job_url item['job_comp'] = job_comp item['job_comp_type'] = job_comp_type item['job_business'] = job_business item['job_date_pub'] = job_date_pub yield item
def parse(self, response): keyword = response.url.split('kd=')[1] # 从url中获取keyword pageNo = 0 try: json_data = json.loads(response.body.decode('utf-8')) # print('json_data:', json_data) pageNo = json_data['content']['pageNo'] # 当前页数 if pageNo > 0: print('\n start 第%s页' % str(pageNo)) results = json_data['content']['positionResult']['result'] # print('results: ', results) # 将json数据解析放入item中 for result in results: # print('result: ', result) item = LagouItem() item['spider'] = self.name item['keyword'] = keyword item['jobId'] = str(result['positionId']) item['jobName'] = result['positionName'] item['jobPlace'] = result['city'] item['jobSalary'] = result['salary'] item['jobAdvantage'] = result[ 'positionAdvantage'] # 岗位优势(福利待遇) item['releaseTime'] = result['createTime'] # 发布时间 item['educationRequire'] = result['education'] # 最低学历要求 item['experienceRequire'] = result['workYear'] # 工作经验要求 item['jobNature'] = result[ 'jobNature'] # 工作性质('全职'、‘兼职’or’实习‘等) lt = result['positionLables'] lt.append(keyword) item['jobLabels'] = lt # 岗位标签,如:部门主管、数据分析等 item['compId'] = str(result['companyId']) item['compName'] = result['companyFullName'] item['compSize'] = result['companySize'] # 公司规模 item['compIndustry'] = result['industryField'] # 公司所属行业 item['compLabels'] = result['companyLabelList'] # 公司标签 item['longitude'] = result['longitude'] # 经度 item['latitude'] = result['latitude'] # 纬度 item['businessZones'] = result['businessZones'] # 更具体的位置 item['compLogo'] = result['companyLogo'] # 公司logo item['financeStage'] = result['financeStage'] # 公司融资阶段 job_desc_url = 'https://www.lagou.com' + '/jobs/' + item[ 'jobId'] + '.html' comp_desc_url = 'https://www.lagou.com' + '/gongsi/' + item[ 'compId'] + '.html' item['jobLink'] = job_desc_url item['compLink'] = comp_desc_url yield scrapy.Request(url=job_desc_url, headers=self.headers, callback=self.job_desc_parse, meta={'item': item}) except Exception as e: if pageNo == 0: print('超出页面范围') else: print('error_page:', pageNo) print('error_info:', e)
def parse(self, response: Response): """ 解析并填充item :param response: :return: """ res_json = response.text extract_status = Compose(json.loads, SelectJmes("status")) status = extract_status(res_json) if status is None: # 成功的情况 extract_result = Compose(json.loads, SelectJmes("content"), SelectJmes("positionResult"), SelectJmes("result")) result_list = extract_result(response.text) for res in result_list: loader = LagouItemLoader(item=LagouItem()) loader.add_value("post_time", res) loader.add_value("job_name", res) loader.add_value("salary", res) loader.add_value("place", res) loader.add_value("job_nature", res) loader.add_value("experience", res) loader.add_value("education", res) loader.add_value("job_kind", res) loader.add_value("advantage", res) loader.add_value("company_name", res) loader.add_value("company_size", res) loader.add_value("company_industry", res) loader.add_value("id", res) loader.add_value( "link", self.job_detail_url.format( id=loader.get_output_value("id"))) this_item = loader.load_item() yield this_item # yield Request( # url=this_item.get("link"), # headers=self.header_dict, # meta={"cookiejar": uuid.uuid4(), "item": this_item}, # callback=self.parse_other, # priority=5, # ) else: # 若请求失败,则重新请求一个主页,获得cookies,然后再次发起请求 key = uuid.uuid4() yield Request(url=self.get_cookies_url, callback=self.empty, meta={"cookiejar": key}, headers=self.header_dict, priority=5, dont_filter=True) yield FormRequest(url=response.url, formdata={ "first": "true", "pn": str(response.meta['page']), 'kd': "" }, callback=self.parse, meta={ "cookiejar": key, "page": response.meta['page'] }, method="POST", headers=self.header_dict, priority=4, dont_filter=True)
def parse(self, response): json_data = response.text # logger.debug(json_data) if '<html>' not in json_data: data = json.loads(json_data, object_pairs_hook=OrderedDict) # logger.debug(data) if data['success'] is True: # Json里面的页码 self.page_no = data['content']['pageNo'] while self.page_no != 0: # 解析 result = data['content']['positionResult']['result'] for item in result: lagou_data = LagouItem() # try: # MongoDB ID 虽然我这里没用MongoDB lagou_data['_id'] = string_to_md5( string=str(item['companyId']) + str(item['positionId'])) lagou_data['from_website'] = "拉勾" # 薪资(最高最低) try: salary = item.get('salary').split('-') lagou_data['min_salary'] = salary[0] lagou_data['max_salary'] = salary[1] except IndexError: salary = item.get('salary').split('以上') lagou_data['min_salary'] = salary lagou_data['max_salary'] = '不限' # 工作地址 try: lagou_data[ 'location'] = item['city'] + item['district'] except TypeError: lagou_data['location'] = item['city'] # 发布时间 lagou_data['publish_date'] = int( time_to_timestamp(time_str=item['createTime'])) # 职位类型 lagou_data['work_type'] = item['jobNature'] # 工作年限 lagou_data['work_experience'] = item['workYear'] # 教育水平 lagou_data['limit_degree'] = item['education'] # 招聘人数 lagou_data['people_count'] = 0 # 职位名称 lagou_data['work_name'] = item['positionName'] # 工作职责 lagou_data['work_duty'] = "" # 工作需求 lagou_data['work_need'] = "" # 公司名称 lagou_data['business_name'] = item['companyFullName'] # 公司状态 lagou_data['business_type'] = item['financeStage'] # 公司人数规模 lagou_data['business_count'] = item['companySize'] # 公司行业类别 lagou_data['business_industry'] = item['industryField'] # 公司页面url company_url = 'https://www.lagou.com/gongsi/%s.html' % item[ 'companyId'] # 招聘信息页面url job_url = 'https://www.lagou.com/jobs/%s.html' % item[ 'positionId'] # 职位页面 lagou_data['work_info_url'] = job_url yield scrapy.Request(url=job_url, method="GET", cookies=ALL_COOKIES, callback=self.parse_job_info, headers=HEADERS, meta={ "lagou_data": lagou_data, "company_url": company_url }, dont_filter=False) # except TypeError as err: # logger.error(err) # 翻页 url = self._url.format(self._search_name, self.page_no + 1) yield scrapy.Request(url=url, method="GET", callback=self.parse) else: raise CloseSpider(reason="End of Page num!")
def parse(self, response): # print response.body # print '**************************************' # return [FormRequest(url=self.urls, formdata={'first':'false','pn':'2','kd':'Python'}, callback=self.after_post)] # # print aa.encode('utf-8') # # print aa # try: # print type(aa) # # print aa.response.body # except: # print 'chucuo' # print '-----------------------------------------' # return [FormRequest(url="http://www.example.com/post/action", formdata={'name': 'John Doe', 'age': '27'}, callback=self.after_post)] # filename = response.url.split("/")[-2] # with open(filename, 'wb') as f: # f.write(response.body) item = LagouItem() # items = [] infojson = json.loads(response.body) result = infojson['content']['positionResult']['result'] if result: # print result # self.for_result(result) for i in result: # print i item['companySize'] = i['companySize'] # print companySize # print i['companySize'] item['firstType'] = i['firstType'] item['appShow'] = i['appShow'] item['pcShow'] = i['pcShow'] item['positionName'] = i['positionName'] item['education'] = i['education'] item['financeStage'] = i['financeStage'] item['city'] = i['city'] item['companyLogo'] = i['companyLogo'] item['district'] = i['district'] item['companyId'] = i['companyId'] item['explain'] = i['explain'] item['industryField'] = i['industryField'] item['createTime'] = i['createTime'] item['positionLables'] = i['positionLables'] item['score'] = i['score'] item['adWord'] = i['adWord'] item['formatCreateTime'] = i['formatCreateTime'] item['salary'] = i['salary'] item['workYear'] = i['workYear'] item['lastLogin'] = i['lastLogin'] item['jobNature'] = i['jobNature'] item['deliver'] = i['deliver'] item['gradeDescription'] = i['gradeDescription'] item['imState'] = i['imState'] item['companyFullName'] = i['companyFullName'] item['companyLabelList'] = i['companyLabelList'] item['positionId'] = i['positionId'] item['companyShortName'] = i['companyShortName'] item['approve'] = i['approve'] item['businessZones'] = i['businessZones'] item['plus'] = i['plus'] item['secondType'] = i['secondType'] item['positionAdvantage'] = i['positionAdvantage'] item['publisherId'] = i['publisherId'] item['promotionScoreExplain'] = i['promotionScoreExplain'] yield item self.pn += 1 while 1: first = 'true' if 1 == self.pn else 'false' # print first # print pn # formdata={'first':''+first+'','pn':''+str(pn)+'','kd':'Python'} # print formdata # self.parse() # print '1222222222222222222222222222222''kd':'Python' yield FormRequest(url=self.urls, headers=self.headers, formdata={ 'first': first, 'pn': str(self.pn), }, callback=self.parse) else: print '-----------------------------------------------' print 'haoxiangmeiyoule' print '-----------------------------------------------' return
def parse_item(self, response): # l = ItemLoader(item=LagouItem(), response=response) self.logger.info('Hi, this is an item page! %s', response.url) item = LagouItem() htmls = response.body # 公司名称 co_name = response.xpath( "//div[@class='company']/div[@class='company_name']/a/text()").get( ) # 职位名称 name = response.xpath( "//li[@class='con_list_item default_list']//div[@class='p_top']//h3/text()" ).get() # 薪资 salary = response.xpath( "//div[@class='postion']//div[@class='p_bot']//span[@class='money']/text()" ).get() # 区域 area = response.xpath( "//div[@class='postion']//div[@class='p_bot']//span[@class='add']/em/text()" ).get() # 工作年限 exp = response.xpath( "//div[@class='postion']//div[@class='p_bot']//div[@class='li_b_l']/text()" ).get() # 学历 edu = response.xpath( "//div[@class='postion']//div[@class='p_bot']//div[@class='li_b_l']/text()" ).get() # 发布时间 time = response.xpath( "//div[@class='p_top']//span[@class='format-time']/text()").get() time = self.getVal(time) # 职位描述 info = response.xpath( "//li[@data-positionname=$val]//div[@class='list_item_bot']/div[@class='li_b_l']/span/text()", val=name).getall() info = ";".join(info) print("info" * 20, info) # info = self.getVal(info) # if info != '': # info = '\n'.join(info).encode('utf-8') # 工作地点 local = response.xpath( "//div[@class='postion']//div[@class='p_bot']//span[@class='add']//em/text()" ).get() # 公司福利 welfare = response.xpath( "//div[@class='list_item_bot']//div[@class='li_b_r']/text()").get( ) # 公司网址 co_url = "" # 招聘人数 num = '0' # 公司类别 co_type = response.xpath('//dl[@class="industry"]/text()').get() # print("++" * 80) # print(name, co_name, area, salary, exp, edu, num, time, welfare, info, local, co_url, co_type) item['name'] = name item['co_name'] = co_name item['area'] = area item['salary'] = salary item['exp'] = exp item['edu'] = edu item['num'] = num item['time'] = time item['welfare'] = welfare item['info'] = str(info) item['local'] = local item['co_url'] = co_url item['co_type'] = co_type # print('+====' * 80, item) # itemJson = json.dumps(item, default=obj2json) # print(itemJson) yield item
def parse(self, response): """ json.loads()将json格式转为python格式,按照html响应的json文件格式进行字典引用 增加time.sleep()是防止出现KeyError: "content"错误, 应该是爬取页面太快, ajax未成功加载出 """ #time.sleep(10) jdict = json.loads(response.body) #time.sleep(10) try: jcontent = jdict["content"]["positionResult"] except Exception, e: print repr(e) + "?" * 30 jdict = json.loads(response.body) jcontent = jdict["content"]["positionResult"] finally: jresult = jcontent["result"] for each in jresult: item = LagouItem() item['city'] = each["city"].encode('utf-8') item['company'] = each["companyFullName"].encode('utf-8') item['size'] = each["companySize"].encode('utf-8') item['zone'] = each["district"].encode('utf-8') item['createtime'] = each["createTime"].encode('utf-8') item['labels'] = each["positionLables"].encode('utf-8') item['positionName'] = each["positionName"].encode('utf-8') item['salary'] = each["salary"].encode('utf-8') item['education'] = each["education"].encode('utf-8') item['workyear'] = each["workYear"].encode('utf-8') print "-" * 30 yield item
def parse_position(self, response): post_headers = self.headers.copy() post_headers.update({ 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_' }) try: jdict = json.loads(response.body) print(jdict) except json.decoder.JSONDecodeError: yield scrapy.http.FormRequest(url='{post_url}&city={city}'.format( post_url=self.post_url, city=self.city), headers=post_headers, formdata={ 'pn': str(self.curpage), 'first': 'true', 'kd': self.key }, dont_filter=True, callback=self.parse_position) if not jdict.get('success', None): print('Spiders are identified') yield scrapy.http.FormRequest(url='{post_url}&city={city}'.format( post_url=self.post_url, city=self.city), headers=post_headers, formdata={ 'pn': str(self.curpage), 'first': 'true', 'kd': self.key }, dont_filter=True, callback=self.parse_position) else: item = LagouItem() jcontent = jdict['content'] jposresult = jcontent["positionResult"] positions = jposresult["result"] for _position in positions: item['city'] = _position['city'] # 城市 item['position_name'] = _position['positionName'] # 职位名称 item['business_zones'] = _position['businessZones'] or [ '' ] # 工作区域 item['company_full_name'] = _position[ 'companyFullName'] # 公司全称 item['company_short_name'] = _position[ 'companyShortName'] # 公司简称 item['company_lable_list'] = _position['companyLabelList'] or [ '' ] # 公司福利 item['company_size'] = _position['companySize'] # 公司规模 item['education'] = _position['education'] # 学历要求 item['finance_stage'] = _position[ 'financeStage'] or None # 融资状况 item['first_type'] = self.key_list[self.direction_index][ 'first_type'] # 一级分类 item['industry_field'] = _position[ 'industryField'] or None # 公司领域 item['job_nature'] = _position['jobNature'] or None # 工作性质 item['position_lables'] = _position['positionLables'] or [ '' ] # 职位标签 item['salary'] = _position['salary'] # 薪资范围 temp_list = _position['salary'].split('-') if len(temp_list) == 1: item['salary_max'] = int( temp_list[0][:temp_list[0].find('k')]) else: item['salary_max'] = int( temp_list[1][:temp_list[1].find('k')]) item['salary_min'] = int(temp_list[0][:temp_list[0].find('k')]) item['salary_avg'] = (item['salary_max'] + item['salary_min']) / 2 item['second_type'] = self.key item['work_year'] = _position['workYear'] yield item if not self.total_page_count: self.total_page_count = jposresult['totalCount'] // 15 + 1 print(self.total_page_count) # self.total_page_count = 30 if self.total_page_count > 30 else self.total_page_count if self.curpage < self.total_page_count - 1: self.curpage += 1 print('Turn pages to ', self.curpage) yield scrapy.http.FormRequest( url='{post_url}&city={city}'.format(post_url=self.post_url, city=self.city), headers=post_headers, formdata={ 'pn': str(self.curpage), 'first': 'true', 'kd': self.key }, callback=self.parse_position) elif self.city_index < len(self.city_list) - 1: self.city_index += 1 self.curpage = 1 self.total_page_count = None self.key = self.key_words[self.key_index] self.city = self.city_list[self.city_index] print('Change the city to ', self.city) yield scrapy.http.FormRequest(url=self.post_url + '&city={}'.format(self.city), headers=post_headers, formdata={ 'pn': str(self.curpage), 'first': 'true', 'kd': self.key }, callback=self.parse_position) elif self.key_index < len(self.key_words) - 1: self.curpage = 1 self.key_index += 1 self.city_index = 0 self.total_page_count = None self.city = self.city_list[self.city_index] self.key = self.key_words[self.key_index] print('Change the keyword to ', self.key) yield scrapy.http.FormRequest( url='{post_url}&city={city}'.format(post_url=self.post_url, city=self.city), headers=post_headers, formdata={ 'pn': str(self.curpage), 'first': 'true', 'kd': self.key }, callback=self.parse_position) else: self.direction_index += 1 self.curpage = 1 self.city_index = 0 self.key_index = 0 self.total_page_count = None self.key_words = self.key_list[ self.direction_index]['second_types'] self.city = self.city_list[self.city_index] self.key = self.key_words[self.key_index] yield scrapy.http.FormRequest( url='{post_url}&city={city}'.format(post_url=self.post_url, city=self.city), headers=post_headers, formdata={ 'pn': str(self.curpage), 'first': 'true', 'kd': self.key }, callback=self.parse_position)