def parse(self, response): def xpath2item(expression, item_key): item[item_key] = response.xpath(expression).extract_first() item = ZhaopinItem() item['url'] = response.url node = "//div[@class='top-txt']" xpath2item(f"{node}/span[contains(@class, 'title')]/text()", 'position') xpath2item(f"{node}/span[@class='salary']/text()", 'salary') node = "//ul[@class='list-left left']" xpath2item(f"{node}//span[@class='area']/../text()", 'area') xpath2item(f"{node}//span[@class='xueli']/../text()", 'education') xpath2item(f"{node}//span[@class='minge']/../text()", 'experience') xpath2item(f"{node}//span[@class='num']/../text()", 'number') node = "//div[@class='abs']" xpath2item(f"{node}/p[contains(text(), '职位类型')]/text()", 'type_') xpath2item(f"{node}/p[contains(text(), '发布时间')]/text()", 'pubdate') xpath2item(f"{node}/p[contains(text(), '有效日期')]/text()", 'validate') xpath2item(f"{node}/p[@class='duty duty-box']/text()", 'duty') node = "//div[@class='details']" xpath2item(f"{node}//div[@class='right txt']/p/text()", 'company') xpath2item(f"{node}/p[contains(text(), '性质')]/text()", 'co_property') xpath2item(f"{node}/p[contains(text(), '规模')]/text()", 'co_scale') xpath2item(f"{node}/p[contains(text(), '类型')]/text()", 'co_type') xpath2item(f"{node}/p[contains(text(), '工作地点')]/text()", 'address') yield item
def parse_detail(self, response): item = ZhaopinItem() # 主题 item['title'] = response.xpath( '//div[@class="title"]/text()').extract_first() # 内容链接 item['url'] = response.url # 开始时间 item['starttime'] = response.xpath( '//div[@class="starttime"]/text()').extract_first() # 结束时间 item['endtime'] = response.xpath( '//div[@class="endtime"]/text()').extract_first() # 招聘城市 item['cityname'] = response.xpath( '//div[@class="cityname"]/text()').extract_first() # 详细地址 item['address'] = response.xpath( '//div[@class="address"]/text()').extract_first() # 详细内容 item['content'] = response.xpath( '//div[@class="middleLeft"]/p[1]/text()').extract_first() # import ipdb as pdb; pdb.set_trace() yield item
def processingThisPage(self, response): print(response.url) print(response.xpath("//title/text()")) tables = response.xpath( '//div[@id="newlist_list_content_table"]/table') ##看看能不能设置一个优先设置,去删除某些元素.看看.!! for x in tables: items = ZhaopinItem() #职位名称 x1 = x.xpath(".//tr[1]/td[1]/div/a[1]//text()").extract() items['jobName'] = ''.join(x1) #公司名字 x1 = x.xpath(".//tr[1]/td[3]//text()").extract() items['company'] = ''.join(x1) #工资待遇 x1 = x.xpath(".//tr[1]/td[4]//text()").extract() items['salary'] = ''.join(x1) #工作地点 x1 = x.xpath(".//tr[1]/td[5]//text()").extract() items['location'] = ''.join(x1) #企业性质 x1 = x.xpath( ".//tr[2]/td/div/div/ul/li[1]/span[2]//text()").extract() items['enterprise'] = ''.join(x1) #公司规模 x1 = x.xpath( ".//tr[2]/td/div/div/ul/li[1]/span[3]//text()").extract() items['scale'] = ''.join(x1) #需要的工作经验 x1 = x.xpath( ".//tr[2]/td/div/div/ul/li[1]/span[4]//text()").extract() items['experience'] = ''.join(x1) #学历需求 x1 = x.xpath( ".//tr[2]/td/div/div/ul/li[1]/span[5]//text()").extract() items['backGroup'] = ''.join(x1) #具体职业要求 x1 = x.xpath(".//tr[2]/td/div/div/ul/li[2]//text()").extract() items['require'] = ''.join(x1) #相信这里还需要一个函数来处理收集详细的职业要求和公司工作环境需求.!!#公司介绍 ##测试一下scrapy的request先. detailUrl = x.xpath(".//tr[1]/td[1]/div/a[1]/@href").extract() if not detailUrl: continue yield scrapy.Request(detailUrl[0], callback=self.loadDetailPage) #items['detail'] = self.manualGetHtml(detailUrl[0]) #print(self.tracer) yield items
def parse(self, response): item = ZhaopinItem() jobs = response.xpath('//table[@class="newlist"]')[1:] for job in jobs: item['jobname'] = job.xpath( './/td[@class="zwmc"]//div/a[1]/text()').extract_first() item['companyname'] = job.xpath( './/td[@class="gsmc"]/a[1]/text()').extract_first() item['salary'] = job.xpath( './/td[@class="zwyx"]/text()').extract_first() item['workingplace'] = job.xpath( './/td[@class="gzdd"]/text()').extract_first() item['posttime'] = job.xpath( './/td[@class="gxsj"]/span/text()').extract_first() yield item
def parse_item(self, response): item = ZhaopinItem() item['name'] = response.xpath( '//table[@class="tablelist textl"]//tr[1]/td/text()').extract() if item['name']: item['name'] = response.xpath( '//table[@class="tablelist textl"]//tr[1]/td/text()').extract( )[0] print item['name'] item['location'] = response.xpath( '//table[@class="tablelist textl"]//tr[2]/td[1]/text()').extract() if item['location']: item['location'] = response.xpath( '//table[@class="tablelist textl"]//tr[2]/td[1]/text()' ).extract()[0] item['type'] = response.xpath( '//table[@class="tablelist textl"]//tr[2]/td[2]/text()').extract() if item['type']: item['type'] = response.xpath( '//table[@class="tablelist textl"]//tr[2]/td[2]/text()' ).extract()[0] item['num'] = response.xpath( '//table[@class="tablelist textl"]//tr[2]/td[3]/text()').extract() if item['num']: item['num'] = response.xpath( '//table[@class="tablelist textl"]//tr[2]/td[3]/text()' ).extract()[0][:-1] item['zhize'] = response.xpath( '//table[@class="tablelist textl"]//tr[3]//li/text()').extract() if item['zhize']: item['zhize'] = response.xpath( '//table[@class="tablelist textl"]//tr[3]//li/text()').extract( )[0] item['yaoqiu'] = response.xpath( '//table[@class="tablelist textl"]//tr[4]//li/text()').extract() if item['yaoqiu']: item['yaoqiu'] = response.xpath( '//table[@class="tablelist textl"]//tr[4]//li/text()').extract( )[0] item['url'] = response.url yield item
def processJobDetail(self, response): items = ZhaopinItem() x = response.xpath("/html/body/div[6]/div[1]") # 职位名称 x1 = x.xpath("/html/body/div[5]/div[1]/div[1]/h1//text()").extract() items['jobName'] = ''.join(x1) # 公司名字 x1 = x.xpath("/html/body/div[5]/div[1]/div[1]/h2//text()").extract() items['company'] = ''.join(x1) # 工资待遇 x1 = x.xpath("ul/li[1]//text()").extract() items['salary'] = ''.join(x1) # 工作地点 x1 = x.xpath("ul/li[2]//text()").extract() items['location'] = ''.join(x1) # 企业性质 x1 = x.xpath( "/html/body/div[6]/div[2]/div[1]/ul/li[2]//text()").extract() items['enterprise'] = ''.join(x1) # 公司规模 x1 = x.xpath( "/html/body/div[6]/div[2]/div[1]/ul/li[1]//text()").extract() items['scale'] = ''.join(x1) # 需要的工作经验 x1 = x.xpath("ul/li[5]//text()").extract() items['experience'] = ''.join(x1) # 学历需求 x1 = x.xpath("ul/li[6]//text()").extract() items['backGroup'] = ''.join(x1) # 具体职业要求 #x1 = x.xpath("ul/li[1]//text()").extract() #items['require'] = ''.join(x1) x1 = x.xpath( "/html/body/div[6]/div[1]/div[1]/div/div[1]//text()").extract() x2 = ''.join(x1) x2 = x2.replace(' ', '') x2 = x2.replace('\r\n', '') items['detail'] = x2 ##该页具体的原始url地址 items['linkUrl'] = response.url yield items
def parse(self, response): quotes = response.css('.sojob-list li') for quote in quotes: item = ZhaopinItem() zhiwu = quote.css('.job-info h3 a::text').extract_first() danwei = quote.css('.company-name a::text').extract_first() gongzi = quote.css('.text-warning::text').extract_first() chengshi = quote.css('.area::text').extract_first() link = quote.css('.job-info h3 a::attr(href)').extract_first() item['zhiwu'] = zhiwu item['danwei'] = danwei item['gongzi'] = gongzi item['chengshi'] = chengshi item['link'] = link yield item next = response.css('.pagerbar a::attr(href)').extract() next_url = 'https://www.liepin.com' + next[7] yield scrapy.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): item = ZhaopinItem() name = response.xpath("//span[@class='post']/a/text()").extract() link = response.xpath("//span[@class='post']/a/@href").extract() company = response.xpath( "//span[@class='company_name']/a/text()").extract() for j in range(0, len(link)): data = urllib.request.urlopen(link[j]).read().decode( 'utf-8', 'ignore') pat1 = '<li><span>职位月薪:</span><strong>(.*?) <a' money = re.compile(pat1, re.S).findall(data) pat2 = '<li><span>工作地点:</span><strong><a target="_blank" href=".*?">(.*?)</a>' location = re.compile(pat2, re.S).findall(data) item['money'] = money[0] item['location'] = location[0] item['name'] = name[j] item['link'] = link[j] item['company'] = company[j] yield item for i in range(2, 101): url = 'http://jobs.zhaopin.com/guangzhou/p' + str(i) print('正在爬取第' + str(i) + '页') yield Request(url=url, callback=self.parse, headers=self.headers)
def parse_job(self, response): item = ZhaopinItem() item['job_url'] = response.url item['job_title'] = response.xpath( "//div[contains(@class,'inner-left fl')]/h1/text()").extract_first( ) item['company'] = response.xpath( "//div[contains(@class,'inner-left fl')]/h2//text()" ).extract_first().strip() item['salary'] = response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[1]/strong/text()" ).extract_first().strip() item['location'] = "".join( response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[2]/strong//text()" ).extract()) item['post_time'] = response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[3]/strong//text()" ).extract_first() item['type_of_empl'] = response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[4]/strong/text()" ).extract_first() item['work_exp'] = response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[5]/strong/text()" ).extract_first() item['min_edu_qual'] = response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[6]/strong/text()" ).extract_first() item['num_of_ppl'] = response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[7]/strong/text()" ).extract_first().strip() item['occup_type'] = response.xpath( "//div[contains(@class,'terminalpage-left')]/ul/li[8]/strong//text()" ).extract_first() item['job_desc'] = self.parse_job_desc(response) item['co_profile'] = self.parse_co_profile(response) yield item
def parse(self, response): #判断列表页是否获取成功,成功则返回值,失败则报异常 pn = response.meta['pn'] referer = response.meta['referer'] # print(response.text) isSuccess = json.loads(response.text)['success'] if isSuccess == True: pgNo = json.loads(response.text)['content']['pageNo'] if pgNo != 0: print(f'第{pn}页页面信息获取成功!') else: print(f'\033[1;31m ***Warning:第{pn}页页面信息获取失败!*** \033[0m') print(f'\033[1;31m {response.text} \033[0m') print( '{0}可能的错误:\n1.district对应的bizArea不一致。\n2.没有更多页面!{1}'.format( '\033[1;31m', '\033[0m')) print('\033[1;31m {0} \033[0m \n'.format(30 * '*')) pn = response.meta['pn'] kd = response.meta['kd'] recruitInfo = json.loads(response.text)['content']['hrInfoMap'] for zhaopinId, comInfo in recruitInfo.items(): item = ZhaopinItem() item['keyWord'] = kd item['zhaopinId'] = zhaopinId item['userId'] = comInfo['userId'] item['phone'] = comInfo['phone'] if comInfo['positionName'] != '' and comInfo[ 'positionName'] != None: item['positionName'] = comInfo['positionName'].replace( '&', '&').strip() else: item['positionName'] = None item['receiveEmail'] = comInfo['receiveEmail'] item['realName'] = comInfo['realName'] if comInfo['portrait'] is not None: item[ 'portrait'] = 'https://www.lgstatic.com/thumbnail_300x300/' + comInfo[ 'portrait'] else: item['portrait'] = None item['userLevel'] = comInfo['userLevel'] item['canTalk'] = comInfo['canTalk'] pageUrl = f'https://www.lagou.com/jobs/{zhaopinId}.html' hdsd = { 'Host': 'www.lagou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Referer': referer, 'Connection': 'keep-alive' } meta = {'item': item, 'pn': pn} yield Request(url=pageUrl, callback=self.detail_parse, cookies=self.cookie, dont_filter=True, headers=hdsd, meta=meta) else: msg = print("\033[1;31m{0}\n+访问失败:{1}!+\n{0}\033[0m".format( 32 * '+', json.loads(response.text)['msg'])) return msg
def parse(self, response): # notfound = response.xpath("//div[@class='returnpage']/h1/text()") # print("notfound", notfound) if response.status != 404: detailobjs = response.xpath( "//div[@class='details_container bg_container ']") i = 0 for detailobj in detailobjs: item = ZhaopinItem() zhiwei = detailobj.xpath( "//span[@class='post']/a/text()")[i].extract() zhiwei_url = detailobj.xpath( "//span[@class='post']//a/@href")[i].extract() gongsi = detailobj.xpath( "//span[@class='company_name']/a/text()")[i].extract() gongsi_url = detailobj.xpath( "//span[@class='company_name']/a/@href")[i].extract() yuexin = detailobj.xpath( "//span[@class='salary']/text()")[i].extract() gongzuodidian = detailobj.xpath( "//span[@class='address']/text()")[i].extract() faburiqi = detailobj.xpath( "//span[@class='release_time']/text()")[i].extract() gongzuojingyan = detailobj.xpath( "//div[@class='fleft detail_items']/span[1]/text()" )[0].extract() gongzuojingyan = gongzuojingyan.split(":")[1] xueli = detailobj.xpath( "//div[@class='fleft detail_items']/span[2]/text()" )[0].extract() xueli = xueli.split(":")[1] gongsiguimo = detailobj.xpath( "//div[@class='fleft detail_items']/span[3]/text()" )[0].extract() gongsiguimo = gongsiguimo.split(":")[1] gongsixingzhi = detailobj.xpath( "//div[@class='fleft detail_items']/span[4]/text()" )[0].extract() gongsixingzhi = gongsixingzhi.split(":")[1] item['zhiwei'] = zhiwei item['zhiwei_url'] = zhiwei_url item['gongsi'] = gongsi item['gongsi_url'] = gongsi_url item['yuexin'] = yuexin item['gongzuodidian'] = gongzuodidian item['faburiqi'] = faburiqi item['gongzuojingyan'] = gongzuojingyan item['xueli'] = xueli item['gongsiguimo'] = gongsiguimo item['gongsixingzhi'] = gongsixingzhi item['location'] = self.location_list[self.cnt] i += 1 yield item self.num += 1 url = self.base_url.format(num=self.num, location=self.location_list[self.cnt]) yield scrapy.Request(url, callback=self.parse) else: print( "===============================================================================" ) if self.cnt < len(self.location_list) - 1: # time.sleep(random.uniform(1, 5)) self.num = 1 self.cnt += 1 url = self.base_url.format( num=self.num, location=self.location_list[self.cnt]) yield scrapy.Request(url, callback=self.parse)