def parse(self, response): try: for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): #初始化模型对象 #职位名称 item = TencentItem() item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] #详细链接 item['positionLink'] = 'https://hr.tencent.com/' + each.xpath("./td[1]/a/@href").extract()[0] #职位类别 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] #招聘人数 item['positionNum'] = each.xpath("./td[3]/text()").extract()[0] #工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] #发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item except IndexError: pass if self.pageNum < 1680 : self.pageNum += 10 else: raise("检索完成") #每次处理完一页的数据后,重新发送下一页面的请求 yield scrapy.Request(self.url + str(self.pageNum),callback = self.parse)
def parse(self, response): node_list = response.xpath("//*[@class='even']|//*[@class='odd']") if not node_list: return items = TencentItem() for node in node_list: items["name"] = node.xpath( "./td[1]/a/text()").extract_first() # extract_first 拿一条数据 items["detailLink"] = node.xpath("./td[1]/a/@href").extract_first() items["positionInfo"] = node.xpath( "./td[2]/text()").extract_first() items["peopleNumber"] = node.xpath( "./td[3]/text()").extract_first() items["workLocation"] = node.xpath( "./td[4]/text()").extract_first() items["publishTime"] = node.xpath("./td[5]/text()").extract_first() yield items url = response.xpath("//a[@id='next']/@href").extract_first() print("-" * 50, url) yield scrapy.Request("https://hr.tencent.com/" + url, callback=self.parse)
def parse(self, response): position_lists = response.xpath( '//tr[@class="even"] | //tr[@class="odd"]') for position in position_lists: item = TencentItem() position_name = position.xpath("./td[1]/a/text()").extract()[0] position_link = position.xpath("./td[1]/a/@href").get() position_type = position.xpath("./td[2]/text()").get() people_num = position.xpath("./td[3]/text()").get() work_address = position.xpath("./td[4]/text()").get() publish_time = position.xpath("./td[5]/text()").get() item['position_name'] = position_name item['position_link'] = position_link item['position_type'] = position_type item['people_num'] = people_num item['work_address'] = work_address item['publish_time'] = publish_time yield item total_page = response.xpath( '//div[@class="left"]/span/text()').extract()[0] if self.offset < int(total_page): self.offset += 10 new_url = "https://hr.tencent.com/position.php?&start=" + str( self.offset) yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): for each in response.xpath("//tr[@class='even'] |//tr[@class='odd']"): #初始化模型对象 item = TencentItem() # 职位名 item["positionname"] = each.xpath("./td[1]/a/text()").extract_first() # 详情链接 item["positionlink"] = each.xpath("./td[1]/a/@href").extract_first() # 职位类别 item["positionType"] = each.xpath("./td[2]/text()").extract_first() # 招聘人数 item["peopleNum"] = each.xpath("./td[3]/text()").extract_first() # 工作地点 item["worlLocation"] = each.xpath("./td[4]/text()").extract_first() # 发布时间 item["publishTime"] = each.xpath("./td[5]/text()").extract_first() yield item if self.offset < 60: self.offset += 10 #自增10,每次处理完一页的数据之后,重新发送下一页的页面请求,重新拼接为新的URL,并调用self.pase,处理Response #将请求重新发送给调度器入队列,出队列,交给下载器下载 yield scrapy.Request(self.url + str(self.offset),callback=self.parse)
def parse(self, response): """ 获取响应, 触发解析函数, 提取数据, 提取URL :param response: 下载==>中央引擎==>爬虫 的response对象 :return: 数据 URL """ response_dict = json.loads(response.text) response_data = response_dict["Data"] info_count = response_data["Count"] # 所有显示的条数 for info in response_data["Posts"]: item = TencentItem() item["Name"] = info["RecruitPostName"] item["Location"] = info["CountryName"] + info["LocationName"] item["BG"] = info["BGName"] item["Category"] = info["CategoryName"] item["Responsibility"] = info["Responsibility"] item["Time"] = info["LastUpdateTime"] item["URL"] = info["PostURL"] yield item # 提取下一页链接, 并且继续请求 num = ceil(int(info_count) / 20) # 向下取整函数floor和向上取整函数ceil for page in range(2, num): next_url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1566111052462&pageSize=20&language=zh-cn&area=cn&pageIndex={}".format( page) # 1. 封装成请求对象 request = scrapy.Request(url=next_url) # 2. 发送给搜索引擎 yield request
def parse_detail(self, response): """ 处理需要爬取页面的 :param response: 内容页面返回信息 :return: 待持久化item """ tencent_item = TencentItem() soup = BeautifulSoup(response.body.decode("utf-8"), "html.parser") title = soup.select("h1")[0].text create_date = get_now_time() url = response.url url_object_id = get_md5(url) content = "" if len(soup.select(".videoPlayer")) == 0: if len(soup.select("p")) > 0: content_list = soup.select("p") for element in content_list: content = content + remove_special_label(element.text) content_list = soup.select(".text") for element in content_list: content = content + remove_special_label(element.text) tencent_item["title"] = title tencent_item["create_date"] = create_date tencent_item["url"] = url tencent_item["url_object_id"] = url_object_id tencent_item['content'] = content if (len(title) > 0) & (len(content) > 0): return tencent_item else: logging.log(logging.ERROR, url) return None
def parse(self, response): """ 获取响应, 触发解析函数, 提取数据, 提取URL :param response: 下载==>中央引擎==>爬虫 的response对象 :return: 数据 URL """ response_dict = json.loads(response.text) response_data = response_dict["Data"] info_count = response_data["Count"] # 所有显示的条数 for info in response_data["Posts"]: item = TencentItem() item["Name"] = info["RecruitPostName"] item["Location"] = info["CountryName"] + info["LocationName"] item["BG"] = info["BGName"] item["Category"] = info["CategoryName"] # item["Responsibility"] = info["Responsibility"].replace("\n", "").replace("\r", "").strip() item["Time"] = info["LastUpdateTime"] yield item # 返回数据对象 detail_info = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?postId=' post_id = info["PostId"] yield scrapy.Request( url=detail_info + post_id, callback=self. parse_detail # 设置回调解析函数, 这里发送的request请求对象, parse()解析函数处理不了这里的请求 ) break # 测试
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0] item['positionLink'] = 'http://hr.tencent.com/' + node.xpath( "./td[1]/a/@href").extract()[0] if len(node.xpath("./td[2]/text()")): item['positionType'] = node.xpath( "./td[2]/text()").extract()[0] else: item['positionType'] = '' item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0] item['workLocation'] = node.xpath("./td[4]/text()").extract()[0] item['publishTime'] = node.xpath("./td[5]/text()").extract()[0] yield item total = int( response.xpath( "//span[@class='lightblue total']/text()").extract()[0]) if self.offset < total: self.offset += len(node_list) url = self.baseUrl + str(self.offset) yield Request(url=url, callback=self.parse)
def parse(self, response): #xpath匹配规则 for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentItem() # 职位名 item["positionname"] = each.xpath("./td[1]/a/text()").extract()[0] # 详细链接 item["positionLink"] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类别 try: item["positionType"] = each.xpath( "./td[2]/text()").extract()[0] except: item["positionType"] = '空' # 招聘人数 item["peopleNum"] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item["workLocation"] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item["publishTime"] = each.xpath("./td[5]/text()").extract()[0] #把数据交给管道文件 yield item #设置新URL页码 if (self.offset < 2620): self.offset += 10 #把请求交给控制器 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//tr[@class="even"]|//tr[@class="odd"]'): #初始化模型对象 item = TencentItem() # 职位名 item['positionname'] = each.xpath('./td[1]/a/text()').extract()[0] # 职位链接 item['positionlink'] = each.xpath('./td[1]/a/@href').extract()[0] # 职位类型 positiontype = each.xpath('./td[2]/text()').extract() #职位类别可能为空 if positiontype: item['positiontype'] = positiontype[0] else: item['positiontype'] = '职位类别' # 招聘人数 item['peoplenumber'] = each.xpath('./td[3]/text()').extract()[0] # 工作地点 item['worklocatiom'] = each.xpath('./td[4]/text()').extract()[0] # 发布时间 item['publishtime'] = each.xpath('./td[5]/text()').extract()[0] yield item if self.offset < 2140: self.offset += 10 #每次处理完一页后,重新发送下一页页面请求 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//tr[@class="even"]|//tr[@class="odd"]'): #初始化对象 item = TencentItem() # 职位名称 item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] # 详情链接 item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0] # 职位类别 item['positionType'] = each.xpath('./td[2]/text()').extract()[0] # 招聘人数 item['positionNum'] = each.xpath('./td[3]/text()').extract()[0] # 工作地点 item['workLocation'] = each.xpath('./td[4]/text()').extract()[0] # 发布时间 item['publishTime'] = each.xpath('./td[5]/text()').extract()[0] yield item if self.offset < 3920: self.offset += 10 # else: # raise "结束工作" # break #每次处理完一页数据之后,重新发送下一页的页面请求 #self.offset自增10,同时拼接新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): jobs = [] for each in response.xpath('//div[@class="recruit-list"]'): item = TencentItem() # 职位名称 item['position_name'] = each.xpath('./a/h4/text()').extract()[0] # 职位类别 item['position_type'] = each.xpath( '/a/p/span[3]/text()').extract()[0] # 工作地点 item['location'] = each.xpath('/a/p/span[2]/text()').extract()[0] # 发布时间 item['pub_time'] = each.xpath('/a/p/span[4]/text()').extract()[0] print item['position_name'] print item['position_type'] print item['location'] print item['pub_time'] yield item # jobs.append(item) if self.offset < 10: self.offset += 1 # return jobs yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//tr[@class="even"] | //tr[@class="odd"]'): # 初始化模型对象 item = TencentItem() # print item 是个 {} # 职位名称 item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] # 详情连接 item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0] # 职位类别 item['positionType'] = each.xpath("./td[2]/text()").extract() # 招聘人数 item['peopleNum'] = each.xpath('./td[3]/text()').extract()[0] # 工作地点 item['workLocation'] = each.xpath('./td[4]/text()').extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] # yield很重要,他是爬虫和后端管道数据处理的桥梁,每处理就返回信息 yield item if self.offset < 2623: self.offset += 10 # 每处理完一页数据之后,重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//tr[@class="even"] | //tr[@class="odd"]'): # 初始化模型对象 item = TencentItem() # 职位名称 item["sitionname"] = each.xpath("./td[1]/a/text()").extract()[0] # 详情链接 item["positionlink"] = each.xpath("./td[1]/a/@href").extract()[0] # 类别 item["positiontype"] = each.xpath("./td[2]/text()").extract()[0] # 招聘人数 item["perpleNum"] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item["workLocation"] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item["publishTime"] = each.xpath("./td[5]/text()").extract()[0] # 将数据给管道文件处理 yield item if self.offset < 3320: self.offset += 10 # 每次处理完一页重新发送请求处理下一页 # 将请求重新发送给调度器:入队列,出队列,交给下载其下载 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): item = TencentItem() name = each.xpath('./td[1]/a/text()').extract()[0] detailLink = each.xpath('./td[1]/a/@href').extract()[0] positionInfo = each.xpath('./td[2]/text()').extract()[0] peopleNumber = each.xpath('./td[3]/text()').extract()[0] workLocation = each.xpath('./td[4]/text()').extract()[0] publishTime = each.xpath('./td[5]/text()').extract()[0] print(name,detailLink,positionInfo,peopleNumber,workLocation,publishTime) item['name'] = name item['detailLink'] = "http://hr.tencent.com/" + detailLink item['positionInfo'] = positionInfo item['peopleNumber'] = peopleNumber item['workLocation'] = workLocation item['publishTime'] = publishTime curpage = re.search('(\d+)',response.url).group(1) page = int(curpage) + 10 url = re.sub('\d+',str(page),response.url) yield scrapy.Request(url,callback=self.parse) yield item
def parse(self, response): # xpath匹配规则 #response 是被爬取的url地址....<200 https://hr.tencent.com/position.php?&start=50> for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): #print ("打印each内容:",each) item = TencentItem() #职位名 item["positionname"] = each.xpath("./td[1]/a/text()").extract()[0] #print ("打印item***positionname******************",item["positionname"]) #详细链接 item["positionLink"] = each.xpath("./td[1]/a/@href").extract()[0] #print("打印item***positionLink******************", item["positionLink"]) #职位类别 try: item["positionType"] = each.xpath( "./td[2]/text()").extract()[0] except: item["positionType"] = '空' #招聘人数 item["peopleNum"] = each.xpath("./td[3]/text()").extract()[0] #print("打印item***peopleNum******************", item["peopleNum"]) #工作地点 item["workLocation"] = each.xpath("./td[4]/text()").extract()[0] #print("打印item***workLocation******************", item["workLocation"]) #发布时间 item["publishTime"] = each.xpath("./td[5]/text()").extract()[0] #print("打印item***publishTime******************", item["publishTime"]) #把数据交给管道文件 yield item # 设置新URL页码 if (self.offset < 50): self.offset += 10 #把请求交给控制器 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): item = TencentItem() all_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for postion in all_list: position_name = postion.xpath("./td[1]/a/text()").get() position_link = postion.xpath("./td[1]/a/@href").get() position_type = postion.xpath("./td[2]/text()").get() people_num = postion.xpath("./td[3]/text()").get() work_address = postion.xpath("./td[4]/text()").get() publish_time = postion.xpath("./td[5]/text()").get() item["position_name"] = position_name item["position_link"] = position_link item["position_type"] = position_type item["people_num"] = people_num item["work_address"] = work_address item["publish_time"] = publish_time yield item if self.page_nums < int( response.xpath('//tr[@class="f"]/td/div/span/text()').get()): self.page_nums += 10 url = self.url + str(self.page_nums) + '#a' yield scrapy.Request(url, callback=self.parse)
def parse(self, response): positions = response.xpath('//tr[@class="even"]|//tr[@class="odd"]') for position in positions: item = TencentItem() item['name'] = position.xpath('./td[1]/a/text()').extract()[0] print(item['name']) item['link'] = position.xpath('./td[1]/a/@href').extract()[0] #print(item['link']) type = position.xpath('./td[2]/text()') print(type) #tencent中有些类别为空,防止错误 if len(type) > 0: type = type.extract()[0] item['type'] = type item['num'] = position.xpath('./td[3]/text()').extract()[0] item['location'] = position.xpath('./td[4]/text()').extract()[0] item['time'] = position.xpath('./td[5]/text()').extract()[0] #创建请求对象 request = scrapy.Request("https://hr.tencent.com/" + item['link'], callback=self.getInfo) request.meta['item'] = item #传递item对象 yield request if self.offset < 1000: self.offset += 10 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): item = TencentItem() pattern1 = re.compile(r'<tr class="(even|odd)">(.*?)</tr>', re.S) #pattern2 = re.compile(r'<td(.*?)>(.*?)</td>',re.S) positions = re.findall(pattern1, response.text) for position in positions: # infos = re.findall(pattern2,position[1]) # for info in infos: # print(info[1]) try: c = etree.HTML(position[1]) item['name'] = c.xpath("//td/a/text()")[0] item['positiontype'] = c.xpath("//td/text()")[0] item['num'] = c.xpath("//td/text()")[1] item['address'] = c.xpath("//td/text()")[2] item['positiontime'] = c.xpath("//td/text()")[3] yield item except Exception as e: print("") print(e) if self.pageNum < 2840: self.pageNum += 10 yield scrapy.Request(self.url + str(self.pageNum), callback=self.parse) '''
def parse(self, response): # 获取页面中招聘信息在网页中位置节点 node_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') # 匹配到下一页的按钮 next_page = response.xpath('//a[@id="next"]/@href').extract_first() # 遍历节点,进入详情页,获取其他信息 for node in node_list: # 实例化,填写数据 item = TencentItem() item['position_name'] = node.xpath( './td[1]/a/text()').extract_first() item['position_link'] = node.xpath( './td[1]/a/@href').extract_first() item['position_type'] = node.xpath( './td[2]/text()').extract_first() item['wanted_number'] = node.xpath( './td[3]/text()').extract_first() item['work_location'] = node.xpath( './td[4]/text()').extract_first() item['publish_time'] = node.xpath('./td[5]/text()').extract_first() yield item yield scrapy.Request(url=self.base_url + item['position_link'], callback=self.details) # 访问下一页信息 yield scrapy.Request(url=self.base_url + next_page, callback=self.parse)
def parse(self, response): zhiwei_list = response.xpath("//tr[@class='even']|//tr[@class='odd']") for zhiwei in zhiwei_list: item = TencentItem() name = zhiwei.xpath("./td[1]/a/text()").extract()[0] link = zhiwei.xpath("./td[1]/a/@href").extract()[0] if len(zhiwei.xpath("./td[2]/text()")) > 0: leibie = zhiwei.xpath("./td[2]/text()").extract()[0] else: leibie = "暂无" num = zhiwei.xpath("./td[3]/text()").extract()[0] addr = zhiwei.xpath("./td[4]/text()").extract()[0] date = zhiwei.xpath("./td[5]/text()").extract()[0] item['name'] = name item['link'] = link item['leibie'] = leibie item['num'] = num item['addr'] = addr item['date'] = date yield item if self.start_num < 2260: self.start_num += 10 yield scrapy.Request("http://hr.tencent.com/position.php?&start=" + str(self.start_num))
def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): #初始化模型对象 item = TencentItem() #职位名 item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] #详情链接 item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0] #职位类别 try: item['positionType'] = each.xpath( "./td[2]/text()").extract()[0] except: item['positionType'] = '空' #招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] #工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] #发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item if self.offset < 180: self.offset += 10 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): text = response.text data_list = json.loads(text)['Data']['Posts'] for data_dict in data_list: item = TencentItem() item['Position_Name'] = data_dict['RecruitPostName'] item['Position_Location'] = data_dict['CountryName'] + data_dict[ 'LocationName'] item['Responsibility'] = data_dict['Responsibility'] item['Last_Update_Time'] = data_dict['LastUpdateTime'] item['Position_Src'] = data_dict['PostURL'] '''另外请求详细招聘信息''' url = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?postId=" + data_dict[ 'PostId'] yield scrapy.Request(url, callback=self.parse_detail, meta={"item": item}, dont_filter=False) '''翻页请求''' global page print("已抓取完第{}页".format(page)) page = page + 1 if (page * 10 - 9) <= int(json.loads(text)['Data']['Count']): next_url = "https://careers.tencent.com/tencentcareer/api/post/Query?countryId=1&parentCategoryId=40001&pageIndex={}&pageSize=10".format( page) yield scrapy.Request(next_url, callback=self.parse, dont_filter=False)
def parse(self, response): items = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for each in items: item = TencentItem() try: name = each.xpath('./td[1]/a/text()').extract() link = each.xpath('./td[1]/a/@href').extract() style = each.xpath('./td[2]/text()').extract() num = each.xpath('./td[3]/text()').extract() location = each.xpath('./td[4]/text()').extract() time = each.xpath('./td[5]/text()').extract() item['positionName'] = name[0].encode('utf-8') item['positionLink'] = link[0].encode('utf-8') item['positionStyle'] = style[0].encode('utf-8') item['positionNumber'] = num[0].encode('utf-8') item['positionLocation'] = location[0].encode('utf-8') item['positionTime'] = time[0].encode('utf-8') except: pass count = re.search(('\d+'), response.url).group(0) page = int(count) + 10 url = re.sub('\d+', str(page), response.url) yield scrapy.Request(url, callback=self.parse) yield item
def parse(self, response): # 提取每个response的数据 node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() # 提取每个职位的信息 item['position_name'] = node.xpath("./td[1]/a/text()").extract()[0] item['position_link'] = node.xpath("./td[1]/a/@href").extract()[0] if len(node.xpath("./td[2]/text()")): item['position_type'] = node.xpath( "./td[2]/text()").extract()[0] else: item['position_type'] = '不定' item['people_num'] = node.xpath("./td[3]/text()").extract()[0] item['work_location'] = node.xpath("./td[4]/text()").extract()[0] item['publish_time'] = node.xpath("./td[5]/text()").extract()[0] # yield的重要性,是返回数据后还能回来接着执行代码 yield item # 第一种写法,拼接url,适用场景:也没没有可以点击的请求链接,必须通过拼接url才能获取响应 # if self.offset < 3330: # self.offset += 10 # url = self.base_url + str(self.offset) # yield scrapy.Request(url, callback=self.parse) # 第二种写法,直接从response获取需要爬取的下一页链接,并发送请求处理,直到请求全部提取完 if not len(response.xpath("//a[@class='noactive' and @id='next']")): url = response.xpath("//a[@id='next']/@href")[0].extract() yield scrapy.Request("http://hr.tencent.com/" + url, callback=self.parse)
def parse(self, response): for each in response.xpath( '//*[contains(@class,"odd") or contains(@class,"even")]'): item = TencentItem() name = each.xpath('./td[1]/a/text()').extract_first() detailLink = each.xpath('./td[2]/text()').extract_first() positionInfo = "https://hr.tencent.com/" + each.xpath( './td[1]/a/@href').extract_first() peopleNumber = each.xpath('./td[3]/text()').extract_first() workLocation = each.xpath('./td[4]/text()').extract_first() publishDate = each.xpath('./td[5]/text()').extract_first() item['name'] = name item['detailLink'] = detailLink item['positionInfo'] = positionInfo item['peopleNumber'] = peopleNumber item['workLocation'] = workLocation item['publishDate'] = publishDate yield item now_page = int(re.search(r'\d+', response.url).group()) if now_page < 10: url = re.sub(r'\d+', str(now_page + 10), response.url) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): # 初始化模型对象 item = TencentItem() item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] # 详情连接 item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类别 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item if self.offset < 1680: self.offset += 10 # 每次处理完一页的数据之后,重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
def parse(self, response): # 获取职位节点列表 node_list = response.xpath('//tr[@class="even"]|//tr[@class="odd"]') # print(len(node_list)) # 编列节点列表,从没一个节点中抽取数据 for node in node_list: # item实例化 item = TencentItem() # 抽取数据 item['name'] = node.xpath('./td[1]/a/text()').extract()[0] item['detail_link'] = 'https://hr.tencent.com/' + node.xpath('./td[1]/a/@href').extract()[0] # extract_first()提取结果的第一个,如果存在则提取,如果不存在则赋值为None item['category'] = node.xpath('./td[2]/text()').extract_first() item['number'] = node.xpath('./td[3]/text()').extract()[0] item['address'] = node.xpath('./td[4]/text()').extract()[0] item['pub_date'] = node.xpath('./td[5]/text()').extract()[0] # 返回数据给引擎 yield item # 获取下一页链接,并且做成请求发送个引擎 # 拼接下一页url next_url = 'https://hr.tencent.com/' + response.xpath('//*[@id="next"]/@href').extract()[0] # 判断是否到达最后一页 if 'javascript:;' not in next_url: # 没有到达最后一页就发送请求,模拟翻页 yield scrapy.Request(next_url,callback=self.parse)
def parse(self, response): """处理response""" for eachJob in response.xpath( '//tr[@class="even"] | //tr[@class="odd"]'): # 初始化item对象 item = TencentItem() # 将一页的内容放入item中 item['jobName'] = eachJob.xpath('./td[1]/a/text()').extract()[0] item['jobDetail'] = eachJob.xpath('./td[1]/a/@href').extract()[0] item['jobType'] = eachJob.xpath('./td[2]/text()').extract_first() item['jobNeed'] = eachJob.xpath('./td[3]/text()').extract()[0] item['jobLocation'] = eachJob.xpath('./td[4]/text()').extract()[0] item['jobTime'] = eachJob.xpath('./td[5]/text()').extract()[0] # 将获取的数据交给pipeline yield item # 提取出接下来的请求,每页10条 if self.offset < 3300: self.offset += 10 else: print('the work finished') next_page = self.url + str(self.offset) # 将请求重新发送给调度器入队列,出队列,给下载器下载 yield scrapy.Request(url=next_page, callback=self.parse)
def parse(self,response): # for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): # 初始化模型对象 item = TencentItem() # 返回的是一个选择器的列表 # 职位名 item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] # 详细链接 item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类型 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item if self.offset <1680: self.offset +=10 #else: # raise "结束工作" # 每次处理一页数据之后 重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数 self.parse处理Response yield scrapy.Request(self.url + str(self.offset),callback = self.parse)