def parse(self, response): for each in response.xpath('//*[@class="even"]'): item = TencentItem() name = each.xpath('./td[1]/a/text()').extract()[0] detailLink = each.xpath('./td[1]/a/@href').extract()[0] positionInfo = each.xpath('./td[2]/text()').extract()[0] peopleNumber = each.xpath('./td[3]/text()').extract()[0] workLocation = each.xpath('./td[4]/text()').extract()[0] publishTime = each.xpath('./td[5]/text()').extract()[0] # print name, detailLink, catalog, peopleNumber, workLocation,publishTime item['name'] = name.encode('utf-8') item['detailLink'] = detailLink.encode('utf-8') item['positionInfo'] = positionInfo.encode('utf-8') item['peopleNumber'] = peopleNumber.encode('utf-8') item['workLocation'] = workLocation.encode('utf-8') item['publishTime'] = publishTime.encode('utf-8') curpage = re.search('(\d+)', response.url).group(1) page = int(curpage) + 10 url = re.sub('\d+', str(page), response.url) # 发送新的url请求加入待爬队列,并调用回调函数 self.parse yield scrapy.Request(url, callback=self.parse) # 将获取的数据交给pipeline yield item
def parse(self, response): items = [] node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() item['name'] = node.xpath('./td[1]/a/text()').extract()[0] item['detailLink'] = node.xpath("./td[1]/a/@href").extract()[0] if len(node.xpath("./td[2]/text()")): item['positionInfo'] = node.xpath( "./td[2]/text()").extract()[0] else: item['positionInfo'] = "" item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0] item['workLocation'] = node.xpath("./td[4]/text()").extract()[0] item['publishTime'] = node.xpath("./td[5]/text()").extract()[0] yield item abc = response.xpath("//div[@class='pagenav']/a[@id='next']") if abc.xpath("./@href").extract()[0] != "javascript:;": self.offset += 10 url = self.base_url + str(self.offset) yield scrapy.Request(url, callback=self.parse) else: print("The crawl is ending")
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() item['position_name'] = node.xpath( "./td[1]/a/text()").extract_first() item['position_link'] = u"https://hr.tencent.com/" + node.xpath( "./td[1]/a/@href").extract_first() item['position_type'] = node.xpath( "./td[2]/text()").extract_first() item['people_number'] = node.xpath( "./td[3]/text()").extract_first() item['work_location'] = node.xpath( "./td[4]/text()").extract_first() item['publish_times'] = node.xpath( "./td[5]/text()").extract_first() # 版本1,同一个item,同一份文件,使用meta传item对象 # yield scrapy.Request(item['position_link'],meta={'tencent_item':item}, callback=self.parse_position) # 版本2,不同文件,不同items,不同管道-加判断 yield scrapy.Request(item['position_link'], callback=self.parse_position) yield item
def parseContent(self, response): for each in response.xpath('//*[@class="even"]'): name = each.xpath('./td[1]/a/text()').extract()[0] detailLink = each.xpath('./td[1]/a/@href').extract()[0] positionInfo = each.xpath('./td[2]/text()').extract()[0] peopleNumber = each.xpath('./td[3]/text()').extract()[0] workLocation = each.xpath('./td[4]/text()').extract()[0] publishTime = each.xpath('./td[5]/text()').extract()[0] item = TencentItem() item['name'] = name.encode('utf-8') item['detailLink'] = detailLink.encode('utf-8') item['positionInfo'] = positionInfo.encode('utf-8') item['peopleNumber'] = peopleNumber.encode('utf-8') item['workLocation'] = workLocation.encode('utf-8') item['publishTime'] = publishTime.encode('utf-8') yield item
def parse(self, response): tr_list = response.xpath("//table[@class='tablelist'/tr]")[1:-1] # 对列表切片, 去除首尾元素 for tr in tr_list: # item = {} item = TencentItem() # scrapy不推荐直接使用字典, 推荐使用mySpider.items中的Item对象 item["title"] = tr.xpath("./td[1]/a/text()").extract_first() # 当前(tr)节点下的第一个td... item["position"] = tr.xpath("./td[2]/text()").extract_first() item["publish_date"] = tr.xpath("./td[5]/text()").extract_first() yield item # 找到下一页的URL地址 next_url = response.xpath("//a[@id='next']/@href").extract_first() if next_url and next_url != 'javascript:;': next_url = 'http://hr.tencent.com/' + next_url # yield一个Request对象, 可以将Request对象发送给引擎 yield scrapy.Request( next_url, callback=self.parse # 指定提取数据的callback函数 )
def parseContent(self, response): for each in response.xpath('//tr[@class="even"]') + response.xpath( '//tr[@class="odd"]'): item = TencentItem() item['name'] = each.xpath('./td[1]/a/text()').extract()[0].encode( 'utf-8') item['detailLink'] = each.xpath( './td[1]/a/@href').extract()[0].encode('utf-8') try: item['category'] = each.xpath( './td[2]/text()').extract()[0].encode('utf-8') except: item['category'] = '' item['peopleNumber'] = each.xpath( './td[3]/text()').extract()[0].encode('utf-8') item['workLocation'] = each.xpath( './td[4]/text()').extract()[0].encode('utf-8') item['publishTime'] = each.xpath( './td[5]/text()').extract()[0].encode('utf-8') yield item
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() # item取到的是unicode? item['position_name'] = node.xpath( "./td[1]/a/text()").extract_first() item['position_link'] = u"https://hr.tencent.com/" + node.xpath( "./td[1]/a/@href").extract_first() item['position_type'] = node.xpath( "./td[2]/text()").extract_first() item['people_number'] = node.xpath( "./td[3]/text()").extract_first() item['work_location'] = node.xpath( "./td[4]/text()").extract_first() item['publish_times'] = node.xpath( "./td[5]/text()").extract_first() yield item
def parse(self, response): time.sleep(2) #rs = response.json()['data'] jsonData = json.loads(response.text)['Data'] PostsListData = jsonData['Posts'] #print(PostsData[0]) item = TencentItem() if PostsListData is not None: curpage = re.search('(\d+)', response.url).group() page = int(curpage) + 1 url = re.sub('\d+', str(page), response.url, 1) PostsDictData = PostsListData[0] # {} name = PostsDictData['RecruitPostName'] detailLink = PostsDictData['PostURL'] positionInfo = PostsDictData['Responsibility'] peopleNumber = '2' workLocation = PostsDictData['CountryName'] + ',' + PostsDictData[ 'LocationName'] publishTime = PostsDictData['LastUpdateTime'] item['name'] = name item['detailLink'] = detailLink item['positionInfo'] = positionInfo item['peopleNumber'] = peopleNumber item['workLocation'] = workLocation item['publishTime'] = publishTime #print(page,url) yield scrapy.Request(url, callback=self.parse) else: print(re.search('(\d+)', response.url).group(), response.url) print('所有跑完----') #print(json.loads(response.text)['Data']) #curpage = re.search('(\d+)',response.url).group() #page = int(curpage) + 1 #url = re.sub('\d+', str(page), response.url) # 发送新的url请求加入待爬队列,并调用回调函数 self.parse #yield scrapy.Request(url, callback=self.parse) # # for each in response.xpath('//div[@class="search-content"]'): # item = TencentItem() # name = each.xpath('./div/div/a/text()').extract()[0] # detailLink = each.xpath('./td[1]/a/@href').extract()[0] # positionInfo = each.xpath('./td[2]/text()').extract()[0] # peopleNumber = each.xpath('./td[3]/text()').extract()[0] # workLocation = each.xpath('./td[4]/text()').extract()[0] # publishTime = each.xpath('./td[5]/text()').extract()[0] # # # #print name, detailLink, catalog, peopleNumber, workLocation,publishTime # # item['name'] = name.encode('utf-8') # item['detailLink'] = detailLink.encode('utf-8') # item['positionInfo'] = positionInfo.encode('utf-8') # item['peopleNumber'] = peopleNumber.encode('utf-8') # item['workLocation'] = workLocation.encode('utf-8') # item['publishTime'] = publishTime.encode('utf-8') # # curpage = re.search('(\d+)',response.url).group(1) # page = int(curpage) + 10 # url = re.sub('\d+', str(page), response.url) # # # 发送新的url请求加入待爬队列,并调用回调函数 self.parse # yield scrapy.Request(url, callback = self.parse) # # # 将获取的数据交给pipeline yield item