def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentspiderItem() # 职位名称 positionName = each.xpath("td[1]/a/text()").extract()[0] # 职位链接 positionHref = each.xpath("td[1]/a/@href").extract()[0] positionInfo = each.xpath('./td[2]/text()').extract()[0] # 招聘人数 pepleNum = each.xpath("td[3]/text()").extract()[0] # 工作地点 location = each.xpath("td[4]/text()").extract()[0] # 发布日期 publicTime = each.xpath("td[5]/text()").extract()[0] # 存储数据 item['positionName'] = positionName item['positionHref'] = positionHref item['positionInfo'] = positionInfo item['pepleNum'] = pepleNum item['location'] = location item['publicTime'] = publicTime # 每页去玩取下一页 if self.offset < 2710: self.offset += 10 # 发给调度器 yield scrapy.Request(self.url + str(self.offset), callback=self.parse) yield item
def parse(self, response): for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): item = TencentspiderItem() item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] yield item #返回给pipelines process_item
def parsetencent(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentspiderItem() item['position'] = each.xpath("./td[1]/a/text()").extract()[0] item['position_link'] = each.xpath("./td[1]/a/@href").extract()[0] item['position_type'] = each.xpath("./td[2]/text()").extract()[0] item['people_num'] = each.xpath("./td[3]/text()").extract()[0] item['work_location'] = each.xpath("./td[4]/text()").extract()[0] item['publish_time'] = each.xpath("./td[5]/text()").extract()[0] yield item
def parse(self, response): item = TencentspiderItem() for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] item['positionLink'] = "https://hr.tencent.com/" + each.xpath( "./td[1]/a/@href").extract()[0] item['positionType'] = each.xpath("./td[2]/text()").extract()[0] item['positionNum'] = each.xpath("./td[3]/text()").extract()[0] item['positionAddr'] = each.xpath("./td[4]/text()").extract()[0] item['positionTime'] = each.xpath("./td[5]/text()").extract()[0] yield item
def parse(self, response): for each in response.xpath("//tr[@class='even']|//tr[@class='odd']" ): #这里的//tr表示选取所有的tr元素@class表示选取所有的class的属性 item = TencentspiderItem() item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionLink'] = 'https://hr.tencent.com/' + each.xpath( './td[1]/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] item['positionNum'] = each.xpath('./td[3]/text()').extract()[0] item['positionSite'] = each.xpath('./td[4]/text()').extract()[0] item['positionTime'] = each.xpath('./td[5]/text()').extract()[0] yield item #返回数据给上层,并最终传递给piplines
def parse_detail(self, response): item = TencentspiderItem() # There are two uls whose class equals "squareli"' in the job's detail page. uls = response.xpath('//ul[@class="squareli"]') # extract all li's text in the ul # method getall() return a list # notice:It will raise a exception if use string() to get all text. item['jobDuties'] = uls[0].xpath('.//text()').getall() item['jobRequirement'] = uls[1].xpath('.//text()').getall() yield item
def parse(self, response): for each in response.xpath('//tr[@class="event"] | //tr[@class="odd"]'): item = TencentspiderItem() item['positionName'] = each.xpath('./td/a/text()').extract()[0] item['positionLink'] = self.base_url + each.xpath('./td/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] item['positionNum'] = each.xpath('./td[3]/text()').extract()[0] yield scrapy.Request(item['positionLink'], meta = {'item': item}, callback=self.parse_desc) if self.offset <= 60: self.offset += 10 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): item = TencentspiderItem() item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionLink'] = "https://hr.tencent.com/" + each.xpath( './td[1]/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] yield item #返回给pipelines process_item # 翻页的第二种方法 #这里最好的做法还是从response中把下一页的url取出来 if self.offset < 540: self.offset += 10 nextPageUrl = self.url + str(self.offset) + "#a" else: return # 对下一页发起request请求,指定一个回调方法 yield scrapy.Request(nextPageUrl, callback=self.parse)
def parse_item(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentspiderItem() item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0] item['positionType'] = each.xpath("./td[2]/text()").extract()[0] item['positionNum'] = each.xpath("./td[3]/text()").extract()[0] item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item
def parse(self, response): for each in response.xpath( '//tr[@class="event"] | //tr[@class="odd"]'): item = TencentspiderItem() item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] item['positionNum'] = each.xpath('./td[3]/text()').extract()[0] item['workLocation'] = each.xpath('./td[4]/text()').extract()[0] item['publishTime'] = each.xpath('./td[5]/text()').extract()[0] yield item if self.offset < 2810: self.offset += 10 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse5(self, response): item = TencentspiderItem() meta = response.meta text_content = response.body.decode("utf-8") content_dict = json.loads(text_content) oriCommList = content_dict[u'data'][u'oriCommList'] cursor = content_dict[u'data'][u'last'] for one_content in oriCommList: item['content'] = one_content[u'content'] item['playCount'] = meta['playCount'] item['name'] = meta['name'] self.total_comment_number += 1 yield item if cursor == "False" or cursor == False: cursor = "6270478955235058308" if self.total_comment_number <= max_comment_number: complete_url = "https://video.coral.qq.com/varticle/" + str(meta['id']) + "/comment/v2?orinum=30&oriorder=o&pageflag=1&cursor=" + str(cursor) yield scrapy.Request(url=complete_url, callback=self.parse5, meta=meta)
def parse_item(self, response): table = response.xpath('//table[@class="tablelist"]') if not table: self.logger.error( "###### The table which warps the jobs' list is not found. ######" ) for tr in table.xpath('.//tr[@class="even"] | .//tr[@class="odd"]'): item = TencentspiderItem() item['positionName'] = tr.xpath('./td[1]/a/text()').get() item['positionLink'] = 'https://hr.tencent.com/' + tr.xpath( './td[1]/a/@href').get() item['positionType'] = tr.xpath('./td[2]/text()').get() item['peopleNumber'] = tr.xpath('./td[3]/text()').get() item['workLocation'] = tr.xpath('./td[4]/text()').get() item['publishTime'] = tr.xpath('./td[5]/text()').get() yield item
def parse(self, response): for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"): item = TencentspiderItem() item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionLink'] = 'https://hr.tencent.com/' + each.xpath( './td[1]/a/@href').extract()[0] item['positionType'] = each.xpath('./td[2]/text()').extract()[0] item['positionCount'] = each.xpath('./td[3]/text()').extract()[0] item['positionLocation'] = each.xpath( './td[4]/text()').extract()[0] item['positionPublishTime'] = each.xpath( './td[5]/text()').extract()[0] yield item #返回给pipelines process_item nextpagelink = response.xpath('//*[@id="next"]') if not nextpagelink.xpath('./@class'): nextpageurl = 'https://hr.tencent.com/' + nextpagelink.xpath( './@href').extract()[0] else: return None yield scrapy.Request(nextpageurl, callback=self.parse)
def parse3(self, response): item = TencentspiderItem() meta = response.meta text_content = response.body.decode("utf-8").strip("\n").strip("jsonp3(").strip(")") content_dict = json.loads(text_content) hot_comment = content_dict[u'data'][u'hotcommentid'] comments = content_dict[u'data'][u'commentid'] parentinfo = content_dict[u'data'][u'parentinfo'] cursor = content_dict[u'data'][u'last'] for comment in hot_comment: item['content'] = comment[u'content'] item['playCount'] = meta['playCount'] item['name'] = meta['name'] self.total_comment_number += 1 yield item for comment in comments: item['content'] = comment[u'content'] item['playCount'] = meta['playCount'] item['name'] = meta['name'] self.total_comment_number += 1 yield item try: for key, comment in parentinfo.items(): item['content'] = comment[u'content'] item['playCount'] = meta['playCount'] item['name'] = meta['name'] self.total_comment_number += 1 yield item except: pass if cursor == "False" or cursor == False: cursor = "6266224144146843373" print "======================" print cursor if self.total_comment_number <= max_comment_number: complete_url = "https://video.coral.qq.com/varticle/" + str(meta['id']) + "/comment/v2?orinum=30&oriorder=o&pageflag=1&cursor=" + str(cursor) yield scrapy.Request(url=complete_url, callback=self.parse5, meta=meta)
def parse(self, response): #爬取页面的处理 dataList = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for each in dataList: item = TencentspiderItem() item['positionTitle'] = each.xpath('./td[1]/a/text()').extract()[0] item['positionNum'] = each.xpath('./td[3]/text()').extract()[0] item['positionAddr'] = each.xpath('./td[4]/text()').extract()[0] item['positionTime'] = each.xpath('./td[5]/text()').extract()[0] if len(each.xpath('./td[2]/text()').extract()) > 0: item['positionType'] = each.xpath( './td[2]/text()').extract()[0] else: item['positionType'] = 'Nothing' yield item #用回调函数进行多url的处理 if self.offset < 2170: self.offset += 10 #回调函数 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse_item(self, response): table = response.xpath('//table[@class="tablelist"]') if not table: self.logger.error( "###### The table which wraps the jobs' list is not found. ######" ) for tr in table.xpath('.//tr[@class="even"] | .//tr[@class="odd"]'): item = TencentspiderItem() item['positionName'] = tr.xpath('./td[1]/a/text()').get() item['positionLink'] = 'https://hr.tencent.com/' + tr.xpath( './td[1]/a/@href').get() item['positionType'] = tr.xpath('./td[2]/text()').get() item['peopleNumber'] = tr.xpath('./td[3]/text()').get() item['workLocation'] = tr.xpath('./td[4]/text()').get() item['publishTime'] = tr.xpath('./td[5]/text()').get() yield scrapy.Request(url=item['positionLink'], meta={'item': item}, callback=self.parse_detail, dont_filter=True)