Exemple #1
0
    def parse(self, response):

        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item = TencentspiderItem()
            # 职位名称
            positionName = each.xpath("td[1]/a/text()").extract()[0]
            # 职位链接
            positionHref = each.xpath("td[1]/a/@href").extract()[0]
            positionInfo = each.xpath('./td[2]/text()').extract()[0]
            # 招聘人数
            pepleNum = each.xpath("td[3]/text()").extract()[0]
            #     工作地点
            location = each.xpath("td[4]/text()").extract()[0]
            # 发布日期
            publicTime = each.xpath("td[5]/text()").extract()[0]
            # 存储数据
            item['positionName'] = positionName
            item['positionHref'] = positionHref
            item['positionInfo'] = positionInfo
            item['pepleNum'] = pepleNum
            item['location'] = location
            item['publicTime'] = publicTime
            #     每页去玩取下一页
            if self.offset < 2710:
                self.offset += 10
            # 发给调度器
            yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
            yield item
Exemple #2
0
 def parse(self, response):
     for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
         item = TencentspiderItem()
         item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0]
         item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0]
         item['positionType'] = each.xpath('./td[2]/text()').extract()[0]
         yield item  #返回给pipelines process_item
Exemple #3
0
    def parsetencent(self, response):
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item = TencentspiderItem()
            item['position'] = each.xpath("./td[1]/a/text()").extract()[0]
            item['position_link'] = each.xpath("./td[1]/a/@href").extract()[0]
            item['position_type'] = each.xpath("./td[2]/text()").extract()[0]
            item['people_num'] = each.xpath("./td[3]/text()").extract()[0]
            item['work_location'] = each.xpath("./td[4]/text()").extract()[0]
            item['publish_time'] = each.xpath("./td[5]/text()").extract()[0]

            yield item
Exemple #4
0
 def parse(self, response):
     item = TencentspiderItem()
     for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
         item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
         item['positionLink'] = "https://hr.tencent.com/" + each.xpath(
             "./td[1]/a/@href").extract()[0]
         item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
         item['positionNum'] = each.xpath("./td[3]/text()").extract()[0]
         item['positionAddr'] = each.xpath("./td[4]/text()").extract()[0]
         item['positionTime'] = each.xpath("./td[5]/text()").extract()[0]
         yield item
 def parse(self, response):
     for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"
                                ):  #这里的//tr表示选取所有的tr元素@class表示选取所有的class的属性
         item = TencentspiderItem()
         item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0]
         item['positionLink'] = 'https://hr.tencent.com/' + each.xpath(
             './td[1]/a/@href').extract()[0]
         item['positionType'] = each.xpath('./td[2]/text()').extract()[0]
         item['positionNum'] = each.xpath('./td[3]/text()').extract()[0]
         item['positionSite'] = each.xpath('./td[4]/text()').extract()[0]
         item['positionTime'] = each.xpath('./td[5]/text()').extract()[0]
         yield item  #返回数据给上层,并最终传递给piplines
    def parse_detail(self, response):

        item = TencentspiderItem()

        # There are two uls whose class equals "squareli"' in the job's detail page.
        uls = response.xpath('//ul[@class="squareli"]')

        # extract all li's text in the ul
        # method getall() return a list
        # notice:It will raise a exception if use string() to get all text.
        item['jobDuties'] = uls[0].xpath('.//text()').getall()
        item['jobRequirement'] = uls[1].xpath('.//text()').getall()

        yield item
Exemple #7
0
    def parse(self, response):
    	for each in response.xpath('//tr[@class="event"] | //tr[@class="odd"]'):
    		item = TencentspiderItem()

    		item['positionName'] = each.xpath('./td/a/text()').extract()[0]

    		item['positionLink'] = self.base_url + each.xpath('./td/a/@href').extract()[0]

    		item['positionType'] = each.xpath('./td[2]/text()').extract()[0]

    		item['positionNum'] = each.xpath('./td[3]/text()').extract()[0]

    		yield scrapy.Request(item['positionLink'], meta = {'item': item}, callback=self.parse_desc)

    	if self.offset <= 60:
    		self.offset += 10
    		yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Exemple #8
0
    def parse(self, response):
        for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
            item = TencentspiderItem()
            item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0]
            item['positionLink'] = "https://hr.tencent.com/" + each.xpath(
                './td[1]/a/@href').extract()[0]
            item['positionType'] = each.xpath('./td[2]/text()').extract()[0]
            yield item  #返回给pipelines process_item

        # 翻页的第二种方法
        #这里最好的做法还是从response中把下一页的url取出来
        if self.offset < 540:
            self.offset += 10
            nextPageUrl = self.url + str(self.offset) + "#a"
        else:
            return
        # 对下一页发起request请求,指定一个回调方法
        yield scrapy.Request(nextPageUrl, callback=self.parse)
Exemple #9
0
    def parse_item(self, response):

        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item = TencentspiderItem()

            item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]

            item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]

            item['positionType'] = each.xpath("./td[2]/text()").extract()[0]

            item['positionNum'] = each.xpath("./td[3]/text()").extract()[0]

            item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]

            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item
Exemple #10
0
    def parse(self, response):

        for each in response.xpath(
                '//tr[@class="event"] | //tr[@class="odd"]'):
            item = TencentspiderItem()

            item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0]
            item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0]
            item['positionType'] = each.xpath('./td[2]/text()').extract()[0]
            item['positionNum'] = each.xpath('./td[3]/text()').extract()[0]
            item['workLocation'] = each.xpath('./td[4]/text()').extract()[0]
            item['publishTime'] = each.xpath('./td[5]/text()').extract()[0]

            yield item

        if self.offset < 2810:
            self.offset += 10

        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Exemple #11
0
	def parse5(self, response):
		item = TencentspiderItem()
		meta = response.meta
		text_content = response.body.decode("utf-8")
		content_dict = json.loads(text_content)
		oriCommList = content_dict[u'data'][u'oriCommList']
		cursor = content_dict[u'data'][u'last']
		for one_content in oriCommList:
			item['content'] = one_content[u'content']
			item['playCount'] = meta['playCount']
			item['name'] = meta['name']
			self.total_comment_number += 1
			yield item
			
		if cursor == "False" or cursor == False:
			cursor = "6270478955235058308"
		if self.total_comment_number <= max_comment_number:
			complete_url = "https://video.coral.qq.com/varticle/" + str(meta['id']) + "/comment/v2?orinum=30&oriorder=o&pageflag=1&cursor=" + str(cursor)
			yield scrapy.Request(url=complete_url, callback=self.parse5, meta=meta)
    def parse_item(self, response):

        table = response.xpath('//table[@class="tablelist"]')

        if not table:
            self.logger.error(
                "###### The table which warps the jobs' list is not found. ######"
            )

        for tr in table.xpath('.//tr[@class="even"] | .//tr[@class="odd"]'):
            item = TencentspiderItem()
            item['positionName'] = tr.xpath('./td[1]/a/text()').get()
            item['positionLink'] = 'https://hr.tencent.com/' + tr.xpath(
                './td[1]/a/@href').get()
            item['positionType'] = tr.xpath('./td[2]/text()').get()
            item['peopleNumber'] = tr.xpath('./td[3]/text()').get()
            item['workLocation'] = tr.xpath('./td[4]/text()').get()
            item['publishTime'] = tr.xpath('./td[5]/text()').get()

            yield item
Exemple #13
0
    def parse(self, response):
        for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
            item = TencentspiderItem()
            item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0]
            item['positionLink'] = 'https://hr.tencent.com/' + each.xpath(
                './td[1]/a/@href').extract()[0]
            item['positionType'] = each.xpath('./td[2]/text()').extract()[0]
            item['positionCount'] = each.xpath('./td[3]/text()').extract()[0]
            item['positionLocation'] = each.xpath(
                './td[4]/text()').extract()[0]
            item['positionPublishTime'] = each.xpath(
                './td[5]/text()').extract()[0]
            yield item  #返回给pipelines process_item

        nextpagelink = response.xpath('//*[@id="next"]')
        if not nextpagelink.xpath('./@class'):
            nextpageurl = 'https://hr.tencent.com/' + nextpagelink.xpath(
                './@href').extract()[0]
        else:
            return None
        yield scrapy.Request(nextpageurl, callback=self.parse)
Exemple #14
0
	def parse3(self, response):
		item = TencentspiderItem()
		meta = response.meta
		text_content = response.body.decode("utf-8").strip("\n").strip("jsonp3(").strip(")")
		content_dict = json.loads(text_content)
		hot_comment = content_dict[u'data'][u'hotcommentid']
		comments = content_dict[u'data'][u'commentid']
		parentinfo = content_dict[u'data'][u'parentinfo']
		cursor = content_dict[u'data'][u'last']
		
		for comment in hot_comment:
			item['content'] = comment[u'content']
			item['playCount'] = meta['playCount']
			item['name'] = meta['name']
			self.total_comment_number += 1
			yield item
		for comment in comments:
			item['content'] = comment[u'content']
			item['playCount'] = meta['playCount']
			item['name'] = meta['name']
			self.total_comment_number += 1
			yield item
		try:
			for key, comment in parentinfo.items():
				item['content'] = comment[u'content']
				item['playCount'] = meta['playCount']
				item['name'] = meta['name']
				self.total_comment_number += 1
				yield item
		except:
			pass
		
		if cursor == "False" or cursor == False:
			cursor = "6266224144146843373"
			print "======================"
		print cursor
		if self.total_comment_number <= max_comment_number:
			complete_url = "https://video.coral.qq.com/varticle/" + str(meta['id']) + "/comment/v2?orinum=30&oriorder=o&pageflag=1&cursor=" + str(cursor)
			yield scrapy.Request(url=complete_url, callback=self.parse5, meta=meta)
Exemple #15
0
    def parse(self, response):

        #爬取页面的处理
        dataList = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')
        for each in dataList:
            item = TencentspiderItem()
            item['positionTitle'] = each.xpath('./td[1]/a/text()').extract()[0]
            item['positionNum'] = each.xpath('./td[3]/text()').extract()[0]
            item['positionAddr'] = each.xpath('./td[4]/text()').extract()[0]
            item['positionTime'] = each.xpath('./td[5]/text()').extract()[0]
            if len(each.xpath('./td[2]/text()').extract()) > 0:
                item['positionType'] = each.xpath(
                    './td[2]/text()').extract()[0]
            else:
                item['positionType'] = 'Nothing'

            yield item
        #用回调函数进行多url的处理
        if self.offset < 2170:
            self.offset += 10
            #回调函数
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
    def parse_item(self, response):

        table = response.xpath('//table[@class="tablelist"]')

        if not table:
            self.logger.error(
                "###### The table which wraps the jobs' list is not found. ######"
            )

        for tr in table.xpath('.//tr[@class="even"] | .//tr[@class="odd"]'):
            item = TencentspiderItem()

            item['positionName'] = tr.xpath('./td[1]/a/text()').get()
            item['positionLink'] = 'https://hr.tencent.com/' + tr.xpath(
                './td[1]/a/@href').get()
            item['positionType'] = tr.xpath('./td[2]/text()').get()
            item['peopleNumber'] = tr.xpath('./td[3]/text()').get()
            item['workLocation'] = tr.xpath('./td[4]/text()').get()
            item['publishTime'] = tr.xpath('./td[5]/text()').get()

            yield scrapy.Request(url=item['positionLink'],
                                 meta={'item': item},
                                 callback=self.parse_detail,
                                 dont_filter=True)