def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        # items = []
        for node in node_list:
            item = TencentItem()

            positionName = node.xpath("./td[1]/a/text()").extract()
            positionLink = node.xpath("./td[1]/a/@href").extract()
            positionType = node.xpath("./td[2]/text()").extract()
            peopleNumber = node.xpath("./td[3]/text()").extract()
            workLocation = node.xpath("./td[4]/text()").extract()
            publishTime = node.xpath("./td[5]/text()").extract()

            item['positionName'] = positionName[0]
            item['positionLink'] = positionLink[0]
            if len(positionType):
                item['positionType'] = positionType[0]
            else:
                item['positionType'] = ''
            item['peopleNumber'] = peopleNumber[0]
            item['workLocation'] = workLocation[0]
            item['publishTime'] = publishTime[0]

            yield item

        # if self.offset < 3070:
        #     self.offset += 10
        #     url = self.baseURL + str(self.offset)
        #     yield scrapy.Request(url, callback=self.parse)

        if len(response.xpath("//a[@class='noactive' and @id='next']")) == 0:
            url = response.xpath("//a[@id='next']/@href").extract()[0]
            yield scrapy.Request("https://hr.tencent.com/"+url,callback=self.parse)
Example #2
0
    def parse(self, response):
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            # 初始化模型对象
            item = TencentItem()

            item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
            item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]
            if len(each.xpath("./td[2]/text()").extract()) > 0:
                item['positionType'] = each.xpath(
                    './td[2]/text()').extract()[0]
            else:
                item['positionType'] = "None"
            item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
            item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item

        if self.offset < 2000:
            self.offset += 10

        # 每次处理完一页的数据之后,重新发送下一页页面请求
        # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
        yield scrapy.Request(self.url + str(self.offset),
                             callback=self.parse,
                             dont_filter=True)
Example #3
0
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for each in node_list:
			#初始化模型
            item = TencentItem()

            item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]

            item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]

            if len(each.xpath("./td[2]/text()")) != 0:
                item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
            else :
                item['positionType'] = ""

            item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]

            item['workLoction'] = each.xpath("./td[4]/text()").extract()[0]

            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item

            # if self.offset < 3830:
            #     self.offset += 10
            #     url = self.base_url + str(self.offset)
        #
        if len(response.xpath("//a[@id='next' and calss='noactive']")) == 0:
            next_url = self.base_url + str(response.xpath("//a[@id='next']/@href"))
            yield scrapy.Request(next_url,callback = self.parse)
Example #4
0
 def parse(self, response):
     print response.text
     print '打印response.text结束'
     # for each in response.xpath("//tr[@class='event']|tr[@class='odd']"):
     print '开始打印符合的xpath'
     # 循环电影的条目
     for each in response.xpath(
             "//div[@class='article']//ol[@class='grid_view']/li"):
         print each
         # item文件导进来
         douban_item = TencentItem()
         # 写详细的xpath,进行数据的解析
         douban_item['serial_number'] = each.xpath(
             ".//div[@class='item']//em/text()").extract_first()
         douban_item['movie_name'] = each.xpath(
             ".//div[@class='info']/div[@class='hd']/a/span[1]/text()"
         ).extract_first()
         print douban_item
         # 将需要的数据传到管道中,否则pipelines接收不到数据
         yield douban_item
     # 解析下一页规则,取后一页的xpath
     next_link = response.xpath(
         "//span[@class='next']/link/@href").extract()
     if next_link:
         next_link = next_link[0]
         yield scrapy.Request("https://movie.douban.com/top250" + next_link,
                              callback=self.parse)
         '''
Example #5
0
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            item['positionName'] = node.xpath(
                "./td[1]/a/text()").extract()[0].encode("utf-8")
            item['positionLink'] = node.xpath(
                "./td[1]/a/@href").extract()[0].encode("utf-8")

            if len(node.xpath("./td[2]/text()")):
                item['positionType'] = node.xpath(
                    "./td[2]/text()").extract()[0].encode("utf-8")
            else:
                item['positionType'] = "NULL"

            item['peopleNumber'] = node.xpath(
                "./td[3]/text()").extract()[0].encode("utf-8")
            item['workLocation'] = node.xpath(
                "./td[4]/text()").extract()[0].encode("utf-8")
            item['publishTime'] = node.xpath(
                "./td[5]/text()").extract()[0].encode("utf-8")

            yield item

        if len(response.xpath("//a[@id='next' and @class='noactive']")) == 0:
            url = response.xpath("//a[@id='next']/@href").extract()[0]
            yield scrapy.Request('https://hr.tencent.com/' + url,
                                 callback=self.parse)
Example #6
0
    def parse(self, response):

        node_list = response.xpath(
            "//tr[@class ='even'] | // tr[@ class ='odd']")
        for node in node_list:
            item = TencentItem()

            # 提取每个职位的信息,并且将提取出的Unicode字符串编码为UTF-8
            item["position_name"] = node.xpath("./td[1]/a/text()").extract()[0]
            if len(node.xpath("./td[2]/text()")):
                item["position_type"] = node.xpath(
                    "./td[2]/text()").extract()[0]
            else:
                item["position_type"] = ""
            item["people_number"] = node.xpath("./td[3]/text()").extract()[0]
            item["position_address"] = node.xpath(
                "./td[4]/text()").extract()[0]
            item["release_time"] = node.xpath("./td[5]/text()").extract()[0]

            yield item

        # 方法一
        # if self.offset < 210:
        #     self.offset += 10
        #     url = self.baseURL + str(self.offset)
        #     # callback回调函数
        #     yield scrapy.Request(url, callback=self.parse)

        # 方法二:获取下一页的链接,调用callback函数
        if len(response.xpath("//a[@id='next' and @class='noactive']")) == 0:
            url = response.xpath("//a[@id='next']/@href").extract()[0]
            yield scrapy.Request("https://hr.tencent.com/" + url,
                                 callback=self.parse)
Example #7
0
    def parse(self, response):

        position_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')

        for position in position_list:
            item = TencentItem()
            item['positionName'] = position.xpath('td[1]/a/text()').extract_first()
            item['positionLink'] = "http://hr.tencent.com/" + position.xpath('td[1]/a/@href').extract_first()
            positionType = position.xpath('td[2]/text()').extract()
            if len(positionType):
                item['positionType'] = positionType
            else:
                item['positionType'] = ""
            item['peopleNumber'] = position.xpath('td[3]/text()').extract_first()
            item['workLocation'] = position.xpath('td[4]/text()').extract_first()
            item['publishTime'] = position.xpath('td[5]/text()').extract_first()

            yield item

        self.offset += 10
        print 'offset', self.offset
        next_url = self.base_url.format(self.offset)
        print 'next_url', next_url
        if position_list.extract_first() is not None:
            yield scrapy.Request(next_url, callback=self.parse)
Example #8
0
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for node in node_list:
            item = TencentItem()
            item['positionName'] = node.xpath(
                "./td[1]/a/text()").extract()[0].encode("utf-8")
            item['positionLink'] = node.xpath(
                "./td[1]/a/@href").extract()[0].encode("utf-8")
            # 爬去的网页中可能为空。做一个判断,防止代码报错
            if len(node.xpath("./td[2]/text()")):
                item['positionType'] = node.xpath(
                    "./td[2]/text()").extract()[0].encode("utf-8")
            else:
                item['positionType'] = ""

            item['positionNum'] = node.xpath(
                "./td[3]/text()").extract()[0].encode("utf-8")
            item['positionAddress'] = node.xpath(
                "./td[4]/text()").extract()[0].encode("utf-8")
            item['positionTime'] = node.xpath(
                "./td[5]/text()").extract()[0].encode("utf-8")
            # 类比于return yield返回给管道
            yield item

        if self.temp < 3000:
            self.temp += 10
            # 自己定义的新的url
            newurl = self.base_url + str(self.temp)
            # 构建并发送请求 request两个参数
            yield scrapy.Request(newurl, callback=self.parse)
Example #9
0
    def parse(self, response):

        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            # 提取每个职位的信息,并且将提取出的Unicode字符串编码为UTF-8编码
            item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0].encode("utf-8")
            item['positionLink'] = node.xpath("./td[1]/a/@href").extract()[0].encode("utf-8")

            if len(node.xpath("./td[2]/text()")):
                item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")
            else:
                item['positionType'] = ""

            item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")
            item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")
            item['publishTime'] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")

            yield item

        if self.offset < 3070:
            self.offset += 10
            url = self.baseURL + str(self.offset)
            yield scrapy.Request(url, callback = self.parse)
Example #10
0
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for node in node_list:
            items = TencentItem()
            # 提取信息
            items['positionName'] = node.xpath("./td/a/text()").extract()[
                0]  # .encode('utf-8')
            items['positionLink'] = node.xpath("./td/a/@href").extract()[0]
            if node.xpath("./td[2]/text()"):
                items['positionType'] = node.xpath(
                    "./td[2]/text()").extract()[0]
            else:
                items['positionType'] = None

            items['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0]
            items['workLocation'] = node.xpath("./td[4]/text()").extract()[0]
            items['publishTime'] = node.xpath("./td[5]/text()").extract()[0]
            # print(items)
            yield items
        if self.offset < 3300 + 10:
            self.offset += 10
            urls = self.baseUrl + str(self.offset)
            request = scrapy.Request(urls, callback=self.parse)
            yield request


# //div[@class='left wcont_b box']//table[@class='tablelist']//tr/td[@class='l square']
Example #11
0
    def parse(self, response):
        
        node_list = response.xpath("//tr[@class='odd'] | //tr[@class='even']")

        for node in node_list:
            item = TencentItem()
            #职位名称
            item['positionName'] = node.xpath('./td[1]/a/text()').extract()[0]
            # 职位链接
            item['positionLink'] = "https://hr.tencent.com/" + node.xpath('./td[1]/a/@href').extract()[0]
            # 职位类型
            if len(node.xpath('./td[2]/text()')):
                item['positonType'] = node.xpath('./td[2]/text()').extract()[0]
            else:
                item['positonType'] = ""
            # 招聘人数
            item['peopleNumber'] = node.xpath('./td[3]/text()').extract()[0]
            # 工作地点
            item['workLocation'] = node.xpath('./td[4]/text()').extract()[0]
            # 发布时间
            item['poblishTime'] = node.xpath('./td[5]/text()').extract()[0]

            yield item
        # 这是拼接URL的方法,对于读取json的时候,因为没有原始的url随意用构建
        # 这里也有问题如果 如果职位数变了,就不是3600了,每次都得改。

        # if self.offset < 50:
        #     self.offset += 10
        #     url = self.baseURL + str(self.offset)
        #     yield scrapy.Request(url, callback = self.parse)

        # 方法二,用下一页来做,这样就不管多少职位都能搞了
        if len(response.xpath('//a[@class="noactive" and @id="next"]')) == 0:
            url = response.xpath('//a[@id="next"]/@href').extract()[0]
            yield scrapy.Request('https://hr.tencent.com/' + url, callback = self.parse)
Example #12
0
    def parse(self, response):
        # print(response.text)
        # print(response.body.decode("utf-8"))
        # print(response.headers)
        nodelist = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        #IndexError: list index out of range

        for node in nodelist:
            item = TencentItem()
            item['position_url']  = node.xpath('./td[1]/a/@href').extract()[0]
            item['position_name'] = node.xpath('./td[1]/a/text()').extract()[0]
            # position_type = node.xpath('./td[2]/text()').extract()[0] 爬取内容为空时,数组会报越界错误
            position_types_list = node.xpath('./td[2]/text()').extract()
            item['position_type'] = position_types_list[0] if position_types_list else None
            item['position_num'] = node.xpath('./td[3]/text()').extract()[0]
            item['position_address'] = node.xpath('./td[4]/text()').extract()[0]
            #position_time = node.xpath('./td[5]/text()').extract()[0] 爬取内容为空时,数组会报越界错误
            times = node.xpath('./td[5]/text()').extract()
            item['position_time'] = times[0] if times else None
            yield item

        if self.offset < 3040:
            self.offset += 10
            yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
Example #13
0
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            item['position_name'] = node.xpath(
                "./td[1]/a/text()").extract_first()
            item['position_link'] = "https://hr.tencent.com/" + node.xpath(
                "./td[1]/a/@href").extract_first()
            item['position_type'] = node.xpath(
                "./td[2]/text()").extract_first()
            item['people_number'] = node.xpath(
                "./td[3]/text()").extract_first()
            item['work_location'] = node.xpath(
                "./td[4]/text()").extract_first()
            item['publish_times'] = node.xpath(
                "./td[5]/text()").extract_first()
            yield item
        """
        # 判断当前页面是否到最后一页,如果没到最后一页,就继续发送下一页的请求
        if not response.xpath("//a[@class='noactive' and @id='next']").extract_first():
            next_link = "https://hr.tencent.com/" + response.xpath("//a[@id='next']/@href").extract_first()
            yield scrapy.Request(next_link, callback = self.parse)
        """
        """
Example #14
0
    def parse(self, response):

        contents = response.xpath("//div[@class ='recruit-list']/a")
        for each in contents:
            item = TencentItem()
            positionName = each.xpath("/h4/text()").extract()
            organization = each.xpath("/p/span[1]/text()").extract()
            positionLocation = each.xpath("/p/span[2]/text()").extract()
            positionType = each.xpath("/p/span[3]/text()").extract()
            releaseTime = each.xpath("/p/span[4]/text()").extract()
            positionBrief = each.xpath("/p[2]/text()").extract()

            item['positionName'] = positionName[0]
            item['organization'] = organization[0]
            item['positionLocation'] = positionLocation[0]
            item['positionType'] = positionType[0]
            item['releaseTime'] = releaseTime[0]
            item['positionBrief'] = positionBrief[0]
            yield item
        # # 第一种写法:拼接url。使用场景,页面没有可以点击的链接,只能通过拼接url才能获得响应
        if self.offset < 380:
            self.offset += 1
            url = self.baseURL + str(self.offset)
            # 返回请求
            yield scrapy.Request(url, callback=self.parse)
Example #15
0
    def parse(self, response):
        # 提取所有职位信息的节点列表
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            # 迭代每个节点,并将数据保存到item对象里,每个item对象表示一条职位信息
            item = TencentItem()

            item['position_name'] = node.xpath(
                "./td[1]/a/text()").extract_first()
            item['position_link'] = "http://hr.tencent.com/" + node.xpath(
                "./td[1]/a/@href").extract_first()
            item['position_type'] = node.xpath(
                "./td[2]/text()").extract_first()
            item['people_number'] = node.xpath(
                "./td[3]/text()").extract_first()
            item['work_location'] = node.xpath(
                "./td[4]/text()").extract_first()
            item['publish_times'] = node.xpath(
                "./td[5]/text()").extract_first()

            # 发送每个职位详情页的请求,并指定回掉函数处理响应
            # yield scrapy.Request(url=item["position_link"], callback=self.parse_positon)

            # meta接收一个字典,并将该字典做为response的属性传递到回调函数里,在通过response.meta提取数据
            yield scrapy.Request(url=item["position_link"],
                                 meta={"item": item},
                                 callback=self.parse_position)
Example #16
0
    def parse(self, response):
        #提取的标签列表
        node_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')

        for node in node_list:
            item = TencentItem()
            #提取每个职位的信息
            item['name'] = node.xpath('./td[1]/a/text()').extract_first()
            item['pos_link'] = 'https://hr.tencent.com/' + node.xpath('./td[1]/a/@href').extract_first()
            pos_Type = node.xpath('./td[2]/text()').extract_first()
            if not pos_Type:
                item['pos_Type'] = ''
            else:
                item['pos_Type'] = pos_Type
            item['pos_nums'] = node.xpath('./td[3]/text()').extract_first()
            item['pos_loaction'] = node.xpath('./td[4]/text()').extract_first()
            item['pos_time'] = node.xpath('./td[5]/text()').extract_first()
            yield item 
            #第一种写法。拼接url,页面没有连接 只能进行拼接
        """if self.offset < 2190:
            self.offset += 10
            url = self.base_url + str(self.offset)
            yield scrapy.Request(url, callback = self.parse)
        """
        #直接从rsponse里提取连接 直接提取完
        url = response.xpath('//a[@class="noactive" and @id="next"]')
        if not len(url):
            next_url ='https://hr.tencent.com/' + response.xpath('//a[@id="next"]/@href').extract_first()
            #原来是二次解析的域名被过滤掉了,解决办法 1、dont_filter=True 忽略allowed_domains的过滤  2、更换为对应的一级域名
            yield scrapy.Request(next_url, callback=self.parse, dont_filter=True)
Example #17
0
    def parse(self, response):
        cot_list = response.xpath('//tr[@class="even"]|//tr[@class="odd"]')

        for cot in cot_list:
            item = TencentItem()

            cot_name = cot.xpath('./td[1]/a/text()').extract()[0]
            cot_link = cot.xpath('./td[1]/a/@href').extract()[0]
            cot_type = cot.xpath('./td[2]/text()').get()
            cot_num = cot.xpath('./td[3]/text()').extract()[0]
            cot_add = cot.xpath('./td[4]/text()').extract()[0]
            cot_time = cot.xpath('./td[5]/text()').extract()[0]

            item["cot_name"] = cot_name
            item["cot_link"] = cot_link
            item["cot_type"] = cot_type
            item["cot_num"] = cot_num
            item["cot_add"] = cot_add
            item["cot_time"] = cot_time

            yield item

        # page = response.xpath('//div[@class="left"]/span/text()').get()
        # if self.st < int(page):
        #     self.st += 10

        if self.st < 100:
            self.st += 10

        # 每一页的请求链接
        new_url = self.url + str(self.st)

        yield scrapy.Request(new_url, callback=self.parse)
    def parse(self, response):
        alibaba_position = response.xpath("//div/div/table//tr[@data-type ='11']")
        item = TencentItem()
        i = 1
        for each in alibaba_position:
            position = each.xpath('./th/a/text()').extract()
            # 获取职位具体工作岗位
            information = each.xpath('./td/text()').extract()

            # techlogy =each.xpath('./td[@data-spm-anchor-id="0.0.0.i%s.1672725fHqSMuq"]/text()'%i).extract()
            # i += 1
            techlogy = each.xpath('./td/text()').extract()
            # 获取工作城市

            work_city =each.xpath('./td[@class="work-city"]/text()').extract()
            # 获取毕业时间
            # generation =each.xpath('./td[@data-spm-anchor-id="0.0.0.i%s.1672725fHqSMuq"]/text()'%i).extract()
            # i += 1
                # 获取职位具体信息
            detail = each.xpath('./td[@class="position-detail"]/a/text()').extract()

            item['position'] = position[0]
            item['techlogy'] = techlogy[0]
            item['work_city'] = work_city[0]
            item['generation'] = techlogy[1]
            item['detail'] = detail[0]

            yield item
Example #19
0
    def parse(self, response):
        item = []
        for each in response.xpath('//*[@class="even"]'):

            item = TencentItem()
            name = each.xpath('./td[1]/a/text()').extract()[0]
            detail_link = each.xpath('./td[1]/a/@href').extract()[0]
            job_info = each.xpath('./td[2]/text()').extract()[0]
            people_number = each.xpath('./td[3]/text()').extract()[0]
            work_city = each.xpath('./td[4]/text()').extract()[0]
            publish_date = each.xpath('./td[5]/text()').extract()[0]

            item['name'] = name
            item['detail_link'] = detail_link
            item['job_info'] = job_info
            item['people_number'] = people_number
            item['work_city'] = work_city
            item['publish_date'] = publish_date

            # 翻页
            curpage = re.search('(\d+)', response.url).group(1)
            page = int(curpage) + 10
            url = re.sub('\d+', str(page), response.url)

            # 发送新的url请求加入待爬队列,并调用回调函数 self.parse
            yield scrapy.Request(url, callback=self.parse)

            # 将获取的数据交给pipeline
            yield item
Example #20
0
    def parse(self, response):
        node_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')
 
        for node in node_list:
            item = TencentItem() 
            #xpath数组下表 从一开始
            print(len(node_list))
            print("真在获取页面。。。。。。")
            item['positionName'] = node.xpath('./td[1]/a/text()').extract()[0].encode('utf-8')
            item['positionLink'] = node.xpath('./td[1]/a/@href').extract()[0].encode('utf-8')
            if len(node.xpath('./td[2]/text()')):
                item['positionType'] = node.xpath('./td[2]/text()').extract()[0].encode('utf-8')
            else:
                item['positionType'] = "".encode('utf-8')

            item['peopleNumber'] = node.xpath('./td[3]/text()').extract()[0].encode('utf-8')
            item['workLocation'] = node.xpath('./td[4]/text()').extract()[0].encode('utf-8')
            item['publishTime'] = node.xpath('./td[5]/text()').extract()[0].encode('utf-8')
            print(node.xpath('./td[5]/text()').extract()[0].encode('utf-8'))
            yield item

        #self.offset = 10
        '''
        if self.offset < 200:
            self.offset += 10
            url = self.baseUrl + str(self.offset)
            yield scrapy.Request(url, callback = self.parse)
        '''
        #另一中方法不断 提取下一页的链接 
        if len(response.xpath('//a[@class="noactive" and @id="next"]')) == 0:
            url = response.xpath('//a[@id="next"]/@href').extract()[0]
            yield scrapy.Request("http://hr.tencent.com/" + url,callback = self.parse)
Example #21
0
    def parse(self, response):
        node_list = response.xpath(
            "//tr[@class ='even'] | //tr[@class ='odd']")
        for node in node_list:
            item = TencentItem()
            item['positionName'] = node.xpath("./td[1]//text()").extract()[0]
            item['positionLink'] = node.xpath("./td[1]/a/@href").extract()[0]
            try:
                item['positionType'] = node.xpath(
                    "./td[2]//text()").extract()[0]
            except:
                item['positionType'] = ''
            item['peopleNumber'] = node.xpath("./td[3]//text()").extract()[0]
            item['workLocation'] = node.xpath("./td[4]//text()").extract()[0]
            item['publicTime'] = node.xpath("./td[5]//text()").extract()[0]
            yield item

        # if self.offset < 3110:
        #     self.offset += 10
        #     url = self.baseURL + str(self.offset)
        #     yield scrapy.Request(url, callback=self.parse)

        if not len(response.xpath("//a[@class='noactive' and @id='next']")):
            url = response.xpath("//a[@id='next']/@href").extract()[0]
            yield scrapy.Request("https://hr.tencent.com/" + url,
                                 callback=self.parse)
Example #22
0
 def parse(self, response):
     print("#" * 80)
     node_list = response.xpath(
         "//tr[@class='even'] | //tr[@class = 'odd']")
     for node in node_list:
         item = TencentItem()
         item["positionName"] = node.xpath(
             "./td[1]/a/text()").extract()[0].encode("utf-8")
         item["positionLink"] = node.xpath(
             "./td[1]/a/@href").extract()[0].encode("utf-8")
         if node.xpath("./td[2]/text()"):
             item["positionType"] = node.xpath(
                 "./td[2]/text()").extract()[0].encode("utf-8")
         else:
             item["positionType"] = ''
         item["peopleNumber"] = node.xpath(
             "./td[3]/text()").extract()[0].encode("utf-8")
         item["workLocation"] = node.xpath(
             "./td[4]/text()").extract()[0].encode("utf-8")
         item["publishTime"] = node.xpath(
             "./td[5]/text()").extract()[0].encode("utf-8")
         yield item
     # print(response.xpath("//a[@class = 'noactive'and @id = 'next']"))
     if not response.xpath("//a[@class = 'noactive'and @id = 'next']"):
         url = "https://hr.tencent.com/" + response.xpath(
             "//a[@id = 'next']/@href").extract()[0]
         print(url)
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              dont_filter=True)
Example #23
0
    def parse(self, response):
        print(response.body)
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            position_name = node.xpath("./td[1]/a/text()").extract_first()
            position_link = node.xpath("./td[1]/a/@href").extract_first()
            if len(node.xpath("./td[2]/text()")):
                position_type = node.xpath("./td[2]/text()").extract_first()
            else:
                position_type = ""
            position_number = node.xpath("./td[3]/text()").extract_first()
            work_location = node.xpath("./td[4]/text()").extract_first()
            publish_time = node.xpath("./td[5]/text()").extract_first()

            item['position_name'] = position_name
            item['position_link'] = position_link
            item['position_type'] = position_type
            item['position_number'] = position_number
            item['work_location'] = work_location
            item['publish_time'] = publish_time

            yield item

        if not len(response.xpath("//a[@class='noactive' and @id='next']")):
            next_url = response.xpath("//a[@id='next']/@href").extract_first()
            url = 'https://hr.tencent.com/' + next_url
            yield scrapy.Request(url, callback=self.parse)  # parse前添加self.
Example #24
0
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            # 每一个item表示一个职位信息
            item = TencentItem()
            item["position_name"] = node.xpath(".//a/text()").extract_first()
            item["position_link"] = node.xpath(".//a/@href").extract_first()
            item["position_type"] = node.xpath(
                "./td[2]/text()").extract_first()
            item["people_number"] = node.xpath(
                "./td[3]/text()").extract_first()
            item["work_location"] = node.xpath(
                "./td[4]/text()").extract_first()
            item["publish_times"] = node.xpath(
                "./td[5]/text()").extract_first()

            yield item

        # 使用与确定的页码,一直循环判断并自增
        # 优点是写法简单,缺点是并没有用到scrapy的并发
        if self.offset <= 2690:
            self.offset += 10
            # callback 表示回调函数
            # 请求发送出去,返回的响应由callback指定定的方法解析
            yield scrapy.Request(url=self.base_url + str(self.offset),
                                 callback=self.parse)
Example #25
0
    def parse(self, response):

        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            # print(node.xpath("td[1]/a/text()").extract())
            item['name'] = node.xpath("./td[1]/a/text()").extract()[0].encode(
                "utf-8")

            if len(node.xpath("./td[2]/text()")) == 0:
                item['type'] = ""
            else:
                item['type'] = node.xpath(
                    "./td[2]/text()").extract()[0].encode("utf-8")

            item['number'] = node.xpath("./td[3]/text()").extract()[0].encode(
                "utf-8")
            item['position'] = node.xpath(
                "./td[4]/text()").extract()[0].encode("utf-8")
            item['time'] = node.xpath("./td[5]/text()").extract()[0].encode(
                "utf-8")

            # print(dict(item))
            yield item

        if len(response.xpath("//a[@id='next' and @class = 'noactive']")) == 0:
            # position.php? & start = 10  # a
            url = response.xpath("//a[@id='next']/@href").extract()[0]
            print(url)
            yield scrapy.Request('http://hr.tencent.com/' + url,
                                 dont_filter=True,
                                 callback=self.parse)
Example #26
0
    def parse(self, response):
        # print(response.text)
        for each in response.xpath(
                "//tr[@class = 'even'] | //tr[@class = 'odd']"):
            # 初始化模型对象
            item = TencentItem()

            item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
            item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]
            item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
            item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
            item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item

        # 方式一 拼接url
        # if self.offset < 100:
        #     self.offset += 10
        #     # 将请求重写发送给调度器入队列、出队列、交给下载器下载
        #     # 拼接新的url,并回调parse函数处理response
        #     # yield scrapy.Request(url, callback = self.parse)
        # yield scrapy.Request(self.url + str(self.offset), callback=self.parse)

        # 方式二,读取下一页的按钮
        if len(response.xpath("//a[@class= 'noactive' and @id = 'next]")) == 0:
            url = response.xpath("//a[@id='next]/@href").extrace()[0]
            yield scrapy.Request('http://careers.tencent.com/' + url,
                                 callback=self.parse)
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even']|//tr[@class='odd']")
        for node in node_list:
            item = TencentItem()
            #提取职位信息,并将字符串编码
            item['JobTitle'] = node.xpath("//tr[@class='even']/td[1]/a/text()"
                                          ).extract()[0].encode("utf-8")
            item['JobLink'] = node.xpath(
                "./td[1]/a/@href").extract()[0].encode("utf-8")
            if len(node.xpath("./td[2]/text()")):
                item['JobType'] = node.xpath(
                    "./td[2]/text()").extract()[0].encode("utf-8")
            else:
                item['JobType'] = ""
            item['Numbers'] = node.xpath("./td[3]/text()").extract()[0].encode(
                "utf-8")
            item['WorkPlace'] = node.xpath(
                "./td[4]/text()").extract()[0].encode("utf-8")
            item['ReleaseTime'] = node.xpath(
                "./td[5]/text()").extract()[0].encode('utf-8')
            #返回数据后继续执行函数
            yield item

        #Response获取需要爬的链接 callback 直到提取完成
        if self.NextNumber < 2190:
            self.NextNumber += 10
            Url = self.baseurl + str(self.NextNumber)
            yield scrapy.Request(Url, callback=self.parse)
Example #28
0
    def parse(self, response):
        position_list = response.xpath(
            '//tr[@class="even"] | //tr[@class="odd"]')
        for position in position_list:
            item = TencentItem()
            position_name = position.xpath("./td[1]/a/text()").get()
            position_link = position.xpath("./td[1]/a/@href").get()
            position_type = position.xpath("./td[2]/text()").extract()[0]
            people_num = position.xpath("./td[3]/text()").extract()[0]
            work_address = position.xpath("./td[4]/text()").get()
            publish_time = position.xpath("./td[5]/text()").get()
            item["position_name"] = position_name
            item["position_link"] = position_link
            item["position_type"] = position_type
            item["people_num"] = people_num
            item["work_address"] = work_address
            item["publish_time"] = publish_time

            yield item
        total = response.xpath('//tr[@class="f"]//span/text()').extract()[0]
        if self.offset < int(total):
            self.offset += 10
        new_url = 'https://hr.tencent.com/position.php?&start=' + str(
            self.offset)
        yield scrapy.Request(new_url, callback=self.parse)
    def parse(self, response):
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            # 初始化模型对象
            try:
                item = TencentItem()
                # 职位名称
                item['positionname'] = each.xpath(
                    "./td[1]/a/text()").extract()[0]
                # # 详情连接
                item['positionlink'] = each.xpath(
                    "./td[1]/a/@href").extract()[0]
                # # 职位类别
                item['positionType'] = each.xpath(
                    "./td[2]/text()").extract()[0]
                # # 招聘人数
                item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
                # # 工作地点
                item['workLocation'] = each.xpath(
                    "./td[4]/text()").extract()[0]
                # # 发布时间
                item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
            except IndexError:
                print(
                    "================================================================"
                )

            yield item

        if self.offset < 3951:
            self.offset += 10

            # 每次处理完一页的数据之后,重新发送下一页页面请求
            # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Example #30
0
    def parse(self, response):
        node_list = response.xpath('//tr[@class="odd"] | //tr[@class="even"]')

        for node in node_list:
            item = TencentItem()
            item['position_name'] = node.xpath(
                "./td[1]/a/text()").extract_first()
            item['position_link'] = u"https://hr.tencent.com/" + node.xpath(
                "./td[1]/a/@href").extract_first()
            item['position_type'] = node.xpath(
                "./td[2]/text()").extract_first()
            item['people_number'] = node.xpath(
                "./td[3]/text()").extract_first()
            item['work_location'] = node.xpath(
                "./td[4]/text()").extract_first()
            item['publish_time'] = node.xpath("./td[5]/text()").extract_first()

            yield item
            # 发送详情页的请求

            yield scrapy.Request(item['position_link'],
                                 callback=self.parse_detail)

            # if self.offset < 1390:
            #     self.offset += 10
            #     next_url = self.base_url + str(self.offset)
            #     # callback: 表示该请求发送后,返回的响应交给指定的 回调函数解析 parse
            #     yield scrapy.Request(next_url, callback=self.parse)

        if not response.xpath("//a[@class='noactive' and @id='next']"):
            next_url = u"https://hr.tencent.com/" + response.xpath(
                ".//a[@id='next']/@href").extract_first()

            yield scrapy.Request(next_url, callback=self.parse)