def parse(self, response):
     for each in response.xpath('//div[@class="job-list"]/ul//li'):
         item = ZhipinspiderItem()
         item['title'] = each.xpath(
             './div/div[@class="info-primary"]/h3/a/div[@class="job-title"]/text()'
         ).extract()[0]
         item['salary'] = each.xpath(
             './div/div[@class="info-primary"]/h3/a/span/text()').extract(
             )[0]
         item['company'] = each.xpath(
             './div/div[@class="info-company"]/div/h3/a/text()').extract(
             )[0]
         item['url'] = each.xpath(
             './div/div[@class="info-primary"]/h3/a/@href').extract()[0]
         item['work_addr'] = each.xpath(
             './div/div[@class="info-primary"]/p/text()').extract()[0]
         item['industry'] = each.xpath(
             './div/div[@class="info-company"]/div/p/text()').extract()[0]
         item['company_size'] = each.xpath(
             './div/div[@class="info-company"]/div/p/text()').extract()[-1]
         item['recruiter'] = each.xpath(
             './div/div[@class="info-publis"]/h3/text()').extract()[0]
         yield item
     next_pages = response.xpath(
         '//div[@class="page"]/a[@class="next"]/@href').extract()
     if next_pages and len(next_pages) > 0:
         next_page = next_pages[0]
         print("---------------------", next_page)
         yield scrapy.Request("https://www.zhipin.com" + next_page,
                              callback=self.parse)
Example #2
0
    def parse(self, response):
        #遍历页面中的所有//div[@class="job_primary"]节点
        for job_primary in response.xpath('//div[@class="job-primary"]'):
            item = ZhipinspiderItem()
            #匹配//div[@class="job-primary"]节点下的/div[@class="info-primary"]节点
            info_primary = job_primary.xpath('./div[@class="info-primary"]')
            item['title'] = info_primary.xpath(
                './div[@class="primary-wrapper"]/div[@class="primary-box"]/div[@class="job-title"]/span[@class="job-name"]/a/text()'
            ).extract_first()
            item['salary'] = info_primary.xpath(
                './div[@class="primary-wrapper"]/div[@class="primary-box"]/div[@class="job-limit clearfix"]/span[@class="red"]/text()'
            ).extract_first()
            item['work_addr'] = info_primary.xpath(
                '.div[@class="primary-wrapper"]/div[@class="primary-box"]/div[@class="job-title"]/span[@class="job-area-wrapper"]/span[@class="job-area"]/text()'
            ).extract_first()
            item['url'] = info_primary.xpath('./h3a/@herf').extract_first()

            company_text = job_primary.xpath('./div[@class="info-company"]' +
                                             '/div[@class="company-text"]')
            item['company'] = company_text.xpath(
                './h3/a/text()').extract_first()
            company_info = company_text.xpath('./p/text()').extract()
            if company_info and len(company_info) > 0:
                item['industry'] = company_text.xpath(
                    './p/text()').extract()[0]
            if company_info and len(company_info) > 1:
                item['company_size'] = company_text.xpath(
                    './p/text()').extract()[2]
            info_publish = job_primary.xpath('./div[@class="info-publish"]')
            item['recruiter'] = info_publish.xpath(
                './h3/text()').extract_first()
            item['publish_date'] = info_publish.xpath(
                './p/text()').extract_first()
            yield item
Example #3
0
    def parse(self, response):

        # 获取当前页面下所有的招聘数据
        node_list = response.xpath('//div[@id="main"]/div/div[2]/ul/li')

        # 循环解析页面的数据
        for node in node_list:
            item = ZhipinspiderItem()
            item['name'] = node.xpath('.//div[@class="job-title"]/span[1]/a/text()').extract()[0]
            item['salary'] = node.xpath('.//div[@class="job-limit clearfix"]/span/text()').extract()[0]
            item['address'] = node.xpath('.//div[@class="job-title"]/span[2]/span/text()').extract()[0]
            item['years'] = node.xpath('.//div[@class="job-limit clearfix"]/p/text()').extract()[0]
            item['education'] = node.xpath('.//div[@class="job-limit clearfix"]/p/text()').extract()[1]
            item['company_name'] = node.xpath('.//div[@class="company-text"]/h3/a/text()').extract()[0]
            item['company_type'] = node.xpath('.//div[@class="company-text"]/p/a/text()').extract()[0]
            company = node.xpath('.//div[@class="company-text"]/p/text()').extract()
            item['company_scale'] = company[0]
            item['company_finance'] = company[1]
            yield item

        self.current_page += 1

        # 最多爬2页
        if self.current_page > 1:
            print("全部数据爬取完毕!")
        else:
            time.sleep(5)
            # 使用 yield 语句将 item 对象返回给 Scrapy 引擎。这里不能用return,因为return会导致整个方法返回,循环不能继续执行
            # 而 yield 将会创建一个生成器。
            yield self.next_request()
Example #4
0
 def parse(self, response):
     for job_primary in response.xpath(
             r'//div[@id="main"]/div/div[2]/ul/li/div'):
         item = ZhipinspiderItem()
         info_primary = job_primary.xpath('./div[@class="info-primary"]')
         item['title'] = info_primary.xpath(
             './h3/a/div[@class="job-title"]/text()')
         yield item
Example #5
0
 def parse(self, response):
     for job_primary in response.xpath('//div[@class="job_primary"]'):
         item = ZhipinspiderItem()
         info_primary = job_primary.xpath('./div[@class="info_primary"]')
         item['title'] = info_primary.xpath('./h3/a/div[@class="job_title"]/text()').extract_first()
         item['salary'] = info_primary.xpath('./h3/a/span[@class="red"]/text()').exract_first()
         item['work_addr'] = info_primary.xpath('./p/text()').exract_first()
         item['url'] = info_primary.xpath('./h3/a/@herf').exract_first()
         yield item
Example #6
0
    def parse(self, response):

        new_links = response.xpath(
            '//div[@class="page"]/a[@class="next"]/@href').extract()
        if new_links and len(new_links) > 0:
            # 获取下一页的链接
            new_link = new_links[0]
            # 再次发送请求获取下一页数据
            yield scrapy.Request("https://www.zhipin.com" + new_link,
                                 callback=self.parse)

        # print('---***--response {} -{}-'.format(response, response.selector.xpath('//div[@class="job-list"]//li')))
        # 遍历页面上所有//div[@class="job-primary"]节点

        item = ZhipinspiderItem()
        for job_primary in response.xpath('//div[@class="job-primary"]'):

            print('---***--job_primary {} --'.format(job_primary))

            # 匹配//div[@class="job-primary"]节点下/div[@class="info-primary"]节点
            # 也就是匹配到包含工作信息的<div.../>元素
            info_primary = job_primary.xpath('./div[@class="info-primary"]')
            item['title'] = info_primary.xpath(
                './h3/a/div[@class="job-title"]/text()').extract_first()
            item['salary'] = info_primary.xpath(
                './h3/a/span[@class="red"]/text()').extract_first()
            item['work_addr'] = info_primary.xpath(
                './p/text()').extract_first()
            item['url'] = info_primary.xpath('./h3/a/@href').extract_first()
            # 匹配//div[@class="job-primary"]节点下./div[@class="info-company"]节点下
            # 的/div[@class="company-text"]的节点
            # 也就是匹配到包含公司信息的<div.../>元素
            company_text = job_primary.xpath('./div[@class="info-company"]' +
                                             '/div[@class="company-text"]')
            item['company'] = company_text.xpath(
                './h3/a/text()').extract_first()
            company_info = company_text.xpath('./p/text()').extract()
            if company_info and len(company_info) > 0:
                item['industry'] = company_info[0]
            if company_info and len(company_info) > 2:
                item['company_size'] = company_info[2]
            # 匹配//div[@class="job-primary"]节点下./div[@class="info-publis"]节点下
            # 也就是匹配到包含发布人信息的<div.../>元素
            info_publis = job_primary.xpath('./div[@class="info-publis"]')
            item['recruiter'] = info_publis.xpath(
                './h3/text()').extract_first()
            item['publish_date'] = info_publis.xpath(
                './p/text()').extract_first()

            yield item
    def parse(self, response):
        # 遍历页面中所有 //div[@class="job-primary"]节点
        for job_primary in response.xpath('//div[@class="job-primary"]'):
            item = ZhipinspiderItem()

            # 匹配 //div[@class="job-primary"]节点下的 /div[@class="info-primary"]节点
            # 也就是匹配到包含工作信息的<div.../>元素
            info_primary = job_primary.xpath('./div[@class="info-primary"]')
            item['title'] = info_primary.xpath(
                './h3/a/div[@class="job-title"]/text()').extract_first()
            item['salary'] = info_primary.xpath(
                './h3/a/span[@class="red"]/text()').extract_first()
            item['work_addr'] = info_primary.xpath(
                './p/text()').extract_first()
            item['url'] = info_primary.xpath('./h3/a/@href').extract_first()

            # 匹配 //div[@class="job-primary"]节点下 ./div[@class="info-company"] 节点下
            # 的 /div[@class="company-text" 节点
            # 也就是匹配到包含公司信息的<div.../>元素
            company_text = job_primary.xpath(
                './div[@class="info-company"]/div[@class="company-text"]')
            item['company'] = company_text.xpath(
                './h3/a/text()').extract_first()
            company_info = company_text.xpath('./p/text()').extract()
            if company_info and len(company_info) > 0:
                item['industry'] = company_text.xpath(
                    './p/text()').extract()[0]
            if company_info and len(company_info) > 1:
                item['company_size'] = company_text.xpath(
                    './p/text()').extract()[2]

            # 匹配 //div[@class="job-primary"]节点下的 /div[@class="info-publis"]节点
            # 也就是匹配到包含发布人信息的<div.../>元素
            info_publis = job_primary.xpath('./div[@class="info-publis"]')
            item['recruiter'] = info_publis.xpath(
                './h3/text()').extract_first()
            yield item

        # 解析下一页链接
        new_links = response.xpath(
            '//div[@class="page"]/a[@class="next"]/@href').extract()
        if new_links and len(new_links) > 0:
            # 获取下一页的链接
            new_links = new_links[0]
            # 再次发送请求获取下一页数据
            yield scrapy.Request("http://www.zhipin.com" + new_links,
                                 callback=self.parse)
Example #8
0
    def parse(self, response):
        # pass

        # 遍历页面上所有//div[@class="job-primary"]节点,每个节点都包含一份招聘信息,为每个节点都建立一个ZhipinspiderItem对象,并从该节点中提取项目感兴趣的信息存入ZhipinspiderItem对象中
        for job_primary in response.xpath('//div[@class="job-primary"]'):
            item = ZhipinspiderItem()

            # 匹配//div[@class="job-primary"]节点下/div[@class="info-primary"]节点,也就是匹配到包含工作信息的div元素
            info_primary = job_primary.xpath('./div[@class="info-primary"]')
            item['title'] = info_primary.xpath(
                './h3/a/div[@class="job-title"]/text()').extract_first()
            item['salary'] = info_primary.xpath(
                './h3/a/span[@class="red"]/text()').extract_first()
            item['work_addr'] = info_primary.xpath(
                './p/text()').extract_first()
            item['url'] = info_primary.xpath('./h3/a/@href').extract_first()

            # 匹配//div[@class="job-primary"]节点下./div[@class="info-company"]节点下的/div[@class="company-text"]的节点,也就是匹配到包含公司信息的div元素
            company_text = job_primary.xpath(
                './div[@class="info-company"]/div[@class="company-text"]')
            item['company'] = company_text.xpath(
                './h3/a/text()').extract_first()
            company_info = company_text.xpath('./p/text()').extract()

            if company_info and len(company_info) > 0:
                item['industry'] = company_info[0]
            if company_info and len(company_info) > 2:
                item['company_size'] = company_info[2]

            # 匹配//div[@class="job-primary"]节点下./div[@class="info-publis"]节点下,也就是匹配到包含发布人信息的div元素
            info_publis = job_primary.xpath('./div[@class="info-publis"]')
            item['recruiter'] = info_publis.xpath(
                './h3/text()').extract_first()
            item['publish_date'] = info_publis.xpath(
                './p/text()').extract_first()

            # 使用yield语句将item对象返回给Scrapy引擎。此处不能使用return,因为return会导致整个方法返回,循环不能继续执行,而yield将会创建一个生成器
            yield item

            new_links = response.xpath(
                '//div[@class="page"]/a[@class="next"]/@href').extract(
                )  # 解析下一页的链接
            if new_links and len(new_links) > 0:
                new_link = new_links[0]  # 获取下一页的链接
                yield scrapy.Request("https://www.zhipin.com" + new_link,
                                     callback=self.parse)  # 再次发送请求获取下一页数据
Example #9
0
    def parse(self, response):
        #response代表Scrapy下载器所获取的目标的响应
        #和shell调试中response对象完全一样

        #每个job_box包含一个工作信息
        for job_box in response.xpath(
                '//div[@class="common-tab-box job-tab-box"]/ul/li'):
            #为每份工作创建一个item对象
            item = ZhipinspiderItem()

            #获取包含工作信息的div
            job_info = job_box.xpath('./div[@class="sub-li"]')
            item['title'] = job_info.xpath('./a/p/text()').extract()[0]
            item['work_spot'] = job_info.xpath('./a/p/text()').extract()[1]
            item['experience'] = job_info.xpath('./a/p/text()').extract()[2]
            item['edu'] = job_info.xpath('./a/p/text()').extract()[3]
            item['company'] = job_info.xpath('./a/p/text()').extract()[4]

            item['salary'] = job_info.xpath('./a/p/span/text()').extract()[0]
            item['recruiter'] = job_info.xpath(
                './a/p/span/text()').extract()[1]
            #item('title') = job_info.xpath('./a/p/span/text()').extract()[2]

            yield item