예제 #1
0
    def parse(self, response):
        print(response)
        li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
        for li in li_list:
            item = BossproItem()
            # ./div/div[1]/div[1]/div/div[1]/span[1]/a
            job_name = li.xpath(
                './div/div[1]/div[1]/div//div[1]/span[1]/a/@title'
            ).extract_first()
            detail_url = 'https://www.zhipin.com' + li.xpath(
                './div/div[1]/div[1]/div//div[1]/span[1]/a/@href'
            ).extract_first()
            item['job_name'] = job_name
            print(job_name, detail_url)
            #对详情页发请求,获取详情页的源码数据
            #手动请求发送
            #请求传参,通过:meta={},可以将meta字典传递给请求对应的回调函数
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        #分页操作
        if self.page_num <= 3:
            new_url = format(self.url % self.page_num)
            print(new_url)
            self.page_num += 1
            yield scrapy.Request(url=new_url, callback=self.parse)
예제 #2
0
    def parse(self, response):
        count = 0
        li_list = response.xpath('//div[@class="job-list"]/ul/li')

        for li in li_list:
            item = BossproItem()
            job_name = li.xpath(
                './div/div[1]/div[1]/div/div[1]/span[1]/text()').extract()
            item['job_name'] = job_name
            count += 1
            print(count)
            # print(job_name)
            detail_url = 'https://www.zhipin.com' + li.xpath(
                './div/div[1]/div[1]/div/div[1]/span[1]/a/@href').extract()

            # 获取detail page的数据
            # 手动请求的发送
            # 请求传参,meta={}, 可以将meta字典传递给请求对应的回调函数
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item: ': item})
        # 分页操作
        if self.page_num <= 3:
            new_url = format(self.url % self.page_num)
            self.page_num += 1

            yield scrapy.Request(new_url, callback=self.parse)
예제 #3
0
    def parse(self, response):
        li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')

        for li in li_list:
            item = BossproItem()
            job_name = li.xpath(
                './/span[@class="job-name"]/a/text()').extract_first()
            item['job_name'] = job_name
            print(job_name)
            detail_url = 'https://www.zhipin.com' + li.xpath(
                './/span[@class="job-name"]/a/@href').extract_first()

            # 对详情页发请求,获取详情页的页面源码数据
            # 手动请求的发送

            # 请求传参:meta={},可以将meta字典传递给请求对应的回调函数
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        # 分页操作
        if self.page_num <= 3:
            new_url = format(self.url % self.page_num)
            # print(new_url)
            self.page_num += 1

            yield scrapy.Request(new_url, callback=self.parse)
예제 #4
0
파일: boss.py 프로젝트: yangxiaoddddd/123
    def parse(self, response):
        li_list = response.xpath('//div[@class="job-list"]/ul/li')
        for li in li_list:
            job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()'
                                ).extract_first()
            salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()'
                              ).extract_first()
            company = li.xpath(
                './/div[@class="company-text"]/h3/a/text()').extract_first()

            #实例化一个item对象
            item = BossproItem()
            #将解析到的数据全部封装到item对象中
            item['job_name'] = job_name
            item['salary'] = salary
            item['company'] = company

            #将item提交给管道
            yield item

        if self.page <= 3:
            print('if 执行!!!')
            self.page += 1
            new_url = format(self.url % self.page)
            print(new_url)
            #手动请求发送
            yield scrapy.Request(url=new_url, callback=self.parse)
예제 #5
0
    def parse(self, response):
        li_list = response.xpath('/html/body/div/div[3]/div/div[2]/ul/li')
        for li in li_list:
            item = BossproItem()
            job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div[1]/text()').extract_first()
            item['job_name'] = job_name
            detial_url = 'https://www.zhipin.com' + li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first()
            yield scrapy.Request(url=detial_url, callback=self.parse_detail, meta={'item': item})

        if self.page_num <= 3:
            new_url = format(self.url % self.page_num)
            self.page_num += 1

            yield scrapy.Request(new_url, callback=self.parse)
예제 #6
0
 def parse(self, response):
     lis = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
     for li in lis:
         item = BossproItem()
         job_name = li.xpath('.//div[@class="job-title"]/span/text()').get()
         price = li.xpath('//span[@class="red"]/text()').get()
         adders = li.xpath('.//span[@class="job-area"]/text()').get()
         detail_url = 'https://www.zhipin.com' + li.xpath(
             './/span[@class="job-name"]/@href').get()
         print(job_name, price, adders)
         print('123456')
         item['job_name'] = job_name
         # 对详情页发起请求
         yield scrapy.Request(detail_url,
                              callback=self.parse_deatil,
                              meta={'item': item})
예제 #7
0
    def parse(self, response):
        li_list = response.xpath('//div[@class="job-list"]/ul/li')

        for li in li_list:
            title = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/div/text()').extract_first()
            salary = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/span/text()').extract_first()
            company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
            
            #实例化一个item类型的对象
            item = BossproItem()
            #将解析到的数据值存储到item对象中:why?
            item['title'] = title
            item['salary'] = salary
            item['company'] = company
            
            #将item对象提交给管道进行持久化存储
            yield item
예제 #8
0
    def parse(self, response):
        # print(TextResponse.text)
        li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
        print(li_list)
        for li in li_list:
            item = BossproItem()
            primary_box = li.xpath('./div/div[1]/div[1]/div')
            job_name = primary_box.xpath(
                './div[1]/span[1]/a/text()').extract_first()
            item['jobname'] = job_name
            detail_url = 'https://www.zhipin.com/' + primary_box.xpath(
                './div[1]/span/a/@href').extract_first()
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        if self.page_num <= 3:
            new_url = format(self.url % self.page_num)
            self.page_num += 1

            yield scrapy.Request(new_url, callback=self.parse)
예제 #9
0
파일: boss.py 프로젝트: spx88/Spider
    def parse(self, response):
        li_list = response.xpath('//*[@id="container"]/div[1]/ul/li')
        for li in li_list:
            job_name = li.xpath('./div/h3/a/text()').extract()
            job_name = ''.join(job_name)
            print(job_name)

            item = BossproItem()
            item['job_name'] = job_name
            detail_url = li.xpath('./div/h3/a/@href').extract_first()

            # 对详情页发请求,获取详情页的页面源码数据
            # 请求传参:meta={},可以将meta字典传递给请求对应的回调函数
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        # 分页操作
        if self.page_num <= 100:
            new_url = format(self.url % self.page_num)
            self.page_num += 10

            yield scrapy.Request(new_url, callback=self.parse)
예제 #10
0
    def parse(self, response):

        li_list = response.xpath('//*[@id="main"]/div/div[2]/ul/li')
        for li in li_list:
            item = BossproItem()
            job_name = li.xpath(
                './/span[@class="job-name"]/a/text()').extract_first()
            job_area = li.xpath(
                './/span[@class="job-area"]/text()').extract_first()
            detail_url = 'https://www.zhipin.com' + li.xpath(
                './/span[@class="job-name"]/a/@href').extract_first()

            # print(job_name, job_area)
            item['job_name'] = job_name
            item['job_area'] = job_area

            # 对详情页发请求获取详情页的源码数据
            # 手动请求的发送
            # 请求传参: meta={}, 可以将meta子弹传递给请求对应的回调函数
            header = self.get_header()
            yield scrapy.Request(url=detail_url,
                                 headers=header,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        # 分页操作
        if self.page_num < 2:
            self.page_num += 1
            new_url = format(self.url % (self.page_num, self.page_num))
            print('正在爬取:%s' % new_url)
            time.sleep(5)

            header = self.get_header()
            yield scrapy.Request(url=new_url,
                                 headers=header,
                                 callback=self.parse)