Esempio n. 1
0
 def parse(self, response):
     quotes = response.css('.quote')  # 类选择器
     for quote in quotes:
         item = QuoteItem()
         # 这里得到的是一个个div模块
         # 类选择器,并且抓取第一个符合标签内的文本内容
         text = quote.css('.text::text').extract_first()
         author = quote.css('.author::text').extract_first()
         # 类选择器下的子选择器,抓取全部符合的标签的文本内容(返回列表)
         tags = quote.css('.tags .tag::text').extract()
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     # 选取类选择器下的子标签,并且抓取标签的属性
     next = response.css('.pager .next a::attr(href)').extract_first()
     # next是局部url,使用join方法获取url完整路径
     url = response.urljoin(next)
     # 重新发起请求,采用递归的方式, callback回调函数;实现翻页的循环
     yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 2
0
    def parse(self, response):
        """
        Spider的一个方法。默认调用start_urls的链接构成的请求完成下载,返回的响应会作为唯一的参数(response)传递给
        这个函数。该方法负责解析返回的响应、提取数据或者进一步生成要处理的请求。
        :param response:
        :return:
        """
        quotes = response.css(".quote")
        for quote in quotes:
            item = QuoteItem()
            item['text'] = quote.css(".text::text").extract_first()     # 获取列表里的第一个元素
            item['author'] = quote.css(".author::text").extract_first()     # 获取列表里的第一个元素
            item['tags'] = quote.css(".tags .tag::text").extract()      # 获取列表里的所有标签

        # 获取下一个页面的URL
        next_url = response.css('.pager .next a::attr(href)').extract_first()
        # 使用urljoin()方法把相对URL构造成绝对URL
        url = response.urljoin(next_url)
        # url和callback函数构造一个新的请求,回调函数使用parse()方法
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 3
0
    def parse(self, response):
        self.logger.info('Hello spider')
        quotes = response.css('div.quote')

        for quote in quotes:
            loader = ItemLoader(item=QuoteItem(), selector=quote)
            loader.add_css(field_name='quote_content', css='.text::text')
            loader.add_css(field_name='tags', css='.tag::text')
            quote_item = loader.load_item()

            author_url = quote.css('.author + a::attr(href)').get()
            self.logger.info('Get author page url')
            # Go to author page
            yield response.follow(author_url,
                                  callback=self.parse_author,
                                  meta={'quote_item': quote_item})

        # Go to next page
        for a in response.css('li.next a'):
            yield response.follow(a, callback=self.parse)
    def parse(self, response):
        self.logger.info('Parse function called on {}'.format(response.url))
        # quotes = response.xpath("//div[@class='quote']")
        quotes = response.css('div.quote')

        for quote in quotes:
            loader = ItemLoader(item=QuoteItem(), selector=quote)
            # pay attention to the dot .// to use relative xpath
            # loader.add_xpath('quote_content', ".//span[@class='text']/text()")
            loader.add_css('quote_content', '.text::text')
            # loader.add_xpath('author', './/small//text()')
            loader.add_css('tags', '.tag::text')
            quote_item = loader.load_item()
            author_url = quote.css('.author + a::attr(href)').get()
            # go to the author page and pass the current collected quote info
            yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})

        # go to Next page
        for a in response.css('li.next a'):
            yield response.follow(a, self.parse)
Esempio n. 5
0
    def parse(self, response):
        '''
        默认下,被调用时,
        start_urls 里的链接构成请求 完成下载后,返回的 response 就会作为唯一的参数传递给这个函数
        该方法负责解析返回的 response、提取数据/进一步生成要处理的请求
        :param response:
        :return:
        '''
        quotes = response.css('.quote') # 选取所有 quote 区块
        for quote in quotes:
            item = QuoteItem()
            item['text'] = quote.css('.text::text').extract_first()
            item['author'] = quote.css('.author::text').extract_first()
            item['tags'] = quote.css('.tags .tag::text').extract()

            yield item

        next = response.css('.pager .next a::attr(href)').extract_first() # 从 response 中提取 下一页的链接
        url = response.urljoin(next) # urljoin() 可以将相对 url 构造成绝对 URL
        yield scrapy.Request(url=url, callback=self.parse) # 构造了一个新的请求,请求完成后,响应会重新经过 parse() 方法处理,再生成下一页,循环到最后一页
Esempio n. 6
0
    def parse(self, response):
        # print(response.text)
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            # 由于tag有多个值,所以使用extract()取当前的所有值
            tags = quote.css('.tags .tag::text').extract()

            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        # 获取下一页的url;
        next = response.css('.pager .next a::attr(href)').extract_first()
        # 将一个相对url构造成一个绝对url,例如这里获取的是next是/page/2, 通过urljoin会变成http://quotes.toscrape.com/page/2
        url = response.urljoin(next)
        # 重新构造请求使用scrapy.Request(),这里是2个参数,一个url,一个回调函数
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 7
0
    def quote_parse(self, response):
        div_list = response.xpath('//div[@class="quote"]')

        #抽取具体quote信息
        for div in div_list:
            quote_item = QuoteItem()
            quote_item['content'] = div.xpath(
                'span[@class="text"]/text()').get()
            quote_item['author'] = div.xpath(
                '//small[@class="author"]/text()').get()
            quote_item['tags'] = div.xpath('//meta/@content').get()
            print(quote_item)
            yield quote_item

            #抽取每个作者详情页链接,并发起请求
            author_url = self.base_url + div.xpath('span//a/@href').get()
            yield scrapy.Request(url=author_url, callback=self.author_parse)

        next_page = response.css('.next a')
        if next_page:
            url = self.base_url + next_page.attrib['href']
            yield Request(url=url, callback=self.quote_parse)
Esempio n. 8
0
    def parse(self, response):
        #它是spider的一个方法,m默认情况下被调用的start_urls里面的连接构成的请求完成下载后。返回的响应就会传递这个函数,然后进行解析,提取数据或者进行下一步的请求
        quotes = response.css(".quote")  #多条信息
        for quote in quotes:
            item = QuoteItem()  #构造item对象
            text = quote.css(".text::text").extract_first()
            author = quote.css(".author::text").extract_first()
            tags = quote.css(".tags .tag::text").extract()
            item["text"] = text
            item["author"] = author
            item["tags"] = tags
            yield item

        next = response.css('.pager .next a::attr("href")').extract_first()

        new_url = response.urljoin(next)
        print("-" * 30)
        print(next)
        print(new_url)
        print("-" * 30)

        yield scrapy.Request(url=new_url, callback=self.parse)