Esempio n. 1
0
    def parse_author(self, response, text, author, tags):
        author_details = response.css('.container .author-details')

        item = QuoteItem()
        item['text'] = text
        item['author'] = author
        item['tags'] = ','.join(tags)

        #split dates to data and year
        born_dates = author_details.css(
            '.author-born-date::text').extract_first()
        item['born_date'], item['born_year'] = born_dates.split(',')
        '''
        remove ‘in’ in location
        split it to city and country 
        !!!sometimes there is no city ,put unknow
        '''
        born_location = author_details.css(
            '.author-born-location::text').extract_first()
        born_location = born_location.replace('in ', '')
        item['born_country'] = born_location.split(',')[-1]
        if len(born_location.split(',')) > 1:
            item['born_city'] = born_location.split(',')[0]
        else:
            item['born_city'] = None

        item['description'] = author_details.css(
            '.author-description::text').extract_first()

        yield item
Esempio n. 2
0
    def parse(self, response):
        # print(response.text)
        #获取标签
        quotes = response.css(".quote")
        for quote in quotes:
            # 获取class=text标签中的文本内容,取第一行
            text = quote.css(".text::text").extract_first()
            author = quote.css(".author::text").extract_first()
            #获取class=tags下的class=tag的文本内容,取所有数据
            tags = quote.css(".tags .tag::text").extract()

            item = QuoteItem()
            #这个地方居然不能支持对象点属性的方式来调用
            item["text"] = text
            item["author"] = author
            item["tags"] = tags
            #当使用yield后scrapy就会默认的解析这个item
            #yield只适用于item类与request类
            #通过 scrapy crawl quotes -o quotes.json 就可以将item保存在json文件中了
            #而通过scrapy crawl quotes -o quotes.csv就可以将item保存为csv文件
            # 而通过scrapy crawl quotes -o quotes.xml就可以将item保存为xml文件
            #也可以将目标文件保存在ftp中,如:ftp://user:[email protected]/path/quotes.xml
            yield item
        #获取下一页按钮对应的url值
        next = response.css(".pager .next a::attr(href)").extract_first()
        #将next获得的相对url=/page/2/转换成绝对url
        url = response.urljoin(next)
        #发起一个请求,用于获取下一页,然后递归调用parse方法来解析网页内容
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 3
0
 def parse(self, response):
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         text = quote.css('.text::text').extract_first()
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract()
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
        quotes = response.css(".quote")
        for quote in quotes:
            item = QuoteItem()
            text = quote.css(".text::text").extract_first()
            author = quote.css(".author::text").extract_first()
            tags = quote.css(".tags .tag::text").extract()
            item["text"] = text
            item["author"] = author
            item["tags"] = tags
            yield item

        next = response.css(".pager .next a::attr(href)").extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 5
0
    def parse(self, response):
        quotes = response.css(".quote")
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()     #只有一个内容就可以用extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tags::text').extract()     #当有多个内容就用extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)        #response.urljoin()可以将括号内的参数添加到当前页面的url后面
        yield scrapy.Request(url=url, callback=self.parse)      #第一个参数URL 表示访问这个URL的页面,后面的参数表示调用自己的函数
Esempio n. 6
0
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()

            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next_page_url = response.css("li.next > a::attr(href)").extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
Esempio n. 7
0
 def parse(self, response):
     print('添加中间件之后,  输出response.status',response.status)
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         text = quote.css('.text::text').extract_first()
         #('.text::text')输出该标签的文本内容  extract_first方法选取第一个
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract()
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     yield scrapy.Request(url=url, callback = self.parse)
 def parse(self, response):
     quotes = response.css('.quote') #通过css选择器获取quote
     for quote in quotes:
         item = QuoteItem()
         text = quote.css('.text::text').extract_first()#通过css选择器获取text的内容(第一个)
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract() #获取所有内容
         # 提取item的内容
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     # 翻页
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     # callback=self.parse参数表示递归调用自己
     yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 9
0
    def parse(self, response):
        # print(response.text)
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tag.tag::text').extract(
            )  #因为tag有多个,所以和上面的不太一样,extract()会把所有的都查找出来
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)  #补全整个url链接
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 10
0
    def parse(self, response):
        # print(response.text)
        # pass
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        # css定位下一页href 进行url拼接 callback回调自己,实现循环爬取页面
        next_page = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next_page)
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 11
0
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            #利用CSS选择器选取想要的信息
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            #调用 items.py 中存储的数据结构
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        #实现翻页
        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)  #定义回调函数,重新回调自己
Esempio n. 12
0
    def parse(self, response):
        # pass
        # print(response.text)
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            # ::text表示其中的文本,extract提取
            text = quote.css('.text::text').extract_first()  # 提取第一个结果
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()  # 提取所有的
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)  # 递归调用来实现循环下一页
Esempio n. 13
0
    def parse(self, response):
        quotes = response.css(".quote")
        ##获得class=quote的区块,迭代查询
        for quote in quotes:
            item = QuoteItem()
            text = quote.css(".text::text").extract_first(
            )  ##css选择器,::scrapy 特有语法结构,获取class=test里的文本内容,extract_first方法拿到内容;
            author = quote.css(".author::text").extract_first()
            tags = quote.css(".tags .tag::text").extract(
            )  ##tags是多级的,css级联;extract()提取全部内容;
            item["text"] = text
            item["author"] = author
            item["tags"] = tags
            yield item

        next = response.css(
            ".pager .next a::attr(href)").extract_first()  ##链接提取,attr(属性名称)
        url = response.urljoin(next)  ##urljoin方法获取绝对链接
        yield scrapy.Request(url=url, callback=self.parse)  #回调自己,完成递归的调用
Esempio n. 14
0
 def parse(self, response):
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         text = quote.css(
             '.text::text').extract_first()  # '.text::text'表获取.text类的文本部分
         author = quote.css(
             '.author::text').extract_first()  # extract_first提取一个内容
         tags = quote.css('.tags .tag::text').extract(
         )  # extract提取多个内容,'.tags .tag::text'表获取tags类下tag类的文本部分
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     #实现翻页,首先获取翻页的值
     next = response.css('.pager .next a::attr(href)').extract_first()
     #拼接URL
     url = response.urljoin(next)
     #反复调用URL
     yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 15
0
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            # ::text --> 提取其中的文本信息  算是scrapy语法
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        # 获取下一页的url
        next = response.css('.pager .next a::attr(href)').extract_first()
        # 连接成新的url
        url = response.urljoin(next)
        # callback 对应的是:请求这个url参数之后 由谁处理
        # 用递归调用 实现了循环翻页
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 16
0
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()

            item = QuoteItem()
            # item['text'] = text
            # item['author'] = author
            # item['tags'] = tags
            for field in item.fields:
                try:
                    item[field] = eval(field)
                except NameError:
                    self.logger.debug('Field is not Defined' + field)
            yield item

        next_page = response.css('.next a::attr(href)').extract_first()
        url = response.urljoin(next_page)
        yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 17
0
    def parse(self, response):  #解析单页网页
        #pass
        #print(response.text)
        quotes = response.css('.quote')
        for quotes in quotes:
            item = QuoteItem()
            text = quotes.css('.text::text').extract_first(
            )  #传入CSS 选择器。只有一个元素时用extract_first()
            author = quotes.css('.author::text').extract_first()
            tags = quotes.css(
                '.tags .tag::text').extract()  #多个元素时用extract(),会以列表的形式返回结果。
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item  #生成字典类型数据

        #实现翻页循环
        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)  #urljoin获取绝对url
        yield scrapy.Request(
            url=url,
            callback=self.parse)  #相当于重新发起一次请求。回绝函数callback递归调用自己,parse是处理索引页函数
Esempio n. 18
0
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()

            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()

            item['text'] = text
            item['author'] = author
            item['tags'] = tags

            yield item

        # 翻页
        next = response.css(
            '.next a::attr("href")').extract_first()  # /page/2/
        url = response.urljoin(
            next)  # 把相对路径连接成绝对路径  http://quotes.toscrape.com/page/2/
        yield scrapy.Request(
            url=url,
            callback=self.parse)  # 请求下一页,把response返回给callback指定函数继续处理,递归调用自己
Esempio n. 19
0
    def parse(self, response):
        quotes = response.css('.quote')

        for quote in quotes:
            # change piplines to get two csv
            # first csv
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = ','.join(tags)
            yield item

            #second csv
            #into about page
            about = quote.css('a::attr(href)').extract_first()
            about_url = response.urljoin(about)
            yield scrapy.Request(url=about_url, callback=self.parse_author)
        #next page
        next = response.css('.pager .next a::attr(href)').extract_first()
        next_url = response.urljoin(next)
        yield scrapy.Request(url=next_url, callback=self.parse)
Esempio n. 20
0
 def parse(self, response):
     # pass
     # pass 默认回调方法
     # print("response.text")
     quotes = response.css('.quote')
     for quote in quotes:
         item = QuoteItem()
         #在items.py中定义的
         text = quote.css('.text::text').extract_first()
         #进一步筛选quote
         #::是特有的语法结构,是输出text中文本
         #extract_first找第一个结果
         author = quote.css('.author::text').extract_first()
         tags = quote.css('.tags .tag::text').extract()
         #tag有多个,extract提取全部内容,类似于find(),findall()
         item['text'] = text
         item['author'] = author
         item['tags'] = tags
         yield item
     #下一页提取
     next = response.css('.pager .next a::attr(href)').extract_first()
     url = response.urljoin(next)
     #urljon()生成一个网站的url
     yield scrapy.Request(url=url, callback=self.parse)