def parse(self, response):
        """
        This parsing will be used to get all quotes from a single page.
        """
        self.state['items_count'] = self.state.get('items_count', 0) + 1
        quotes_list = response.xpath('/html/body/div/div[2]/div[1]/div')
        if not quotes_list:
            yield None

        for quote in quotes_list:
            item = QuotesItem()
            item['quote'] = quote.xpath('.//span[1]/text()')[0].extract()
            item['author'] = quote.xpath(
                './/span[2]/small/text()')[0].extract()
            tags = quote.xpath('.//div/a/text()').extract()
            item['tags'] = str(tags)
            author_url = quote.xpath('.//span[2]/a/@href')[0].extract()
            cleaned_author_url = self.clean_url(author_url)

            # Note the don't_filter parameter. It will allow for duplicate parsing  under this call.
            author_description_request = scrapy.Request(
                cleaned_author_url,
                callback=self.parse_author_detail_page,
                meta={'item': item},
                dont_filter=True,
                errback=self.handle_error)
            yield author_description_request
    def parse_author(self, response):
        # Extract the author nationality
        author_nationality = response.xpath(
            '//div[contains(@class, "bqLn") and contains(text(), "ationality")]/a/text()'
        ).extract_first()
        quote_author_path = '//div[contains(@id, "quotesList")]//div[contains(@class,"bqQt")]'
        #quote_list        = []

        for individual_quote_author in response.xpath(quote_author_path):
            individual_quote_author_url = individual_quote_author.xpath(
                './a/img/@src').extract_first()

            # Get the item from the last response
            item = QuotesItem(response.meta['item'])
            item['image_urls'] = []

            # Validate the individual quote author url has an image
            if individual_quote_author_url:
                individual_quote_author_url = response.urljoin(
                    individual_quote_author_url)
                item['image_urls'].append(individual_quote_author_url)

            individual_quote_author_path = './/span[contains(@class, "uote") and contains(@class, "ink")]/a[contains(@href, "quotes")]/text()'
            individual_quote_author_text = individual_quote_author.xpath(
                individual_quote_author_path).extract_first()

            item['author_nationality'] = author_nationality
            item['quote_text'] = individual_quote_author_text

            yield item
Exemple #3
0
 def parse(self, response):
     print("爬取页面 "+ response.url)
     content_list = response.xpath('//div[@class="quote"]/span[@class="text"]/text()').extract()
     auth_list = response.xpath('//div[@class="quote"]//small[@class="author"]/text()').extract()
     item = QuotesItem()
     for i,j in zip(auth_list,content_list):
         item['author'] = i
         item['content'] = eval(j)
         yield item
Exemple #4
0
 def parse(self, response):
     items = QuotesItem()
     quotes = response.xpath("//div[@class='quote']")
     for item in quotes:
         quote = item.xpath("./span[@class='text']/text()").extract()[0]
         author = item.xpath("./span")[1].xpath(
             "./small[@class='author']/text()").extract()[0]
         items["quote"] = quote
         items["author"] = author
         yield items
Exemple #5
0
    def parse(self, response):
        for quote in response.css('.quote'):
            item = QuotesItem(quote=quote.css('.text::text').get(),
                              author=quote.css('.author::text').get(),
                              author_url=response.urljoin(
                                  quote.css('.author a::attr(href)').get()),
                              tags=quote.css('.tag *::text').getall())
            yield item

        yield scrapy.Request(
            response.urljoin(response.css('.next a::attr(href)').get()))
    def parse(self, response):
        quotes = response.css("div.quote")

        for quote in quotes:

            loader = ItemLoader(item=QuotesItem(), selector=quote)
            loader.add_css("author", ".author::text")
            loader.add_css("quote", ".text::text")
            loader.add_css("tags", ".tag::text")

            yield loader.load_item()
Exemple #7
0
    def parse(self, response):
        item = QuotesItem()
        for quote in response.css('div.quote'):
            item['content'] = quote.css('span.text::text').extract_first()
            item['author'] = quote.css('small.author::text').get()
            item['tags'] = ','.join(quote.css('div.tags a.tag::text').getall())
            yield item

        next_page_url = response.css('li.next > a::attr(href)').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
Exemple #8
0
    def parse_item(self, response):
        self.log('Scraping: ' + response.url)

        articles = response.css('.quote')

        for article in articles:
            item = QuotesItem()
            item['text'] = article.css('::text').extract()[1].strip()
            item['author'] = article.css('::text').extract()[4].strip()

            yield item
Exemple #9
0
 def parse_author(self, response):
     item = QuotesItem()
     descripition = response.xpath(
         '//div[@class="author-description"]/text()').extract_first().strip(
         )
     item['description'] = descripition
     item['content'] = response.meta['content']
     item['author'] = response.meta['author']
     item['tags'] = response.meta['tags']
     item['author_url'] = response.meta['author_url']
     yield item
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            q_item = QuotesItem()
            q_item['text'] = quote.css('.text::text').extract_first()
            q_item['author'] = quote.css('.author::text').extract_first()
            q_item['tags'] = quote.css('.tags .tag::text').extract()
            yield q_item

        next = response.css('.pager .next a::attr("href")').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)
    def extractData(self, res):
        q = QuotesItem()

        for quote in res.css('div.quote'):
            q['quote'] = '"' + re.sub(
                r'[^\x00-\x7f]', r'',
                quote.css('span.text::text').extract_first()) + '"'
            q['author'] = quote.css('small.author::text').extract_first()
            q['tags'] = ' '.join(
                str(s) for s in quote.css('div.tags > a.tag::text').extract())

            self.writeTxt(q)
Exemple #12
0
    def parse(self, response):
        quotes = response.css(".quote")

        for quote in quotes:
            item = QuotesItem()
            item['text'] = quote.css(".text::text").extract_first()
            item['author'] = quote.css(".author::text").extract_first()
            item['tags'] = quote.css(".tags .tag::text").extract()
            yield item
        
        next = response.css(".next a::attr(href)").extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url, callback=self.parse)
Exemple #13
0
 def parse_quotes(self, response):
     for (i, quote) in enumerate(response.css('.quote')):
         text = quote.css('.text::text').get()
         author = quote.css('.author::text').get()
         if not self.author or author == self.author:
             yield QuotesItem(text=text,
                              author=author,
                              url=response.url,
                              rank=i,
                              scrape_date=datetime.now().isoformat())
     url = response.css('.pager .next a::attr(href)').get()
     if url:
         yield response.follow(url, callback=self.parse_quotes)
    def parse(self, response):
        html_author = '//div/h2[contains(text(), "opular") and contains(text(), "thors")]/..//div[contains(@class, "bqLn")]'
        for individual_author in response.xpath(html_author):
            author_name = individual_author.xpath('./a/text()').extract_first()
            author_link = individual_author.xpath('./a/@href').extract_first()
            full_author_url = response.urljoin(author_link)

            ## Declare information Item for authors
            item = QuotesItem()
            item['author_name'] = author_name

            yield scrapy.Request(full_author_url,
                                 callback=self.parse_author,
                                 meta={'item': item})
            break
Exemple #15
0
    def parse(self, response):
        for quote in response.xpath("//div[@class='quote']"):
            item = QuotesItem()
            item['quote'] = quote.xpath(
                "span[@class='text']/text()").extract_first()
            item['author'] = quote.xpath(
                "span/small[@class='author']/text()").extract_first()
            item['tags'] = quote.xpath(
                "div[@class='tags']/a[@class='tag']/text()").extract()
            yield item

        next_page = response.xpath(
            "//ul[@class='pager']/li[@class='next']/a/@href").extract_first()
        next_page = "http://quotes.toscrape.com" + next_page
        yield scrapy.http.Request(next_page, callback=self.parse)
Exemple #16
0
    def parse(self, response):
        for (i, quote) in enumerate(response.css('.quote')):
            text = quote.css('span.text::text').get()
            author = quote.css('.author::text').get()

            yield QuotesItem(text=text,
                             author=author,
                             url=response.url,
                             rank=i,
                             scrapy_date=datetime.now().isoformat())

        url = response.css('.pager .next a::attr(href)').get()

        if url is not None:
            yield response.follow(url)
 def parse(self, response):
     quote_list = response.xpath('//div[@class= "quote"]')
     item = QuotesItem()
     for quote in quote_list:
         item['quote'] = quote.xpath(
             './/span[@class = "text"]/text()').get()
         item['author'] = quote.xpath(
             './/span/small[@class = "author"]/text()').get()
         item['tag'] = quote.xpath(
             './/div[@class = "tags"]/a/text()').getall()
         if item['tag'] == []:
             item['tag'] = ['no_tag']
         yield item
     next_page = response.xpath('//li[@class="next"]/a/@href').get()
     if next_page is not None:
         yield response.follow(next_page, callback=self.parse)
Exemple #18
0
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first()
            tags = quote.css('.tags .tag::text').extract()

            item = QuotesItem()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        next_url = response.urljoin(next)
        yield scrapy.Request(url=next_url, callback=self.parse)
Exemple #19
0
 def parse(self, response):
     quote_sel= response.xpath('//div[@class="quote"]')
     for sel in quote_sel:
         #print i
         quote = sel.xpath('span[@class="text"]/text()').extract_first()
         #print quote
         author = sel.xpath('span/small[@class="author"]/text()').extract_first()
         #print author
         item =  QuotesItem(
             quote = quote,
             author = author)
         yield item
     next_url = response.xpath('//li[@class="next"]/a/@href').extract_first()
     next = response.urljoin(next_url)
     #print next
     if next_url:
         yield Request(url= next)
Exemple #20
0
    def parse(self, response):
        item = QuotesItem()
        for quote in response.css('div.quote'):
            item['quotes'] = quote.css(
                'span.text::text').extract_first().replace("'", "\\'")
            item['author'] = quote.css('small.author::text').extract_first()
            link = quote.css('a::attr(href)').extract_first()
            item['author_link'] = response.urljoin(link)
            item['tags'] = ';'.join(
                quote.css('div.tags a.tag::text').extract())
            yield item

        self.page += 1
        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page and self.page < 3:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
Exemple #21
0
 def parse_product(self, response):
     for info in response.xpath(
             '//div[contains(@id, "quotesList")]').extract():
         for t in info:
             hrefs = response.xpath(
                 '//div[contains(@class, "qti-listm")]/a/@href').extract()
             #lines = response.xpath("//div[@class='qti-listm']//a/img/@alt").extract().split(',')[0]
             lines = [
                 line.split(",") for line in response.xpath(
                     "//div[@class='qti-listm']//a/img/@alt").extract()
             ]
             lines = [line[0] for line in lines]
             for item in zip(hrefs, lines):
                 new_item = QuotesItem()
                 new_item['hrefs'] = item[0]
                 new_item['lines'] = item[1]
                 yield new_item
Exemple #22
0
 def parse(self, response):
     # 解析数据
     Source = response.css(".tablelist1 td")
     message = Source.css("::text").extract()
     message2 = Source.css("::attr(onclick)").extract()
     message3 = response.css('#content .default::text').extract()
     temp = message3.copy()
     for i in temp:
         if '--' in i:
             message3.remove(i)
     if '杭州' in message3[1]:
         message3[1] = message3[1][2:]
     message3[0] = message3[0] + '市 '
     message3.insert(0, '浙江省 ')
     message3 = ''.join(message3)
     time.sleep(0.1)
     # 1 获取日期和title
     # 1 _title = []
     # 1 for i in message[0::3]:
     # 1   _title.append(i)
     # 访问二级页面
     for index, value in enumerate(message2):
         item = QuotesItem()
         # 1 item['title'] = _title[index]
         # item['aera'] = message3
         # item['_url'] = 'http://www.zjzxts.gov.cn/wsdt/wsdtHtml/xfjxq.jsp?id='+ value[6:-2]
         url = 'http://www.zjzxts.gov.cn/wsdt/wsdtHtml/xfjxq.jsp?id=' + value[
             6:-2]
         time.sleep(0.1)
         yield scrapy.Request(url,
                              meta={
                                  'item': item,
                                  'Aera': message3,
                                  'Url': url
                              },
                              callback=self.detail_parse2)
     current_page = response.css('#content .paginList #cp::text').extract()
     current_page = str(int(current_page[0]))
     if (current_page != '10'):
         one_url = response.css(
             '#content .paginList a::attr(href)').extract_first()
         next_url = 'http://www.zjzxts.gov.cn' + one_url[:
                                                         -5] + current_page + '&bt='
         time.sleep(1)
         yield scrapy.Request(url=next_url, callback=self.parse)
    def parse(self, response):

        for quote in response.xpath('//div[@class="quote"]'):
            item = QuotesItem()
            '''            
            yield {
                'text': quote.xpath('span[@class="text"]/text()').extract_first(),
                'author': quote.xpath('span/small[@class="author"]/text()').extract_first(),
            }
            '''
            item['title'] = quote.xpath('span[@class="text"]/text()').extract_first()
            item['author'] = quote.xpath('span/small[@class="author"]/text()').extract_first()
            yield item

        next_page = response.xpath('/html/body/div[1]/div[2]/div[1]/nav/ul/li/a/@href').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
 def parse(self, response):
     # get div with quote and author
     quotes = response.xpath('//div[@class="quoteText"]')
     for quote in quotes:
         try:
             item = QuotesItem()
             item['quote'] = quote.xpath(
                 'text()').extract_first().strip()[1:-1]
             item['author'] = quote.xpath(
                 'a/text()').extract_first().strip().split(',')[0]
             item['source'] = self.name
             yield item
         except:
             pass
     # find the next page link and go
     next_page = response.xpath(
         '//a[@class="next_page"]/@href').extract_first()
     yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
Exemple #25
0
    def detail_parse2(self, response):
        # 接受上级已爬取的数据
        time.sleep(2)
        item = response.meta['item']
        source = response.css('.blue1::text').extract()
        # 二级页数据提取
        item = QuotesItem()
        try:
            item['url'] = response.meta['Url']
            item['date'] = source[1]
            item['aera'] = response.meta['Aera']
            item['source'] = source[2]
            item['_content'] = source[0]
            item['department'] = source[3]
            item['ans_date'] = source[4]
            item['ans_content'] = source[5]
        except IndexError:
            pass

        return item
    def parse(self, response):
        """ Parsing function for the spider's requests. """

        # Remove the line breaks on the html
        response = response.replace(body=response.body.replace(b"<br>", b""))

        for quote in response.css(".quoteDetails"):
            # Create an item loader with the quote data and add it as a new quote_item

            self.logger.info("Creating quote_item")
            loader = ItemLoader(item=QuotesItem(), selector=quote)
            loader.add_css("quote_content", ".quoteText::text")
            loader.add_css("author_name", ".quoteText .authorOrTitle::text")
            loader.add_css("author_image", ".leftAlignedImage img::attr(src)")
            loader.add_css("tags", ".greyText.smallText.left a::text")
            quote_item = loader.load_item()

            yield quote_item

            # Scrape the next page
            next_page = response.css("a.next_page::attr(href)").get()
            if next_page is not None:
                yield response.follow(next_page, self.parse)
# Standard library imports
from scrapy import Spider
from scrapy.loader import ItemLoader

# Local application imports
from quotes.items import QuotesItem

quote_item = QuotesItem()


class QuotesSpider(Spider):
    """ Web Scraping Spider for Goodreads website. """

    # Class attributes
    name = "quotes"
    start_urls = ["https://www.goodreads.com/quotes?page=1"]

    def parse(self, response):
        """ Parsing function for the spider's requests. """

        # Remove the line breaks on the html
        response = response.replace(body=response.body.replace(b"<br>", b""))

        for quote in response.css(".quoteDetails"):
            # Create an item loader with the quote data and add it as a new quote_item

            self.logger.info("Creating quote_item")
            loader = ItemLoader(item=QuotesItem(), selector=quote)
            loader.add_css("quote_content", ".quoteText::text")
            loader.add_css("author_name", ".quoteText .authorOrTitle::text")
            loader.add_css("author_image", ".leftAlignedImage img::attr(src)")