Example #1
0
    def scrape_home(self, response):
        # use the response xpath here
        # h1_tag = response.xpath('//h1/a/text()').extract()[0]
        # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()

        # yield {
        #     'h1': h1_tag,
        #     'Tags': tags
        # }
        open_in_browser(response)  # using debuging
        loader = ItemLoader(item=QuotesSpiderItem(), response=response)
        quotes = response.xpath('//*[@class="quote"]')
        for quote in quotes:
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            author = quote.xpath(
                './/*[@itemprop="author"]/text()').extract_first()
            tags = quote.xpath(
                './/*[@itemprop="keywords"]/@content').extract_first()
            loader.add_value('text', text)
            loader.add_value('author', author)
            loader.add_value('tags', tags)
            yield loader.load_item()

        next_page_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        absolut_next_page_url = response.urljoin(next_page_url)

        yield Request(absolut_next_page_url)
Example #2
0
    def parse(self, response):
        # h1_tags = response.xpath('//h1/a/text()').extract_first()
        # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()

        # yield {'H1 Tag': h1_tags, 'Tags':tags}

        l=ItemLoader(item=QuotesSpiderItem(),response=response)

        quotes = response.xpath('//*[@class="quote"]')
        for quote in quotes:
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            author = quote.xpath(
                './/*[@itemprop="author"]/text()').extract_first()
            tags = quote.xpath(
                './/*[@itemprop="keywords"]/@content').extract_first()
            l.add_value('author',author)
            l.add_value('tags',tags)
            l.add_value('text',text)


            # yield {"Text": text,
            #        "Author": author,
            #        "Tags": tags}
            yield l.load_item()

            next_page_url = response.xpath(
                '//*[@class="next"]/a/@href').extract_first()
            absolute_next_page_url = response.urljoin(next_page_url)

            yield scrapy.Request(absolute_next_page_url)
Example #3
0
    def parse(self, response):
        l = ItemLoader(item=QuotesSpiderItem(), response=response)

        # h1_tag = response.xpath('//h1/a/text()').extract_first()
        # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()
        #
        # l.add_value('h1_tag', h1_tag)
        # l.add_value('tags', tags)

        # ------------------
        quotes = response.xpath('//*[@class="quote"]')
        for quote in quotes:
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            author = quote.xpath(
                './/*[@class="author"]/text()').extract_first()
            tags = quote.xpath('.//*[@class="tag"]/text()').extract()
            l.add_value('text', text)
            l.add_value('author', author)
            l.add_value('tags', tags)
            print('----------------------\n', text, author.upper(), tags)
            # yield {'Text': text,
            #        'Author': author,
            #        'Tags': tags}

        next_page_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        absolute_next_page_url = response.urljoin(next_page_url)
        yield scrapy.Request(absolute_next_page_url)
        # -------------------------

        return l.load_item()
Example #4
0
    def parse(self, response):
        item_loader = ItemLoader(item=QuotesSpiderItem(), response=response)
        h1_tag = response.xpath('//h1/a/text()').extract_first()
        tags = response.xpath('//*[@class="tags-item"]/a/text()').extract()
        item_loader.add_value('h1_tag', h1_tag)
        item_loader.add_value('tags', tags)

        return item_loader.load_item()
Example #5
0
    def scrape_home_page(self, response):
	# open_in_browser(response)
	l = ItemLoader(item=QuotesSpiderItem(), response=response)
        h1_tag = response.xpath('//h1/a/text()').extract_first()
	tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()
	l.add_value('h1_tag', h1_tag)
	l.add_value('tags', tags)
	return l.load_item()
Example #6
0
    def parse(self, response):
        l = ItemLoader(item=QuotesSpiderItem(),response=response)
        heading = response.xpath("//h1/text()").extract_first()
        link = response.xpath("//a/@href").extract_first()

        l.add_value('heading',heading)
        l.add_value('link',link)
        return l.load_item()
Example #7
0
    def parse(self, response):
        for quote in response.css('div.quote'):
            item = QuotesSpiderItem()
            item['text'] = quote.css('span.text::text').extract_first()
            item['author'] = quote.xpath('span/small/text()').extract_first()
            yield item

        next_page = response.css('li.next a::attr("href")').extract_first()
        if next_page is not None:
            yield response.follow(next_page, self.parse)
Example #8
0
    def scrape_home_page(self, response):
    	# for debugging only
    	open_in_browser(response)

    	l = ItemLoader(item=QuotesSpiderItem(), response=response)

    	h1_tag = response.xpath('//h1/a/text()').extract_first()
        tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()

        l.add_value('h1_tag', h1_tag)
        l.add_value('tags', tags)

        return l.load_item()


# Commenting all out.
    # def parse(self, response):


    # 	l = ItemLoader(item=QuotesSpiderItem(), response=response)
    # 	# Commenting this out. Just for understanding.
    #     h1_tag = response.xpath('//h1/a/text()').extract_first()
    #     tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()

    #     # yield {'H1 Tag': h1_tag, 'Tags': tags}
    #     l.add_value('h1_tag', h1_tag)
    #     l.add_value('tags', tags)

    #     return l.load_item()


    #     # quotes = response.xpath('//*[@class="quote"]')
    #     # for quote in quotes:
    #     # 	text = quote.xpath('.//*[@class="text"]/text()').extract_first()
    #     # 	author = quote.xpath('.//*[@itemprop="author"]/text()').extract_first()
    #     # 	tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first()
    #     # 	# tags = quote.xpath('.//*[@class="tag"]/text()').extract()


    #     # 	# If you want to print the data.
    #     # 	# print('\n')
    #     # 	# print(text)
    #     # 	# print(author)
    #     # 	# print(tags)
    #     # 	# print('\n')

    #     # 	yield{'Text':text,
    #     # 		  'Author':author,
    #     # 		  'Tags':tags}

    #     # next_page_url = response.xpath('//*[@class="next"]/a/@href').extract_first()
    #     # absolute_next_page_url = response.urljoin(next_page_url)

    #     # yield scrapy.Request(absolute_next_page_url)
Example #9
0
    def parse(self, response):
        l = ItemLoader(item=QuotesSpiderItem(), response=response)
        # extract title of page
        h1_tag = response.xpath('//h1/a/text()').extract_first()
        # extract 10 tags on left side of page
        tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()

        l.add_value('h1_tag', h1_tag)
        l.add_value('tags', tags)

        return l.load_item()
Example #10
0
    def parse(self, response):
        # define itemloader
       	l = ItemLoader(item =QuotesSpiderItem(), reponser = response) # callback function

        h1_tag = response.xpath('.//h1/a/text()').extract_first()
        tags = response.xpath('.//*[@class="tag-item"]/a/text()').extract()

        l.add_value('h1_tag', h1_tag)
        l.add_value('tags', tags)

        return l.load_item()
Example #11
0
    def home_page_advanced(self, response):
        """
        Scrape homepage with help of pipelines.py & items.py
        """
        # open_in_browser(response)
        l = ItemLoader(item=QuotesSpiderItem(), response=response)

        h1_tag = response.xpath('//h1/a/text()').extract_first()
        tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()

        l.add_value('h1_tag', h1_tag)
        l.add_value('tags', tags)

        return l.load_item()
Example #12
0
    def parse(self, response):

        l = ItemLoader(item=QuotesSpiderItem(), response=response)

        quotes = response.xpath("//div[@class='quote']")
        for quote in quotes:
            text = quote.xpath(".//*[@class='text']/text()").extract_first()
            author = quote.xpath(
                ".//*[@itemprop='author']/text()").extract_first()
            tags = quote.xpath(".//*[@itemprop='keywords']/@content").extract()

            l.add_value('text', text)
            l.add_value('author', author)
            l.add_value('tags', tags)

        yield l.load_item()
        """
    def parse(self, response):
        l = ItemLoader(item=QuotesSpiderItem(), response=response)
        # h1_tag = response.xpath('//h1/a/text()').extract_first()
        # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract()

        # # yield to print to scrappy output
        # yield {'H1 Tag': h1_tag, 'Tags': tags}
        quotes = response.xpath('//*[@class="quote"]')
        for quote in quotes:
            # quote.xpath('.//*[@itemprop="text"]/text()').extract_first()
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            author = quote.xpath(
                './/*[@itemprop="author"]/text()').extract_first()

            # string or unicode
            # quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first()
            # list
            # quote.xpath('.//*[@class="tag"]/text()').extract()
            tags = quote.xpath(
                './/*[@itemprop="keywords"]/@content').extract_first()

            # l.add_value('text', text)
            # l.add_value('author', author)
            # l.add_value('tags', tags)

            # print("\n")
            # print(text)
            # print(author)
            # print(tags)
            # print("\n")

            yield {'Text': text, 'Author': author, 'Tags': tags}

            # return l.load_item()

        # next_page = response.css('li.next a::attr(href)').extract_first()
        # if next_page is not None:
        #     yield response.follow(next_page, self.parse)
        next_page_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        absolute_next_page_url = response.urljoin(next_page_url)
        yield scrapy.Request(absolute_next_page_url)
Example #14
0
    def scrape_home_page(self, response):  #parse call back 하지 않아도 된다
        open_in_browser(response)  #브라우저를 연다
        l = ItemLoader(item=QuotesSpiderItem(), response=response)

        h1_tag = response.xpath('//h1/a/text()').extract_first()
        tags = response.xpath('//*[@class = "tag-item"]/a/text()').extract()

        l.add_value('h1_tag', h1_tag)
        l.add_value('tags', tags)

        # quotes =response.xpath('//*[@class="quote"]')
        # for quote in quotes:
        #     text = quote.xpath('.//*[@class="text"]/text()').extract_first()
        #     author = quote.xpath('.//*[@class = "author"]/text()').extract_first()
        #     tags_q = quote.xpath('.//*[@itemprop = "keywords"]/@content').extract_first()
        #
        #     l.add_value('text', text)
        #     l.add_value('author', author)
        #     l.add_value('tags_q', tags_q)
        return l.load_item()
Example #15
0
    def parse(self, response):

        quotes = response.xpath('//*[@class="quote"]')
        for quote in quotes:
            # define item loader from items.py
            l = ItemLoader(item=QuotesSpiderItem(), selector=quote)
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            author = quote.xpath(
                './/*[@itemprop="author"]/text()').extract_first()
            tags = quote.xpath(
                './/*[@itemprop="keywords"]/@content').extract_first()

            l.add_value('text', text)
            l.add_value('author', author)
            l.add_value('tags', tags)
            yield l.load_item()

        next_page_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        absoulte_next_page_url = response.urljoin(next_page_url)
        yield scrapy.Request(absoulte_next_page_url)
Example #16
0
    def parse(self, response):
        quotes = response.xpath('//*[@class="quote"]')

        for quote in quotes:

            l = ItemLoader(item=QuotesSpiderItem(), response=response)
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            # print text
            author = quote.xpath(
                './/*[@class="author"]/text()').extract_first()
            # print author

            # tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first().split(',')
            tags = quote.xpath('.//*[@class="tag"]/text()').extract()
            tag_links = quote.xpath('.//*[@class="tag"]/@href').extract()
            # print tag_links
            # print tags

            # yield {
            # 	'text': text,
            # 	'author': author,
            # 	'tags': {
            # 		'tag_names': tags,
            # 		'tag_links': tag_links
            # 	}
            # }

            l.add_value('text', text)
            l.add_value('author', author)
            l.add_value('tags', {'tag_names': tags, 'tag_links': tag_links})

            yield l.load_item()

        next_pg_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        abs_next_pg_url = response.urljoin(next_pg_url)
        print abs_next_pg_url

        # yield scrapy.http.Request(abs_next_pg_url, callback=self.parse)
        yield scrapy.http.Request(abs_next_pg_url)
Example #17
0
    def parse(self, response):
        quotes = response.xpath('//*[@class="quote"]')
        for quote in quotes:
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            author = quote.xpath(
                './/*[@itemprop="author"]/text()').extract_first()
            tags = quote.xpath(
                './/*[@itemprop="keywords"]/@content').extract_first()

            print(author)
            print(tags)
            l = ItemLoader(item=QuotesSpiderItem(), response=response)
            l.add_value('text', text)
            l.add_value('author', author)
            l.add_value('tags', tags)

            yield l.load_item()

        next_page_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        absolute_next_page_url = response.urljoin(next_page_url)
        yield Request(absolute_next_page_url)
Example #18
0
    def parse(self, response):
        l = ItemLoader(item=QuotesSpiderItem(), response=response)

        quotes = response.xpath('//*[@class="quote"]')
        for quote in quotes:
            text = quote.xpath('.//*[@class="text"]/text()').extract_first()
            author = quote.xpath(
                './/*[@itemprop="author"]/text()').extract_first()
            tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract()

            yield {'Text': text, 'Author': author, 'Tags': tags}

        # storing the scraped data into item files
        l.add_value('Text', text)
        l.add_value('Author', author)
        l.add_value('Tags', tags)

        l.load_item()

        next_page_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        absolute_page_url = response.urljoin(next_page_url)
        yield Request(absolute_page_url)