Ejemplo n.º 1
0
    def parse(self, response):
        for quote in response.xpath('//div[@class="quote"]'):
            summary = quote.xpath(
                './span[@class="text"]/text()').extract_first()
            author_name = quote.xpath(
                './/small[@class="author"]/text()').extract_first()
            author_homepage_url = response.urljoin(
                quote.xpath('.//a[1]/@href').extract_first())

            # print("@@@ " + author_name + " Done.")
            item = QuotesbotItem()
            item["summary"] = summary
            item["author_name"] = author_name
            item["author_homepage_url"] = author_homepage_url

            yield item

            # yield {
            #     'author_name': author_name,
            #     'author_homepage_url': author_homepage_url,
            #     'summary': summary
            # }

        next_url_page = response.xpath(
            '//li[@class="next"]/a/@href').extract_first()
        if next_url_page is not None:
            yield scrapy.Request(response.urljoin(next_url_page))
Ejemplo n.º 2
0
    def parse(self, response):
        # items = []
        for quote in response.css("div.quote"):
            item = QuotesbotItem()
            item['text'] = quote.css("span.text::text").extract_first()
            item['author'] = quote.css("small.author::text").extract_first()
            item['tags'] = quote.css("div.tags > a.tag::text").extract()
            yield item
        #     items.append(item)
        # return items

        next_page_url = response.css("li.next > a::attr(href)").extract_first()
Ejemplo n.º 3
0
    def parse(self, response):
        for quote in response.css("div.quote"):
            q = QuotesLoader(item=QuotesbotItem(), selector=quote)
            q.add_xpath('text', './span[@class="text"]/text()')
            q.add_xpath('author', './/small[@class="author"]/text()')
            q.add_xpath('tags', './/div[@class="tags"]/a[@class="tag"]/text()')
            yield q.load_item()

        next_page_url = response.xpath(
            '//li[@class="next"]/a/@href').extract_first()
        if next_page_url is not None:
            pagenum = int(next_page_url.split("/")[2])
            logger.info(f"Next Request is page {pagenum}!")
            if pagenum < 5:
                yield scrapy.Request(response.urljoin(next_page_url))
Ejemplo n.º 4
0
    def parse(self, response):
        for quote in response.css("div.quote"):
            author_page_url = '/author/' + quote.css("small.author::text").get()

            # 항목 정의
            item = QuotesbotItem()
            item['text'] = quote.css("span.text::text").get().strip()
            item['tags'] = quote.css("div.tags > a.tag::text").getall()

            # 콜백 요청
            yield scrapy.Request(urllib.parse.unquote(response.urljoin(author_page_url)), meta={'item':item}, callback=self.parse_author)

        next_page_url = response.css("li.next > a::attr(href)").get().strip()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
Ejemplo n.º 5
0
    def parse(self, response):
        item = QuotesbotItem()
        for quote in response.xpath('//div[@class="quote"]'):
            item['text'] = quote.xpath(
                './span[@class="text"]/text()').extract_first()
            item['author'] = quote.xpath(
                './/small[@class="author"]/text()').extract_first()
            item['authorlink'] = quote.xpath(
                './span[2]/a/@href').extract_first()
            item['tags'] = quote.xpath(
                './/div[@class="tags"]/a[@class="tag"]/text()').extract_first(
                )
            yield item

        next_page_url = response.xpath(
            '//li[@class="next"]/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
Ejemplo n.º 6
0
 def thirdParse(self, response):
     item = QuotesbotItem()
     for each in response.xpath('//div[@id="doc"]'):
         flag = each.xpath(
             '//div[@class="title-and-desc"]/a/div/text()').extract_first()
         if flag == None:
             continue
         else:
             item['highestCat'] = each.xpath(
                 './section/div[2]/a[1]/text()').extract_first()
             item['subCat'] = re.findall(
                 r"\s[a-zA-z]+\s",
                 each.xpath('normalize-space(./section/div[2])').
                 extract_first())[0]
             # item['thirdCat'] = response.xpath('./section/div[2]/a[2]/text()').extract_first()
             item['websiteName'] = each.xpath(
                 '//div[@class="title-and-desc"]/a/div/text()'
             ).extract_first()
             item['websiteUrl'] = each.xpath(
                 '//div[@class="title-and-desc"]/a/@href').extract_first()
             # yield item
             yield scrapy.Request(item['websiteUrl'],
                                  callback=self.UrlParse,
                                  meta={"item": item})
Ejemplo n.º 7
0
 def parse(self, response):
     item = QuotesbotItem()
     item['urlname'] = response.xpath('//title/text()')
     print 'haha', item['urlname']
     print response.xpath('//title/text()').extract_first()
     pass