Esempio n. 1
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        quote: scrapy.selector.unified.Selector
        for quote in response.selector.xpath("//div[@class='quote']"):
            loader = ItemLoader(item=QuoteItem(),
                                selector=quote,
                                response=response)
            loader.add_xpath('text', ".//div[@class='quoteText']/text()")
            loader.add_xpath('author', ".//span[@class='authorOrTitle']")
            loader.add_xpath('tags',
                             ".//div[@class='greyText smallText left']/a")
            yield loader.load_item()
            # yield {
            #     'text':
            #     quote.xpath(".//div[@class='quoteText']/text()[1]"
            #                 ).extract_first().strip(),
            #     'author':
            #     quote.xpath(".//span[@class='authorOrTitle']/text()").
            #     extract_first().strip(),
            #     'tags':
            #     quote.xpath(".//div[@class='greyText smallText left']/a/text()"
            #                 ).extract()
            # }

        next_page = response.selector.xpath(
            '//a[@class="next_page"]/@href').extract_first()
        if next_page:
            next_page_url = response.urljoin(next_page)
            yield scrapy.Request(url=next_page_url, callback=self.parse)
Esempio n. 2
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse) -> typing.Iterator[dict]:
        # FIXME: when xpath has <1 match, .get() cheerfully returns None.
        # FIXME: when xpath has >1 match, .get() cheerfully returns the first.
        # How do I get exceptions for both cases?
        for quote_etree in response.xpath('//*[@itemscope]'):
            yield {
                'author': quote_etree.xpath('.//*[@itemprop="author"]/text()').get(),
                'text': quote_etree.xpath('.//*[@itemprop="text"]/text()').get(),
                'tags': quote_etree.xpath('.//*[@class="tag"]/text()').getall()}

        # Recursively descend the next page.
        # Follow the "next page" link
        for next_url in response.xpath('//li[@class="next"]/a/@href').getall():
            yield scrapy.Request(
                response.urljoin(next_url),
                callback=self.parse)
    def get_crawl_list(
            self, response: scrapy.http.response.html.HtmlResponse) -> List:
        """
        DOMの内容から企業情報が載っているURlを取得する.

        Args:
            response (scrapy.http.response.html.HtmlResponse): オブジェクト

        Returns:
            List: 企業の情報が入ったListを返す.
        """
        company_list = []

        company_list_box = response.css(".entryList01")
        company_list_tag = company_list_box.css("li")

        for company in company_list_tag:
            company_path = company.css("a::attr(href)").extract_first()
            company_url = response.urljoin(company_path)

            company_list.append({"url": company_url})

        return company_list
    def next_page_link(
            self, response: scrapy.http.response.html.HtmlResponse
    ) -> scrapy.Request:
        """
        次のクローリング先のURLを生成し、scray.Requestオブジェクトを生成する.

        Args:
            response (scrapy.http.response.html.HtmlResponse): オブジェクト.

        Returns:
            scrapy.Request: scrapy.Requestオブジェクトを返す.
        """

        self.page_count += 1

        # index path
        index_path = "index_" + str(self.page_count) + ".html"
        index_path = index_path if self.page_count != 1 else ""

        # URLが相対パスだった場合に絶対パスに変換する
        older_post_link = response.urljoin(index_path)

        # 次のページをのリクエストを実行する
        return scrapy.Request(older_post_link, callback=self.parse)