def parse(self, response: scrapy.http.response.html.HtmlResponse): quote: scrapy.selector.unified.Selector for quote in response.selector.xpath("//div[@class='quote']"): loader = ItemLoader(item=QuoteItem(), selector=quote, response=response) loader.add_xpath('text', ".//div[@class='quoteText']/text()") loader.add_xpath('author', ".//span[@class='authorOrTitle']") loader.add_xpath('tags', ".//div[@class='greyText smallText left']/a") yield loader.load_item() # yield { # 'text': # quote.xpath(".//div[@class='quoteText']/text()[1]" # ).extract_first().strip(), # 'author': # quote.xpath(".//span[@class='authorOrTitle']/text()"). # extract_first().strip(), # 'tags': # quote.xpath(".//div[@class='greyText smallText left']/a/text()" # ).extract() # } next_page = response.selector.xpath( '//a[@class="next_page"]/@href').extract_first() if next_page: next_page_url = response.urljoin(next_page) yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse(self, response: scrapy.http.response.html.HtmlResponse) -> typing.Iterator[dict]: # FIXME: when xpath has <1 match, .get() cheerfully returns None. # FIXME: when xpath has >1 match, .get() cheerfully returns the first. # How do I get exceptions for both cases? for quote_etree in response.xpath('//*[@itemscope]'): yield { 'author': quote_etree.xpath('.//*[@itemprop="author"]/text()').get(), 'text': quote_etree.xpath('.//*[@itemprop="text"]/text()').get(), 'tags': quote_etree.xpath('.//*[@class="tag"]/text()').getall()} # Recursively descend the next page. # Follow the "next page" link for next_url in response.xpath('//li[@class="next"]/a/@href').getall(): yield scrapy.Request( response.urljoin(next_url), callback=self.parse)
def get_crawl_list( self, response: scrapy.http.response.html.HtmlResponse) -> List: """ DOMの内容から企業情報が載っているURlを取得する. Args: response (scrapy.http.response.html.HtmlResponse): オブジェクト Returns: List: 企業の情報が入ったListを返す. """ company_list = [] company_list_box = response.css(".entryList01") company_list_tag = company_list_box.css("li") for company in company_list_tag: company_path = company.css("a::attr(href)").extract_first() company_url = response.urljoin(company_path) company_list.append({"url": company_url}) return company_list
def next_page_link( self, response: scrapy.http.response.html.HtmlResponse ) -> scrapy.Request: """ 次のクローリング先のURLを生成し、scray.Requestオブジェクトを生成する. Args: response (scrapy.http.response.html.HtmlResponse): オブジェクト. Returns: scrapy.Request: scrapy.Requestオブジェクトを返す. """ self.page_count += 1 # index path index_path = "index_" + str(self.page_count) + ".html" index_path = index_path if self.page_count != 1 else "" # URLが相対パスだった場合に絶対パスに変換する older_post_link = response.urljoin(index_path) # 次のページをのリクエストを実行する return scrapy.Request(older_post_link, callback=self.parse)