def parse(self, response): quotes = response.css('.quote') # 类选择器 for quote in quotes: item = QuoteItem() # 这里得到的是一个个div模块 # 类选择器,并且抓取第一个符合标签内的文本内容 text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() # 类选择器下的子选择器,抓取全部符合的标签的文本内容(返回列表) tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item # 选取类选择器下的子标签,并且抓取标签的属性 next = response.css('.pager .next a::attr(href)').extract_first() # next是局部url,使用join方法获取url完整路径 url = response.urljoin(next) # 重新发起请求,采用递归的方式, callback回调函数;实现翻页的循环 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): """ Spider的一个方法。默认调用start_urls的链接构成的请求完成下载,返回的响应会作为唯一的参数(response)传递给 这个函数。该方法负责解析返回的响应、提取数据或者进一步生成要处理的请求。 :param response: :return: """ quotes = response.css(".quote") for quote in quotes: item = QuoteItem() item['text'] = quote.css(".text::text").extract_first() # 获取列表里的第一个元素 item['author'] = quote.css(".author::text").extract_first() # 获取列表里的第一个元素 item['tags'] = quote.css(".tags .tag::text").extract() # 获取列表里的所有标签 # 获取下一个页面的URL next_url = response.css('.pager .next a::attr(href)').extract_first() # 使用urljoin()方法把相对URL构造成绝对URL url = response.urljoin(next_url) # url和callback函数构造一个新的请求,回调函数使用parse()方法 yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): self.logger.info('Hello spider') quotes = response.css('div.quote') for quote in quotes: loader = ItemLoader(item=QuoteItem(), selector=quote) loader.add_css(field_name='quote_content', css='.text::text') loader.add_css(field_name='tags', css='.tag::text') quote_item = loader.load_item() author_url = quote.css('.author + a::attr(href)').get() self.logger.info('Get author page url') # Go to author page yield response.follow(author_url, callback=self.parse_author, meta={'quote_item': quote_item}) # Go to next page for a in response.css('li.next a'): yield response.follow(a, callback=self.parse)
def parse(self, response): self.logger.info('Parse function called on {}'.format(response.url)) # quotes = response.xpath("//div[@class='quote']") quotes = response.css('div.quote') for quote in quotes: loader = ItemLoader(item=QuoteItem(), selector=quote) # pay attention to the dot .// to use relative xpath # loader.add_xpath('quote_content', ".//span[@class='text']/text()") loader.add_css('quote_content', '.text::text') # loader.add_xpath('author', './/small//text()') loader.add_css('tags', '.tag::text') quote_item = loader.load_item() author_url = quote.css('.author + a::attr(href)').get() # go to the author page and pass the current collected quote info yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item}) # go to Next page for a in response.css('li.next a'): yield response.follow(a, self.parse)
def parse(self, response): ''' 默认下,被调用时, start_urls 里的链接构成请求 完成下载后,返回的 response 就会作为唯一的参数传递给这个函数 该方法负责解析返回的 response、提取数据/进一步生成要处理的请求 :param response: :return: ''' quotes = response.css('.quote') # 选取所有 quote 区块 for quote in quotes: item = QuoteItem() item['text'] = quote.css('.text::text').extract_first() item['author'] = quote.css('.author::text').extract_first() item['tags'] = quote.css('.tags .tag::text').extract() yield item next = response.css('.pager .next a::attr(href)').extract_first() # 从 response 中提取 下一页的链接 url = response.urljoin(next) # urljoin() 可以将相对 url 构造成绝对 URL yield scrapy.Request(url=url, callback=self.parse) # 构造了一个新的请求,请求完成后,响应会重新经过 parse() 方法处理,再生成下一页,循环到最后一页
def parse(self, response): # print(response.text) quotes = response.css('.quote') for quote in quotes: item = QuoteItem() text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() # 由于tag有多个值,所以使用extract()取当前的所有值 tags = quote.css('.tags .tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item # 获取下一页的url; next = response.css('.pager .next a::attr(href)').extract_first() # 将一个相对url构造成一个绝对url,例如这里获取的是next是/page/2, 通过urljoin会变成http://quotes.toscrape.com/page/2 url = response.urljoin(next) # 重新构造请求使用scrapy.Request(),这里是2个参数,一个url,一个回调函数 yield scrapy.Request(url=url, callback=self.parse)
def quote_parse(self, response): div_list = response.xpath('//div[@class="quote"]') #抽取具体quote信息 for div in div_list: quote_item = QuoteItem() quote_item['content'] = div.xpath( 'span[@class="text"]/text()').get() quote_item['author'] = div.xpath( '//small[@class="author"]/text()').get() quote_item['tags'] = div.xpath('//meta/@content').get() print(quote_item) yield quote_item #抽取每个作者详情页链接,并发起请求 author_url = self.base_url + div.xpath('span//a/@href').get() yield scrapy.Request(url=author_url, callback=self.author_parse) next_page = response.css('.next a') if next_page: url = self.base_url + next_page.attrib['href'] yield Request(url=url, callback=self.quote_parse)
def parse(self, response): #它是spider的一个方法,m默认情况下被调用的start_urls里面的连接构成的请求完成下载后。返回的响应就会传递这个函数,然后进行解析,提取数据或者进行下一步的请求 quotes = response.css(".quote") #多条信息 for quote in quotes: item = QuoteItem() #构造item对象 text = quote.css(".text::text").extract_first() author = quote.css(".author::text").extract_first() tags = quote.css(".tags .tag::text").extract() item["text"] = text item["author"] = author item["tags"] = tags yield item next = response.css('.pager .next a::attr("href")').extract_first() new_url = response.urljoin(next) print("-" * 30) print(next) print(new_url) print("-" * 30) yield scrapy.Request(url=new_url, callback=self.parse)