def parse(self, response): sel = Selector(response) #items = [] #获得文章url和标题 item = CsdnblogItem() article_url = str(response.url) article_name = sel.xpath( '//div[@id="article_details"]/div/h1/span/a/text()').extract() item['article_name'] = [n.encode('utf-8') for n in article_name] item['article_url'] = article_url.encode('utf-8') yield item #获得下一篇文章的url urls = sel.xpath('//li[@class="next_article"]/a/@href').extract() # for url in urls: # print url # url = "http://blog.csdn.net" + url # print url # yield Request(url, callback=self.parse) print '-------------------------------xxy--', urls[0] url = "http://blog.csdn.net" + urls[0] print url yield Request(url, callback=self.parse)
def parse(self, response): sel = Selector(response) item = CsdnblogItem() article_url = str(response.url) article_name = sel.xpath( '//div[@class="article_title"]/h1/span/a/text()').extract() item['article_name'] = [n.encode('utf-8') for n in article_name] item['article_url'] = article_url.encode('utf-8') yield item urls = sel.xpath('//li[@class="next_article"]/a/@href').extract() for url in urls: url = "http://blog.csdn.net" + url yield Request(url, callback=self.parse)
def parse(self, response): sel = Selector(response) # items = [] # 获得文章url和标题 item = CsdnblogItem() article_url = str(response.url) article_name = sel.xpath('//h1[@class="title-article"]/text()').extract_first() item['article_name'] = article_name item['article_url'] = article_url yield item # 获得下一篇文章的url url = response.xpath('//div[@class="tool-box"]/ul/li[last()-1]/a/@href').extract_first() if(url != None): yield Request(url, callback=self.parse)
def parse(self, response): sel = Selector(response) #items = [] #获得文章url和标题 item = CsdnblogItem() article_url = str(response.url) article_name = sel.xpath('//h1/text()').extract() item['article_name'] = [n.encode('utf-8') for n in article_name] item['article_url'] = article_url.encode('utf-8') yield item #获得下一篇文章的url urls = sel.xpath('//div[@class="nav-previous"]/a/@href').extract() for url in urls: print url yield Request(url, callback=self.parse)