def parse(self, response): sel = Selector(response) sites = sel.xpath('//ol[@class="grid_view"]/li/div/div[@class="pic"]') items = [] for site in sites: item = top100Item() item["movie_rank"] = site.xpath("em/text()").extract() item["movie_name"] = site.xpath("a/img/@alt").extract() item["movie_url"] = site.xpath("a/@href").extract() items.append(item) yield item # 获得下一篇top250的url urls = sel.xpath('//div[@class="paginator"]/a/@href').extract() urls[:-1] urls.reverse() for url in urls: if url != "?start=0&filter=": # print url url = "http://movie.douban.com/top250" + url # print url yield Request(url, callback=self.parse) """i=0
def parse(self, response): sel = Selector(response) sites=sel.xpath('//ol[@class="grid_view"]/li/div/div[@class="pic"]') items = [] for site in sites: item=top100Item() item['movie_rank']=site.xpath('em/text()').extract() item['movie_name']=site.xpath('a/img/@alt').extract() item['movie_url']=site.xpath('a/@href').extract() items.append(item) return items