def parse_book(self, response): item = DoubanItem() item['publish'] = response.xpath( u'//span[contains(./text(), "出版社:")]/following::text()[1]' ).extract()[0] item['publish_time'] = response.xpath( u'//span[contains(./text(), "出版年:")]/following::text()[1]' ).extract()[0] item['decorate'] = response.xpath( u'//span[contains(./text(), "装帧:")]/following::text()[1]').extract( )[0] item['ISBN'] = response.xpath( u'//span[contains(./text(), "ISBN:")]/following::text()[1]' ).extract()[0] item['price'] = response.xpath( u'//span[contains(./text(), "定价:")]/following::text()[1]').extract( )[0] item['page_num'] = response.xpath( u'//span[contains(./text(), "页数:")]/following::text()[1]').extract( )[0] writerList = response.xpath( '//div[@id="info"]/span/a/text()').extract() item['writer'] = writerList[0] try: item['translator'] = writerList[1] except: item['translator'] = '无' yield item
def parse(self, response): #定义item为字典,并且检查字段的格式是否为字典 #打印响应的body # with open('douban.html','w') as f: # f.write(response.body) #解析数据 item = DoubanItem() #xpath解析,extract()提取,出来是一个列表,可以用[0]进行进一步提取 #解析有xpath提取,css提取,正则提取 #首先xpath,直接response.xpath(),再加extract()提取成列表,[0]就是第一个 #或者extract_first('无数据')也是提取第一个,无数据的话不会出现导致数组越界报错,会返回无数据 #css,直接response.css('.text a::text'),.是class,#是id,空格加a就是标签, # ::text是文本属性,::attr(href)是获取属性。。或者就是css选择器 #两者可以互相嵌套 #正则一定要在xpath后接上,不然报错 #response.xpath('.').re(''),re_first是获取第一个, item['name'] = response.xpath( '//*[@id="anony-time"]/div/div[3]/ul/li[1]/a[2]/text()').extract( )[0] item['column'] = response.xpath( '//*[@id="anony-time"]/div/div[3]/ul/li[1]/span/text()').extract( )[0] #详情页的url detail_url = response.xpath( '//*[@id="statuses"]/div[2]/div[1]/div/div/div[2]/div[1]/div[2]/div/a/@href' ) print(item) # return把数据传输到engine,然后engine把数据传给pipelines管道 yield item
def parse(self, response): print("resonse.url===", response.url) node_list = response.xpath('//div[@class="item"]') for node in node_list: title = node.xpath('.//span[@class="title"][1]/text()').extract() score = node.xpath( './/div[@class="star"]/span[2]/text()').extract() info = node.xpath('.//div[@class="info"]//p/span/text()').extract() if title: title = " ".join(title) if score: score = " ".join(score) if info: info = " ".join(info) item = DoubanItem() item["title"] = title item["score"] = score item["info"] = info yield item #下一页 if self.offset < 225: self.offset += 25 new_next_url = self.url + str(self.offset) + "&filter=" yield scrapy.Request(new_next_url, callback=self.parse)
def parse_item(self, response): print("response.url================================", response.url) all_node = response.xpath('//div[@class="info"]') for node in all_node: item = DoubanItem() print("--" * 100) # 影片的标题 tilte = node.xpath( './/span[@class="title"][1]/text()').extract()[0] # 影片的信息 content = node.xpath('.//div[@class="bd"]/p/text()').extract()[0] # 影片的评分 score = node.xpath( './/div[@class="star"]/span[2]/text()').extract()[0] # 影片的一句话简介 info = node.xpath('.//p[@class="quote"]/span/text()').extract() if len(info) > 0: info = info[0] item["tilte"] = tilte item["content"] = content item["score"] = score item["info"] = info # print(item) yield item
def parse_item(self, response): # print (response.url,'----------') # 获取所有电影节点 node_list = response.xpath( '//*[@id="content"]/div/div[1]/ol/li/div/div[2]') # print (len(node_list)) # 遍历节点列表 for node in node_list: # 创建item对象 item = DoubanItem() # 从节点中获取数据,保存到item中 # 电影名 item['name'] = node.xpath( './div[1]/a/span[1]/text()').extract_first() # 评分 item['score'] = node.xpath( './div[2]/div/span[2]/text()').extract_first() # info信息 item['info'] = ''.join([ i.strip() for i in node.xpath('./div[2]/p[1]/text()').extract() ]).replace('\xa0', '') #简介 item['desc'] = node.xpath( './div[2]/p[2]/span/text()').extract_first() # print (item) # 返回数据 yield item
def parse(self, response): for info in response.xpath('//div[@class="item"]'): item = DoubanItem() item['rank'] = info.xpath('div[@class="pic"]/em/text()').extract( )[0].strip().encode("utf8").replace("\'", "") item['title'] = info.xpath('div[@class="pic"]/a/img/@alt').extract( )[0].strip().encode("utf8").replace("\'", "") item['link'] = info.xpath('div[@class="pic"]/a/@href').extract( )[0].strip().encode("utf8").replace("\'", "") item['star'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/p[1]/text()').extract( )[0].strip().encode("utf8").replace("\'", "") list_quote = info.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()' ).extract() item['quote'] = self.sumChildStr(list_quote).replace("\'", "") list_rate = info.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()' ).extract() item['rate'] = self.sumChildStr(list_rate).replace("\'", "") yield item next_page = response.xpath('//span[@class="next"]/a/@href') if next_page and self.page < self.MAX_PAGE: url = response.urljoin(next_page[0].extract()) self.page += 1 yield scrapy.Request(url, self.parse)
def parse_item(self, response): print("resonse.url===", response.url) node_list = response.xpath('//div[@class="item"]') for node in node_list: title = node.xpath('.//span[@class="title"][1]/text()').extract() score = node.xpath( './/div[@class="star"]/span[2]/text()').extract() info = node.xpath('.//div[@class="info"]//p/span/text()').extract() if title: title = " ".join(title) if score: score = " ".join(score) if info: info = " ".join(info) item = DoubanItem() item["title"] = title item["score"] = score item["info"] = info yield item
def parse(self, response): item = DoubanItem() author = response.xpath( "//div[@id='info']/span[contains(./text(), '作者:')]/following-sibling::a[1]/text()" ).extract()[0] item["author"] = re.sub("\s", "", string=author) item["publish_house"] = response.xpath( "//div[@id='info']/span[contains(./text(), '出版社:')]/following::text()[1]" ).extract()[0] item["publish_date"] = response.xpath( "//div[@id='info']/span[contains(./text(), '出版年:')]/following::text()[1]" ).extract()[0] item["page_num"] = response.xpath( "//div[@id='info']/span[contains(./text(), '页数:')]/following::text()[1]" ).extract()[0] item["package"] = response.xpath( "//div[@id='info']/span[contains(./text(), '装帧:')]/following::text()[1]" ).extract()[0] item["ISBN"] = response.xpath( "//div[@id='info']/span[contains(./text(), 'ISBN:')]/following::text()[1]" ).extract()[0] item["price"] = response.xpath( "//div[@id='info']/span[contains(./text(), '定价:')]/following::text()[1]" ).extract()[0] item["remark"] = response.xpath( "//strong[@class='ll rating_num ']/text()").extract()[0] item["tags"] = response.xpath("//a[@class=' tag']/text()").extract() yield item # 返回下一个链接 url_list = response.xpath( "//div[@class='content clearfix']/dl/dd/a/@href").extract() for url in url_list: yield scrapy.Request(url=url, callback=self.parse)
def parse_movie(self, response): loader = ItemLoader(item=DoubanItem(), response=response) for attr, xpath in self.settings.getdict('INFO_XPATH').items(): loader.add_xpath(attr, xpath) s = response.xpath('//div[@id="info"]').extract_first() for attr, regex in self.settings.getdict('RE').items(): loader.add_value(attr, re.findall(regex, s)) loader.add_value('rate', self.parse_rate(response)) loader.add_value('url', response.url) if self.settings.get('ALLOW_COVER') == True: image_urls = self._get_urls( self.image_base_url, urljoin, response.xpath('//div[@id="mainpic"]/a/img/@src').extract(), lambda s: s.split('/')[-1], ) loader.add_value('image_urls', image_urls) return loader.load_item()
def parse(self, response): node_list = response.xpath("//ol/li") #node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: # 构建item对象,用来保存数据 item = DoubanItem() # 提取每个职位的信息 #print(node) #print("*"*40) item['title'] = node.xpath(".//span[1]/text()").extract()[0] item['director'] = node.xpath( ".//p/text()").extract()[0].split()[1] span = node.xpath(".//p[@class='quote']/span") if span is None: item['introduce'] = '########' else: item['introduce'] = span.xpath(".//text()").extract()[0] item['link'] = node.xpath(".//a[1]/@href").extract()[0] # yield 的重要性,是返回数据后还能回来接着执行代码 yield item # 第一种写法:拼接url,适用场景:页面没有可以点击的请求连接,必须通过拼接url才能获取响应 if self.off_set < 250: self.off_set += 25 url = self.base_url + str(self.off_set) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): item = DoubanItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item["排名"] = movie.xpath( './/div[@class="pic"]/em/text()').extract()[0] item["名字"] = movie.xpath( './/div[@class="hd"]/a/span[1]/text()').extract()[0] item["导演"] = movie.xpath('.//p[@class=""]/text()[1]' ).extract_first().strip().split(' ')[1] # item["主演"]=movie.xpath('.//p[@class=""]/text()[1]').extract_first().strip().split(' ')[1] item["年份"] = movie.xpath('.//p[@class=""]/text()[2]' ).extract_first().split('/')[0].strip() item["国家"] = movie.xpath('.//p[@class=""]/text()[2]' ).extract_first().split('/')[1].strip() item["类型"] = movie.xpath('.//p[@class=""]/text()[2]' ).extract_first().split('/')[2].strip() item["评分"] = movie.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] item["评价"] = movie.xpath( './/p[@class="quote"]/span/text()').extract()[0] yield item next_url = response.xpath('//span[@class="next"]/a/@href').extract() if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield Request(next_url, headers=self.headers)
def parse(self, response): # 定义item为字典,并且检查字段的格式是否为字典 # 打印响应的body # with open('douban.html','w') as f: # f.write(response.body) # 解析数据 item = DoubanItem() # xpath解析,extract()提取,出来是一个列表,可以用[0]进行进一步提取 item['name'] = response.xpath( '//*[@id="anony-time"]/div/div[3]/ul/li[1]/a[2]/text()').extract( )[0] item['column'] = response.xpath( '//*[@id="anony-time"]/div/div[3]/ul/li[1]/span/text()').extract( )[0] # 详情页的url detail_url = response.xpath( '//*[@id="statuses"]/div[2]/div[1]/div/div/div[2]/div[1]/div[2]/div/a/@href' ) print(item) #要传输item,所以不能现在就返回 # yield item #通过meta传输数据,公用管道存储,将item传了过去 yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item})
class Douban_can_video(scrapy.Spider): name = 'Douban_can_video' item = DoubanItem() allowed_domains = ['movie.douban.com'] start_urls = ['https://movie.douban.com/j/search_subjects?'] headers = { 'Referer':'https://movie.douban.com/', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 'Chrome/55.0.2883.87 Safari/537.36' } values = { 'type': 'movie', 'tag': '可播放', 'sort': 'recommend', 'page_limit': '20', 'page_start': '0', } # 'https://movie.douban.com/j/search_subjects?' def start_requests(self): # yield scrapy.Request(self.start_urls[0],headers=self.headers,callback=self.parse) #热门:309 最新:500 data = urllib.urlencode(self.values) url = self.start_urls[0] + data self.item['path'] = 'Image/可播放' yield scrapy.Request(url, headers=self.headers, callback=self.parse) def parse(self, response): s = json.loads(response.body) a = s['subjects'] for i in xrange(len(a)): self.url = a[i]['url'] self.item['url'] = self.url yield scrapy.Request(self.url, self.parse_info, headers=self.headers) def parse_info(self, response): print '*****************8' print response self.item['title'] = response.xpath( '//h1/span[@property="v:itemreviewed"]/text()').extract()[0] self.item['year'] = response.xpath( '//h1/span[@class="year"]/text()').extract()[0] self.item['image_urls'] = response.xpath( '//div[@id="mainpic"][@class=""]/a/img/@src').extract() #处理提取多行text self.item['info'] = response.xpath( 'string(//div[@id="info"])').extract()[0] # content = response.xpath('string(//div[@class="related-info"])').extract()[0] #处理xpath提取出来打内容中的空白,只留一个 self.item['content'] = response.xpath( 'normalize-space(string(//div[@class="related-info"]))').extract( )[0] yield self.item
def parse_item(self, response): movie_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li') for movie in movie_list: item = DoubanItem() item['name'] = movie.xpath('./div/div[2]/div[1]/a/span[1]/text()').extract_first() item['score'] = movie.xpath('./div/div[2]/div[2]/div/span[2]/text()').extract_first() item['info'] = movie.xpath('./div/div[2]/div[2]/p[1]/text()').extract_first().strip() item['desc'] = movie.xpath('./div/div[2]/div[2]/p[2]/span/text()').extract_first() yield item
def parse_item(self, response): sel = Selector(response) item = DoubanItem() # 以下爬出的内容中文编码有问题,需要转化 item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract() item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)') item['score']=sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()') item['director']=sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()') item['classification']= sel.xpath('//span[@property="v:genre"]/text()') item['actor']= sel.xpath('//*[@id="info"]/span[3]/span[2]/text()') # 输出全是 '/',需要修正 return item
def parse_next(self, response): for item in response.xpath('//tr[@class="item"]'): book = DoubanItem() book['title'] = item.xpath('td[2]/div[1]/a/@title').extract_first() book['info'] = item.xpath('td[2]/p/text()').extract_first() book['rank'] = item.xpath( 'td[2]/div[2]/span[2]/text()').extract_first() book['intro'] = item.xpath( 'td[2]/p/span[1]/text()').extract_first() print book
def parse_item(self, response): # print(response.url) node_list = response.xpath('//div[@class="info"]') for node in node_list: item = DoubanItem() item['name'] = node.xpath('./div[1]/a/span[1]/text()').extract_first() item['score'] = node.xpath('./div[2]/div/span[2]/text()').extract_first() item['info'] = node.xpath('./div[2]/p[1]/text()').extract_first().replace('\xa0', ' ').strip() # item['info'] = node.xpath('./div[2]/p[1]/text()').extract_first().replace('\xa0','').strip() item['desc'] = node.xpath('./div[2]/p[2]/span/text()').extract_first() # print(item) yield item
def parse(self, response): elements = response.css("#subject_list ul li") for ele in elements: item = DoubanItem() item['title'] = ele.css(".info h2 a::text").get().strip() pub_all = ele.css(".info .pub::text").get().strip().split(' / ') item['author'] = '/'.join(pub_all[:-3]) item['pub'] = ''.join(pub_all[-3:-2]) item['date'] = ''.join(pub_all[-2:-1]) item['price'] = ''.join(pub_all[-1:]) item['rating_nums'] = ele.css( ".info .star span.rating_nums::text").get() item['comment_nums'] = ele.css( ".info .star span.pl::text").get().strip().strip('()') item['introduction'] = ele.css(".info p::text").get() yield item
def parse(self, response): el_list = json.loads(response.body.decode())["subjects"] for el in el_list: item = DoubanItem() item["name"] = el["title"] item["img_link"] = el["cover"] item["score"] = el["rate"] details_url = el["url"] yield scrapy.Request(url=details_url, callback=self.parse_details, meta={"item": item}) ajax_url = response.request.url.rsplit("=", 1) ajax_url = "=".join([ajax_url[0], str(int(ajax_url[-1]) + 20)]) time.sleep(random.random() * 2) yield scrapy.Request(url=ajax_url, callback=self.parse)
def parse(self, response): node_list = response.xpath("//div[@class='pic']//img") for node in node_list: item = DoubanItem() item['imagename'] = node.xpath("./@alt")[0].extract() item['imagelink'] = node.xpath("./@src")[0].extract() yield item # if self.offset < 225: # self.offset += 25 # newurl = self.baseurl + str(self.offset) # yield scrapy.Request(newurl, callback = self.parse) if response.xpath("//span[@class = 'next']/link/@href"): offset = response.xpath( "//span[@class = 'next']/link/@href").extract()[0] newurl = self.baseurl + offset yield scrapy.Request(newurl, callback=self.parse)
def parse_item(self, response): node_list = response.xpath('//div[@class="info"]') # print(len(node_list)) for node in node_list: item = DoubanItem() item['name'] = node.xpath( './div[1]/a/span[1]/text()').extract_first() item['score'] = node.xpath( './div[2]/div/span[2]/text()').extract_first() item['info'] = ''.join([ data.strip() for data in node.xpath('./div[2]/p[1]/text()').extract() ]) item['desc'] = node.xpath( './div[2]/p[2]/span/text()').extract_first() yield item
def parse_item(self, response): # print (response.url,'------') # 获取电影节点列表 node_list = response.xpath('//div[@class="info"]') # print (len(node_list)) # 遍历节点列表 for node in node_list: # 构建item实例 item = DoubanItem() # 抽取数据 item['name'] = node.xpath('./div[1]/a/span[1]/text()').extract_first() item['score'] = node.xpath('./div[2]/div/span[2]/text()').extract_first() item['info'] = ''.join([i.strip() for i in node.xpath('./div[2]/p[1]/text()').extract()]).replace('\xa0','') item['desc'] = node.xpath('./div[2]/p[2]/span/text()').extract_first() # print (item) # 将数据返回给引擎 yield item
def parse(self, response): #print(response.body) selector = scrapy.Selector(response) for movie in selector.xpath('//tr[@class="item"]'): item = DoubanItem() title = movie.xpath('./td[1]/a/@title').extract_first() href = movie.xpath('./td[1]/a/@href').extract_first() item['title'] = title item['href'] = href #print(title) #print(href) yield item #startnum = response.url #startnum = startnum[40:-7] #num = int(startnum) + 20 #next_page_url = 'https://movie.douban.com/tag/2015?start=' + str(num) + '&type=O' next_page_url = response.xpath( '//span[@class="next"]/a/@href').extract_first() if next_page_url: yield scrapy.Request(response.urljoin(next_page_url))
def parse_article(self, response): hxs = Selector(response) movie_name = hxs.xpath( '//*[@id="content"]/h1/span[1]/text()').extract() comment_link = hxs.xpath( '//div[@id="comments-section"]/div/h2/span/a/@href').extract()[0] item = DoubanItem() item['movie_name'] = movie_name item['comment_link'] = comment_link yield Request(comment_link, meta={'item': item}, callback=self.parse_item, cookies=[ { 'name': 'COOKIE_NAME', 'value': 'VALUE', 'domain': '.douban.com', 'path': '/' }, ])
def parsr_detarl(self, response): # print(2222222222222222222222,response.request.headers['User-Agent']) item = DoubanItem() print('+++++++++++++++', response.url) item['movie_name'] = response.xpath( '//*[@id="content"]/h1/span[1]/text()').extract_first() item['movie_url'] = response.url item['director'] = response.xpath( '//*[@id="info"]/span[1]/span[2]/a/text()').extract_first() item['scripter'] = ",".join( response.xpath( '//*[@id="info"]/span[2]/span[2]/a/text()').extract()) item['octor'] = ",".join( response.xpath( '//*[@id="info"]/span[3]/span[2]/span/a/text()').extract()) item['style'] = ','.join( response.xpath( '//*[@id="info"]/span[@property="v:genre"]/text()').extract()) item['create_country'] = response.xpath( '//*[@id="info"]/text()[8]').extract_first() item['language'] = response.xpath( '//*[@id="info"]/text()[10]').extract_first() item['show_date'] = ','.join( response.xpath( '//*[@id="info"]/span[@property="v:initialReleaseDate"]/text()' ).extract()) item['longer'] = response.xpath( '//*[@id="info"]/span[@property="v:runtime"]/text()' ).extract_first() item['other_name'] = response.xpath( '//*[@id="info"]/text()[17]').extract_first() item['desc'] = ''.join( response.xpath( '//*[@id="link-report"]/span[@class="all hidden"]/text()'). extract()) yield item
def parse(self, response): print("response.url================================",response.url) all_node = response.xpath('//div[@class="info"]') for node in all_node: item = DoubanItem() print("--"*100) #影片的标题 tilte = node.xpath('.//span[@class="title"][1]/text()').extract()[0] #影片的信息 content = node.xpath('.//div[@class="bd"]/p/text()').extract()[0] #影片的评分 score = node.xpath('.//div[@class="star"]/span[2]/text()').extract()[0] #影片的一句话简介 info = node.xpath('.//p[@class="quote"]/span/text()').extract() if len(info) > 0: info = info[0] item["tilte"] = tilte item["content"] = content item["score"] = score item["info"] = info # print(item) yield item #实现下一页的功能 if self.offset < 225: self.offset += 25 else: self.offset = 0 next_url = self.url + str(self.offset) print("next_url=====",next_url) yield scrapy.Request(next_url,callback=self.parse,headers=self.headers,dont_filter=True)
def parse(self, response): item = DoubanItem() movies = response.xpath("//div[@class='info']") print "-----" for each in movies: item['title'] = each.xpath( ".//span[@class='title'][1]/text()").extract()[0] bd = each.xpath(".//div[@class='bd']/p/text()").extract()[0] item['star'] = each.xpath( ".//span[@class='rating_num']/text()").extract()[0] quote = each.xpath(".//p[@class='quote']/span/text()").extract() item['bd'] = "".join(bd).strip() if len('quote') != 0: item['quote'] = quote[0] yield item print "----" if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse_item(self, response): # 获取所有电影节点列表 movie_list = response.xpath('//div[@class="item"]') for movie in movie_list: # 创建item实例 item = DoubanItem() item['name'] = movie.xpath( './div[2]/div[1]/a/span[1]/text()').extract()[0] item['image'] = movie.xpath('./div[1]/a/img/@src').extract()[0] item['score'] = movie.xpath( './div[2]/div[2]/div/span[2]/text()').extract()[0] item['info'] = movie.xpath( './div[2]/div[2]/p[2]/span/text()').extract()[0] item['ower'] = ''.join([ i.strip() for i in movie.xpath('./div[2]/div[2]/p[1]/text()').extract() ]).replace('\xa0', '') url = movie.xpath('./div[1]/a/@href').extract()[0] print(url, '++++++++++++++++++++++') yield scrapy.Request(url, callback=self.parse_detail, meta={'mymeta': item})
def parse(self, response): #print(response.request.headers['User-Agent']) movie_list = response.xpath( '//*[@id="content"]/div/div[1]/ol/li/div/div[2]') print(len(movie_list)) for movie in movie_list: item = DoubanItem() item['name'] = movie.xpath( './div[1]/a/span[1]/text()').extract_first() print(item) yield item next_url = response.xpath( '//*[@id="content"]/div/div[1]/div[2]/span[3]/a/@href' ).extract_first() if next_url != None: next_url = response.urljoin(next_url) yield scrapy.Request(url=next_url)
def parse(self, response): for info in response.xpath('//div[@class="item"]'): item = DoubanItem() item['rank'] = info.xpath('div[@class="pic"]/em/text()').extract() item['title'] = info.xpath( 'div[@class="pic"]/a/img/@alt').extract() item['link'] = info.xpath('div[@class="pic"]/a/@href').extract() item['star'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/p[1]/text()').extract( )[0].strip() item['rate'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()' ).extract() item['quote'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()' ).extract() yield item next_page = response.xpath('//span[@class="next"]/a/@href') if next_page: url = response.urljoin(next_page[0].extract()) yield scrapy.Request(url, self.parse)