def parse(self, response): dm_dict = json.loads(response.body_as_unicode()) # print(dm_dict) dm_list = dm_dict["subjects"] # print(len(dm_list)) if len(dm_list) > 0: for dm in dm_list: item = DoubanmovieItem() item["cover"] = dm["cover"] item["id"] = dm["id"] item["is_new"] = dm["is_new"] item["playable"] = dm["playable"] item["rate"] = dm["rate"] item["title"] = dm["title"] item["url"] = dm["url"] yield item # print("end") # 下一頁 self.pg += 1 next_url = self.next_url.format(self.pg * 20) yield scrapy.Request(next_url, callback=self.parse) else: # 最後沒有資料時回傳一個空item{}回pipelines,告訴它可以保存資料到json,csv,or db # 或是可以在pipelines.py裡的close_spider()寫作 item = DoubanmovieItem() yield item print("爬蟲結束.........,pg=%d" % (int(self.pg) * 19))
def parse(self, response): item = DoubanmovieItem() selector = Selector(response) movies = selector.xpath('//div[@class="info"]') for eachMovie in movies: title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract() fulltitle = '' for each in title: fulltitle += each movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract() star = eachMovie.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() quote = eachMovie.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = fulltitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() if nextLink: nextLink = nextLink[0] print(nextLink) yield Request(self.url + nextLink, callback=self.parse)
def parse_a_movie(self,response): sel = Selector(response) item = DoubanmovieItem() item['movie'] = sel.xpath('//h1/span[1]/text()').extract()[0] item['score'] = sel.xpath('//div[@class="rating_self clearfix"]/strong/text()').extract()[0] item['url'] = response.url item['intro'] = '' jieshao = sel.xpath('//div[@id="link-report"]/span[1]/text()').extract() if jieshao[0].strip() != '': for yige in jieshao: item['intro'] = item['intro'] + yige.strip()+ '\n' i = 0 length = len(item['intro']) while i < length: item['intro'] = item['intro'][:i]+'\n'+item['intro'][i:] i += 75 else: #self.driver.get(response.url) #self.driver.find_element_by_xpath('//a[@class="j a_show_full"]').click() #jieshao2 = self.driver.find_element_by_xpath('//span[@class="all hidden"]').text #item['intro'] = jieshao2 i = 0 length = len(item['intro']) while i < length: item['intro'] = item['intro'][:i]+'\n'+item['intro'][i:] i += 75 return item
def parse(self, response): currentpPage_movie_item = response.xpath('//div[@class="item"]') for movie_item in currentpPage_movie_item: # 创建一个Movie对象 movie = DoubanmovieItem() # 获取电影排名并赋值rank属性 movie['rank'] = movie_item.xpath( 'div[@class="pic"]/em/text()').extract() # 获取电影名称并赋值title属性 movie['title'] = movie_item.xpath( 'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()' ).extract() # 获取电影海报地址并赋值pic_url属性 movie['pic_url'] = movie_item.xpath( 'div[@class="pic"]/a/img/@src').extract() # 将封装好的一个电影信息添加到容器中,yield作用是创建一个列表并添加元素 movie['inq'] = movie_item.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()' ).extract() yield movie pass # 下一页请求跳转,实现自动翻页 nextPage = response.xpath('//span[@class="next"]/a/@href') # 判断nextPage是否有效(无效代表当前页面为最后一页) if nextPage: # 获取nextPage中的下一页链接地址并加入到respones对象的请求地址中 url = response.urljoin(nextPage[0].extract()) # 发送下一页请求并调用parse()函数继续解析 yield scrapy.Request(url, self.parse) pass
def parse(self, response): # print response.body item = DoubanmovieItem() selector = Selector(response) Movies = selector.xpath('//div[@class="info"]') for eachMoive in Movies: title = eachMoive.xpath('div[@class="hd"]/a/span/text()').extract() fullTitle = '' for each in title: fullTitle += each movieInfo = eachMoive.xpath('div[@class="bd"]/p/text()').extract() star = eachMoive.xpath( 'div[@class="bd"]/div[@class="star"]/span/em/text()').extract( )[0] quote = eachMoive.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() #quote可能为空,因此需要先进行判断 if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() #第10页是最后一页,没有下一页的链接 if nextLink: nextLink = nextLink[0] print nextLink yield Request(self.url + nextLink, callback=self.parse)
def parse_page_item(self,response): # 初始化item对象 item = DoubanmovieItem() # 电影名 item['name'] = response.xpath("//div[@id='content']/h1/span[1]/text()").extract()[0].strip().split()[0] # 评分 item['score'] = response.xpath("//strong[@class='ll rating_num']/text()").extract()[0].strip() # 导演等信息XML树 info_tree = response.xpath("//div[@id='info']") # 主演列表 actor_list = info_tree.xpath("./span[3]/span[2]/a/text()").extract() # 主演字段 item['actors'] = self.retain_element_by_num(actor_list, RATAIN_ACTORS) # 导演列表 director_list = info_tree.xpath("./span[3]//a/text()").extract() # 导演字段 item['directors'] = self.retain_element_by_num(director_list, RATAIN_DIRECTORS) # 类型列表 type_list = info_tree.xpath(".//span[@property='v:genre']/text()").extract() # 类型字段 item['types'] = self.retain_element_by_num(type_list, RATAIN_TYPE) # 评价人数 item['people_number'] = response.xpath("//a[@class='rating_people']/span/text()").extract()[0] # 将item交给pipelines处理 yield item
def parse(self, response): #定位标签 movie_item = response.xpath('//div[@class = "item"]') # print(movie_item) # #遍历数据 for item in movie_item: # print(item) #xpath解析 #创建采集对象 movie = DoubanmovieItem() # 排名解析并赋值 movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract() #电影名解析并赋值 movie['title'] = item.xpath( 'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()' ).extract() #获取影评 movie["quote"] = item.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()' ).extract() # 获取海报 movie['pic'] = item.xpath('div[@class="pic"]/a/img/@src').extract() #返回生成器 yield movie #深度采集 #从当前页获取下一页链接 next_page = response.xpath('//span[@class="next"]/a/@href').extract() if next_page: choice = input("是否需要抓取下一页:(y/n)?n\n") if choice.lower() == 'y': #获取下一页地址 next_url = "https://movie.douban.com/top250" + next_page[0] yield scrapy.Request(next_url, self.parse) else: os.path.exists(0)
def parse(self, response): item = DoubanmovieItem() selector = Selector(response) Movies = selector.xpath('//div[@class="info"]') for eachMovie in Movies: title = eachMovie.xpath( 'div[@class="hd"]/a/span/text()').extract() # 多个span标签 fullTitle = "".join(title) # 将多个字符串无缝连接起来 movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract() star = eachMovie.xpath( 'div[@class="bd"]/div[@class="star"]/span/text()').extract()[0] quote = eachMovie.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() # quote可能为空,因此需要先进行判断 if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() # 第10页是最后一页,没有下一页的链接 if nextLink: nextLink = nextLink[0] yield Request(urljoin(response.url, nextLink), callback=self.parse)
def parse(self, response): items = [] soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'comment'}) for i in range(len(title_list)): # 在items.py定义 item = DoubanmovieItem() # author = title_list[i].find('a').find('href') comments = title_list[i].find('span', attrs={'class': 'short'}).text comments_dict = dict(很差=1,较差=2,还行=3,推荐=4,力荐=5) t1 = title_list[i].find('span',attrs={'class':'comment-info'}).find_all('span') star = t1[1]['title'] t2 = title_list[i].find('span',attrs={'class':'comment-info'}).find('a') author = t2.text item['author'] = author item['comments'] = comments item['star'] = comments_dict[star] items.append(item) return items
def parse(self, response): item = DoubanmovieItem() all_movie_list = response.xpath('//div[@class="item"]') for evevry_moive_info in all_movie_list: item["count"] = evevry_moive_info.xpath( 'div[1]/em/text()').extract() item["name"] = evevry_moive_info.xpath( 'div[2]/div[1]/a[1]/span[1]/text()').extract() item["director"] = evevry_moive_info.xpath( 'div[2]//div[2]/p/text()').extract()[0].split( '\xa0\xa0\xa0')[0].strip().replace('导演: ', '') try: item["stra"] = evevry_moive_info.xpath( 'div[2]//div[2]/p/text()').extract()[0].split( '\xa0\xa0\xa0')[1].strip().replace('主演: ', '').replace( '...', '') except: item["stra"] = '空数据' item["quote"] = evevry_moive_info.xpath( 'div[2]//div[2]/p[2]/span/text()').extract() yield item nextPage = response.xpath('//span[@class="next"]/a/@href') # 判断nextPage是否有效 if nextPage: # 拼接下一页的地址 url = response.urljoin(nextPage[0].extract()) # 发送url后页请求 yield scrapy.Request(url, self.parse)
def parse(self, response): #解析所有<div class = "item"> <div> itemList = response.xpath('//div[@class = "item"]') movie = DoubanmovieItem() # 循环读取每一个item for item in itemList: # 爬取top250排名 movie['rank'] = item.xpath( 'div[@class = "pic"]/em/text()').extract() # 爬取电影名称 movie['title'] = item.xpath( 'div[@class = "info"]/div[@class = "hd"]/a/span[@class = "title"][1]/text()' ).extract() #爬取电影海报 movie['poster'] = item.xpath( 'div[@class = "pic"]/a/img/@src').extract() # 添加对象到列表中 yield movie pass # 下一页请求 next_page = response.xpath('//span[@class = "next"]/a/@href') #判断 if next_page: # 拼接网址 url = response.urljoin(next_page[0].extract()) # 重新请求parse函数 yield scrapy.Request(url, self.parse) pass pass
def parse(self, response): # for info in response.xpath('//div[@class="item"]'): # item = DoubanmovieItem() # item['rank'] = info.xpath('div[@class="pic"]/em/text()').extract() # item['title'] = info.xpath('div[@class="pic"]/a/img/@alt').extract() # item['link'] = info.xpath('div[@class="pic"]/a/@href').extract() # item['star'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/em/text()').extract() # item['rate'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract() # item['quote'] = info.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract() # yield item item = DoubanmovieItem() item['rank'] = response.xpath( '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/em').extract() item['title'] = response.xpath( '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/a/img').extract( ) item['link'] = response.xpath( '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/a').extract() item['star'] = response.xpath( '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/div/span[2]' ).extract() item['rate'] = response.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()' ).extract() item['quote'] = response.xpath( '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/p[2]/span' ).extract() yield item
def parse(self, response): item = DoubanmovieItem() selector = scrapy.Selector(response) movies = selector.xpath('//div[@class="item"]') for each in movies: num = each.xpath('div[@class="pic"]/em/text()').extract()[0] title = each.xpath( 'div[@class="info"]/div[@class="hd"]/a/span[@class="title"]/text()' ).extract()[0] star = each.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] # star = re.search('<span class="rating_num" property="v:average">(.*?)</span>', each.extract(), re.S).group(1) quote = each.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()' ).extract_first() if quote is None: quote = ' ' item['quote'] = quote item['star'] = star item['title'] = title item['num'] = num yield item nextPage = selector.xpath( '//span[@class="next"]/link/@href').extract_first() if nextPage: next = response.urljoin(nextPage) print(next) yield scrapy.http.Request(next, callback=self.parse)
def parse(self, response): movies = response.xpath('//tr[@class="item"]') for movie_item in movies: item = DoubanmovieItem() item['name'] = movie_item.xpath('.//a[@class="nbg"]/img/@alt').extract_first() item['url'] = movie_item.xpath('.//a/@href').extract_first() item['rating'] = movie_item.xpath('.//span[@class="rating_nums"]/text()').extract_first() yield item
def parse_item(self,response): print "=============START" print response.body print "=============END" hxs = HtmlXPathSelector(response) movie_name = hxs.select('//*[@id="content"]/h1/span[1]/text()').extract() movie_director = hxs.select('//*[@id="info"]/span[1]/span[2]/a/text()').extract() movie_writer = hxs.select('//*[@id="info"]/span[2]/span[2]/a/text()').extract() movie_description = hxs.select('//*[@id="link-report"]//*[@property="v:summary"]/text()').extract() movie_roles = hxs.select('//*[@id="info"]/span[3]/span[2]//*[@rel="v:starring"]/text()').extract() movie_type = hxs.select('//*[@id="info"]//*[@property="v:genre"]/text()').extract() #获取电影详细信息序列及字符串 movie_detail = hxs.select('//*[@id="info"]').extract() movie_detail_str = ''.join(movie_detail).strip() # 正则匹配关键词 movie_language_str = '.*语言:</span> (.+?)<br>'.decode("utf8") movie_date_str = '.*上映日期:</span> <span property="v:initialReleaseDate" content="(\S+?)">(\S+?)</span>.*'.decode("utf8") movie_long_str = '.*片长:</span> <span property="v:runtime" content="(\d+).*'.decode("utf8") movie_country_str = '.*制片国家/地区:</span> (.+?)<br>'.decode("utf8") pattern_language =re.compile(movie_language_str,re.S) pattern_date = re.compile(movie_date_str,re.S) pattern_long = re.compile(movie_long_str,re.S) pattern_country = re.compile(movie_country_str,re.S) movie_language = re.search(pattern_language,movie_detail_str) movie_date = re.search(pattern_date,movie_detail_str) movie_long = re.search(pattern_long,movie_detail_str) movie_country = re.search(pattern_country,movie_detail_str) # 保存数据到item里 item = DoubanmovieItem() item['movie_name'] = self._string_deal(''.join(movie_name)) item['movie_director'] = self._string_deal(' '.join(movie_director)) item['movie_description'] = self._string_deal(''.join(movie_description[0])) if len(movie_description) else '' item['movie_writer'] = self._string_deal(' '.join(movie_writer)) item['movie_roles'] = self._string_deal(' '.join(movie_roles)) item['movie_type'] = self._string_deal(' '.join(movie_type)) item['movie_language'] = "" if movie_language: item['movie_language'] = self._string_deal(movie_language.group(1)) item['movie_date'] = "" if movie_date: item['movie_date'] = self._string_deal(movie_date.group(1)) item['movie_long'] = "" if movie_long: item['movie_long'] = self._string_deal(movie_long.group(1)) item['movie_country'] = "" if movie_country: item['movie_country'] = self._string_deal(movie_country.group(1)) yield item
def parse(self, response): # 使用 BeautifulSoup 解析网页内容 bs_info = bs(response.text, 'html.parser') # html 解析方式 # Python 中使用for in 形式的循环,Python使用缩进来做语句块分隔 title_list = bs_info.find_all('div', attrs={'class': 'hd'}) for i in title_list: item = DoubanmovieItem() item["title"] = i.find('a').find('span', ).text item["link"] = i.find('a').get('href') yield scrapy.Request(url=item["link"], meta={'item': item}, callback=self.parse2)
def parse(self, response): item = DoubanmovieItem() item['rank'] = rank[self.i] item['title'] = title[self.i] item['star'] = star[self.i] item['rate'] = rate[self.i] item['quote'] = quote[self.i] item['detail'] = response.xpath('//*[@id="link-report"]').extract()[0] self.i = self.i + 1 yield item if self.i < 250: yield scrapy.Request(url[self.i], self.parse)
def parse(self, response): soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'hd'}) #for i in range(len(title_list)): # 在Python中应该这样写 for i in title_list: # 在items.py定义 item = DoubanmovieItem() title = i.find('a').find('span').text link = i.find('a').get('href') item['title'] = title item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): # 解析函数 soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'hd'}) for i in title_list: item = DoubanmovieItem() title = i.find('a').find('span').text link = i.find('a').get('href') item['title'] = title item['link'] = link # yield item yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): for info in response.xpath('//div[@class="item"]'): item = DoubanmovieItem() item['title'] = info.xpath( 'div[@class="pic"]/a/img/@alt').extract() item['star'] = info.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() yield item next_page = response.xpath('//span[@class="next"]/a/@href') if next_page: url = response.urljoin(next_page[0].extract()) yield scrapy.Request(url, self.parse)
def parse(self, response): sel = Selector(response) sites = sel.xpath('//div[@class="item"]/div[@class="info"]') items = [] for site in sites: item = DoubanmovieItem() item['title'] = site.xpath( 'div[@class="hd"]/a/span[@class="title"]/text()')[0].extract() item['rating_num'] = site.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() item['link'] = site.xpath('div[@class="hd"]/a/@href').extract() items.append(item) yield item
def parse(self, response): movie_item = response.xpath('//div[@class="item"]') for item in movie_item: #创建一个movie对象 movie = DoubanmovieItem() movie['rank'] = item.xpath( 'div[@class="pic"]/em/text()').extract() #提取文本内容 movie['name'] = item.xpath( 'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()' ).extract() movie['pic'] = item.xpath('div[@class="pic"]/a/img/@src').extract() yield movie pass pass
def parse(self, response): items = [] soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'hd'}) for i in range(len(title_list)): # 在Python中应该这样写 # for i in title_list: # 在items.py定义 item = DoubanmovieItem() title = title_list[i].find('a').find('span').text link = title_list[i].find('a').get('href') item['title'] = title item['link'] = link items.append(item) return items
def parse_subject(self, response): item = DoubanmovieItem() # todo extract item content item['movie_name'] = response.xpath( '//*[@id="content"]/h1/span[1]').xpath( 'normalize-space(string(.))').extract()[0] item['intro'] = response.xpath('//*[@id="link-report"]/span').xpath( 'normalize-space(string(.))').extract()[0] item['actors'] = response.xpath( '//*[@id="info"]/span[3]/span[2]').xpath( 'normalize-space(string(.))').extract() item['date'] = response.xpath('//*[@id="info"]/span[11]').xpath( 'normalize-space(string(.))').extract()[0] item['director'] = response.xpath( '//*[@id="info"]/span[1]/span[2]/a').xpath( 'normalize-space(string(.))').extract()[0] return item
def deail_parse(self, response): self.logger.info('Parse function called on %s', response.url) result = DoubanmovieItem() imag_url = response.xpath('//*[@id="mainpic"]/a/img/@src').get() title = response.xpath('//*[@id="content"]/h1/span[1]/text()').get() year = response.xpath('//*[@id="content"]/h1/span[2]/text()').get() star = response.xpath( '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').get() comment = response.xpath( '//*[@id="hot-comments"]//div/div/p/span/text()').get() result['title'] = title result['year'] = year result['star'] = star result['comment'] = comment result['image_url'] = imag_url yield result
def parse(self, response): #定位标签 movie_item = response.xpath('//div[@class = "item"]') # print(movie_item) # #遍历数据 for item in movie_item: # print(item) #xpath解析 #创建采集对象 movie = DoubanmovieItem() # 排名解析并赋值 movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract() #电影名解析并赋值 movie['title'] = item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()').extract() #返回生成器 yield movie pass
def parse_item(self, response): sel = response item = DoubanmovieItem() item['name'] = sel.xpath( '//*[@id="content"]/h1/span[1]/text()').extract() item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re( r'\((\d+)\)') item['score'] = sel.xpath( '//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract() item['director'] = sel.xpath( '//*[@id="info"]/span[1]/span[2]/a/text()').extract() item['classification'] = sel.xpath( '//span[@property="v:genre"]/text()').extract() item['actor'] = sel.xpath( '////*[@id="info"]/span[3]/span[2]/a/text()').extract() return item
def parse(self, response): item = DoubanmovieItem() selector = Selector(response) movies = selector.xpath('//div[@class="pl2"]') for each in movies: URL = each.xpath('a/@href').extract() title = each.xpath('a/text()').extract() title2 = each.xpath('a/span/text()').extract() fullTitle = '' #//*[@id="content"]/div/div[1]/div[2]/table[1]/tbody/tr/td[2]/div/a/text() #//*[@id="content"]/div/div[1]/div[2]/table[1]/tbody/tr/td[2]/div/a/span for eachTitle in title: fullTitle += eachTitle for eachTitle in title2: fullTitle += eachTitle # print fullTitle movieInfo = each.xpath('p[@class="pl"]/text()').extract() # print movieInfo star = each.xpath( 'div[@class="star clearfix"]/span[@class="rating_nums"]/text()' ).extract() # print star #quote = each.xpath('div[@class="star clearfix"]/span[@class="rating_nums"]/text()').extract() #if quote: #quote = quote[0] #else: #quote = '' # print quote item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['full_URL'] = URL #item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() if nextLink: nextLink = nextLink[0] print nextLink yield Request(nextLink, callback=self.parse)
def parse(self, response): info_list = response.xpath('//div[@class="info"]') for data in info_list: item = DoubanmovieItem() item['name'] = data.xpath('./div[@class="hd"]/a/span[1]/text()').extract()[0].encode('utf-8') item['actor'] = data.xpath('./div[@class="bd"]/p[1]/text()').extract()[0].strip().encode('utf-8') item['rating'] = data.xpath('./div//div/span[@class="rating_num"]/text()').extract()[0].encode('utf-8') infos = data.xpath('./div[@class="bd"]/p[@class="quote"]/span/text()').extract() if len(infos)!=0: infos = infos[0].strip().encode('utf-8') else: infos = ' ' item['info'] = infos yield item if self.page<225: self.page+=25 yield scrapy.Request(self.url+str(self.page), callback=self.parse)
def parse(self, response): movies = response.xpath('//div[@class="info"]') for each in movies: item = DoubanmovieItem() item['moviename'] = each.xpath( './div[@class="hd"]/a/span[1]/text()').extract()[0] item['info'] = each.xpath( './div[@class="bd"]/p/text()').extract()[0] item['star'] = each.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] quote = each.xpath('.//p[@class="quote"]/span/text()').extract() if len(quote) != 0: item["quote"] = quote[0] yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)