def parse(self, response): item = DoubanItem() selector = Selector(response) movieList = selector.xpath('//ol[@class="grid_view"]/li') print(movieList) for movie in movieList: #读取数据 item = DoubanItem() title = movie.xpath('div/div[@class="info"]/div[1]/a/span/text()').extract() item['movieInfo']= movie.xpath('div/div[@class="info"]/div[2]/p[1]/text()').extract()[0] item['star'] = movie.xpath('div/div[@class="info"]/div[2]/div/span[@class="rating_num"]/text()').extract()[0] quote = movie.xpath('div/div[@class="info"]/div[2]/p[2]/span/text()').extract() #特殊数据处理 if quote: item['quote'] = quote[0] else: item['quote'] = '' item['title'] = ''.join(title) #数据保存 yield item #下一页处理 nextPage = selector.xpath('//span[@class="next"]/link/@href').extract() if nextPage: nextPage = nextPage[0] print(self.url + nextPage) yield Request(self.url + nextPage, callback=self.parse)
def parse_rev(getid, ty, response): if ty == "rev": item = DoubanItem() item['_i'] = 'review_profile' item['review_id'] = getid p = response.body.decode('UTF-8') item['review_name'] = etree.HTML(p).xpath( '//div[@class="article"]/h1/span/text()')[0] tmp = etree.HTML(p).xpath( '//div[@class="main-bd"]/p[@class="main-title-tip"]/text()') if tmp: item['review_spoil'] = tmp[0] else: item['review_spoil'] = None l = etree.HTML(p).xpath( '//header[@class="main-hd"]/span/@class')[0] item['review_rating'] = re.findall("\d+", l)[0][0] item['review_time'] = etree.HTML(p).xpath( '//header[@class="main-hd"]/span[@class="main-meta"]/text()' )[0] string = "" m = etree.HTML(p).xpath('//div[@id="link-report"]//p/text()') if not m: m = etree.HTML(p).xpath('//div[@id="link-report"]/div/text()') for n in m: string = string + n item['review'] = string yield item item = DoubanItem() item['_i'] = 'review_comments' item['review_id'] = getid p = response.body.decode('UTF-8') i = etree.HTML(p).xpath('//div[@class="comment-item"]') for j in i: string = "" item['c_id'] = j.attrib['data-cid'] item['commenter_id'] = re.split(r'[/?]\s*', j.attrib['data-user_url'])[4] item['ref_cid'] = j.attrib['data-ref_cid'] item['c_time'] = j.find('div//div[@class="header"]/span').text m = j.findall('div//p[@class="comment-text"]') if m: for n in m: string = string + n.text else: string = "" item['comment'] = string yield item
def errback_httpbin(self, failure): if failure.check(HttpError): # these exceptions come from HttpError spider middleware # you can get the non-200 response response = failure.value.response self.logger.error('HttpError on %s', response.url) self.driver.get(response.url) temp_html = BeautifulSoup(self.driver.page_source, 'lxml') item = DoubanItem() item['rank'] = temp_html.find('span', class_='top250-no').get_text().split('.')[1] info = temp_html.find(id='info') item['name'] = temp_html.find('h1').find('span').get_text().split()[0] item['year'] = re.findall('\d+', temp_html.find('span', class_='year').get_text())[0] item['director'] = '' item['script'] = '' item['actor'] = '' role_dict = {u'\u5bfc\u6f14': 'director', u'\u7f16\u5267': 'script', u'\u4e3b\u6f14': 'actor'} temp = info.find_all('span', class_='pl') for items in temp: role = role_dict.get(items.get_text()) if role is not None: item[role] = items.find_next('span').get_text() item['classification'] = '/'.join([x.get_text() for x in info.find_all('span', property='v:genre')]) item['score'] = temp_html.find(id='interest_sectl').find('strong', class_='ll rating_num').get_text() item['story'] = re.sub('(\s)|(/n)', '', temp_html.find('span', property='v:summary').get_text()) return item
def parse(self, response): sel = Selector(response) movie_name = sel.xpath("//div[@class='pl2']/a/text()").extract() ''' //*[@id="content"]/div/div[1]/div/div/table[3]/tbody/tr/td[2]/div/a/span ''' names = [] print '******' for name in movie_name[::2]: names.append(name.strip().replace('\\', "")) #print movie_name print '******' movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract() movie_score = sel.xpath( "//div[@class='pl2']/div/span[@class='rating_nums']/text()" ).extract() #items = [] for name, url, score in zip(names, movie_url, movie_score): item = DoubanItem() #item['movie_name']=[n.encode('utf-8') for n in movie_name] #item['movie_score']=[n for n in movie_score] #item['movie_url']=[n for n in movie_url] item['movie_name'] = name item['movie_score'] = score item['movie_url'] = url #print item yield item #items.append(item) #return items #print movie_name,movie_score,movie_url
def parse(self, response): soup = BeautifulSoup(response.body.decode('utf-8', 'ignore'), 'lxml') ol = soup.find('ol', attrs={'class': 'grid_view'}) for li in ol.findAll('li'): tep = [] titles = [] for span in li.findAll('span'): if span.has_attr('class'): if span.attrs['class'][0] == 'title': titles.append(span.string.strip().replace(',', ',')) elif span.attrs['class'][0] == 'rating_num': tep.append(span.string.strip().replace(',', ',')) elif span.attrs['class'][0] == 'inq': tep.append(span.string.strip().replace(',', ',')) tep.insert(0, titles[0]) while len(tep) < 3: tep.append("-") tep = tep[:3] item = DoubanItem() item['name'] = tep[0] item['fen'] = tep[1] item['words'] = tep[2] yield item a = soup.find('a', text=re.compile("^后页")) if a: yield scrapy.Request("http://movie.douban.com/top250" + a.attrs['href'], callback=self.parse)
def parse(self, response): movie_list = response.xpath( '//div[@class="article"]//ol[@class="grid_view"]/li') for it in movie_list: douban_item = DoubanItem() douban_item['serial_number'] = it.xpath( ".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = it.xpath( './/div[@class="hd"]//a/span[1]/text()').extract_first() content = it.xpath('.//div[@class="bd"]//p[1]/text()').extract() for c_introduce in content: douban_item['introduce'] = "".join(c_introduce.split()) douban_item['star'] = it.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract_first() douban_item['evaluate'] = it.xpath( './/div[@class="star"]/span[4]/text()').extract_first() douban_item['describe'] = it.xpath( './/p[@class="quote"]/span/text()').extract_first() print(douban_item) yield douban_item next_link = response.xpath('//span[@class="next"]/a/@href').extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
def parse(self, response): item = DoubanItem() movies = response.xpath("//div[@class='info']") for each in movies: # 标题 item['title'] = each.xpath( ".//span[@class='title'][1]/text()").extract()[0] # 信息 item['bd'] = each.xpath( ".//div[@class='bd']/p/text()").extract()[0] # 评分 item['star'] = each.xpath( ".//div[@class='star']/span[@class='rating_num']/text()" ).extract()[0] # 简介 quote = each.xpath(".//p[@class='quote']/span/text()").extract() if len(quote) != 0: item['quote'] = quote[0] yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def get_items(self, response): result = json.loads(response.text) if result.get('res').get('subjects') == []: #这个if获取的是台词 text = result.get('res').get('payload').get('text') title = result.get('res').get('subject').get('title') s = Sql() s.save_to_mysql(title, text) pass elif result.get('res').get('subjects') == None: #这个if获取的是演员、导演 itemslist = result.get('res').get('people') print(response.url) print(itemslist) item = DoubanProfession() item['category'] = result.get('res').get('payload').get('title') s = Sql for items in itemslist: for field in item.fields: if field in items.keys(): item[field] = items.get(field) print(item) s.save_to_mongo2(item) else: itemslist = result.get('res').get('subjects') #最后这里获取的是电影信息 item = DoubanItem() item['category'] = result.get('res').get('payload').get('title') for items in itemslist: for field in item.fields: if field in items.keys(): item[field] = items.get(field) yield item
def parse(self, response): # 每页 20个评论数据 # 一级评论 titles = response.css(".main-bd > h2 > a::text").extract() commentIds = response.css("header > a.name::text").extract() data_cid = response.css("div::attr(data-cid)").extract() dates = response.css(".main-meta::attr(content)").extract() for idx in range(len(commentIds)): item = DoubanItem() item['parent'] = -1 item['commentId'] = commentIds[idx] item['commentDate'] = dates[idx] item['data_cid'] = data_cid[idx] item['title'] = '悲惨世界' item['type'] = '图书' item['level'] = 1 rev_json = 'https://book.douban.com/j/review/%s/full' % data_cid[ idx] yield scrapy.Request(rev_json, meta={ 'item': item, 'title': titles[idx] }, callback=self.get_json_commentFull) sub_comment_url = 'https://book.douban.com/review/%s/' % data_cid[ idx] yield scrapy.Request(sub_comment_url, meta={ 'sub_comment_url': sub_comment_url, 'data_cid': data_cid[idx] }, callback=self.parse_sub_review)
def parse2(self, response): # 循环电影条目 movie_list = response.xpath( "//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: # item 文件导进来 # 这里实际上就是导入的items自定义的数据结构,所以名称什么的都是一样的 douban_item = DoubanItem() # 写详细的解析 douban_item['serial_number'] = i_item.xpath( ".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()" ).extract_first() #print(douban_item) # 需要将数据yield到pipelines里去 yield douban_item print('*' * 20, response.request.headers['User-Agent']) next_link = response.xpath( "//span[@class='next']/link/@href").extract() # 解析下一页规则,取后页的Xpath if next_link: next_link = next_link[0] yield scrapy.Request('http://movie.douban.com/top250' + next_link, callback=self.parse)
def parse_item(self, response): items = DoubanItem() x = Selector(response) lb = x.xpath('//div[@class="hd"]/h1/text()').extract()[0] zz_yz = x.re('作者</span.*?</a></span></span></p>') x_k = x.xpath('//div[@class="info"]') sm = x_k.xpath('./div[@class="title"]/a/text()').extract() jj = x_k.xpath('./div[@class="article-desc-brief"]/text()').extract() for i in range(len(sm)): items['lb'] = lb items['sm'] = sm[i] zz_k = re.findall('作者</span.*?</a></span></span>', zz_yz[i]) items['zz'] = re.findall('([〕〔\u4e00-\u9fa5·\s]{2,})', zz_k[0])[1:] yz_k = re.findall('译者</span.*?</a></span></span>', zz_yz[i]) if not yz_k: items['yz'] = None else: items['yz'] = re.findall('([〕〔\u4e00-\u9fa5·]+)', yz_k[0])[1:] pf = x_k[i].xpath( './div/span[@class="rating-average"]/text()').extract() if not pf: items['pf'] = None else: items['pf'] = pf[0] items['jj'] = jj[i] yield items #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i
def parse(self, response): url = response.url args = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(args.query, True) type_num = params.get('type') if len(type_num) == 0: return movie_type = TYPE_SETTINGS.get(type_num[0]) element_list = json.loads(response.body) # 语料库中25%作为测试数据 test_num = len(element_list) * 0.25 for element in element_list: item = DoubanItem() item['title'] = element.get('title') item['type_num'] = type_num[0] item['movie_type'] = movie_type item['types'] = '|'.join(element.get('types')) if type( element.get('types')) is list else element.get('types') item['regions'] = '|'.join(element.get('regions')) if type( element.get('regions')) is list else element.get('regions') item['url'] = element.get('url') item['score'] = element.get('score') item['vote_count'] = str( element.get('vote_count')) if element.get('vote_count') else '' item['mid'] = element.get('id') item['release_date'] = element.get('release_date') item['rank'] = str( element.get('rank')) if element.get('rank') else '' item['property'] = '1' if test_num >= 0 else '0' test_num -= 1 yield item
def parse(self, response, **kwargs): item = DoubanItem() # 获取JSON数据 json_text = response.text # 解码 movie_dict = json.loads(json_text) if len(movie_dict["data"]) == 0: return # for循环遍历每部电影 for one_movie in movie_dict["data"]: # 获取电影名称 item["title"] = one_movie["title"] # 获取导演 item["directors"] = one_movie["directors"] # 获取演员 item["casts"] = one_movie["casts"] # 获取评分 item["rate"] = one_movie["rate"] # 获取封面 item["cover"] = one_movie["cover"] yield item # 爬取更多数据 url_next = 'https://movie.douban.com/j/new_search_subjects?tags=电影&start=%d&countries=中国大陆'%(self.currentPage*20) self.currentPage += 1 yield Request(url_next, headers=self.headers)
def parse(self, response): item = DoubanItem() for i in range(1, 26): info = response.xpath(f"//div[@class='article']/div[@class='indent']/table[{i}]") item['book'] = info.xpath(".//div[@class='pl2']/a/@title")[0].extract() item['author'] = info.xpath(".//p[@class='pl']/text()")[0].extract().split(' / ')[0] item['time'] = info.xpath(".//p[@class='pl']/text()")[0].extract().split(' / ')[-2] price = info.xpath(".//p[@class='pl']/text()")[0].extract().split(' / ')[-1] if len(price) == 1: price = str(float(price)) # print(price) item['price'] = float(re.findall(r"\d+.*?\d+", price)[0]) item['star'] = float(info.xpath(".//div[@class='star clearfix']/span[@class='rating_nums']/text()").extract()[0]) tmp = info.xpath(".//div[@class='star clearfix']/span[@class='pl']/text()")[0].extract() item['mark_num'] = int(re.findall(r"\d+", tmp)[0]) # print(item['book']) yield item if self.index < 225: self.index += 25 self.url = f'https://book.douban.com/top250?start={self.index}' yield Request(self.url, callback = self.parse)
def parse_item(self, response): item = {} item = DoubanItem() item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() item['score'] = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract() item['link'] = response.url yield item
def parse_item(self, response): # i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() # return i item = DoubanItem() num = response.xpath( '//div[@class="top250"]/span[@class="top250-no"]/text()').extract( )[0] item['ranking'] = int(re.sub(r'\D', '', num)) item['movie_name'] = response.xpath( '//div[@id="content"]/h1/span/text()').extract()[0] item['score'] = response.xpath( '//div[@class="rating_self clearfix"]/strong/text()').extract()[0] item['info'] = "".join( response.xpath('//div[@id="link-report"]/span/text()').extract()) item['doctor'] = response.xpath( '//span[@class="attrs"]/a/text()').extract()[0] # actors = response.xpath('//span[@class="actor"]/span[@class="attrs"]/span/a/text()').extract() # print(actors[0:3],'++++++++++++++actors++++++++') # item['actor'] = actors[0:3] item['img'] = response.xpath( '//div[@id="mainpic"]/a/img/@src').extract()[0] yield item
def parse(self, response): item = DoubanItem() movies = response.xpath('//div[@class="info"]') for each in movies: item['title'] = each.xpath( './/a/span[@class="title"][1]/text()').extract()[0] bd = each.xpath('.//div[@class="bd"]/p/text()').extract()[0] item['bd'] = bd.replace('\n', "").replace(' ', '') quote = each.xpath('.//p[@class="quote"]/span/text()').extract() if len(quote) != 0: item['quote'] = quote[0] else: pass item['star'] = each.xpath( './/div[@class="star"]//span[2]/text()').extract()[0] yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse_sub_review(self, response): parent_id = response.meta['data_cid'] script = response.text d = re.search(re.compile("'comments': (.*)"), script) if d: if not d.group(): logging.info(response.meta['sub_comment_url']) else: json_text = d.group().strip("'comments':").strip(",") info = json.loads(json_text) for review in info: item = DoubanItem() item['parent'] = parent_id item['commentDate'] = review['create_time'] item['commentId'] = review['author']['name'] item['commentContent'] = review['text'] item['data_cid'] = review['id'] item['title'] = '悲惨世界' item['type'] = '图书' item['level'] = 2 yield item sub_sub_url = f"https://book.douban.com/j/review/comment/{review['id']}/replies?start=0&count=500" yield scrapy.Request(sub_sub_url, meta={'parent_id': review['id']}, callback=self.parse_sub_sub_review)
def parse(self, response): movie_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li') for i in movie_list: douban_item = DoubanItem() douban_item['number'] = i.xpath( './div/div[1]/em/text()').extract_first() douban_item['name'] = i.xpath( './div/div[2]/div[1]/a/span[1]/text()').extract_first() content = i.xpath( './div/div[2]/div[2]/p[1]/text()').extract()[1].strip() douban_item['introduce'] = "".join(content.split()) douban_item['star'] = i.xpath( './div/div[2]/div[2]/div/span[2]/text()').extract_first() douban_item['evaluate'] = i.xpath( './div/div[2]/div[2]/div/span[4]/text()').extract_first() douban_item['describe'] = i.xpath( './div/div[2]/div[2]/p[2]/span/text()').extract_first() yield douban_item next_link = response.xpath( '//*[@id="content"]/div/div[1]/div[2]/span[3]/a/@href').extract() if next_link: print(next_link) next_link = next_link[0] yield scrapy.Request('https://movie.douban.com/top250' + next_link, callback=self.parse)
class DouBanSpider(Spider): name = 'dou_ban' allowed_domains = ['movie.douban.com'] start_urls = 'https://movie.douban.com/subject/27113517/comments?start={}&limit=20&sort=new_score&status=P' num = 0 item = DoubanItem() def start_requests(self): yield Request(self.start_urls.format(self.num), self.parse_response) #spider解析数据 def parse_response(self, response): soup = BeautifulSoup(response.text, 'lxml') # if soup.state_code!=200: # break selects = soup.find('div', id='comments').find_all('div') for user in selects[:-1]: self.item['user_name'] = user.find( 'span', class_='comment-info').a.get_text() span_time = user.find('span', class_='comment-time') self.item['time'] = span_time['title'] self.item['comment'] = user.p.span.get_text() yield self.item self.num += 20 yield Request(self.start_urls.format(self.num), self.parse_response)
def parse(self, response): #循环电影的条目 movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: #item文件导进来 douban_item =DoubanItem() #写详细的xpath,进行数据的解析 douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath(".//div[@class='info']//div[@class='hd']/a/span[1]/text()").extract_first() content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract() #数据的处理 for i_content in content: content_s = "".join(i_content.split()) douban_item['introduce'] = content_s douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first() #yield到item pipeline,进行数据 读取,清洗 yield douban_item #解析下一页规则,取下一页的xpath next_link = response.xpath("//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] #yield 提交到调度器中 yield scrapy.Request("https://movie.douban.com/top250"+next_link,callback=self.parse)
def parse(self, response): item = DoubanItem() # movies = response.xpath("//div[@class='info']") movies = response.xpath("//div[@class='item']") for each in movies: # 排名 item['rank'] = each.xpath("./div[@class='pic']/em/text()").extract()[0] info = each.xpath("./div[@class='info']") # 标题 item['title'] = info.xpath(".//span[@class='title'][1]/text()").extract()[0] # 基本信息 item['actor_info'] = info.xpath("./div[@class='bd']/p/text()").extract()[0].replace('\n', '').replace(' ', '') item['movie_info'] = info.xpath("./div[@class='bd']/p/text()").extract()[1].replace('\n', '').replace(' ', '') # 评分 item['star'] = \ info.xpath("./div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()").extract()[0] # 简介 quote = info.xpath("./div[@class='bd']/p[@class='quote']/span/text()").extract() if len(quote) != 0: item['quote'] = quote[0] else: item['quote'] = '' yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): #print response.body item = DoubanItem() selector = Selector(response) Movies = selector.xpath('//div[@class="info"]') for eachMovie in Movies: title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract() fullTitle = '' for each in title: fullTitle += each movieInfo = eachMovie.xpath( 'div[@class="bd"]/p[@class=""]/text()').extract() star = eachMovie.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] quote = eachMovie.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() #quote可能为空,需要先进行判断 if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextLink = selector.xpath('//span[@class="next"]/link/@href').extract() #第十页是最后一页,没有下一页的链接 if nextLink: nextLink = nextLink[0] #print nextLink yield Request(self.url + nextLink, callback=self.parse)
def parse(self, response): # 循环电影的列表 move_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li") for i in move_list: douban_item = DoubanItem() # 写详细的 xpath, 进行规则解析数据 # extract_first() 获取第一个数据 # extract() 获取所有数据 douban_item['number'] = i.xpath(".//div[@class='item']//em/text()").extract_first() douban_item['name'] = i.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() douban_item['star'] = i.xpath(".//div[@class='info']/div[@class='bd']//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i.xpath(".//div[@class='info']//div[@class='star']/span[4]/text()").extract_first() douban_item['describe'] = i.xpath(".//div[@class='info']/div[@class='bd']//span[@class='inq']/text()").extract_first() content = i.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract() # 数据处理 for x in content: content_s = "".join(x.split()) douban_item['introduce'] = content_s # 你需要把数据 yield 到 pipelines 里面去 yield douban_item next_link = response.xpath("//span[@class='next']/link/@href").extract() # 解析下一页 if next_link: next_link = next_link[0] # 满足下一页条件,则 传入 路径,并且调用回调函数, self.parse 去解析爬到的内容 yield scrapy.Request("http://movie.douban.com/top250" + next_link, callback=self.parse)
def content_parse(self, response): ''' 获取用户评论信息 ''' contents = response.xpath('//div[@class="comment"]') for content in contents: item = DoubanItem() name = content.xpath( './h3/span[@class="comment-info"]/a/text()').extract()[0] score = content.xpath( './h3/span[@class="comment-info"]/span[2]').attrib.get('title') # 这里span标签内的文字换行会导致写入数据出现问题,因此直接把评论带标签拿出来,之后再做处理 comment = content.xpath('./p/span[@class="short"]').extract()[0] date = content.xpath( './h3/span[@class="comment-info"]/span[@class="comment-time "]/text()' ).extract()[0].strip() # 获取评论用户主页链接,用于爬取用户常居城市 href = content.xpath( './h3/span[@class="comment-info"]/a/@href').extract()[0] item['name'] = name # 判断用户是否评分,未评分第二个span标签是时间,这里通过长度判断 if len(score) < 5: item['score'] = score else: item['score'] = '--' item['comment'] = comment item['date'] = date item['href'] = href yield item # 返回item
def parse(self, response): # print(response.text) html = etree.HTML(response.text) # 用etree来解析 li_list = html.xpath("//ol[@class='grid_view']/li") # 用Xpath来查找元素 # item_list=[] for li in li_list: item = DoubanItem() # 导入items.py中定义的item类格式进行封装 item['em'] = li.xpath(".//em/text()")[ 0] # 注意,通过xpath返回的始终是一个list.需要处理后才能用 item['title'] = li.xpath(".//img/@src")[0] item['img'] = li.xpath(".//span[@class='title']/text()")[0] item['comment'] = li.xpath(".//div[@class='star']/span/text()")[-1] # item_list.append(item) yield item # 利用yield返回一个迭代对象 这里是产生一个item并继续往下执行下一个yield # print('打印for循环中的记录', item) try: # 最后一页没有href值,会抛异常,所以用try来处理 next_page = html.xpath("//span[@class='next']/a/@href")[ 0] # 在当前页面获取下一页的地址 print("一页循环完毕,进入下一页" + "-" * 100) # 利用Request手动发送请求, 回调函数调用自己去访问下一页形成递归循环算法 # 注意callback=self.parse不能加括号,代表把函数的地址赋值给callback.如果加括号,代表把函数的执行结果给callback. yield scrapy.Request(url='https://movie.douban.com/top250' + next_page, callback=self.parse) except: print("下载完毕!")
def parse(self, response): json_obj = json.loads(response.text) if 'data' not in response.text: logging.error('No data in json api:' + response.url) result_list = json_obj['data'] for result in result_list: try: item = DoubanItem() url = result['url'] url = utils.get_real_url(url) item['url'] = url item['dType'] = 'META_VIDEO_S1' item['type'] = response.meta['type'] item['vTitle'] = result['title'] item['vScore'] = result['rate'] item['vCoverUrl'] = result['cover'] item['vDirector'] = result['directors'] and result[ 'directors'][0] or '' item['vStars'] = '/'.join(result['casts']) # request request = scrapy.Request( response.urljoin(url), meta={'cookiejar': response.meta['cookiejar']}, callback=self.parse_video, errback=self.err_back) request.meta['item'] = item yield request except Exception as e: logging.error('Item in json api error:' + response.url + ' item url:' + url) continue
def parse(self, response): # 实例化item类 item = DoubanItem() # 找出每部电影总的div 并进行遍历 再取详细信息 for each in response.xpath("//div[@class='info']"): title = each.xpath( 'div[@class="hd"]/a/span[@class="title"]/text()').extract() content = each.xpath('div[@class="bd"]/p/text()').extract() score = each.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() info = each.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title[0] item['content'] = content[0] item['score'] = score[0] item['info'] = info yield item # 排行榜共250条 每25条一页 if self.start <= 225: self.start += 25 url = self.url + str(self.start) + self.end yield scrapy.Request(url, callback=self.parse)
def parse(self, response): movie_list = response.xpath( "//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: douban_item = DoubanItem() douban_item['serial_number'] = i_item.xpath( ".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()" ).extract_first() content = i_item.xpath( ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract( ) for i_content in content: content_s = "".join(i_content.split()) douban_item['introduce'] = content_s douban_item['star'] = i_item.xpath( ".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath( ".//div[@class='star']//span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath( ".//p[@class='quote']/span/text()").extract_first() yield douban_item next_link = response.xpath( "//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
def parse(self, response): # 循环电影条目 movie_list = response.xpath('//div[@class="article"]//ol[@class="grid_view"]/li') for i_item in movie_list: # item 文件导进来 douban_item = DoubanItem() # 写详细的xpath,进行数据的解析 douban_item['seria_number'] = i_item.xpath('.//div[@class="pic"]/em/text()').extract_first() douban_item['movie_name'] = i_item.xpath('.//div[@class="hd"]/a/span[1]/text()').extract_first() content = i_item.xpath('.//div[@class="bd"]/p[1]/text()').extract() contents = i_item.xpath('.//div[@class="bd"]/p[1]/text()').extract() # 有多行时,进行数据处理 for i_content in content: douban_item['introduce'] = "".join(i_content.split()) douban_item['star'] = i_item.xpath('.//div[@class="bd"]/div[@class="star"]/span[2]/text()').extract_first() douban_item['evaluate'] = i_item.xpath('.//div[@class="bd"]/div[@class="star"]/span[4]/text()').extract_first() douban_item['discribtion'] = i_item.xpath('.//div[@class="bd"]/p[@class="quote"]/span[1]/text()').extract_first() # extract()是提取选择器里面的内容,不加则只是一个选择器Selector # 最后将数据yield到pipelines里面去 yield douban_item # 解析下一页规则,去后一页的xpath next_link=response.xpath('//span[@class="next"]/link/@href').extract() next_links=response.xpath('//span[@class="next"]/link/@href').extract_first() if next_link: next_link=next_link[0] yield scrapy.Request('https://movie.douban.com/top250' + next_link, callback=self.parse)