def parse(self, response): movie_list = response.xpath( "//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: douban_item = DoubanmovieItem() douban_item['serial_number'] = i_item.xpath( ".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='hd']/a/span[1]/text()").extract_first() content = i_item.xpath( ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract( ) for i_content in content: content_s = "".join(i_content.split()) douban_item['introduce'] = content_s douban_item['star'] = i_item.xpath( ".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath( ".//div[@class='star']//span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath( ".//p[@class='quote']/span/text()").extract_first() yield douban_item next_link = response.xpath( "//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
def parse(self, response):#response代表程序获取网页的代码,也就是我们需要解析的代码内容 #获取当前页面中的电影信息标签<div class="item">并生成一个列表 # //相对路径 currentpage_movie_items = response.xpath('//div[@class="item"]') #循环遍历电影信息列表 for movie_item in currentpage_movie_items:#单个页面的数据 #获取采集数据并且复制DoubanmovieItem类成员 #创建一个Movie对象 movie = DoubanmovieItem() #xpath下标从1开始 movie['rank'] = movie_item.xpath('div[@class="pic"]/em/text()').extract()#extract返回一个list movie['title'] = movie_item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()').extract() movie['link'] = movie_item.xpath('div[@class="pic"]/a/@href').extract() movie['rating'] = movie_item.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract() movie['participants'] = movie_item.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()').extract() movie['quote'] = movie_item.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract() movie['pic'] = movie_item.xpath('div[@class="pic"]/a/img/@src').extract() #将封装好的一个电影信息添加到容器中,yield作用是创建一个列表并添加元素 yield movie#创建一个序列 pass #请求下一页数据 next_page = response.xpath("//span[@class='next']/a/@href") if next_page:#最后一页没有下一页数据,判断是否有下一页 url=response.urljoin(next_page[0].extract())#下一页地址 # yield scrapy.Request(url,self.parse,headers=self.headers)#递归请求,携带头部信息 yield scrapy.Request(url, self.parse) # 递归请求,携带头部信息
def parse(self, response): item = DoubanmovieItem() selector = Selector(response) articles = selector.xpath( '// *[ @ id = "content"] / div / div[1]/ol/li') print(articles) for article in articles: name = article.xpath( ' div / div[2] / div[1] / a / span[1] / text()').extract() rate = article.xpath( ' div / div[2] / div[2] / div / span[2]/ text()').extract() commentNUM = article.xpath( 'div / div[2] / div[2] / div / span[4]/ text()').extract() comment = article.xpath( ' div / div[2] / div[2] / p[2] / span/ text()').extract() item['name'] = name item['rate'] = rate item['commentNUM'] = commentNUM item['comment'] = comment yield item n = int( selector.xpath( '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/em/text()'). extract()[0]) - 1 if n <= 250: n = n + 25 link = 'https://movie.douban.com/top250?start=' + str( n) + '&filter=' yield Request(link, callback=self.parse)
def parse(self, response): item = DoubanmovieItem() sel = Selector(response) imgs = sel.xpath('//*[@id="content"]/div/div[1]/ol/li') item['url'] = [] item['name'] = [] for img in imgs: site = img.xpath('div/div[1]/a/img/@src').extract_first() img_name = img.xpath('div/div[1]/a/img/@alt').extract_first() item['url'].append(site) item['name'].append(img_name) yield item
def parse_item(self, response): selector = Selector(response) log.info('parsing: {}'.format(response.url)) item = DoubanmovieItem() item['name'] = selector.xpath('//*[@id="content"]/h1/span[1]/text()').extract() item['year'] = selector.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)') item['score'] = selector.xpath('//strong[@class="ll rating_num"]/text()').extract() movieid = re.match('https://.*/.*/(.*)/.*', response.url).group(1) item['movieid'] = movieid item['director'] = selector.xpath('//span[@class="attrs"]/a[@rel="v:directedBy"]/text()').extract() item['classification'] = selector.xpath('//span[@property="v:genre"]/text()').extract() item['actor'] = selector.xpath('//span[@class="attrs"]/a[@rel="v:starring"]/text()').extract() # get the first recommended poster list_poster_url = 'https://movie.douban.com/subject/{}/photos?type=R'.format(movieid) yield scrapy.Request(list_poster_url, callback=self.parse_poster_url, meta={'item': item})
def parse(self, response): item = DoubanmovieItem() sel = Selector(response) images = sel.xpath('//*[@id="content"]/div/div[1]/ol/li') item['url'] = [] item['img_name'] = [] # append the url and name of the image in item for image in images: # extract url and name of the image site = image.xpath('div/div[1]/a/img/@src').extract_first() img_name = image.xpath('div/div[1]/a/img/@alt').extract_first() item['url'].append(site) item['img_name'].append(img_name) yield item
def parse(self, response): for movie in response.xpath('//div[@class="item"]'): item = DoubanmovieItem() item['rank'] = movie.xpath('./div[@class="pic"]/em/text()').extract_first() item['title'] = movie.xpath('./div[@class="pic"]/a/img/@alt').extract_first() item['playable'] = movie.xpath('.//span[@class="playable"]/text()').extract_first() item['link'] = movie.xpath('./div[@class="pic"]/a/@href').extract_first() item['star'] = movie.xpath('.//div[@class="star"]/span[1]/@class').extract_first() item['rate'] = movie.xpath('.//span[@class="rating_num"]/text()').extract_first() item['pl'] = movie.xpath('.//div[@class="star"]/span[4]/text()').extract_first()[:-3] item['quote'] = movie.xpath('.//p[@class="quote"]/span/text()').extract_first() item['type'] = movie.xpath('.//div[@class="bd"]/p/text()').extract()[1].lstrip().rstrip() yield item next_page = response.xpath('//span[@class="next"]/a/@href') if next_page: url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, self.parse)
def parse_start_url(self, response): # 分组 movies = response.css("ol.grid_view li") # 提取数据 for movie in movies: item = DoubanmovieItem() item['title'] = movie.css(".hd span::text").extract_first() score = movie.css(".bd p::text").extract() for i in range(0, len(score)): score[i] = re.sub(r' |\xa0', '', score[i]) item['score'] = movie.css( ".star span.rating_num::text").extract_first() item['content'] = ''.join(score).strip('\n') item['info'] = movie.css(".quote span::text").extract_first() yield item
def parse(self, response): # 获取当前页面中所有的电影采集标签item movie_items = response.xpath('//div[@class="item"]') # 使用for循环遍历每一个电影标签,获取采集数据项并封装成一个采集项对象 for item in movie_items: print(type(item)) # 创建一个空的DoubanmovieItem对象 电影采集类对象 movie = DoubanmovieItem() # Xpath解析获取电影排名并为movie对象的rank属性赋值 movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract() # Xpath解析获取电影排名并为movie对象的title属性赋值 movie['title'] = item.xpath( 'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()' ).extract() # 电影评分 参与评分人数 movie['star'] = item.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()' ).extract() # 电影详情连接 movie['link'] = item.xpath('div[@class="pic"]/a/@href').extract() # 海报图片 movie['img_url'] = item.xpath( 'div[@class="pic"]/a/img/@src').extract() # 将添加好的movie添加到一个生成器中 yield movie pass # 取下一页的地址 nextPageURL = response.xpath('//span[@class="next"]/a/@href').extract() # print(nextPageURL) if nextPageURL: url = response.urljoin(nextPageURL[-1]) # print('url', url) # 发送下一页请求并调用parse()函数继续解析 yield scrapy.Request(url, self.parse, dont_filter=False) pass else: print("退出") logger.warning('日志信息') pass
def parse(self, response): sel = Selector(response) content = response.xpath('//div[@id="content"]') movieId = re.findall(r'.*/(\d*)/', response.url)[0] name = content.xpath('h1/span/text()').extract()[0] year = content.xpath('h1/span/text()').extract()[1] imageurl = content.xpath('.//div[@id="mainpic"]/a/@href').extract()[0] info = content.xpath('.//div[@id="info"]').extract()[0] tags = re.compile('<[^>]+>') info = tags.sub("", info) info_dict = dict([line.strip().split(":", 1) for line in info.strip().split("\n") if line.strip().find(":") > 0]) content_right = content.xpath('.//div[@class="rating_wrap clearbox"]') rating = content_right.xpath('.//strong/text()').extract()[0] rating_people = content_right.xpath('.//a[@class="rating_people"]/span/text()').extract()[0] rating_on_weight = content.xpath('.//div[@class="ratings-on-weight"]') rating_per = rating_on_weight.xpath('.//span[@class="rating_per"]/text()').extract() # actor_num = content.xpath('.//div[@id="celebrities"]/h2/span/a/text()').extract()[0][3:] commentary_num = content.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[0][3:][:-2] review_num = content.xpath('.//section[@class="reviews mod movie-content"]/header/h2/span/a/text()').extract()[0][3:][:-2] watched_num = content.xpath('.//div[@class="subject-others-interests-ft"]/a/text()').extract()[0][:-3] wanted_num = content.xpath('.//div[@class="subject-others-interests-ft"]/a/text()').extract()[1][:-3] youku_up, youku_down, youku_vv = 0, 0, 0 video_flag = [0, 0, 0, 0, 0] video_source = [] links = content.xpath('.//div[@class="gray_ad"]/ul/li') videoId = "" for link in links: source = link.xpath('.//a/@data-cn').extract()[0] src = link.xpath('.//a/@href').extract()[0] price = link.xpath('.//span/span/text()').extract()[0].strip() if source=="腾讯视频": video_flag[0] = 1 elif source=="爱奇艺视频": video_flag[1] = 1 elif source=="芒果 TV": video_flag[2] = 1 elif source=="乐视视频": video_flag[3] = 1 elif source=="优酷视频": video_flag[4] = 1 try: videoId, youkuIndexUrl= self.youkuPageParse(src) infoUrl = "http://v.youku.com/action/getVideoPlayInfo?vid="+videoId+"¶m%5B%5D=updown&callback=tuijsonp5" youku_up, youku_down, youku_vv = self.youkuInfoParse(infoUrl) self.youkuIndexParse(youkuIndexUrl, movieId) except TypeError as e: pass video_source.append(dict(source=source,src=src,price=price)) item = DoubanmovieItem() item['movieId'] = movieId item["url"] = response.url item["name"] = name item["year"] = year item["image"] = imageurl item["director"] = info_dict.get("导演", "").strip() item["writer"] = info_dict.get("编剧", "").strip() item["stars"] = info_dict.get("主演", "").strip() item["movietype"] = info_dict.get("类型", "").strip() item["country"] = info_dict.get("制片国家/地区", "").strip() item["language"] = info_dict.get("语言", "").strip() item["ontime"] = info_dict.get("上映日期", "").strip() if "上映日期" in info_dict else info_dict.get("首播", "") item["seasons"] = info_dict.get("季数", "").strip() item["clips"] = info_dict.get("集数", "").strip() item["time"] = info_dict.get("片长", "").strip() if "片长" in info_dict else info_dict.get("单集片长", "") item["nickname"] = info_dict.get("又名", "").strip() item["imdblink"] = "http://www.imdb.com/title/"+info_dict.get("IMDb链接", "").strip() item["rating"] = rating item["rating_people"] = rating_people item["rating_per"] = rating_per item["commentary_num"] = commentary_num item["review_num"] = review_num item["watched_num"] = watched_num item["wanted_num"] = wanted_num item["video_flag"] = video_flag item["video_source"] = video_source item["youku_up"] = youku_up item["youku_down"] = youku_down item["youku_vv"] = youku_vv item["youku_movie_id"] = videoId yield item