Esempio n. 1
0
    def parse(self, response):
        movie_list = response.xpath(
            "//div[@class='article']//ol[@class='grid_view']/li")
        for i_item in movie_list:
            douban_item = DoubanmovieItem()
            douban_item['serial_number'] = i_item.xpath(
                ".//div[@class='item']//em/text()").extract_first()
            douban_item['movie_name'] = i_item.xpath(
                ".//div[@class='hd']/a/span[1]/text()").extract_first()
            content = i_item.xpath(
                ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract(
                )
            for i_content in content:
                content_s = "".join(i_content.split())
                douban_item['introduce'] = content_s
            douban_item['star'] = i_item.xpath(
                ".//span[@class='rating_num']/text()").extract_first()
            douban_item['evaluate'] = i_item.xpath(
                ".//div[@class='star']//span[4]/text()").extract_first()
            douban_item['describe'] = i_item.xpath(
                ".//p[@class='quote']/span/text()").extract_first()
            yield douban_item

        next_link = response.xpath(
            "//span[@class='next']/link/@href").extract()
        if next_link:
            next_link = next_link[0]
            yield scrapy.Request("https://movie.douban.com/top250" + next_link,
                                 callback=self.parse)
Esempio n. 2
0
    def parse(self, response):#response代表程序获取网页的代码,也就是我们需要解析的代码内容
        #获取当前页面中的电影信息标签<div class="item">并生成一个列表

        # //相对路径
        currentpage_movie_items = response.xpath('//div[@class="item"]')
        #循环遍历电影信息列表
        for movie_item in currentpage_movie_items:#单个页面的数据
            #获取采集数据并且复制DoubanmovieItem类成员
            #创建一个Movie对象
            movie = DoubanmovieItem()
            #xpath下标从1开始
            movie['rank'] = movie_item.xpath('div[@class="pic"]/em/text()').extract()#extract返回一个list
            movie['title'] = movie_item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()').extract()
            movie['link'] = movie_item.xpath('div[@class="pic"]/a/@href').extract()
            movie['rating'] = movie_item.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
            movie['participants'] = movie_item.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()').extract()
            movie['quote'] = movie_item.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            movie['pic'] = movie_item.xpath('div[@class="pic"]/a/img/@src').extract()
            #将封装好的一个电影信息添加到容器中,yield作用是创建一个列表并添加元素
            yield movie#创建一个序列
        pass
        #请求下一页数据
        next_page = response.xpath("//span[@class='next']/a/@href")
        if next_page:#最后一页没有下一页数据,判断是否有下一页
            url=response.urljoin(next_page[0].extract())#下一页地址
            # yield scrapy.Request(url,self.parse,headers=self.headers)#递归请求,携带头部信息
            yield scrapy.Request(url, self.parse)  # 递归请求,携带头部信息
Esempio n. 3
0
    def parse(self, response):
        item = DoubanmovieItem()
        selector = Selector(response)
        articles = selector.xpath(
            '// *[ @ id = "content"] / div / div[1]/ol/li')
        print(articles)

        for article in articles:
            name = article.xpath(
                ' div / div[2] / div[1] / a / span[1] / text()').extract()
            rate = article.xpath(
                ' div / div[2] / div[2] / div / span[2]/ text()').extract()
            commentNUM = article.xpath(
                'div / div[2] / div[2] / div / span[4]/ text()').extract()
            comment = article.xpath(
                ' div / div[2] / div[2] / p[2] / span/ text()').extract()
            item['name'] = name
            item['rate'] = rate
            item['commentNUM'] = commentNUM
            item['comment'] = comment

            yield item

        n = int(
            selector.xpath(
                '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/em/text()').
            extract()[0]) - 1

        if n <= 250:
            n = n + 25
            link = 'https://movie.douban.com/top250?start=' + str(
                n) + '&filter='
            yield Request(link, callback=self.parse)
 def parse(self, response):
     item = DoubanmovieItem()
     sel = Selector(response)
     imgs = sel.xpath('//*[@id="content"]/div/div[1]/ol/li')
     item['url'] = []
     item['name'] = []
     for img in imgs:
         site = img.xpath('div/div[1]/a/img/@src').extract_first()
         img_name = img.xpath('div/div[1]/a/img/@alt').extract_first()
         item['url'].append(site)
         item['name'].append(img_name)
         yield item
Esempio n. 5
0
    def parse_item(self, response):
        selector = Selector(response)
        log.info('parsing: {}'.format(response.url))
        item = DoubanmovieItem()
        item['name'] = selector.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        item['year'] = selector.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
        item['score'] = selector.xpath('//strong[@class="ll rating_num"]/text()').extract()

        movieid = re.match('https://.*/.*/(.*)/.*', response.url).group(1)
        item['movieid'] = movieid
        item['director'] = selector.xpath('//span[@class="attrs"]/a[@rel="v:directedBy"]/text()').extract()
        item['classification'] = selector.xpath('//span[@property="v:genre"]/text()').extract()
        item['actor'] = selector.xpath('//span[@class="attrs"]/a[@rel="v:starring"]/text()').extract()

        # get the first recommended poster
        list_poster_url = 'https://movie.douban.com/subject/{}/photos?type=R'.format(movieid)
        yield scrapy.Request(list_poster_url, callback=self.parse_poster_url, meta={'item': item})
    def parse(self, response):

        item = DoubanmovieItem()
        sel = Selector(response)
        images = sel.xpath('//*[@id="content"]/div/div[1]/ol/li')

        item['url'] = []
        item['img_name'] = []
        # append the url and name of the image in item
        for image in images:
            # extract url and name of the image
            site = image.xpath('div/div[1]/a/img/@src').extract_first()
            img_name = image.xpath('div/div[1]/a/img/@alt').extract_first()

            item['url'].append(site)
            item['img_name'].append(img_name)

        yield item
Esempio n. 7
0
    def parse(self, response):
        for movie in response.xpath('//div[@class="item"]'):
            item = DoubanmovieItem()
            item['rank'] = movie.xpath('./div[@class="pic"]/em/text()').extract_first()
            item['title'] = movie.xpath('./div[@class="pic"]/a/img/@alt').extract_first()
            item['playable'] = movie.xpath('.//span[@class="playable"]/text()').extract_first()
            item['link'] = movie.xpath('./div[@class="pic"]/a/@href').extract_first()
            item['star'] = movie.xpath('.//div[@class="star"]/span[1]/@class').extract_first()
            item['rate'] = movie.xpath('.//span[@class="rating_num"]/text()').extract_first()
            item['pl'] = movie.xpath('.//div[@class="star"]/span[4]/text()').extract_first()[:-3]
            item['quote'] = movie.xpath('.//p[@class="quote"]/span/text()').extract_first()
            item['type'] = movie.xpath('.//div[@class="bd"]/p/text()').extract()[1].lstrip().rstrip()
            yield item

        next_page = response.xpath('//span[@class="next"]/a/@href')
        if next_page:
            url = response.urljoin(next_page.extract_first())
            yield scrapy.Request(url, self.parse)
Esempio n. 8
0
    def parse_start_url(self, response):
        # 分组
        movies = response.css("ol.grid_view li")

        # 提取数据
        for movie in movies:
            item = DoubanmovieItem()
            item['title'] = movie.css(".hd span::text").extract_first()
            score = movie.css(".bd p::text").extract()

            for i in range(0, len(score)):
                score[i] = re.sub(r' |\xa0', '', score[i])
            item['score'] = movie.css(
                ".star span.rating_num::text").extract_first()
            item['content'] = ''.join(score).strip('\n')
            item['info'] = movie.css(".quote span::text").extract_first()

            yield item
Esempio n. 9
0
    def parse(self, response):
        # 获取当前页面中所有的电影采集标签item
        movie_items = response.xpath('//div[@class="item"]')
        # 使用for循环遍历每一个电影标签,获取采集数据项并封装成一个采集项对象
        for item in movie_items:
            print(type(item))
            # 创建一个空的DoubanmovieItem对象 电影采集类对象
            movie = DoubanmovieItem()
            # Xpath解析获取电影排名并为movie对象的rank属性赋值
            movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract()
            # Xpath解析获取电影排名并为movie对象的title属性赋值
            movie['title'] = item.xpath(
                'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()'
            ).extract()
            # 电影评分 参与评分人数
            movie['star'] = item.xpath(
                'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()'
            ).extract()
            # 电影详情连接
            movie['link'] = item.xpath('div[@class="pic"]/a/@href').extract()
            # 海报图片
            movie['img_url'] = item.xpath(
                'div[@class="pic"]/a/img/@src').extract()
            # 将添加好的movie添加到一个生成器中
            yield movie
        pass

        # 取下一页的地址
        nextPageURL = response.xpath('//span[@class="next"]/a/@href').extract()
        # print(nextPageURL)
        if nextPageURL:
            url = response.urljoin(nextPageURL[-1])
            # print('url', url)
            # 发送下一页请求并调用parse()函数继续解析
            yield scrapy.Request(url, self.parse, dont_filter=False)
            pass
        else:
            print("退出")

        logger.warning('日志信息')

        pass
Esempio n. 10
0
	def parse(self, response):
		sel = Selector(response)
		content = response.xpath('//div[@id="content"]')
		movieId = re.findall(r'.*/(\d*)/', response.url)[0]
		name = content.xpath('h1/span/text()').extract()[0]
		year = content.xpath('h1/span/text()').extract()[1]
		imageurl = content.xpath('.//div[@id="mainpic"]/a/@href').extract()[0]

		info = content.xpath('.//div[@id="info"]').extract()[0]
		tags = re.compile('<[^>]+>')
		info = tags.sub("", info)
		info_dict = dict([line.strip().split(":", 1) for line in info.strip().split("\n") if line.strip().find(":") > 0])

		content_right = content.xpath('.//div[@class="rating_wrap clearbox"]')
		rating = content_right.xpath('.//strong/text()').extract()[0]
		rating_people = content_right.xpath('.//a[@class="rating_people"]/span/text()').extract()[0]

		rating_on_weight = content.xpath('.//div[@class="ratings-on-weight"]')
		rating_per = rating_on_weight.xpath('.//span[@class="rating_per"]/text()').extract()

		# actor_num = content.xpath('.//div[@id="celebrities"]/h2/span/a/text()').extract()[0][3:]
		commentary_num = content.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[0][3:][:-2]
		review_num = content.xpath('.//section[@class="reviews mod movie-content"]/header/h2/span/a/text()').extract()[0][3:][:-2]
		watched_num = content.xpath('.//div[@class="subject-others-interests-ft"]/a/text()').extract()[0][:-3]
		wanted_num = content.xpath('.//div[@class="subject-others-interests-ft"]/a/text()').extract()[1][:-3]

		youku_up, youku_down, youku_vv = 0, 0, 0
		video_flag = [0, 0, 0, 0, 0]
		video_source = []
		links = content.xpath('.//div[@class="gray_ad"]/ul/li')
		videoId = ""
		for link in links:
			source = link.xpath('.//a/@data-cn').extract()[0]
			src = link.xpath('.//a/@href').extract()[0]
			price = link.xpath('.//span/span/text()').extract()[0].strip()
			if source=="腾讯视频":
				video_flag[0] = 1
			elif source=="爱奇艺视频":
				video_flag[1] = 1
			elif source=="芒果 TV":
				video_flag[2] = 1
			elif source=="乐视视频":
				video_flag[3] = 1
			elif source=="优酷视频":
				video_flag[4] = 1
				try:
					videoId, youkuIndexUrl= self.youkuPageParse(src)
					infoUrl = "http://v.youku.com/action/getVideoPlayInfo?vid="+videoId+"&param%5B%5D=updown&callback=tuijsonp5"
					youku_up, youku_down, youku_vv = self.youkuInfoParse(infoUrl)
					self.youkuIndexParse(youkuIndexUrl, movieId)
				except TypeError as e:
					pass
			video_source.append(dict(source=source,src=src,price=price))

		item = DoubanmovieItem()
		item['movieId'] = movieId
		item["url"] = response.url

		item["name"] = name
		item["year"] = year
		item["image"] = imageurl

		item["director"] = info_dict.get("导演", "").strip()
		item["writer"] = info_dict.get("编剧", "").strip()
		item["stars"] = info_dict.get("主演", "").strip()

		item["movietype"] = info_dict.get("类型", "").strip()
		item["country"] = info_dict.get("制片国家/地区", "").strip()
		item["language"] = info_dict.get("语言", "").strip()

		item["ontime"] = info_dict.get("上映日期", "").strip() if "上映日期" in info_dict else info_dict.get("首播", "")
		item["seasons"] = info_dict.get("季数", "").strip()
		item["clips"] = info_dict.get("集数", "").strip()
		item["time"] = info_dict.get("片长", "").strip() if "片长" in info_dict else info_dict.get("单集片长", "")

		item["nickname"] = info_dict.get("又名", "").strip()
		item["imdblink"] = "http://www.imdb.com/title/"+info_dict.get("IMDb链接", "").strip()

		item["rating"] = rating
		item["rating_people"] = rating_people
		item["rating_per"] = rating_per

		item["commentary_num"] = commentary_num
		item["review_num"] = review_num
		item["watched_num"] = watched_num
		item["wanted_num"] = wanted_num

		item["video_flag"] = video_flag
		item["video_source"] = video_source

		item["youku_up"] = youku_up
		item["youku_down"] = youku_down
		item["youku_vv"] = youku_vv
		item["youku_movie_id"] = videoId
		yield item