Beispiel #1
0
    def parse_post(self, response):
        post = PostItem(
            title=response.css('.title::text').extract_first(),
            category='-'.join(response.css('.cate a::text').extract()).replace(
                '\t', '').replace('\n', ''),
            update_time=response.css('.update-time i::text').extract_first(),
            play_counts=response.css(
                '.play-counts::text').extract_first().replace(',', ''),
            like_counts=response.css(
                '.like-counts::text').extract_first().replace(',', ''),
            tags=', '.join(response.css('.tag-wrapper a::text').extract()),
            description=response.css(
                '.filmplay-info-desc p::text').extract_first())

        # 解析视频信息
        vid, = re.findall(r'var vid = "(\w+)";?', response.text)
        post['vid'] = vid
        app_key, = re.findall(r'var modeServerAppKey = "(\w+)";?',
                              response.text)
        url_format = 'http://openapi-vtom.vmovier.com/v3' \
                     '/video/%s?expand=resource&usage=xpc_web&appKey=%s'
        # https://openapi-vtom.vmovier.com/v3/video/5F0C33F784BB4?expand=resource&usage=xpc_web&appKey=61a2f329348b3bf77
        url = url_format % (vid, app_key)
        request = Request(url=url, callback=self.parse_video)
        request.meta['post'] = post
        yield request
        # 解析评论信息
        # https://app.xinpianchang.com/comments?resource_id=10847731&type=article
        resource_id, = re.findall(r"article_id: '(\d+)'", response.text)
        url_format = 'http://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1'
        url = url_format % resource_id
        request = Request(url, callback=self.parse_comment)
        yield request
Beispiel #2
0
    def parse_post(self, response):
        post = PostItem()
        post['pid'] = response.meta['pid']
        post['thumbnail'] = response.meta['thumbnail']
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').get()
        post['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').get()
        post['video'] = response.xpath('//video[@id="xpc_video"]/@src').get()
        post['video_format'] = ''  #response.xpath('').get()
        post['category'] = response.xpath(
            '//span[@class="cate v-center"]/text()').get()
        post['created_at'] = response.xpath(
            '//span[contains(@class,"update-time")]/i/text()').get()
        post['play_counts'] = response.xpath(
            '//i[contains(@class,"play-counts")]/@data-curplaycounts').get()
        post['like_counts'] = response.xpath(
            '//span[contains(@class,"like-counts")]/@data-counts').get()
        post['description'] = response.xpath(
            '//p[contains(@class, "desc")]/text()').get()
        yield post

        request = Request(comment_api % (post['pid'], 1),
                          callback=self.parse_comment)
        request.meta['pid'] = post['pid']
        request.meta['cur_page'] = 1
        yield request
Beispiel #3
0
    def parse_post(self, response):
        """解析视频详情页"""
        # 取出上一个函数传递的参数
        pid = response.meta['pid']
        post = PostItem()
        post['pid'] = pid
        # 缩略图
        post['thumbnail'] = response.meta['thumbnail']
        # 标题
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').extract_first()
        # 分类信息
        cates = response.xpath(
            '//span[contains(@class, "cate")]/a/text()').extract()
        post['category'] = '-'.join([cate.strip() for cate in cates])
        # 发布时间
        post['created_at'] = response.xpath(
            '//span[contains(@class, "update-time")]/i/text()').get()
        # 播放次数
        post['play_counts'] = response.xpath(
            '//i[contains(@class, "play-counts")]/@data-curplaycounts').get()
        # 点赞次数
        post['like_counts'] = response.xpath(
            '//span[contains(@class, "like-counts")]/@data-counts').get()
        # 描述信息
        post['description'] = strip(
            response.xpath('//p[contains(@class, "desc")]/text()').get())
        # 提取视频的vid,这个是请求视频源文件地址的关键参数
        vid, = re.findall(r'vid: \"(\w+)\",', response.text)
        # 请求视频信息接口,把vid参数代入进去
        video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?'
        request = Request(video_url % vid, callback=self.parse_video)
        request.meta['post'] = post
        yield request
        # 请求评论接口,注意ajax=1时返回Html,=0或者不写时返回json
        comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1'
        request = Request(comment_url % pid, callback=self.parse_comment)
        yield request

        # 请求用户页面
        composer_url = 'http://www.xinpianchang.com/u%s?from=articleList'
        # 选择出所有的包含作者信息的节点
        composer_list = response.xpath(
            '//div[@class="user-team"]//ul[@class="creator-list"]/li')
        for composer in composer_list:
            # 作者ID
            cid = composer.xpath('./a/@data-userid').get()
            request = Request(composer_url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request
            # 保存作者和视频之间的对应关系
            cr = CopyrightItem()
            # 用cid和Pid组合起来作为主键
            cr['pcid'] = '%s_%s' % (cid, pid)
            cr['cid'] = cid
            cr['pid'] = pid
            # 不同作者在不同作品里担任的角色也不一样,所以也要保存起来
            cr['roles'] = composer.xpath(
                './/span[contains(@class, "roles")]/text()').get()
            yield cr
    def parse_post(self, response):
        post = PostItem()
        pid = response.meta['pid']
        post['pid'] = pid
        # 视频标题
        post['title'] = response.xpath(
            '//h3[contains(@class, "title")]/text()').extract_first()
        # 缩略图
        post['thumbnail'] = response.meta['thumbnail']

        # 分类
        cates = response.xpath(
            '//span[contains(@class, "cate")]//text()').extract()
        post['category'] = ''.join([strip(cate) for cate in cates])
        # 创建时间
        post['created_at'] = response.xpath(
            '//span[contains(@class, "update-time")]/i/text()').get()
        # 播放次数
        post['play_counts'] = response.xpath(
            '//i[contains(@class, "play-counts")]/@data-curplaycounts').get()
        # 点赞次数
        post['like_counts'] = response.xpath(
            '//span[contains(@class, "like-counts")]/@data-counts').get()
        # 介绍
        post['description'] = strip(
            response.xpath('//p[contains(@class, "desc")]/text()').get())

        video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web'
        vid, = re.findall(r'vid: \"(\w+)\"', response.text)
        post['vid'] = vid
        req = Request(video_url % vid, callback=self.parse_video)
        req.meta['post'] = post
        yield req

        comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1' % pid

        req = Request(comment_url, callback=self.parse_comments)
        yield req

        # 爬取用户页面
        # composer_urls = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li/a/@href').extract()
        composers = response.xpath(
            '//div[@class="user-team"]//ul[@class="creator-list"]/li')

        for composer in composers:
            cid = composer.xpath('./a/@data-userid').get()
            req = response.follow(composer.xpath('./a/@href').get(),
                                  callback=self.parse_composer)
            # 不跟踪此页面的cookie,以防止visit_userid_10043764这样的cookie泛滥
            req.meta['dont_merge_cookies'] = True
            yield req
            # 获取作品与作者的对应关系
            cr = CopyrightItem()
            cr['pcid'] = '%s-%s' % (pid, cid)
            cr['pid'] = pid
            cr['cid'] = cid
            cr['roles'] = composer.xpath(
                './/span[contains(@class, "roles")]/text()').get()
            yield cr
Beispiel #5
0
    def parse_post(self, response):
        # print("2222222222222@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2")
        # print(response.text) #网页源代码
        post = PostItem()
        pid = response.meta['pid']
        post['pid'] = pid
        post['thumbnail'] = response.meta['thumbnail']
        minutes, seconds, *_ = response.meta['duration'].split("'")
        post['duration'] = int(minutes) * 60 + int(seconds)
        post['video'] = response.xpath('//video[@id="xpc_video"]/@src').get()
        # 预览图片
        post['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').extract_first()
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').get()
        # 所属类别(多个)
        cates = response.xpath(
            '//span[contains(@class, "cate")]/a/text()').extract()
        post['category'] = '-'.join([strip(cate) for cate in cates])
        post['created_at'] = response.xpath(
            '//span[contains(@class, "update-time")]/i/text()').get()
        post['play_counts'] = response.xpath(
            '//i[contains(@class, "play-counts")]/@data-curplaycounts').get()
        post['like_counts'] = response.xpath(
            '//span[contains(@class, "like-counts")]/@data-counts').get()
        post['description'] = strip(
            response.xpath('//p[contains(@class, "desc")]/text()').get())
        yield post

        creator_list = response.xpath(
            '//div[contains(@class, "filmplay-creator")]/ul/li')

        # 作者详情页面
        url = 'http://www.xinpianchang.com/u%s?from=articleList'
        for creator in creator_list:
            # print('++++++++++++',creator, "++++++++++++++++++")
            #<Selector xpath='//div[contains(@class, "filmplay-creator")]/ul/li'
            # data='<li>\n\t\t\t\t<a href="/u10081750?from=articl'>
            cid = creator.xpath('./a/@data-userid').get()
            request = Request(url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request

            cr = CopyrightItem()
            cr['pcid'] = '%s_%s' % (cid, pid)
            cr['cid'] = cid
            cr['pid'] = pid
            cr['roles'] = creator.xpath(
                '//span[contains(@class, "roles")]/text()').get()
            # print("333333333333333######################################################################################################")
            yield cr

        # 评论页面
        comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1'
        request = Request(comment_url % pid, callback=self.parse_comment)
        request.meta['pid'] = pid
        yield request
Beispiel #6
0
    def parse_detail(self, response):
        postitem = PostItem()
        #视频的id
        postitem['pid'] = response.meta['post_id']
        #视频的标题
        postitem['title'] = response.meta['title']
        #视频的小缩略图
        postitem['thumbnail'] = response.meta['thumbnail']
        #视频的大图
        postitem['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').extract_first()
        #视频的地址
        video_url = response.xpath('//a[@id="player"]/@href').get()
        if video_url:
            video_url = video_url[2:]
        postitem['video'] = video_url
        #这个是视频的格式
        video_format = str(video_url).split('.')[-1]
        postitem['video_format'] = video_format
        #这个是视频的分类
        category = response.xpath(
            '//div[contains(@class,"filmplay-intro")]/span/text()').get()
        postitem['category'] = category
        #这个是视频创建时间
        created_at = response.xpath(
            '//span[contains(@class,"update-time")]/i/text()').get()
        postitem['created_at'] = created_at
        #这个是视频的播放次数
        play_counts = response.xpath(
            '//i[contains(@class,"play-counts")]/text()').get()
        postitem['play_counts'] = num_to_int(play_counts)
        #视频的喜爱次数
        like_counts = response.xpath(
            '//span[contains(@class,"like-counts")]/text()').get()
        postitem['like_counts'] = num_to_int(like_counts)
        #视频的描述
        description = response.xpath(
            '//p[contains(@class,"desc")]/text()').get()
        postitem['description'] = description
        yield postitem
        #把去往作者详情的链接提取出来,然后yield到相应的方法上面去
        composers = response.xpath('//a[@class="head-wrap"]')
        for one in composers:
            composer_url = one.xpath('./@href').get()
            crequest = scrapy.Request('http://www.xinpianchang.com' +
                                      composer_url,
                                      callback=self.parse_composer)
            crequest.meta['composer_id'] = one.xpath('./@data-userid').get()
            yield crequest

        #把这篇文章的评论解析出来,跳转到评论方法上
        request = scrapy.Request(comment_api % (1, postitem['pid']),
                                 callback=self.parse_comment)
        request.meta['pid'] = postitem['pid']
        request.meta['page'] = 1
        yield request
Beispiel #7
0
    def parse_post(self, response):
        post = PostItem()
        # 将所有数据爬取
        post['pid'] = response.meta['pid']
        post['thumbnail'] = response.meta['thumbnail']
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').get()
        post['video'] = response.xpath('//video[@id="xpc_video"]/@src').get()
        post['video_format'] = ''
        #预览图
        post['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').get()
        # 类别
        post['category'] = response.xpath(
            '//span[@class="cate v-center"]/text()').get()
        post['created_at'] = response.xpath(
            '//span[contains(@class,"update-time")]/i/text()').get()
        post['play_counts'] = ci(
            response.xpath(
                '//i[contains(@class,"play-counts")]/@data-curplaycounts').get(
                ))
        post['like_counts'] = ci(
            response.xpath(
                '//span[contains(@class,like-counts)]/@data-counts').get())
        post['description'] = response.xpath(
            '//p[contains(@class,"desc")]/text()').get()

        yield post

        creator_list = response.xpath(
            '//div[contains(@class,"filmplay-creator")]/ul[@class="creator-list"]/li'
        )
        for creator in creator_list:
            user_page = creator.xpath('./a/@href').get()
            user_id = creator.xpath('./a/@data-userid').get()
            request = Request('%s%s' % (self.root_url, user_page),
                              callback=self.parse_composer)
            request.meta['cid'] = user_id
            yield request

            cr = CopyrightItem()
            cr['pid'] = response.meta['pid']
            cr['cid'] = user_id
            cr['pcid'] = '%s_%s' % (cr['pid'], cr['cid'])
            cr['roles'] = creator.xpath(
                './/span[contains(@class,"roles")]/text()').get()
            yield cr

        # 将post['pid']作为参数传入comment_api中 指定page参数为1,回调函数为self.parse_comment
        request = Request(comment_api % post['pid'],
                          callback=self.parse_comment)
        request.meta['pid'] = post['pid']
        # request.meta['cur_page'] = 1
        yield request
Beispiel #8
0
    def parse_post(self, response):
        post = PostItem()
        post['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').extract_first()
        post['pid'] = response.meta['pid']
        post['thumbnail'] = response.meta['thumbnail']  # 图片
        post['video'] = response.xpath(
            '//video[@id="xpc_video"]/@src').get()  # 视频链接
        post['title'] = response.xpath(
            '//*[@class="title-wrap"]/h3/text()').get()  # 标题
        post['category'] = response.xpath(
            '//*[@class="cate v-center"]/text()').get()
        vf = response.xpath('//*[@class="video-format v-center"]/text()').get()
        post['video_format'] = vf.strip() if vf else ""
        post['created_at'] = response.xpath(
            '//*[@class="update-time v-center"]//text()').get()
        post['play_counts'] = response.xpath(
            '//i[contains(@class,"play-counts")]/text()').get().replace(
                ',', '')
        post['like_counts'] = response.xpath(
            '//span[contains(@class,"like-counts")]/text()').get().replace(
                ',', '')
        post['description'] = response.xpath(
            '//p[contains(@class,"desc")]/text()').get() or ''
        yield post
        self.logger.info('scraped post(%s): %s' % (post['pid'], post['title']))

        # 视频与导演,一对多的关系
        compose_url = "http://www.xinpianchang.com/u%s"
        composer_list = response.xpath(
            '//div[@class="user-team"]//ul[@class="creator-list"]/li')
        for composer in composer_list:
            cid = composer.xpath('./a/@data-userid').get()
            copyright = {
                'pcid':
                '%s_%s' % (post['pid'], cid),
                'pid':
                post['pid'],
                'cid':
                cid,
                'roles':
                composer.xpath(
                    './/span[contains(@class,"roles")]/text()').get()
            }
            yield CopyrightItem(copyright)
            request = Request(compose_url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request

            comment_api = "http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&page=1"
            yield response.follow(comment_api % post['pid'],
                                  callback=self.parse_comment)
Beispiel #9
0
    def parse_post(self, response):
        pid = response.meta['pid']
        post = PostItem(pid=pid)
        post['thumbnail'] = response.meta['thumbnail']
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').get()  #get()==extract()
        vid, = re.findall('vid: \"(\w+)\"\,', response.text)
        video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web'
        post['category'] = ''.join([
            _.strip() for _ in response.xpath(
                '//span[contains(@class,"cate")]//text()').extract()
        ])
        post['created_at'] = response.xpath(
            '//span[contains(@class,"update-time")]/i/text()').get()
        post['play_counts'] = convert_int(
            response.xpath('//i[contains(@class,"play-count")]/text()').get())
        post['like_counts'] = convert_int(
            response.xpath(
                '//span[contains(@class,"like-counts")]/text()').get())
        post['description'] = strip(
            response.xpath('//p[contains(@class, "desc")]/text()').get())

        #多了一步视频地址请求
        request = Request(video_url % vid, callback=self.parse_video)
        request.meta['post'] = post  # 传post给parse_video()
        yield request  #???不明白
        # request即为<GET https://openapi-vtom.vmovier.com/v3/video/5EDE53C61C155?expand=resource&usage=xpc_web>

        comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24'
        request = Request(comment_url % pid, callback=self.parse_comment)
        request.meta['pid'] = pid
        yield request

        creator_list = response.xpath('//div[@class="creator-info"]')
        for creator in creator_list:
            c_url, = creator.xpath('./a/@href').extract()
            cid, = re.findall('\/u(\d+)\?', c_url)
            request = response.follow('https://www.xinpianchang.com' + c_url,
                                      self.parse_composer)
            request.meta['cid'] = cid
            request.meta['dont_merge_cookies'] = True
            yield request

            cr = CopyrightItem()
            cr['pcid'] = '%s_%s' % (pid, cid)
            cr['pid'] = pid
            cr['roles'] = creator.xpath(
                './a/following-sibling::span[1]/text()').get()
            yield cr
Beispiel #10
0
    def parse_post(self, response):  #视频函数
        post = PostItem()
        post['pid'] = response.meta['pid']  #从请求中获取视频的id
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').get()
        post['thumbnail'] = response.meta['thumbnail']
        post['video_format'] = ''
        post['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').get()

        post['category'] = response.xpath(
            '//span[@class="cate v-center"]/text()').get()  #视频的分类
        post['video'] = response.xpath(
            '//video[@id="xpc_video"]/@src').get()  #视频的地址
        post['play_counts'] = convert_int(
            response.xpath(
                '//i[contains(@class,"play-counts")]/@data-curplaycounts').get(
                ))  #视频的播放次数,将其转换为整形
        post['like_counts'] = convert_int(
            response.xpath(
                '//span[contains(@class,"like-counts")]/@data-counts').get()
        )  #视频的点赞次数,并讲其转换为整形
        post['description'] = response.xpath(
            '//p[contains(@class,"desc")]/text()').get()  #获取视频的描述
        post['created_at'] = response.xpath(
            '//span[contains(@class,"update-time")]/i/text()').get()  #视频的创建时间
        yield post
        creator_list = response.xpath(
            '//div[contains(@class,"filmplay-creator")]/ul[@class="creator-list"]/li'
        )  #获取作者列表
        for creator in creator_list:
            user_page = creator.xpath('./a/@href').get()  #获取作者的主页地址
            user_id = creator.xpath('./a/@data-userid').get()  #获取作者的ID
            request = Request('%s%s' % (self.root_url, user_page),
                              callback=self.parse_composer)  #拼接路径,访问作者的主页
            request.meta['cid'] = user_id
            yield request
            cr = CopyrightItem()
            cr['pid'] = response.meta['pid']
            cr['cid'] = user_id
            cr['pcid'] = '%s_%s' % (cr['pid'], cr['cid'])  #作者ID与视频ID相关联起来
            cr['roles'] = creator.xpath(
                './div[@class="creator-info"]/span/text()').get()  #获取作者的职务
            yield cr

        request = Request(comment_api % post['pid'],
                          callback=self.parse_comment)
        request.meta['pid'] = post['pid']
        yield request
Beispiel #11
0
    def parse_post(self, response):
        pid = response.meta['pid']
        post = PostItem()
        post['pid'] =  pid
        post['thumbnail'] = response.meta['thumbnail']
        # post["video"] = response.xpath("//video[@id='xpc_video']/@src").extract_first()
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').extract_first()
        cates = response.xpath(
            '//span[contains(@class, "cate")]/a/text()').extract()
        post['category'] = '-'.join([cate.strip() for cate in cates])
        post['created_at'] = response.xpath(
            '//span[contains(@class, "update-time")]/i/text()').get()
        post['play_counts'] = response.xpath(
            '//i[contains(@class, "play-counts")]/@data-curplaycounts').get()
        post['like_counts'] = response.xpath(
            '//span[contains(@class, "like-counts")]/@data-counts').get()
        post['description'] = strip(response.xpath(
            '//p[contains(@class, "desc")]/text()').get())


        vid, = re.findall(r'vid: \"(\w+)\",',response.text)
        video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?'
        request = Request(video_url % vid,callback=self.parse_video)
        request.meta['post'] = post
        yield request

        #评论信息
        comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1'
        request = Request(comment_url % pid,callback=self.parse_comment)
        yield request

        comment_url = 'http://www.xinpianchang.com/u%s?from=articleList'
        composer_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li')
        for composer in composer_list:
            cid = composer.xpath('./a/@data-userid').get()
            request = Request(composer_url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request

            cr = CopyrightItem()
            cr['pcid'] = '%s_%s' % (cid, pid)
            cr['cid'] = cid
            cr['pid'] = pid
            cr['roles'] = composer.xpath('.//span[contains(@class,"roles")]/text()').get()
            yield cr
Beispiel #12
0
    def parse_post(self, response):
        pid = response.meta['pid']
        post = PostItem(pid=pid)
        post['thumbnail'] = response.meta['thumbnail']
        post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()
        # 由于视频的地址也是动态加载的,分析请求方式后发现请求url
        # 使用shell测试后得出正则表达式获取一个list形式返回的vid因此加逗号获取
        vid, = re.findall('vid: \"(\w+)",',response.text)
        video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web'
        # cates = response.xpath('//span[contains(@class,"cate v-center")]//text()').extract()
        post['category'] = response.xpath('normalize-space(string(//span[@class="cate v-center"]))').extract()
        post['created_at'] = response.xpath('//span[@class="update-time v-center"]/i/text()').get()
        post['play_counts'] = convert_int(response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').get())
        post['like_counts'] = convert_int(response.xpath('//span[contains(@class,"like-counts")]/@data-counts').get())
        post['description'] = response.xpath('normalize-space(string(//p[contains(@class,"desc")]))').get()
        # post[''] = response.xpath('').get()
        # post[''] = response.xpath('').get()
        # post[''] = response.xpath('').get()
        # post[''] = response.xpath('').get()
        request = Request(video_url % vid, callback=self.parse_video) # 解析视频接口模块回调该视频请求url,不是视频地址,里面包含了视频地址
        request.meta['post'] = post  # 通过meta把包含title信息的post传递出去
        yield request

        # 设置页面评论部分的url
        comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24'
        request = Request(comment_url % pid, callback=self.parse_comment)
        request.meta['pid'] = pid
        yield request

        # 获取制作人信息
        creator_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul/li')
        composer_url = 'https://www.xinpianchang.com/u%s?from=articleList'
        for creator in creator_list:
            cid = creator.xpath('./a/@data-userid').get()
            request = response.follow(composer_url % cid, self.parse_composer)
            request.meta['cid'] = cid
            request.meta['dont_merge_cookies'] = True
            yield request

            cr = CopyrightItem()
            cr['pcid'] = '%s_%s' % (pid, cid)
            cr['pid'] = pid
            cr['cid'] = cid
            cr['roles'] = creator.xpath('./div[@class="creator-info"]/span/text()').get()
            yield cr
Beispiel #13
0
    def parse_post(self, response):
        # 提取作品信息
        post = PostItem()
        post['pid'] = response.meta['pid']
        post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').extract_first()
        post['thumbnail'] = response.meta['thumbnail']
        post['preview'] = strip(response.xpath('//div[@class="filmplay"]//img/@src').extract_first())
        video = response.xpath('//video[@id="xpc_video"]/@src') or response.xpath('//div[@class="td-player"]//video/@src')
        post['video'] = video.extract_first()
        post['video_format'] = strip(response.xpath('//span[contains(@class, "video-format")]/text()').extract_first())
        duration = response.meta['duration']
        if duration:
            # 将播放时长由文本格式(比如:19:00)转换为int秒
            duration = [int(i) for i in duration.replace("'", "").split(' ')]
            post['duration'] = duration[0] * 60 + duration[1]
        post['category'] = response.xpath('//span[@class="cate v-center"]/text()').extract_first()
        post['created_at'] = response.xpath('//span[contains(@class,"update-time")]/i/text()').extract_first()
        post['play_counts'] = ci(response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').extract_first())
        post['like_counts'] = ci(response.xpath('//i[contains(@class,"like-counts")]/@data-counts').extract_first())
        post['description'] = strip(response.xpath('//p[contains(@class,"desc")]/text()').extract_first(default=''))
        yield post

        # 抓取评论
        comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi/id-%s/page-1'
        request = Request(comment_api % (post['pid']), callback=self.parse_comment)
        yield request

       
        composer_url = 'http://www.xinpianchang.com/u%s'
        composers = []
        for elem in response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li'):
            cid = elem.xpath('.//a[@class="head-wrap"]/@data-userid').extract_first()
            # 抓取作者信息
            request = Request(composer_url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request

            # 提取著作权信息
            cr = CopyrightItem()
            cr['pid'] = post['pid']
            cr['cid'] = cid
            cr['pcid'] = '%s_%s' % (cr['pid'], cid)
            cr['roles'] = elem.xpath('.//span[contains(@class, "roles")]/text()').extract_first()
            yield cr
Beispiel #14
0
    def parse_post(self, response):
        """处理视频详情页"""

        post = PostItem()
        # 获取上个页面处理函数设置的视频ID
        pid = response.meta['pid']
        post['pid'] = pid
        post['thumbnail'] = response.meta['thumbnail']
        # 视频标题
        post['title'] = response.xpath(
            '//div[@class="title-wrap"]/h3/text()').extract_first()
        # 视频的预览图,也就是刚打开页面看到的那张图
        post['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').extract_first()
        # 视频URL
        post['video'] = response.xpath('//a[@id="player"]/@href').get()
        # 视频所属分类
        cates = response.xpath(
            '//span[contains(@class,"cate")]//text()').extract()
        post['category'] = ''.join([cate.strip() for cate in cates])
        # 发表时间
        post['created_at'] = response.xpath(
            '//span[contains(@class,"update-time")]/i/text()').get()
        # 播放次数
        post['play_counts'] = response.xpath(
            '//i[contains(@class,"play-counts")]/@data-curplaycounts').get()
        # 被点赞次数
        post['like_counts'] = response.xpath(
            '//span[contains(@class,"like-counts")]/@data-counts').get()
        # 播放时长
        duration = response.meta['duration']
        if duration:
            # duration原始格式:01' 51''
            minutes, seconds, *_ = duration.split("'")
            post['duration'] = int(minutes) * 60 + int(seconds)
        # 视频描述
        post['description'] = response.xpath(
            '//p[contains(@class, "desc")]/text()').get()
        post['video_format'] = '1080p'
        yield post

        # 用户主页地址模板
        composer_url = 'http://www.xinpianchang.com/u%s?from=articleList'
        # 获取当前视频的创作者节点列表
        composer_list = response.xpath(
            '//div[@class="user-team"]//ul[@class="creator-list"]/li')
        # 遍历所有的创作者
        for composer in composer_list:
            cr = CopyrightItem()
            cid = composer.xpath('./a/@data-userid').get()
            cr['pcid'] = '%s_%s' % (cid, pid)
            cr['cid'] = cid
            cr['pid'] = pid
            # 不同的作者在不同的视频内担任的角色不一样
            cr['roles'] = composer.xpath(
                './/span[contains(@class, "roles")]/text()').get()
            yield cr
            # 构造用户主页的request,并yield
            request = Request(composer_url % cid, callback=self.parse_composer)
            request.meta['cid'] = cid
            yield request

        # 评论信息的url模板
        comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1'
        # 构造评论接口的request,并返回
        request = Request(comment_url % pid, callback=self.parse_comment)
        yield request