def parse_post(self, response): """解析视频详情页""" # 取出上一个函数传递的参数 pid = response.meta['pid'] post = PostItem() post['pid'] = pid # 缩略图 post['thumbnail'] = response.meta['thumbnail'] # 标题 post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').extract_first() # 分类信息 cates = response.xpath( '//span[contains(@class, "cate")]/a/text()').extract() post['category'] = '-'.join([cate.strip() for cate in cates]) # 发布时间 post['created_at'] = response.xpath( '//span[contains(@class, "update-time")]/i/text()').get() # 播放次数 post['play_counts'] = response.xpath( '//i[contains(@class, "play-counts")]/@data-curplaycounts').get() # 点赞次数 post['like_counts'] = response.xpath( '//span[contains(@class, "like-counts")]/@data-counts').get() # 描述信息 post['description'] = strip( response.xpath('//p[contains(@class, "desc")]/text()').get()) # 提取视频的vid,这个是请求视频源文件地址的关键参数 vid, = re.findall(r'vid: \"(\w+)\",', response.text) # 请求视频信息接口,把vid参数代入进去 video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?' request = Request(video_url % vid, callback=self.parse_video) request.meta['post'] = post yield request # 请求评论接口,注意ajax=1时返回Html,=0或者不写时返回json comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' request = Request(comment_url % pid, callback=self.parse_comment) yield request # 请求用户页面 composer_url = 'http://www.xinpianchang.com/u%s?from=articleList' # 选择出所有的包含作者信息的节点 composer_list = response.xpath( '//div[@class="user-team"]//ul[@class="creator-list"]/li') for composer in composer_list: # 作者ID cid = composer.xpath('./a/@data-userid').get() request = Request(composer_url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request # 保存作者和视频之间的对应关系 cr = CopyrightItem() # 用cid和Pid组合起来作为主键 cr['pcid'] = '%s_%s' % (cid, pid) cr['cid'] = cid cr['pid'] = pid # 不同作者在不同作品里担任的角色也不一样,所以也要保存起来 cr['roles'] = composer.xpath( './/span[contains(@class, "roles")]/text()').get() yield cr
def parse_post(self, response): post = PostItem() pid = response.meta['pid'] post['pid'] = pid # 视频标题 post['title'] = response.xpath( '//h3[contains(@class, "title")]/text()').extract_first() # 缩略图 post['thumbnail'] = response.meta['thumbnail'] # 分类 cates = response.xpath( '//span[contains(@class, "cate")]//text()').extract() post['category'] = ''.join([strip(cate) for cate in cates]) # 创建时间 post['created_at'] = response.xpath( '//span[contains(@class, "update-time")]/i/text()').get() # 播放次数 post['play_counts'] = response.xpath( '//i[contains(@class, "play-counts")]/@data-curplaycounts').get() # 点赞次数 post['like_counts'] = response.xpath( '//span[contains(@class, "like-counts")]/@data-counts').get() # 介绍 post['description'] = strip( response.xpath('//p[contains(@class, "desc")]/text()').get()) video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web' vid, = re.findall(r'vid: \"(\w+)\"', response.text) post['vid'] = vid req = Request(video_url % vid, callback=self.parse_video) req.meta['post'] = post yield req comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1' % pid req = Request(comment_url, callback=self.parse_comments) yield req # 爬取用户页面 # composer_urls = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li/a/@href').extract() composers = response.xpath( '//div[@class="user-team"]//ul[@class="creator-list"]/li') for composer in composers: cid = composer.xpath('./a/@data-userid').get() req = response.follow(composer.xpath('./a/@href').get(), callback=self.parse_composer) # 不跟踪此页面的cookie,以防止visit_userid_10043764这样的cookie泛滥 req.meta['dont_merge_cookies'] = True yield req # 获取作品与作者的对应关系 cr = CopyrightItem() cr['pcid'] = '%s-%s' % (pid, cid) cr['pid'] = pid cr['cid'] = cid cr['roles'] = composer.xpath( './/span[contains(@class, "roles")]/text()').get() yield cr
def parse_post(self, response): # print("2222222222222@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2") # print(response.text) #网页源代码 post = PostItem() pid = response.meta['pid'] post['pid'] = pid post['thumbnail'] = response.meta['thumbnail'] minutes, seconds, *_ = response.meta['duration'].split("'") post['duration'] = int(minutes) * 60 + int(seconds) post['video'] = response.xpath('//video[@id="xpc_video"]/@src').get() # 预览图片 post['preview'] = response.xpath( '//div[@class="filmplay"]//img/@src').extract_first() post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').get() # 所属类别(多个) cates = response.xpath( '//span[contains(@class, "cate")]/a/text()').extract() post['category'] = '-'.join([strip(cate) for cate in cates]) post['created_at'] = response.xpath( '//span[contains(@class, "update-time")]/i/text()').get() post['play_counts'] = response.xpath( '//i[contains(@class, "play-counts")]/@data-curplaycounts').get() post['like_counts'] = response.xpath( '//span[contains(@class, "like-counts")]/@data-counts').get() post['description'] = strip( response.xpath('//p[contains(@class, "desc")]/text()').get()) yield post creator_list = response.xpath( '//div[contains(@class, "filmplay-creator")]/ul/li') # 作者详情页面 url = 'http://www.xinpianchang.com/u%s?from=articleList' for creator in creator_list: # print('++++++++++++',creator, "++++++++++++++++++") #<Selector xpath='//div[contains(@class, "filmplay-creator")]/ul/li' # data='<li>\n\t\t\t\t<a href="/u10081750?from=articl'> cid = creator.xpath('./a/@data-userid').get() request = Request(url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request cr = CopyrightItem() cr['pcid'] = '%s_%s' % (cid, pid) cr['cid'] = cid cr['pid'] = pid cr['roles'] = creator.xpath( '//span[contains(@class, "roles")]/text()').get() # print("333333333333333######################################################################################################") yield cr # 评论页面 comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' request = Request(comment_url % pid, callback=self.parse_comment) request.meta['pid'] = pid yield request
def parse_post(self, response): post = PostItem() # 将所有数据爬取 post['pid'] = response.meta['pid'] post['thumbnail'] = response.meta['thumbnail'] post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').get() post['video'] = response.xpath('//video[@id="xpc_video"]/@src').get() post['video_format'] = '' #预览图 post['preview'] = response.xpath( '//div[@class="filmplay"]//img/@src').get() # 类别 post['category'] = response.xpath( '//span[@class="cate v-center"]/text()').get() post['created_at'] = response.xpath( '//span[contains(@class,"update-time")]/i/text()').get() post['play_counts'] = ci( response.xpath( '//i[contains(@class,"play-counts")]/@data-curplaycounts').get( )) post['like_counts'] = ci( response.xpath( '//span[contains(@class,like-counts)]/@data-counts').get()) post['description'] = response.xpath( '//p[contains(@class,"desc")]/text()').get() yield post creator_list = response.xpath( '//div[contains(@class,"filmplay-creator")]/ul[@class="creator-list"]/li' ) for creator in creator_list: user_page = creator.xpath('./a/@href').get() user_id = creator.xpath('./a/@data-userid').get() request = Request('%s%s' % (self.root_url, user_page), callback=self.parse_composer) request.meta['cid'] = user_id yield request cr = CopyrightItem() cr['pid'] = response.meta['pid'] cr['cid'] = user_id cr['pcid'] = '%s_%s' % (cr['pid'], cr['cid']) cr['roles'] = creator.xpath( './/span[contains(@class,"roles")]/text()').get() yield cr # 将post['pid']作为参数传入comment_api中 指定page参数为1,回调函数为self.parse_comment request = Request(comment_api % post['pid'], callback=self.parse_comment) request.meta['pid'] = post['pid'] # request.meta['cur_page'] = 1 yield request
def parse_post(self, response): post = PostItem() post['preview'] = response.xpath( '//div[@class="filmplay"]//img/@src').extract_first() post['pid'] = response.meta['pid'] post['thumbnail'] = response.meta['thumbnail'] # 图片 post['video'] = response.xpath( '//video[@id="xpc_video"]/@src').get() # 视频链接 post['title'] = response.xpath( '//*[@class="title-wrap"]/h3/text()').get() # 标题 post['category'] = response.xpath( '//*[@class="cate v-center"]/text()').get() vf = response.xpath('//*[@class="video-format v-center"]/text()').get() post['video_format'] = vf.strip() if vf else "" post['created_at'] = response.xpath( '//*[@class="update-time v-center"]//text()').get() post['play_counts'] = response.xpath( '//i[contains(@class,"play-counts")]/text()').get().replace( ',', '') post['like_counts'] = response.xpath( '//span[contains(@class,"like-counts")]/text()').get().replace( ',', '') post['description'] = response.xpath( '//p[contains(@class,"desc")]/text()').get() or '' yield post self.logger.info('scraped post(%s): %s' % (post['pid'], post['title'])) # 视频与导演,一对多的关系 compose_url = "http://www.xinpianchang.com/u%s" composer_list = response.xpath( '//div[@class="user-team"]//ul[@class="creator-list"]/li') for composer in composer_list: cid = composer.xpath('./a/@data-userid').get() copyright = { 'pcid': '%s_%s' % (post['pid'], cid), 'pid': post['pid'], 'cid': cid, 'roles': composer.xpath( './/span[contains(@class,"roles")]/text()').get() } yield CopyrightItem(copyright) request = Request(compose_url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request comment_api = "http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&page=1" yield response.follow(comment_api % post['pid'], callback=self.parse_comment)
def parse_post(self, response): pid = response.meta['pid'] post = PostItem(pid=pid) post['thumbnail'] = response.meta['thumbnail'] post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').get() #get()==extract() vid, = re.findall('vid: \"(\w+)\"\,', response.text) video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web' post['category'] = ''.join([ _.strip() for _ in response.xpath( '//span[contains(@class,"cate")]//text()').extract() ]) post['created_at'] = response.xpath( '//span[contains(@class,"update-time")]/i/text()').get() post['play_counts'] = convert_int( response.xpath('//i[contains(@class,"play-count")]/text()').get()) post['like_counts'] = convert_int( response.xpath( '//span[contains(@class,"like-counts")]/text()').get()) post['description'] = strip( response.xpath('//p[contains(@class, "desc")]/text()').get()) #多了一步视频地址请求 request = Request(video_url % vid, callback=self.parse_video) request.meta['post'] = post # 传post给parse_video() yield request #???不明白 # request即为<GET https://openapi-vtom.vmovier.com/v3/video/5EDE53C61C155?expand=resource&usage=xpc_web> comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24' request = Request(comment_url % pid, callback=self.parse_comment) request.meta['pid'] = pid yield request creator_list = response.xpath('//div[@class="creator-info"]') for creator in creator_list: c_url, = creator.xpath('./a/@href').extract() cid, = re.findall('\/u(\d+)\?', c_url) request = response.follow('https://www.xinpianchang.com' + c_url, self.parse_composer) request.meta['cid'] = cid request.meta['dont_merge_cookies'] = True yield request cr = CopyrightItem() cr['pcid'] = '%s_%s' % (pid, cid) cr['pid'] = pid cr['roles'] = creator.xpath( './a/following-sibling::span[1]/text()').get() yield cr
def parse_post(self, response): #视频函数 post = PostItem() post['pid'] = response.meta['pid'] #从请求中获取视频的id post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').get() post['thumbnail'] = response.meta['thumbnail'] post['video_format'] = '' post['preview'] = response.xpath( '//div[@class="filmplay"]//img/@src').get() post['category'] = response.xpath( '//span[@class="cate v-center"]/text()').get() #视频的分类 post['video'] = response.xpath( '//video[@id="xpc_video"]/@src').get() #视频的地址 post['play_counts'] = convert_int( response.xpath( '//i[contains(@class,"play-counts")]/@data-curplaycounts').get( )) #视频的播放次数,将其转换为整形 post['like_counts'] = convert_int( response.xpath( '//span[contains(@class,"like-counts")]/@data-counts').get() ) #视频的点赞次数,并讲其转换为整形 post['description'] = response.xpath( '//p[contains(@class,"desc")]/text()').get() #获取视频的描述 post['created_at'] = response.xpath( '//span[contains(@class,"update-time")]/i/text()').get() #视频的创建时间 yield post creator_list = response.xpath( '//div[contains(@class,"filmplay-creator")]/ul[@class="creator-list"]/li' ) #获取作者列表 for creator in creator_list: user_page = creator.xpath('./a/@href').get() #获取作者的主页地址 user_id = creator.xpath('./a/@data-userid').get() #获取作者的ID request = Request('%s%s' % (self.root_url, user_page), callback=self.parse_composer) #拼接路径,访问作者的主页 request.meta['cid'] = user_id yield request cr = CopyrightItem() cr['pid'] = response.meta['pid'] cr['cid'] = user_id cr['pcid'] = '%s_%s' % (cr['pid'], cr['cid']) #作者ID与视频ID相关联起来 cr['roles'] = creator.xpath( './div[@class="creator-info"]/span/text()').get() #获取作者的职务 yield cr request = Request(comment_api % post['pid'], callback=self.parse_comment) request.meta['pid'] = post['pid'] yield request
def parse_post(self, response): pid = response.meta['pid'] post = PostItem() post['pid'] = pid post['thumbnail'] = response.meta['thumbnail'] # post["video"] = response.xpath("//video[@id='xpc_video']/@src").extract_first() post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').extract_first() cates = response.xpath( '//span[contains(@class, "cate")]/a/text()').extract() post['category'] = '-'.join([cate.strip() for cate in cates]) post['created_at'] = response.xpath( '//span[contains(@class, "update-time")]/i/text()').get() post['play_counts'] = response.xpath( '//i[contains(@class, "play-counts")]/@data-curplaycounts').get() post['like_counts'] = response.xpath( '//span[contains(@class, "like-counts")]/@data-counts').get() post['description'] = strip(response.xpath( '//p[contains(@class, "desc")]/text()').get()) vid, = re.findall(r'vid: \"(\w+)\",',response.text) video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource,resource_origin?' request = Request(video_url % vid,callback=self.parse_video) request.meta['post'] = post yield request #评论信息 comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' request = Request(comment_url % pid,callback=self.parse_comment) yield request comment_url = 'http://www.xinpianchang.com/u%s?from=articleList' composer_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li') for composer in composer_list: cid = composer.xpath('./a/@data-userid').get() request = Request(composer_url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request cr = CopyrightItem() cr['pcid'] = '%s_%s' % (cid, pid) cr['cid'] = cid cr['pid'] = pid cr['roles'] = composer.xpath('.//span[contains(@class,"roles")]/text()').get() yield cr
def parse_post(self, response): pid = response.meta['pid'] post = PostItem(pid=pid) post['thumbnail'] = response.meta['thumbnail'] post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get() # 由于视频的地址也是动态加载的,分析请求方式后发现请求url # 使用shell测试后得出正则表达式获取一个list形式返回的vid因此加逗号获取 vid, = re.findall('vid: \"(\w+)",',response.text) video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web' # cates = response.xpath('//span[contains(@class,"cate v-center")]//text()').extract() post['category'] = response.xpath('normalize-space(string(//span[@class="cate v-center"]))').extract() post['created_at'] = response.xpath('//span[@class="update-time v-center"]/i/text()').get() post['play_counts'] = convert_int(response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').get()) post['like_counts'] = convert_int(response.xpath('//span[contains(@class,"like-counts")]/@data-counts').get()) post['description'] = response.xpath('normalize-space(string(//p[contains(@class,"desc")]))').get() # post[''] = response.xpath('').get() # post[''] = response.xpath('').get() # post[''] = response.xpath('').get() # post[''] = response.xpath('').get() request = Request(video_url % vid, callback=self.parse_video) # 解析视频接口模块回调该视频请求url,不是视频地址,里面包含了视频地址 request.meta['post'] = post # 通过meta把包含title信息的post传递出去 yield request # 设置页面评论部分的url comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24' request = Request(comment_url % pid, callback=self.parse_comment) request.meta['pid'] = pid yield request # 获取制作人信息 creator_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul/li') composer_url = 'https://www.xinpianchang.com/u%s?from=articleList' for creator in creator_list: cid = creator.xpath('./a/@data-userid').get() request = response.follow(composer_url % cid, self.parse_composer) request.meta['cid'] = cid request.meta['dont_merge_cookies'] = True yield request cr = CopyrightItem() cr['pcid'] = '%s_%s' % (pid, cid) cr['pid'] = pid cr['cid'] = cid cr['roles'] = creator.xpath('./div[@class="creator-info"]/span/text()').get() yield cr
def parse_post(self, response): # 提取作品信息 post = PostItem() post['pid'] = response.meta['pid'] post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').extract_first() post['thumbnail'] = response.meta['thumbnail'] post['preview'] = strip(response.xpath('//div[@class="filmplay"]//img/@src').extract_first()) video = response.xpath('//video[@id="xpc_video"]/@src') or response.xpath('//div[@class="td-player"]//video/@src') post['video'] = video.extract_first() post['video_format'] = strip(response.xpath('//span[contains(@class, "video-format")]/text()').extract_first()) duration = response.meta['duration'] if duration: # 将播放时长由文本格式(比如:19:00)转换为int秒 duration = [int(i) for i in duration.replace("'", "").split(' ')] post['duration'] = duration[0] * 60 + duration[1] post['category'] = response.xpath('//span[@class="cate v-center"]/text()').extract_first() post['created_at'] = response.xpath('//span[contains(@class,"update-time")]/i/text()').extract_first() post['play_counts'] = ci(response.xpath('//i[contains(@class,"play-counts")]/@data-curplaycounts').extract_first()) post['like_counts'] = ci(response.xpath('//i[contains(@class,"like-counts")]/@data-counts').extract_first()) post['description'] = strip(response.xpath('//p[contains(@class,"desc")]/text()').extract_first(default='')) yield post # 抓取评论 comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi/id-%s/page-1' request = Request(comment_api % (post['pid']), callback=self.parse_comment) yield request composer_url = 'http://www.xinpianchang.com/u%s' composers = [] for elem in response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li'): cid = elem.xpath('.//a[@class="head-wrap"]/@data-userid').extract_first() # 抓取作者信息 request = Request(composer_url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request # 提取著作权信息 cr = CopyrightItem() cr['pid'] = post['pid'] cr['cid'] = cid cr['pcid'] = '%s_%s' % (cr['pid'], cid) cr['roles'] = elem.xpath('.//span[contains(@class, "roles")]/text()').extract_first() yield cr
def parse_post(self, response): """处理视频详情页""" post = PostItem() # 获取上个页面处理函数设置的视频ID pid = response.meta['pid'] post['pid'] = pid post['thumbnail'] = response.meta['thumbnail'] # 视频标题 post['title'] = response.xpath( '//div[@class="title-wrap"]/h3/text()').extract_first() # 视频的预览图,也就是刚打开页面看到的那张图 post['preview'] = response.xpath( '//div[@class="filmplay"]//img/@src').extract_first() # 视频URL post['video'] = response.xpath('//a[@id="player"]/@href').get() # 视频所属分类 cates = response.xpath( '//span[contains(@class,"cate")]//text()').extract() post['category'] = ''.join([cate.strip() for cate in cates]) # 发表时间 post['created_at'] = response.xpath( '//span[contains(@class,"update-time")]/i/text()').get() # 播放次数 post['play_counts'] = response.xpath( '//i[contains(@class,"play-counts")]/@data-curplaycounts').get() # 被点赞次数 post['like_counts'] = response.xpath( '//span[contains(@class,"like-counts")]/@data-counts').get() # 播放时长 duration = response.meta['duration'] if duration: # duration原始格式:01' 51'' minutes, seconds, *_ = duration.split("'") post['duration'] = int(minutes) * 60 + int(seconds) # 视频描述 post['description'] = response.xpath( '//p[contains(@class, "desc")]/text()').get() post['video_format'] = '1080p' yield post # 用户主页地址模板 composer_url = 'http://www.xinpianchang.com/u%s?from=articleList' # 获取当前视频的创作者节点列表 composer_list = response.xpath( '//div[@class="user-team"]//ul[@class="creator-list"]/li') # 遍历所有的创作者 for composer in composer_list: cr = CopyrightItem() cid = composer.xpath('./a/@data-userid').get() cr['pcid'] = '%s_%s' % (cid, pid) cr['cid'] = cid cr['pid'] = pid # 不同的作者在不同的视频内担任的角色不一样 cr['roles'] = composer.xpath( './/span[contains(@class, "roles")]/text()').get() yield cr # 构造用户主页的request,并yield request = Request(composer_url % cid, callback=self.parse_composer) request.meta['cid'] = cid yield request # 评论信息的url模板 comment_url = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' # 构造评论接口的request,并返回 request = Request(comment_url % pid, callback=self.parse_comment) yield request