def parse_episode_youku(self, response): try: logging.log(logging.INFO, "episode_youku:%s" % response.request.url) pg_id = response.request.meta['pg_id'] cat_name = response.request.meta['cat_name'] site_id = response.request.meta['site_id'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = response.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() #ep_item['title'] = Util.strip_title("".join(title)) ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') #if category: # ep_item['category'] = category[0].replace(u'频道', '') ep_item['category'] = cat_name if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = site_id ep_item['url'] = response.request.url ep_item['pg_id'] = pg_id ep_item['audit'] = audit ep_item['format_id'] = self.format_id ep_item['priority'] = priority if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode(self, response): try: cust_para = response.request.meta['cust_para'] log.msg('%s: %s' % (response.request.url, cust_para)) items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() category = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = Util.strip_title("".join(title)) if 'need_check' in cust_para: if self.content_is_forbidden(ep_item['title']): log.msg('video [ %s ] is in blacklist!' % ep_item['show_id']) return items else: pass else: pass if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if 'category' in cust_para: ep_item['category'] = cust_para['category'] elif category: ep_item['category'] = category[0].replace(u'频道', '') if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] if 'priority' in cust_para: ep_item['priority'] = cust_para['priority'] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('%s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #owner owner = sel.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return items.append(Request(url=owner[0], callback=self.parse_owner)) #video info #title = sel.xpath('//div[@class="base_info"]/h1/descendant-or-self::*/text()').extract() title = sel.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() category = sel.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() scripts = sel.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = sel.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = sel.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = sel.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() #ep_item['title'] = Util.strip_title("".join(title)) ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if category: ep_item['category'] = category[0].replace(u'频道', '') if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id #if video_id: # items.append(Request(url=self.vpaction_url+video_id[0], callback=self.parse_vpaction, meta={'item':ep_item})) if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def video_parse(self, response): items = [] try: kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else None pg_id = response.request.meta[ 'pg_id'] if 'pg_id' in response.request.meta else None cat_id = response.request.meta[ 'cat_id'] if 'cat_id' in response.request.meta else None subject_id = response.request.meta[ 'subject_id'] if 'subject_id' in response.request.meta else None #check video's category category_str = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() category = None if category_str: category = category_str[0].replace(u'频道', '') if category: if category in self.category_exclude: log.msg("video category excluded: %s" % category) return owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return #episode info show_id = Util.get_showid(response.request.url) title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') episode_item = EpisodeItem() if show_id: episode_item['show_id'] = show_id else: return if video_id: episode_item['video_id'] = video_id[0] if owner_show_id: episode_item['owner_show_id'] = owner_show_id if title: episode_item['title'] = Util.strip_title("".join(title)) if tag: episode_item['tag'] = Util.unquote(tag[0]).rstrip('|') if category: episode_item['category'] = category if upload: t = Util.get_upload_time(upload[0]) if t: episode_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: episode_item['description'] = description[0] episode_item['spider_id'] = self.spider_id episode_item['site_id'] = self.site_id episode_item['url'] = response.request.url episode_item['kw_id'] = kw_id episode_item['pg_id'] = pg_id episode_item['cat_id'] = cat_id episode_item['subject_id'] = subject_id if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.vpaction_parse, meta={'episode_item': episode_item})) else: items.append(episode_item) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: recommend = response.request.meta['recommend'] log.msg('%s|recommend: %s' % (response.request.url, recommend)) items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return #check recommended video's category category = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() cat = None if category: cat = category[0].replace(u'频道', '') if recommend and cat: if cat in self.cat_exclude: log.msg("video category excluded: %s" % cat) return #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = Util.strip_title("".join(title)) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if cat: ep_item['category'] = cat if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) #recommendation if not recommend: items.append( Request(url=self.ykrec_url + video_id[0], callback=self.parse_recommendation)) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)