def parse_episode(self, response): try: log.msg('%s' % response.request.url) thumb_url = response.request.meta['thumb_url'] upload_time = response.request.meta['upload_time'] category = response.request.meta['category'] kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else 1 items = [] #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.url_prefix + owner_url[0] + "/about", callback=self.parse_about)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() #category = response.xpath('//p[@id="eow-category"]/a/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = Util.get_youtube_showid(response.request.url) if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if category: #ep_item['category'] = category[0].replace('&', '|') ep_item['category'] = category ''' if upload: ptime = Util.get_youtube_publish(upload[0]) if ptime: ep_item['upload_time'] = ptime ''' if upload_time: t = Util.get_youtube_upload_time(upload_time[0].strip()) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = "\n".join(description) if thumb_url: ep_item['thumb_url'] = thumb_url[0] if played: pld = Util.normalize_played(played[0]) if pld: ep_item['played'] = Util.normalize_played(played[0]) else: ep_item['played'] = '0' ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = Util.normalize_youtube_url(response.request.url) ep_item['kw_id'] = kw_id query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.parse_other_info, meta={'item': ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def video_parse(self, response): items = [] try: kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else None pg_id = response.request.meta[ 'pg_id'] if 'pg_id' in response.request.meta else None cat_id = response.request.meta[ 'cat_id'] if 'cat_id' in response.request.meta else None subject_id = response.request.meta[ 'subject_id'] if 'subject_id' in response.request.meta else None show_id = Util.get_youtube_showid(response.request.url) if not show_id: return items #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.youtube_url_prefix + owner_url[0] + "/about", callback=self.video_about_parse)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() category = response.xpath( '//div[@id="watch-description"]//ul[@class="content watch-info-tag-list"]/li/a/text()' ).extract() upload = response.xpath( '//meta[@itemprop="datePublished"]/@content').extract() #该方法获取的缩略图 thumb_url = response.xpath( '//link[@itemprop="thumbnailUrl"]/@href').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = show_id #这里缩略图采用合成的方式得到['default', 'mqdefault', 'hqdefault', 'sddefault', 'maxresdefault'] #ep_item['thumb_url'] = self.thumb_url_prefix + '/' + show_id + '/default.jpg' if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if description: ep_item['description'] = "\n".join(description) if played: pld = Util.normalize_played(played[0]) if pld: ep_item['played'] = Util.normalize_played(played[0]) else: ep_item['played'] = '0' if kw_id: ep_item['kw_id'] = kw_id if pg_id: ep_item['pg_id'] = pg_id if cat_id: ep_item['cat_id'] = cat_id if subject_id: ep_item['subject_id'] = subject_id if thumb_url: ep_item['thumb_url'] = thumb_url[0] if category: category = category[0].strip() #https://www.youtube.com/watch?v=lwy4qwaByVQ ep_item['category'] = category.replace('&', '|') if upload: upload = upload[0].strip() struct_time = None struct_time = time.strptime(upload, '%b %d, %Y') if not struct_time: struct_time = time.strptime(upload, '%Y年%m月%d日') if struct_time: time_str = time.strftime('%Y-%m-%d %H:%M:%S', struct_time) #time_str = "%s-%s-%s %s" % (struct_time.tm_year, struct_time.tm_mon, struct_time.tm_mday, time_str) ep_item['upload_time'] = time_str ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = Util.normalize_youtube_url(response.request.url) query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.video_other_info_parse, meta={'item': ep_item})) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)