def naver_download_by_url(url, info_only=False, **kwargs): ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}' page = get_content(url) og_video_url = re.search( r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page ).group(1) params_dict = urllib.parse.parse_qs( urllib.parse.urlparse(og_video_url).query ) vid = params_dict['vid'][0] key = params_dict['outKey'][0] meta_str = get_content(ep.format(vid, key)) meta_json = json.loads(meta_str) if 'errorCode' in meta_json: log.wtf(meta_json['errorCode']) title = meta_json['meta']['subject'] videos = meta_json['videos']['list'] video_list = sorted( videos, key=lambda video: video['encodingOption']['width'] ) video_url = video_list[-1]['source'] size = url_size(video_url) print_info(site_info, title, 'mp4', size) if not info_only: download_urls([video_url], title, 'mp4', size, **kwargs)
def vidto_download(url, info_only=False, **kwargs): html = get_content(url) params = {} r = re.findall( r'type="(?:hidden|submit)?"(?:.*?)name="(.+?)"\s* value="?(.+?)">', html ) for name, value in r: params[name] = value data = parse.urlencode(params).encode('utf-8') req = request.Request(url, headers=FAKE_HEADERS) print('Please wait for 6 seconds...') time.sleep(6) print('Starting') new_html = request.urlopen(req, data).read().decode('utf-8', 'replace') new_stff = re.search(r'lnk_download" href="(.*?)">', new_html) if new_stff: url = new_stff.group(1) title = params['fname'] _type = '' ext = '' a, b, size = url_info(url) print_info(site_info, title, _type, size) if not info_only: download_urls([url], title, ext, size, **kwargs) else: log.wtf("Cann't find link, please review")
def panda_download(url, info_only=False, **kwargs): roomid = re.search('/(\d+)', url) if roomid is None: log.wtf('Cannot found room id for this url') roomid = roomid.group(1) json_request_url = ( 'http://www.panda.tv/api_room_v2?roomid={}&__plat=pc_web&_={}'.format( roomid, int(time.time()))) content = get_content(json_request_url) api_json = json.loads(content) errno = api_json['errno'] errmsg = api_json['errmsg'] if errno: raise ValueError('Errno : {}, Errmsg : {}'.format(errno, errmsg)) data = api_json['data'] title = data['roominfo']['name'] room_key = data['videoinfo']['room_key'] plflag = data['videoinfo']['plflag'].split('_') status = data['videoinfo']['status'] if status is not '2': raise ValueError( 'The live stream is not online! (status:{})'.format(status)) data2 = json.loads(data['videoinfo']['plflag_list']) rid = data2['auth']['rid'] sign = data2['auth']['sign'] ts = data2['auth']['time'] real_url = ('http://pl{}.live.panda.tv/live_panda/{}.flv?sign={}&ts={}&' 'rid={}'.format(plflag[1], room_key, sign, ts, rid)) print_info(site_info, title, 'flv', float('inf')) if not info_only: download_urls([real_url], title, 'flv', None, **kwargs)
def douyutv_video_download(url, output_dir='.', merge=True, info_only=False, **kwargs): ep = 'http://vmobile.douyu.com/video/getInfo?vid=' patt = r'show/([0-9A-Za-z]+)' title_patt = r'<h1>(.+?)</h1>' hit = re.search(patt, url) if hit is None: log.wtf('Unknown url pattern') vid = hit.group(1) page = get_content(url) hit = re.search(title_patt, page) if hit is None: title = vid else: title = hit.group(1) meta = json.loads(get_content(ep + vid)) if meta['error'] != 0: log.wtf('Error from API server') m3u8_url = meta['data']['video_url'] print_info('Douyu Video', title, 'm3u8', 0, m3u8_url=m3u8_url) if not info_only: urls = general_m3u8_extractor(m3u8_url) download_urls(urls, title, 'ts', 0, output_dir=output_dir, merge=merge, **kwargs)
def wanmen_download(url, info_only=False, **kwargs): if 'wanmen.org' not in url: log.wtf( 'You are at the wrong place dude. This is for WanMen University!') courseID = int(match1(url, r'course\/(\d+)')) assert courseID > 0 # without courseID we cannot do anything tIndex = int(match1(url, r'tIndex=(\d+)')) pIndex = int(match1(url, r'pIndex=(\d+)')) json_api_content = _wanmen_get_json_api_content_by_courseID(courseID) if pIndex: # only download ONE single part assert tIndex >= 0 wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, info_only=info_only, **kwargs) elif tIndex: # download a topic wanmen_download_by_course_topic(json_api_content, tIndex, info_only=info_only, **kwargs) else: # download the whole course wanmen_download_by_course(json_api_content, info_only=info_only, **kwargs)
def get_room_id_from_url(self, match_id): meta = json.loads(get_content(self.game_ep + str(match_id))) if meta['error'] != 0: log.wtf('Error happens when accessing game_details api') rooms = meta['data']['anchor_data'] for room in rooms: if room['is_use_room']: return room['room_id'] log.wtf('No room available for match {}'.format(match_id))
def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): info_api = ('http://vv.video.qq.com/getinfo?otype=json&appver=3.2.19.333' '&platform=11&defnpayver=1&vid={}'.format(vid)) info = get_content(info_api) video_json = json.loads(match1(info, r'QZOutputJson=(.*)')[:-1]) fn_pre = video_json['vl']['vi'][0]['lnk'] title = video_json['vl']['vi'][0]['ti'] host = video_json['vl']['vi'][0]['ul']['ui'][0]['url'] streams = video_json['fl']['fi'] seg_cnt = video_json['vl']['vi'][0]['cl']['fc'] if seg_cnt == 0: seg_cnt = 1 # best_quality = streams[-1]['name'] part_format_id = streams[-1]['id'] part_urls = [] total_size = 0 for part in range(1, seg_cnt + 1): filename = '{}.p{}.{}.mp4'.format(fn_pre, str(part_format_id % 10000), str(part)) key_api = ('http://vv.video.qq.com/getkey?otype=json&platform=11&' 'format={}&vid={}&filename={}&appver=3.2.19.333'.format( part_format_id, vid, filename)) part_info = get_content(key_api) key_json = json.loads(match1(part_info, r'QZOutputJson=(.*)')[:-1]) if key_json.get('key') is None: vkey = video_json['vl']['vi'][0]['fvkey'] url = '{}{}?vkey={}'.format( video_json['vl']['vi'][0]['ul']['ui'][0]['url'], fn_pre + '.mp4', vkey) else: vkey = key_json['key'] url = '{}{}?vkey={}'.format(host, filename, vkey) if not vkey: if part == 1: log.wtf(key_json['msg']) else: log.w(key_json['msg']) break part_urls.append(url) _, ext, size = url_info(url) total_size += size print_info(site_info, title, ext, total_size) if not info_only: download_urls(part_urls, title, ext, total_size, output_dir=output_dir, merge=merge)
def sina_download_by_vid(vid, title=None, info_only=False, **kwargs): """Downloads a Sina video by its unique vid. http://video.sina.com.cn/ """ xml = api_req(vid) urls, name, size = video_info(xml) if urls is None: log.wtf(name) title = name print_info(site_info, title, 'flv', size) if not info_only: download_urls(urls, title, 'flv', size, **kwargs)
def get_vid_from_url(self, url): """Extracts video ID from live.qq.com. """ hit = re.search(r'live.qq.com/(\d+)', url) if hit is not None: return hit.group(1) hit = re.search(r'live.qq.com/directory/match/(\d+)', url) if hit is not None: return self.get_room_id_from_url(hit.group(1)) html = get_content(url) room_id = match1(html, r'room_id\":(\d+)') if room_id is None: log.wtf('Unknown page {}'.format(url)) return room_id
def pixnet_download(url, info_only=False, **kwargs): if not re.match(r'http://(\w)+.pixnet.net/album/video/(\d)+', url): log.wtf('[Failed] Unsupported URL pattern.') return # http://eric6513.pixnet.net/album/video/206644535 html = get_content(url) title = ''.join(match1( html, r'<meta property="og:description\" content="([^"]*)"' ).split('-')[1:]).strip() time_now = int(time()) m = re.match(r'http://(\w+).pixnet.net/album/video/(\d+)', url) username = m.group(1) # eric6513 _id = m.group(2) # 206644535 data_dict = { 'username': username, 'autoplay': 1, 'id': _id, 'loop': 0, 'profile': 9, 'time': time_now, } # have to be like this data_dict_str = quote(str(data_dict).replace("'", '"'), safe='"') url2 = 'http://api.pixnet.tv/content?type=json&customData={}'.format( data_dict_str ) # &sig=edb07258e6a9ff40e375e11d30607983 can be blank for now # if required, can be obtained from url like # http://s.ext.pixnet.tv/user/eric6513/html5/autoplay/206644507.js # http://api.pixnet.tv/content?type=json&customData={%22username%22:%22eric6513%22,%22id%22:%22206644535%22,%22time%22:1441823350,%22autoplay%22:0,%22loop%22:0,%22profile%22:7} video_json = get_content(url2) content = json.loads(video_json) url_main = content['element']['video_url'] url_backup = content['element']['backup_video_uri'] try: # In some rare cases the main URL is IPv6 only... # Something like #611 url_info(url_main) url = url_main except Exception: url = url_backup _type, ext, size = url_info(url) print_info(site_info, title, _type, size) if not info_only: download_urls([url], title, ext, size, **kwargs)
def funshion_download(url, **kwargs): if re.match(r'https?://www.fun.tv/vplay/v-(\w+)', url): vid = re.search(r'https?://www.fun.tv/vplay/v-(\w+)', url).group(1) Funshion().download_by_vid(vid, single_video=True, **kwargs) elif re.match(r'https?://www.fun.tv/vplay/.*g-(\w+)', url): epid = re.search(r'https?://www.fun.tv/vplay/.*g-(\w+)', url).group(1) url = ('http://pm.funshion.com/v5/media/episode?id={}&cl=mweb&' 'uc=111'.format(epid)) meta = json.loads(get_content(url)) drama_name = meta['name'] extractor = Funshion() for ep in meta['episodes']: title = '{}_{}_{}'.format(drama_name, ep['num'], ep['name']) extractor.download_by_vid(ep['id'], title=title, **kwargs) else: log.wtf('Unknown url pattern')
def vc_entry(self, **kwargs): vc_id = match1(self.url, r'video/(\d+)') \ or match1(self.url, r'vcdetail\?vc=(\d+)') if not vc_id: log.wtf('Unknown url pattern') endpoint = ( 'https://api.vc.bilibili.com/clip/v1/video/detail?video_id={}' '&need_playurl=1'.format(vc_id)) vc_meta = json.loads(get_content(endpoint, headers=FAKE_HEADERS)) if vc_meta['code'] != 0: log.wtf('{}\n{}'.format(vc_meta['msg'], vc_meta['message'])) item = vc_meta['data']['item'] self.title = item['description'] self.streams['vc'] = {} self.streams['vc']['src'] = [item['video_playurl']] self.streams['vc']['container'] = 'mp4' self.streams['vc']['size'] = int(item['video_size'])
def prepare(self, **kwargs): self.page = get_content(self.url) if self.vid is None: self.vid = self.get_vid_from_url() self.title = self.get_title() meta = json.loads(get_content(self.__class__.ep.format(self.vid))) if meta['code'] != 200: log.wtf(meta['message']) for video in meta['result']['videos']: height = video['height'] url = self.__class__.cdn + video['key'] stream_meta = dict(m3u8_url=url, size=0, container='mp4') video_profile = '{}x{}'.format(video['width'], video['height']) stream_meta['video_profile'] = video_profile for stream_type in self.__class__.stream_types: if height // 10 == int(stream_type['id'][:-1]) // 10: # width 481, 482... 489 are all 480p here stream_id = stream_type['id'] self.streams[stream_id] = stream_meta
def download_playlist_by_url(self, url, **kwargs): self.url = url playlist_id = self.__class__.get_playlist_id_from_url(self.url) if playlist_id is None: log.wtf('[Failed] Unsupported URL pattern.') video_page = get_content( 'https://www.youtube.com/playlist?list={}'.format(playlist_id)) from html.parser import HTMLParser videos = sorted([ HTMLParser().unescape(video) for video in re.findall(r'<a href="(/watch\?[^"]+)"', video_page) if parse_query_param(video, 'index') ], key=lambda video: parse_query_param(video, 'index')) # Parse browse_ajax page for more videos to load load_more_href = match1(video_page, r'data-uix-load-more-href="([^"]+)"') while load_more_href: browse_ajax = get_content( 'https://www.youtube.com/{}'.format(load_more_href)) browse_data = json.loads(browse_ajax) load_more_widget_html = browse_data['load_more_widget_html'] content_html = browse_data['content_html'] vs = set(re.findall(r'href="(/watch\?[^"]+)"', content_html)) videos += sorted([ HTMLParser().unescape(video) for video in list(vs) if parse_query_param(video, 'index') ]) load_more_href = match1(load_more_widget_html, r'data-uix-load-more-href="([^"]+)"') self.title = re.search(r'<meta name="title" content="([^"]+)"', video_page).group(1) self.p_playlist() for video in videos: vid = parse_query_param(video, 'v') index = parse_query_param(video, 'index') self.__class__().download_by_url( self.__class__.get_url_from_vid(vid), index=index, **kwargs)
def prepare(self, **kwargs): if self.vid is None: hit = re.search(self.__class__.mobile_pt, self.url) self.vid = (hit.group(1), hit.group(2)) ep_url = self.__class__.ep.format(self.vid[0], self.vid[1]) meta = json.loads(get_content(ep_url)) if meta['code'] != 0: log.wtf(meta['message']['errormsg']) file_path = self.__class__.file_host.format(meta['data']['file_path']) self.title = meta['data']['name'] duration = str(meta['data']['duration']) + 's' self.streams['_default'] = { 'src': [file_path], 'video_profile': duration, 'container': 'm4a' }
def prepare(self, vid='', title=None, **kwargs): assert vid api_url = self.API_ENDPOINT + \ 'servlet/playinfo?vid={vid}&m=0'.format(vid=vid) # return XML html = get_content(api_url) self.tree = ET.ElementTree(ET.fromstring(html)) if self.tree.find('result').text != '1': log.wtf('API result says failed!') raise if title is None: self.title = '_'.join([ i.text for i in self.tree.iterfind( 'video/videomarks/videomark/markdesc') ]) else: self.title = title if not title: self.title = vid for i in self.tree.iterfind('video/quality'): quality = i.attrib['value'] url = i[0].attrib['playurl'] self.stream_types.append({ 'id': quality, 'video_profile': i.attrib['desp'] }) self.streams[quality] = { 'url': url, 'video_profile': i.attrib['desp'] } self.streams_sorted = [ dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams ]
def sina_zxt(url, info_only=False, **kwargs): ep = 'http://s.video.sina.com.cn/video/play?video_id=' frag = urllib.parse.urlparse(url).fragment if not frag: log.wtf('No video specified with fragment') meta = json.loads(get_content(ep + frag)) if meta['code'] != 1: # Yes they use 1 for success. log.wtf(meta['message']) title = meta['data']['title'] videos = sorted(meta['data']['videos'], key=lambda i: int(i['size'])) if len(videos) == 0: log.wtf('No video file returned by API server') vid = videos[-1]['file_id'] container = videos[-1]['type'] size = int(videos[-1]['size']) if container == 'hlv': container = 'flv' urls, _, _ = video_info(api_req(vid)) print_info(site_info, title, container, size) if not info_only: download_urls(urls, title, container, size, **kwargs) return
def prepare(self, **kwargs): page = get_content(self.url) server_data = re.search(r'window\.__NUXT__=({.+?});', page) if server_data is None: log.wtf('cannot find server_data') json_data = json.loads(server_data.group(1))['state'] live_info = json_data['live-info']['liveInfo'] self.title = '{}_{}'.format( json_data['anchor-info']['anchorInfo']['nickName'], live_info['videoInfo']['title'] ) for exsited_stream in live_info['videoInfo']['streamInfos']: for s in self.__class__.stream_types: if s['video_profile'] == exsited_stream['bitrate']: current_stream_id = s['id'] stream_info = dict( src=[unescape(exsited_stream['playUrl'])] ) stream_info['video_profile'] = exsited_stream['desc'] stream_info['container'] = s['container'] stream_info['size'] = float('inf') self.streams[current_stream_id] = stream_info
def download(self, **kwargs): """Override the original one Ugly ugly dirty hack """ if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: # Download stream with the best quality stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: urls = self.streams[stream_id]['src'] # ext = self.streams[stream_id]['container'] # total_size = self.streams[stream_id]['size'] else: urls = self.dash_streams[stream_id]['src'] # ext = self.dash_streams[stream_id]['container'] # total_size = self.dash_streams[stream_id]['size'] if not urls: log.wtf('[Failed] Cannot extract video source.') # Here's the change! download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False) if not kwargs['caption']: print('Skipping captions.') return for lang in self.caption_tracks: filename = '{}.{}.srt'.format(get_filename(self.title), lang) print('Saving {} ... '.format(filename), end='', flush=True) srt = self.caption_tracks[lang] with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8') as x: x.write(srt) print('Done.')
def prepare(self, **kwargs): assert self.url or self.vid if not self.vid and self.url: self.vid = self.__class__.get_vid_from_url(self.url) if self.vid is None: self.download_playlist_by_url(self.url, **kwargs) exit(0) video_info = parse.parse_qs( get_content( 'https://www.youtube.com/get_video_info?video_id={}'.format( self.vid))) ytplayer_config = None if 'status' not in video_info: log.wtf('[Failed] Unknown status.') elif video_info['status'] == ['ok']: if 'use_cipher_signature' not in video_info \ or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) # Parse video page (for DASH) video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) try: ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') except Exception: stream_list = video_info['url_encoded_fmt_stream_map'][ 0].split(',') self.html5player = None else: # Parse video page instead video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytplayer_config['args']['title'] self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') elif video_info['status'] == ['fail']: if video_info['errorcode'] == ['150']: video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) try: ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1)) except Exception: msg = re.search('class="message">([^<]+)<', video_page).group(1) log.wtf('[Failed] "%s"' % msg.strip()) if 'title' in ytplayer_config['args']: # 150 Restricted from playback on certain sites # Parse video page instead self.title = ytplayer_config['args']['title'] self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') else: log.wtf('[Error] The uploader has not made this video ' 'available in your country.') elif video_info['errorcode'] == ['100']: log.wtf('[Failed] This video does not exist.', exit_code=int(video_info['errorcode'][0])) else: log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=int(video_info['errorcode'][0])) else: log.wtf('[Failed] Invalid status.') # YouTube Live if ytplayer_config and ( ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'): hlsvp = ytplayer_config['args']['hlsvp'] if 'info_only' in kwargs and kwargs['info_only']: return else: download_url_ffmpeg(hlsvp, self.title, 'mp4') exit(0) for stream in stream_list: metadata = parse.parse_qs(stream) stream_itag = metadata['itag'][0] self.streams[stream_itag] = { 'itag': metadata['itag'][0], 'url': metadata['url'][0], 'sig': metadata['sig'][0] if 'sig' in metadata else None, 's': metadata['s'][0] if 's' in metadata else None, 'quality': metadata['quality'][0], 'type': metadata['type'][0], 'mime': metadata['type'][0].split(';')[0], 'container': mime_to_container(metadata['type'][0].split(';')[0]), } # Prepare caption tracks try: caption_tracks = json.loads( ytplayer_config['args']['player_response'] )['captions']['playerCaptionsTracklistRenderer']['captionTracks'] for ct in caption_tracks: ttsurl, lang = ct['baseUrl'], ct['languageCode'] tts_xml = parseString(get_content(ttsurl)) transcript = tts_xml.getElementsByTagName('transcript')[0] texts = transcript.getElementsByTagName('text') srt = '' seq = 0 for text in texts: if text.firstChild is None: continue # empty element seq += 1 start = float(text.getAttribute('start')) if text.getAttribute('dur'): dur = float(text.getAttribute('dur')) else: dur = 1.0 # could be ill-formed XML finish = start + dur m, s = divmod(start, 60) h, m = divmod(m, 60) start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace( '.', ',') m, s = divmod(finish, 60) h, m = divmod(m, 60) finish = '{:0>2}:{:0>2}:{:06.3f}'.format( int(h), int(m), s).replace('.', ',') content = unescape(text.firstChild.nodeValue) srt += '{}\n'.format(str(seq)) srt += '{} --> {}\n'.format(start, finish) srt += '{}\n\n'.format(content) self.caption_tracks[lang] = srt except Exception: pass # Prepare DASH streams try: dashmpd = ytplayer_config['args']['dashmpd'] dash_xml = parseString(get_content(dashmpd)) for aset in dash_xml.getElementsByTagName('AdaptationSet'): mimeType = aset.getAttribute('mimeType') if mimeType == 'audio/mp4': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_mp4_a_url = burls[0].firstChild.nodeValue dash_mp4_a_size = burls[0].getAttribute('yt:contentLength') if not dash_mp4_a_size: try: dash_mp4_a_size = url_size(dash_mp4_a_url) except Exception: continue elif mimeType == 'audio/webm': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_webm_a_url = burls[0].firstChild.nodeValue dash_webm_a_size = burls[0].getAttribute( 'yt:contentLength') if not dash_webm_a_size: try: dash_webm_a_size = url_size(dash_webm_a_url) except Exception: continue elif mimeType == 'video/mp4': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) h = int(rep.getAttribute('height')) itag = rep.getAttribute('id') burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') if not dash_size: try: dash_size = url_size(dash_url) except Exception: continue self.dash_streams[itag] = { 'quality': '{}x{}'.format(w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', 'src': [dash_url, dash_mp4_a_url], 'size': int(dash_size) + int(dash_mp4_a_size) } elif mimeType == 'video/webm': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) h = int(rep.getAttribute('height')) itag = rep.getAttribute('id') burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') if not dash_size: try: dash_size = url_size(dash_url) except Exception: continue self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', 'src': [dash_url, dash_webm_a_url], 'size': int(dash_size) + int(dash_webm_a_size) } except Exception: # VEVO if not self.html5player: return self.js = get_content(self.html5player) if 'adaptive_fmts' in ytplayer_config['args']: streams = [ dict([(i.split('=')[0], parse.unquote(i.split('=')[1])) for i in afmt.split('&')]) for afmt in ytplayer_config['args']['adaptive_fmts'].split(',') ] for stream in streams: # get over speed limiting stream['url'] += '&ratebypass=yes' for stream in streams: # audio if stream['type'].startswith('audio/mp4'): dash_mp4_a_url = stream['url'] if 's' in stream: sig = self.__class__.decipher(self.js, stream['s']) dash_mp4_a_url += '&signature={}'.format(sig) dash_mp4_a_size = stream['clen'] elif stream['type'].startswith('audio/webm'): dash_webm_a_url = stream['url'] if 's' in stream: sig = self.__class__.decipher(self.js, stream['s']) dash_webm_a_url += '&signature={}'.format(sig) dash_webm_a_size = stream['clen'] for stream in streams: # video if 'size' in stream: if stream['type'].startswith('video/mp4'): mimeType = 'video/mp4' dash_url = stream['url'] if 's' in stream: sig = self.__class__.decipher( self.js, stream['s']) dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', 'src': [dash_url, dash_mp4_a_url], 'size': int(dash_size) + int(dash_mp4_a_size) } elif stream['type'].startswith('video/webm'): mimeType = 'video/webm' dash_url = stream['url'] if 's' in stream: sig = self.__class__.decipher( self.js, stream['s']) dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', 'src': [dash_url, dash_webm_a_url], 'size': int(dash_size) + int(dash_webm_a_size) }
def download(self, **kwargs): if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: # Download stream with the best quality stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: urls = self.streams[stream_id]['src'] ext = self.streams[stream_id]['container'] total_size = self.streams[stream_id]['size'] else: urls = self.dash_streams[stream_id]['src'] ext = self.dash_streams[stream_id]['container'] total_size = self.dash_streams[stream_id]['size'] if not urls: log.wtf('[Failed] Cannot extract video source.') if ext == 'm3u8': ffmpeg_kwargs = {} if 'iqiyi' in self.name: # ffmpeg_kwargs['override'] = True # ffmpeg_kwargs['params'] = { # '-c:a': 'copy', '-bsf:a': 'aac_adtstoasc' # } m3u8_urls = general_m3u8_extractor(urls[0]) # FIXME(iawia002): 如果要计算大小的话需要消耗太多时间 if len(m3u8_urls) <= 100: size = urls_size(m3u8_urls) else: size = float('inf') download_urls(m3u8_urls, self.title, 'mp4', size, **kwargs) else: download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False, **ffmpeg_kwargs) else: headers = copy(config.FAKE_HEADERS) if self.ua is not None: headers['User-Agent'] = self.ua if self.referer is not None: headers['Referer'] = self.referer download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmuku.') return for lang in self.caption_tracks: filename = '%s.%s.srt' % (get_filename(self.title), lang) print('Saving %s ... ' % filename, end="", flush=True) srt = self.caption_tracks[lang] with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8') as x: x.write(srt) print('Done.') if self.danmuku is not None and not dry_run: filename = '{}.cmt.xml'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: fp.write(self.danmuku) keep_obj = kwargs.get('keep_obj', False) if not keep_obj: self.__init__()
def get_vid_from_url(self): hit = re.search(self.__class__.vid_patt, self.page) if hit is None: log.wtf('Cannot get stream_id') return hit.group(1)
def prepare(self, **kwargs): assert self.url or self.vid if self.url and not self.vid: self.get_vid_from_url() if self.vid is None: self.get_vid_from_page() if self.vid is None: log.wtf('Cannot fetch vid') if kwargs.get('password') and kwargs['password']: self.password_protected = True self.password = kwargs['password'] self.utid = fetch_cna() time.sleep(3) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_code == -6001: # wrong vid parsed from the page vid_from_url = self.vid self.get_vid_from_page() if vid_from_url == self.vid: log.wtf(self.api_error_msg) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_code == -2002: # wrong password self.password_protected = True # it can be True already(from cli). # offer another chance to retry self.password = input(log.sprint('Password: '******'stream') is None: if self.api_error_msg: log.wtf(self.api_error_msg) else: log.wtf('Unknown error') self.title = self.api_data['video']['title'] stream_types = dict([(i['id'], i) for i in self.stream_types]) audio_lang = self.api_data['stream'][0]['audio_lang'] for stream in self.api_data['stream']: stream_id = stream['stream_type'] is_preview = False if stream_id in stream_types \ and stream['audio_lang'] == audio_lang: if 'alias-of' in stream_types[stream_id]: stream_id = stream_types[stream_id]['alias-of'] if stream_id not in self.streams: self.streams[stream_id] = { 'container': stream_types[stream_id]['container'], 'video_profile': stream_types[stream_id]['video_profile'], 'size': stream['size'], 'pieces': [{ 'segs': stream['segs'] }], 'm3u8_url': stream['m3u8_url'] } src = [] for seg in stream['segs']: if seg.get('cdn_url'): src.append( self.__class__.change_cdn(seg['cdn_url'])) else: is_preview = True self.streams[stream_id]['src'] = src else: self.streams[stream_id]['size'] += stream['size'] self.streams[stream_id]['pieces'].append( {'segs': stream['segs']}) src = [] for seg in stream['segs']: if seg.get('cdn_url'): src.append( self.__class__.change_cdn(seg['cdn_url'])) else: is_preview = True self.streams[stream_id]['src'].extend(src) if is_preview: log.w('{} is a preview'.format(stream_id)) # Audio languages if 'dvd' in self.api_data: al = self.api_data['dvd'].get('audiolang') if al: self.audiolang = al for i in self.audiolang: i['url'] = 'http://v.youku.com/v_show/id_{}'.format( i['vid'])
def youku_download_playlist_by_url(url, **kwargs): video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)' js_cb_pt = '\(({.+})\)' if re.match(video_page_pt, url): youku_obj = Youku() youku_obj.url = url youku_obj.prepare(**kwargs) total_episode = None try: total_episode = youku_obj.api_data['show']['episode_total'] except KeyError: log.wtf('Cannot get total_episode for {}'.format(url)) next_vid = youku_obj.vid for _ in range(total_episode): this_extractor = Youku() this_extractor.download_by_vid(next_vid, keep_obj=True, **kwargs) next_vid = this_extractor.video_next['encodevid'] ''' if youku_obj.video_list is None: log.wtf('Cannot find video list for {}'.format(url)) else: vid_list = [v['encodevid'] for v in youku_obj.video_list] for v in vid_list: Youku().download_by_vid(v, **kwargs) ''' elif re.match('https?://list.youku.com/show/id_', url): # http://list.youku.com/show/id_z2ae8ee1c837b11e18195.html # official playlist page = get_content(url) show_id = re.search(r'showid:"(\d+)"', page).group(1) ep = ('http://list.youku.com/show/module?id={}&tab=showInfo&' 'callback=jQuery'.format(show_id)) xhr_page = get_content(ep).replace('\/', '/').replace('\"', '"') video_url = re.search( r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_page).group(1) youku_download_playlist_by_url('http://' + video_url, **kwargs) return elif re.match('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url): # http://list.youku.com/albumlist/show/id_2336634.html # UGC playlist list_id = re.search( 'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1) ep = ('http://list.youku.com/albumlist/items?id={}&page={}&size=20&' 'ascending=1&callback=tuijsonp6') first_u = ep.format(list_id, 1) xhr_page = get_content(first_u) json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1)) video_cnt = json_data['data']['total'] xhr_html = json_data['html'] v_urls = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) if video_cnt > 20: req_cnt = video_cnt // 20 for i in range(2, req_cnt + 2): req_u = ep.format(list_id, i) xhr_page = get_content(req_u) json_data = json.loads( re.search(js_cb_pt, xhr_page).group(1).replace('\/', '/')) xhr_html = json_data['html'] page_videos = re.findall( r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) v_urls.extend(page_videos) for u in v_urls[0::2]: url = 'http://' + u Youku().download_by_url(url, **kwargs) return
def download(self, **kwargs): if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: # Download stream with the best quality stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: urls = self.streams[stream_id]['src'] ext = self.streams[stream_id]['container'] total_size = self.streams[stream_id]['size'] else: urls = self.dash_streams[stream_id]['src'] ext = self.dash_streams[stream_id]['container'] total_size = self.dash_streams[stream_id]['size'] if ext == 'm3u8': ext = 'mp4' if not urls: log.wtf('[Failed] Cannot extract video source.') # For legacy main() headers = copy(config.FAKE_HEADERS) if self.ua is not None: headers['User-Agent'] = self.ua if self.referer is not None: headers['Referer'] = self.referer download_urls( urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams ) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmuku.') return for lang in self.caption_tracks: filename = '%s.%s.srt' % (get_filename(self.title), lang) print('Saving %s ... ' % filename, end="", flush=True) srt = self.caption_tracks[lang] with open( os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8' ) as x: x.write(srt) print('Done.') if self.danmuku is not None and not dry_run: filename = '{}.cmt.xml'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open( os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8' ) as fp: fp.write(self.danmuku) # For main_dev() # download_urls( # urls, self.title, self.streams[stream_id]['container'], # self.streams[stream_id]['size'] # ) keep_obj = kwargs.get('keep_obj', False) if not keep_obj: self.__init__()