def zhanqi_live(room_id, merge=True, output_dir='.', info_only=False, **kwargs): api_url = ('https://www.zhanqi.tv/api/static/v2.2/room/domain/{}.json'. format(room_id)) json_data = json.loads(get_content(api_url))['data'] status = json_data['status'] if status != '4': raise Exception('The live stream is not online!') nickname = json_data['nickname'] title = '{}:{}'.format(nickname, json_data['title']) video_levels = base64.b64decode( json_data['flashvars']['VideoLevels']).decode('utf8') m3u8_url = json.loads(video_levels)['streamUrl'] print_info(site_info, title, 'm3u8', 0, m3u8_url=m3u8_url, m3u8_type='master') if not info_only: download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge)
def douyutv_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if 'v.douyu.com/show/' in url: douyutv_video_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) return url = re.sub(r'[\w.]*douyu.com', 'm.douyu.com', url) html = get_content(url) room_id_patt = r'room_id\s*:\s*(\d+),' room_id = match1(html, room_id_patt) if room_id == '0': room_id = url[url.rfind('/') + 1:] api_url = 'http://www.douyutv.com/api/v1/' args = 'room/{}?aid=wp&client_sys=wp&time={}'.format( room_id, int(time.time())) auth_md5 = (args + 'zNzMV1y4EMxOHS6I5WKm').encode('utf-8') auth_str = hashlib.md5(auth_md5).hexdigest() json_request_url = '{}{}&auth={}'.format(api_url, args, auth_str) content = get_content(json_request_url) json_content = json.loads(content) data = json_content['data'] server_status = json_content.get('error', 0) if server_status is not 0: raise ValueError('Server returned error: {}'.format(server_status)) title = data.get('room_name') show_status = data.get('show_status') if show_status is not '1': raise ValueError( 'The live stream is not online! (Errno: {})'.format(server_status)) real_url = '{}/{}'.format(data.get('rtmp_url'), data.get('rtmp_live')) print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', None, output_dir=output_dir, merge=merge)
def metacafe_download(url, info_only=False, **kwargs): if re.match(r'http://www.metacafe.com/watch/\w+', url): html = get_content(url) title = match1(html, r'<meta property="og:title" content="([^"]*)"') data = match1( html, r"<script type='text/json' id='json_video_data'>(.+)</script>" ) data = json.loads(data) m3u8_url = data['sources'][0]['src'] print_info( site_info, title, 'm3u8', 0, m3u8_url=m3u8_url, m3u8_type='master' ) if not info_only: download_url_ffmpeg(m3u8_url, title, 'mp4', **kwargs)
def huomaotv_download( url, output_dir='.', merge=True, info_only=False, **kwargs ): room_id_pattern = r'huomao.com/(\d+)' room_id = match1(url, room_id_pattern) html = get_content( 'http://m.huomao.com/mobile/mob_live/{}'.format(room_id) ) parser = get_parser(html) m3u8_url = parser.source['src'] title = parser.title.text print_info(site_info, title, 'm3u8', float('inf')) if not info_only: download_url_ffmpeg( m3u8_url, title, 'mp4', None, output_dir=output_dir, merge=merge )
def showroom_download_by_room_id(room_id, info_only=False, **kwargs): '''Source: Android mobile ''' while True: timestamp = str(int(time() * 1000)) api_endpoint = ( 'https://www.showroom-live.com/api/live/streaming_url?room_id=' '{room_id}&_={timestamp}'.format(room_id=room_id, timestamp=timestamp)) html = get_content(api_endpoint) html = json.loads(html) if len(html) >= 1: break log.w('The live show is currently offline.') sleep(1) # This is mainly for testing the M3U FFmpeg parser so I would ignore # any non-m3u ones stream_url = [ i['url'] for i in html['streaming_url_list'] if i['is_default'] and i['type'] == 'hls' ][0] assert stream_url # title title = '' profile_api = ('https://www.showroom-live.com/api/room/profile?room_id=' '{room_id}'.format(room_id=room_id)) html = json.loads(get_content(profile_api)) try: title = html['main_name'] except KeyError: title = 'Showroom_{room_id}'.format(room_id=room_id) type_, ext, size = url_info(stream_url) print_info(site_info, title, type_, size) if not info_only: download_url_ffmpeg(url=stream_url, title=title, ext='mp4', **kwargs)
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs): try: content_type = get_head(url, headers=FAKE_HEADERS)['Content-Type'] except Exception: content_type = get_head(url, headers=FAKE_HEADERS, get_method='GET')['Content-Type'] if content_type.startswith('text/html'): try: embed_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) except Exception: pass else: return domains = url.split('/')[2].split('.') if len(domains) > 2: domains = domains[1:] site_info = '.'.join(domains) if content_type.startswith('text/html'): # extract an HTML page response = get_content(url) page = str(response) page_title = match1(page, r'<title>([^<]*)') if page_title: page_title = unescape_html(page_title) hls_urls = re.findall( r'(https?://[^;"\'\\]+' + '\.m3u8?' + r'[^;"\'\\]*)', page) if hls_urls: for hls_url in hls_urls: type_, ext, size = url_info(hls_url) print_info(site_info, page_title, type_, size) if not info_only: download_url_ffmpeg(url=hls_url, title=page_title, ext='mp4', output_dir=output_dir) return # most common media file extensions on the Internet media_exts = [ '\.flv', '\.mp3', '\.mp4', '\.webm', '[-_]1\d\d\d\.jpe?g', '[-_][6-9]\d\d\.jpe?g', # tumblr '[-_]1\d\d\dx[6-9]\d\d\.jpe?g', '[-_][6-9]\d\dx1\d\d\d\.jpe?g', '[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', 's1600/[\w%]+\.jpe?g', # blogger 'img[6-9]\d\d/[\w%]+\.jpe?g', # oricon? ] urls = [] for i in media_exts: urls += re.findall(r'(https?://[^;"\'\\]+' + i + r'[^;"\'\\]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page) urls += [parse.unquote(url) for url in p_urls] q_urls = re.findall( r'(https?:\\\\/\\\\/[^;"\']+' + i + r'[^;"\']*)', page) urls += [url.replace('\\\\/', '/') for url in q_urls] # a link href to an image is often an interesting one urls += re.findall(r'href="(https?://[^"]+\.jpe?g)"', page, re.I) urls += re.findall(r'href="(https?://[^"]+\.png)"', page, re.I) urls += re.findall(r'href="(https?://[^"]+\.gif)"', page, re.I) # MPEG-DASH MPD mpd_urls = re.findall(r'src="(https?://[^"]+\.mpd)"', page) for mpd_url in mpd_urls: cont = get_content(mpd_url) base_url = match1(cont, r'<BaseURL>(.*)</BaseURL>') urls += [match1(mpd_url, r'(.*/)[^/]*') + base_url] # have some candy! candies = [] i = 1 for url in set(urls): filename = parse.unquote(url.split('/')[-1]) if 5 <= len(filename) <= 80: title = '.'.join(filename.split('.')[:-1]) else: title = '{}'.format(i) i += 1 candies.append({'url': url, 'title': title}) for candy in candies: try: mime, ext, size = url_info(candy['url'], faker=True) if not size: size = float('Int') except Exception: continue else: print_info(site_info, candy['title'], ext, size) if not info_only: download_urls([candy['url']], candy['title'], ext, size, output_dir=output_dir, merge=merge, faker=True) return else: # direct download filename = parse.unquote(url.split('/')[-1]) title = '.'.join(filename.split('.')[:-1]) ext = filename.split('.')[-1] _, _, size = url_info(url, faker=True) print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, output_dir=output_dir, merge=merge, faker=True) return
def prepare(self, **kwargs): assert self.url or self.vid if not self.vid and self.url: self.vid = self.__class__.get_vid_from_url(self.url) if self.vid is None: self.download_playlist_by_url(self.url, **kwargs) exit(0) video_info = parse.parse_qs( get_content( 'https://www.youtube.com/get_video_info?video_id={}'.format( self.vid))) ytplayer_config = None if 'status' not in video_info: log.wtf('[Failed] Unknown status.') elif video_info['status'] == ['ok']: if 'use_cipher_signature' not in video_info \ or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) # Parse video page (for DASH) video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) try: ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') except Exception: stream_list = video_info['url_encoded_fmt_stream_map'][ 0].split(',') self.html5player = None else: # Parse video page instead video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytplayer_config['args']['title'] self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') elif video_info['status'] == ['fail']: if video_info['errorcode'] == ['150']: video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) try: ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1)) except Exception: msg = re.search('class="message">([^<]+)<', video_page).group(1) log.wtf('[Failed] "%s"' % msg.strip()) if 'title' in ytplayer_config['args']: # 150 Restricted from playback on certain sites # Parse video page instead self.title = ytplayer_config['args']['title'] self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') else: log.wtf('[Error] The uploader has not made this video ' 'available in your country.') elif video_info['errorcode'] == ['100']: log.wtf('[Failed] This video does not exist.', exit_code=int(video_info['errorcode'][0])) else: log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=int(video_info['errorcode'][0])) else: log.wtf('[Failed] Invalid status.') # YouTube Live if ytplayer_config and ( ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'): hlsvp = ytplayer_config['args']['hlsvp'] if 'info_only' in kwargs and kwargs['info_only']: return else: download_url_ffmpeg(hlsvp, self.title, 'mp4') exit(0) for stream in stream_list: metadata = parse.parse_qs(stream) stream_itag = metadata['itag'][0] self.streams[stream_itag] = { 'itag': metadata['itag'][0], 'url': metadata['url'][0], 'sig': metadata['sig'][0] if 'sig' in metadata else None, 's': metadata['s'][0] if 's' in metadata else None, 'quality': metadata['quality'][0], 'type': metadata['type'][0], 'mime': metadata['type'][0].split(';')[0], 'container': mime_to_container(metadata['type'][0].split(';')[0]), } # Prepare caption tracks try: caption_tracks = json.loads( ytplayer_config['args']['player_response'] )['captions']['playerCaptionsTracklistRenderer']['captionTracks'] for ct in caption_tracks: ttsurl, lang = ct['baseUrl'], ct['languageCode'] tts_xml = parseString(get_content(ttsurl)) transcript = tts_xml.getElementsByTagName('transcript')[0] texts = transcript.getElementsByTagName('text') srt = '' seq = 0 for text in texts: if text.firstChild is None: continue # empty element seq += 1 start = float(text.getAttribute('start')) if text.getAttribute('dur'): dur = float(text.getAttribute('dur')) else: dur = 1.0 # could be ill-formed XML finish = start + dur m, s = divmod(start, 60) h, m = divmod(m, 60) start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace( '.', ',') m, s = divmod(finish, 60) h, m = divmod(m, 60) finish = '{:0>2}:{:0>2}:{:06.3f}'.format( int(h), int(m), s).replace('.', ',') content = unescape(text.firstChild.nodeValue) srt += '{}\n'.format(str(seq)) srt += '{} --> {}\n'.format(start, finish) srt += '{}\n\n'.format(content) self.caption_tracks[lang] = srt except Exception: pass # Prepare DASH streams try: dashmpd = ytplayer_config['args']['dashmpd'] dash_xml = parseString(get_content(dashmpd)) for aset in dash_xml.getElementsByTagName('AdaptationSet'): mimeType = aset.getAttribute('mimeType') if mimeType == 'audio/mp4': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_mp4_a_url = burls[0].firstChild.nodeValue dash_mp4_a_size = burls[0].getAttribute('yt:contentLength') if not dash_mp4_a_size: try: dash_mp4_a_size = url_size(dash_mp4_a_url) except Exception: continue elif mimeType == 'audio/webm': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_webm_a_url = burls[0].firstChild.nodeValue dash_webm_a_size = burls[0].getAttribute( 'yt:contentLength') if not dash_webm_a_size: try: dash_webm_a_size = url_size(dash_webm_a_url) except Exception: continue elif mimeType == 'video/mp4': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) h = int(rep.getAttribute('height')) itag = rep.getAttribute('id') burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') if not dash_size: try: dash_size = url_size(dash_url) except Exception: continue self.dash_streams[itag] = { 'quality': '{}x{}'.format(w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', 'src': [dash_url, dash_mp4_a_url], 'size': int(dash_size) + int(dash_mp4_a_size) } elif mimeType == 'video/webm': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) h = int(rep.getAttribute('height')) itag = rep.getAttribute('id') burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') if not dash_size: try: dash_size = url_size(dash_url) except Exception: continue self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', 'src': [dash_url, dash_webm_a_url], 'size': int(dash_size) + int(dash_webm_a_size) } except Exception: # VEVO if not self.html5player: return self.js = get_content(self.html5player) if 'adaptive_fmts' in ytplayer_config['args']: streams = [ dict([(i.split('=')[0], parse.unquote(i.split('=')[1])) for i in afmt.split('&')]) for afmt in ytplayer_config['args']['adaptive_fmts'].split(',') ] for stream in streams: # get over speed limiting stream['url'] += '&ratebypass=yes' for stream in streams: # audio if stream['type'].startswith('audio/mp4'): dash_mp4_a_url = stream['url'] if 's' in stream: sig = self.__class__.decipher(self.js, stream['s']) dash_mp4_a_url += '&signature={}'.format(sig) dash_mp4_a_size = stream['clen'] elif stream['type'].startswith('audio/webm'): dash_webm_a_url = stream['url'] if 's' in stream: sig = self.__class__.decipher(self.js, stream['s']) dash_webm_a_url += '&signature={}'.format(sig) dash_webm_a_size = stream['clen'] for stream in streams: # video if 'size' in stream: if stream['type'].startswith('video/mp4'): mimeType = 'video/mp4' dash_url = stream['url'] if 's' in stream: sig = self.__class__.decipher( self.js, stream['s']) dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', 'src': [dash_url, dash_mp4_a_url], 'size': int(dash_size) + int(dash_mp4_a_size) } elif stream['type'].startswith('video/webm'): mimeType = 'video/webm' dash_url = stream['url'] if 's' in stream: sig = self.__class__.decipher( self.js, stream['s']) dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', 'src': [dash_url, dash_webm_a_url], 'size': int(dash_size) + int(dash_webm_a_size) }
def download(self, **kwargs): if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: # Download stream with the best quality stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: urls = self.streams[stream_id]['src'] ext = self.streams[stream_id]['container'] total_size = self.streams[stream_id]['size'] else: urls = self.dash_streams[stream_id]['src'] ext = self.dash_streams[stream_id]['container'] total_size = self.dash_streams[stream_id]['size'] if not urls: log.wtf('[Failed] Cannot extract video source.') if ext == 'm3u8': ffmpeg_kwargs = {} if 'iqiyi' in self.name: # ffmpeg_kwargs['override'] = True # ffmpeg_kwargs['params'] = { # '-c:a': 'copy', '-bsf:a': 'aac_adtstoasc' # } m3u8_urls = general_m3u8_extractor(urls[0]) # FIXME(iawia002): 如果要计算大小的话需要消耗太多时间 if len(m3u8_urls) <= 100: size = urls_size(m3u8_urls) else: size = float('inf') download_urls(m3u8_urls, self.title, 'mp4', size, **kwargs) else: download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False, **ffmpeg_kwargs) else: headers = copy(config.FAKE_HEADERS) if self.ua is not None: headers['User-Agent'] = self.ua if self.referer is not None: headers['Referer'] = self.referer download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmuku.') return for lang in self.caption_tracks: filename = '%s.%s.srt' % (get_filename(self.title), lang) print('Saving %s ... ' % filename, end="", flush=True) srt = self.caption_tracks[lang] with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8') as x: x.write(srt) print('Done.') if self.danmuku is not None and not dry_run: filename = '{}.cmt.xml'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: fp.write(self.danmuku) keep_obj = kwargs.get('keep_obj', False) if not keep_obj: self.__init__()
def download(self, **kwargs): """Override the original one Ugly ugly dirty hack """ if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: # Download stream with the best quality stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: urls = self.streams[stream_id]['src'] # ext = self.streams[stream_id]['container'] # total_size = self.streams[stream_id]['size'] else: urls = self.dash_streams[stream_id]['src'] # ext = self.dash_streams[stream_id]['container'] # total_size = self.dash_streams[stream_id]['size'] if not urls: log.wtf('[Failed] Cannot extract video source.') # Here's the change! download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False) if not kwargs['caption']: print('Skipping captions.') return for lang in self.caption_tracks: filename = '{}.{}.srt'.format(get_filename(self.title), lang) print('Saving {} ... '.format(filename), end='', flush=True) srt = self.caption_tracks[lang] with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8') as x: x.write(srt) print('Done.')