def google_search(url): keywords = r1(r'https?://(.*)', url) url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) page = get_content(url) videos = re.findall( r'<a href="(https?://[^"]+)" onmousedown="[^"]+">([^<]+)<', page) vdurs = re.findall(r'<span class="vdur _dwc">([^<]+)<', page) durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs] print('Google Videos search:') for v in zip(videos, durs): print('- video: {} [{}]'.format(unescape_html(v[0][1]), v[1] if v[1] else '?')) print('# lulu %s' % log.sprint(v[0][0], log.UNDERLINE)) print() print('Best matched result:') return (videos[0][0])
def mtv81_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_content(url) title = unescape_html('|'.join( match1(html, r'<title>(.*?)</title>').split('|')[:-2])) # mgid%3Auma%3Avideo%3Amtv81.com%3A897974 vid = match1(html, r'getTheVideo\("(.*?)"') xml = parseString( get_content( 'http://intl.esperanto.mtvi.com/www/xml/media/mediaGen.jhtml?uri={}&' 'flashPlayer=LNX%2013,0,0,206&geo=CN&sid=123456'.format(vid))) url = sorted(map(lambda x: x.firstChild.nodeValue, xml.getElementsByTagName("src")), key=lambda x: int(match1(x, r'_(\d+?)_')))[-1] mediatype, ext, size = 'mp4', 'mp4', 0 print_info(site_info, title, mediatype, size) # rtmpdump -r 'rtmpe://cp30865.edgefcs.net/ondemand/mtviestor/_!/intlod/MTVInternational/MBUS/GeoLocals/00JP/VIAMTVI/PYC/201304/7122HVAQ4/00JPVIAMTVIPYC7122HVAQ4_640x_360_1200_m30.mp4' -o "title.mp4" --swfVfy http://media.mtvnservices.com/player/prime/mediaplayerprime.1.10.8.swf # noqa # because rtmpdump is unstable,may try serveral times if not info_only: download_rtmp_url( url=url, title=title, ext=ext, params={ '--swfVfy': ('http://media.mtvnservices.com/player/prime/mediaplayer' 'prime.1.10.8.swf') }, output_dir=output_dir)
def theplatform_download_by_pid( pid, title, output_dir='.', merge=True, info_only=False, **kwargs ): smil_url = ( 'http://link.theplatform.com/s/dJ5BDC/{}/meta.smil?format=smil' '&mbr=true'.format(pid) ) smil = get_content(smil_url) smil_base = unescape_html(match1(smil, r'<meta base="([^"]+)"')) smil_videos = { y: x for x, y in dict( re.findall(r'<video src="([^"]+)".+height="([^"]+)"', smil) ).items() } for height in ['1080', '720', '480', '360', '240', '216']: if height in smil_videos: smil_video = smil_videos[height] break assert smil_video _type, ext, size = 'mp4', 'mp4', 0 print_info(site_info, title, _type, size) if not info_only: download_rtmp_url( url=smil_base, title=title, ext=ext, params={"-y": '{}:{}'.format(ext, smil_video)}, output_dir=output_dir )
def ifeng_download_by_id(_id, title=None, info_only=False, **kwargs): assert match1( _id, r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'), _id url = 'http://vxml.ifengimg.com/video_info_new/{}/{}/{}.xml'.format( _id[-2], _id[-2:], _id) xml = get_content(url) title = match1(xml, r'Name="([^"]+)"') title = unescape_html(title) url = match1(xml, r'VideoPlayUrl="([^"]+)"') url = url.replace('http://wideo.ifeng.com/', 'http://ips.ifeng.com/wideo.ifeng.com/') _, ext, size = url_info(url) print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, **kwargs)
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs): try: content_type = get_head(url, headers=FAKE_HEADERS)['Content-Type'] except Exception: content_type = get_head(url, headers=FAKE_HEADERS, get_method='GET')['Content-Type'] if content_type.startswith('text/html'): try: embed_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) except Exception: pass else: return domains = url.split('/')[2].split('.') if len(domains) > 2: domains = domains[1:] site_info = '.'.join(domains) if content_type.startswith('text/html'): # extract an HTML page response = get_content(url) page = str(response) page_title = match1(page, r'<title>([^<]*)') if page_title: page_title = unescape_html(page_title) hls_urls = re.findall( r'(https?://[^;"\'\\]+' + '\.m3u8?' + r'[^;"\'\\]*)', page) if hls_urls: for hls_url in hls_urls: type_, ext, size = url_info(hls_url) print_info(site_info, page_title, type_, size) if not info_only: download_url_ffmpeg(url=hls_url, title=page_title, ext='mp4', output_dir=output_dir) return # most common media file extensions on the Internet media_exts = [ '\.flv', '\.mp3', '\.mp4', '\.webm', '[-_]1\d\d\d\.jpe?g', '[-_][6-9]\d\d\.jpe?g', # tumblr '[-_]1\d\d\dx[6-9]\d\d\.jpe?g', '[-_][6-9]\d\dx1\d\d\d\.jpe?g', '[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', 's1600/[\w%]+\.jpe?g', # blogger 'img[6-9]\d\d/[\w%]+\.jpe?g', # oricon? ] urls = [] for i in media_exts: urls += re.findall(r'(https?://[^;"\'\\]+' + i + r'[^;"\'\\]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page) urls += [parse.unquote(url) for url in p_urls] q_urls = re.findall( r'(https?:\\\\/\\\\/[^;"\']+' + i + r'[^;"\']*)', page) urls += [url.replace('\\\\/', '/') for url in q_urls] # a link href to an image is often an interesting one urls += re.findall(r'href="(https?://[^"]+\.jpe?g)"', page, re.I) urls += re.findall(r'href="(https?://[^"]+\.png)"', page, re.I) urls += re.findall(r'href="(https?://[^"]+\.gif)"', page, re.I) # MPEG-DASH MPD mpd_urls = re.findall(r'src="(https?://[^"]+\.mpd)"', page) for mpd_url in mpd_urls: cont = get_content(mpd_url) base_url = match1(cont, r'<BaseURL>(.*)</BaseURL>') urls += [match1(mpd_url, r'(.*/)[^/]*') + base_url] # have some candy! candies = [] i = 1 for url in set(urls): filename = parse.unquote(url.split('/')[-1]) if 5 <= len(filename) <= 80: title = '.'.join(filename.split('.')[:-1]) else: title = '{}'.format(i) i += 1 candies.append({'url': url, 'title': title}) for candy in candies: try: mime, ext, size = url_info(candy['url'], faker=True) if not size: size = float('Int') except Exception: continue else: print_info(site_info, candy['title'], ext, size) if not info_only: download_urls([candy['url']], candy['title'], ext, size, output_dir=output_dir, merge=merge, faker=True) return else: # direct download filename = parse.unquote(url.split('/')[-1]) title = '.'.join(filename.split('.')[:-1]) ext = filename.split('.')[-1] _, _, size = url_info(url, faker=True) print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, output_dir=output_dir, merge=merge, faker=True) return
def google_download(url, info_only=False, **kwargs): # Percent-encoding Unicode URL url = parse.quote(url, safe=':/+%?=') service = url.split('/')[2].split('.')[0] if service == 'plus': # Google Plus # attempt to extract images first # TBD: posts with > 4 images # TBD: album links html = get_content(parse.unquote(url)) real_urls = [] for src in re.findall(r'src="([^"]+)"[^>]*itemprop="image"', html): t = src.split('/') t[0], t[-2] = t[0] or 'https:', 's0-d' u = '/'.join(t) real_urls.append(u) if not real_urls: real_urls = [ match1(html, r'<meta property="og:image" content="([^"]+)') ] real_urls = [re.sub(r'w\d+-h\d+-p', 's0', u) for u in real_urls] post_date = match1(html, r'"?(20\d\d[-/]?[01]\d[-/]?[0123]\d)"?') post_id = match1(html, r'/posts/([^"]+)') title = '{}_{}'.format(post_date, post_id) try: url = 'https://plus.google.com/{}'.format( match1(html, r'(photos/\d+/albums/\d+/\d+)\?authkey')) html = get_content(url) temp = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) temp = sorted(temp, key=lambda x: fmt_level[x[0]]) urls = [unicodize(i[1]) for i in temp if i[0] == temp[0][0]] assert urls real_urls = urls # Look ma, there's really a video! post_url = match1( html, r'"(https://plus.google.com/[^/]+/posts/[^"]*)"') post_author = match1(post_url, r'/\+([^/]+)/posts') if post_author: post_url = 'https://plus.google.com/+{}/posts/{}'.format( parse.quote(post_author), match1(post_url, r'posts/(.+)')) post_html = get_content(post_url) title = match1(post_html, r'<title[^>]*>([^<\n]+)') except Exception: pass for i, real_url in enumerate(real_urls): title_i = '{}[{}]'.format(title, i) if len(real_urls) > 1 \ else title _, ext, size = url_info(real_url) if ext is None: ext = 'mp4' print_info(site_info, title_i, ext, size) if not info_only: download_urls([real_url], title_i, ext, size, **kwargs) elif service in ['docs', 'drive']: # Google Docs html = get_content(url) title = match1(html, r'"title":"([^"]*)"') or match1( html, r'<meta itemprop="name" content="([^"]*)"') if len(title.split('.')) > 1: title = ".".join(title.split('.')[:-1]) docid = match1(url, '/file/d/([^/]+)') request.install_opener( request.build_opener(request.HTTPCookieProcessor())) real_url = ( 'https://docs.google.com/uc?export=download&confirm=no_antivirus&' 'id={}'.format(docid)) redirected_url = get_location(real_url) if real_url != redirected_url: # tiny file - get real url here _, ext, size = url_info(redirected_url) real_url = redirected_url else: # huge file - the real_url is a confirm page and real url is in it confirm_page = get_content(real_url) hrefs = re.findall(r'href="(.+?)"', confirm_page) for u in hrefs: if u.startswith('/uc?export=download'): rel = unescape_html(u) confirm_url = 'https://docs.google.com' + rel real_url = get_location(confirm_url) _, ext, size = url_info(real_url) if size is None: size = 0 print_info(site_info, title, ext, size) if not info_only: download_urls([real_url], title, ext, size, **kwargs)
def print_info(site_info, title, type, size, **kwargs): if json_output: json_output_.print_info(site_info=site_info, title=title, type=type, size=size) return if type: type = type.lower() if type in ['3gp']: type = 'video/3gpp' elif type in ['asf', 'wmv']: type = 'video/x-ms-asf' elif type in ['flv', 'f4v']: type = 'video/x-flv' elif type in ['mkv']: type = 'video/x-matroska' elif type in ['mp3']: type = 'audio/mpeg' elif type in ['mp4']: type = 'video/mp4' elif type in ['mov']: type = 'video/quicktime' elif type in ['ts']: type = 'video/MP2T' elif type in ['webm']: type = 'video/webm' elif type in ['jpg']: type = 'image/jpeg' elif type in ['png']: type = 'image/png' elif type in ['gif']: type = 'image/gif' if type in ['video/3gpp']: type_info = '3GPP multimedia file (%s)' % type elif type in ['video/x-flv', 'video/f4v']: type_info = 'Flash video (%s)' % type elif type in ['video/mp4', 'video/x-m4v']: type_info = 'MPEG-4 video (%s)' % type elif type in ['video/MP2T']: type_info = 'MPEG-2 transport stream (%s)' % type elif type in ['video/webm']: type_info = 'WebM video (%s)' % type # elif type in ['video/ogg']: # type_info = 'Ogg video (%s)' % type elif type in ['video/quicktime']: type_info = 'QuickTime video (%s)' % type elif type in ['video/x-matroska']: type_info = 'Matroska video (%s)' % type # elif type in ['video/x-ms-wmv']: # type_info = 'Windows Media video (%s)' % type elif type in ['video/x-ms-asf']: type_info = 'Advanced Systems Format (%s)' % type # elif type in ['video/mpeg']: # type_info = 'MPEG video (%s)' % type elif type in ['audio/mp4', 'audio/m4a']: type_info = 'MPEG-4 audio (%s)' % type elif type in ['audio/mpeg']: type_info = 'MP3 (%s)' % type elif type in ['audio/wav', 'audio/wave', 'audio/x-wav']: type_info = 'Waveform Audio File Format ({})'.format(type) elif type in ['image/jpeg']: type_info = 'JPEG Image (%s)' % type elif type in ['image/png']: type_info = 'Portable Network Graphics (%s)' % type elif type in ['image/gif']: type_info = 'Graphics Interchange Format (%s)' % type elif type in ['m3u8']: if 'm3u8_type' in kwargs: if kwargs['m3u8_type'] == 'master': type_info = 'M3U8 Master {}'.format(type) else: type_info = 'M3U8 Playlist {}'.format(type) else: type_info = 'Unknown type (%s)' % type maybe_print('Site: ', site_info) maybe_print('Title: ', unescape_html(tr(title))) print('Type: ', type_info) if type != 'm3u8': print('Size: ', round(size / 1048576, 2), 'MiB (' + str(size) + ' Bytes)') if type == 'm3u8' and 'm3u8_url' in kwargs: print('M3U8 Url: {}'.format(kwargs['m3u8_url'])) print()
def prepare(self, **kwargs): assert self.url or self.vid if not self.vid and self.url: self.vid = self.__class__.get_vid_from_url(self.url) if self.vid is None: self.download_playlist_by_url(self.url, **kwargs) exit(0) video_info = parse.parse_qs( get_content( 'https://www.youtube.com/get_video_info?video_id={}'.format( self.vid))) ytplayer_config = None if 'status' not in video_info: log.wtf('[Failed] Unknown status.') elif video_info['status'] == ['ok']: if 'use_cipher_signature' not in video_info \ or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) # Parse video page (for DASH) video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) try: ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') except Exception: stream_list = video_info['url_encoded_fmt_stream_map'][ 0].split(',') self.html5player = None else: # Parse video page instead video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytplayer_config['args']['title'] self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') elif video_info['status'] == ['fail']: if video_info['errorcode'] == ['150']: video_page = get_content( 'https://www.youtube.com/watch?v={}'.format(self.vid)) try: ytplayer_config = json.loads( re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1)) except Exception: msg = re.search('class="message">([^<]+)<', video_page).group(1) log.wtf('[Failed] "%s"' % msg.strip()) if 'title' in ytplayer_config['args']: # 150 Restricted from playback on certain sites # Parse video page instead self.title = ytplayer_config['args']['title'] self.html5player = 'https://www.youtube.com{}'.format( ytplayer_config['assets']['js']) stream_list = ytplayer_config['args'][ 'url_encoded_fmt_stream_map'].split(',') else: log.wtf('[Error] The uploader has not made this video ' 'available in your country.') elif video_info['errorcode'] == ['100']: log.wtf('[Failed] This video does not exist.', exit_code=int(video_info['errorcode'][0])) else: log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=int(video_info['errorcode'][0])) else: log.wtf('[Failed] Invalid status.') # YouTube Live if ytplayer_config and ( ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'): hlsvp = ytplayer_config['args']['hlsvp'] if 'info_only' in kwargs and kwargs['info_only']: return else: download_url_ffmpeg(hlsvp, self.title, 'mp4') exit(0) for stream in stream_list: metadata = parse.parse_qs(stream) stream_itag = metadata['itag'][0] self.streams[stream_itag] = { 'itag': metadata['itag'][0], 'url': metadata['url'][0], 'sig': metadata['sig'][0] if 'sig' in metadata else None, 's': metadata['s'][0] if 's' in metadata else None, 'quality': metadata['quality'][0], 'type': metadata['type'][0], 'mime': metadata['type'][0].split(';')[0], 'container': mime_to_container(metadata['type'][0].split(';')[0]), } # Prepare caption tracks try: caption_tracks = json.loads( ytplayer_config['args']['player_response'] )['captions']['playerCaptionsTracklistRenderer']['captionTracks'] for ct in caption_tracks: ttsurl, lang = ct['baseUrl'], ct['languageCode'] tts_xml = parseString(get_content(ttsurl)) transcript = tts_xml.getElementsByTagName('transcript')[0] texts = transcript.getElementsByTagName('text') srt = '' seq = 0 for text in texts: if text.firstChild is None: continue # empty element seq += 1 start = float(text.getAttribute('start')) if text.getAttribute('dur'): dur = float(text.getAttribute('dur')) else: dur = 1.0 # could be ill-formed XML finish = start + dur m, s = divmod(start, 60) h, m = divmod(m, 60) start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace( '.', ',') m, s = divmod(finish, 60) h, m = divmod(m, 60) finish = '{:0>2}:{:0>2}:{:06.3f}'.format( int(h), int(m), s).replace('.', ',') content = unescape_html(text.firstChild.nodeValue) srt += '{}\n'.format(str(seq)) srt += '{} --> {}\n'.format(start, finish) srt += '{}\n\n'.format(content) self.caption_tracks[lang] = srt except Exception: pass # Prepare DASH streams try: dashmpd = ytplayer_config['args']['dashmpd'] dash_xml = parseString(get_content(dashmpd)) for aset in dash_xml.getElementsByTagName('AdaptationSet'): mimeType = aset.getAttribute('mimeType') if mimeType == 'audio/mp4': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_mp4_a_url = burls[0].firstChild.nodeValue dash_mp4_a_size = burls[0].getAttribute('yt:contentLength') if not dash_mp4_a_size: try: dash_mp4_a_size = url_size(dash_mp4_a_url) except Exception: continue elif mimeType == 'audio/webm': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_webm_a_url = burls[0].firstChild.nodeValue dash_webm_a_size = burls[0].getAttribute( 'yt:contentLength') if not dash_webm_a_size: try: dash_webm_a_size = url_size(dash_webm_a_url) except Exception: continue elif mimeType == 'video/mp4': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) h = int(rep.getAttribute('height')) itag = rep.getAttribute('id') burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') if not dash_size: try: dash_size = url_size(dash_url) except Exception: continue self.dash_streams[itag] = { 'quality': '{}x{}'.format(w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', 'src': [dash_url, dash_mp4_a_url], 'size': int(dash_size) + int(dash_mp4_a_size) } elif mimeType == 'video/webm': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) h = int(rep.getAttribute('height')) itag = rep.getAttribute('id') burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') if not dash_size: try: dash_size = url_size(dash_url) except Exception: continue self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', 'src': [dash_url, dash_webm_a_url], 'size': int(dash_size) + int(dash_webm_a_size) } except Exception: # VEVO if not self.html5player: return self.js = get_content(self.html5player) if 'adaptive_fmts' in ytplayer_config['args']: streams = [ dict([(i.split('=')[0], parse.unquote(i.split('=')[1])) for i in afmt.split('&')]) for afmt in ytplayer_config['args']['adaptive_fmts'].split(',') ] for stream in streams: # get over speed limiting stream['url'] += '&ratebypass=yes' for stream in streams: # audio if stream['type'].startswith('audio/mp4'): dash_mp4_a_url = stream['url'] if 's' in stream: sig = self.__class__.decipher(self.js, stream['s']) dash_mp4_a_url += '&signature={}'.format(sig) dash_mp4_a_size = stream['clen'] elif stream['type'].startswith('audio/webm'): dash_webm_a_url = stream['url'] if 's' in stream: sig = self.__class__.decipher(self.js, stream['s']) dash_webm_a_url += '&signature={}'.format(sig) dash_webm_a_size = stream['clen'] for stream in streams: # video if 'size' in stream: if stream['type'].startswith('video/mp4'): mimeType = 'video/mp4' dash_url = stream['url'] if 's' in stream: sig = self.__class__.decipher( self.js, stream['s']) dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', 'src': [dash_url, dash_mp4_a_url], 'size': int(dash_size) + int(dash_mp4_a_size) } elif stream['type'].startswith('video/webm'): mimeType = 'video/webm' dash_url = stream['url'] if 's' in stream: sig = self.__class__.decipher( self.js, stream['s']) dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', 'src': [dash_url, dash_webm_a_url], 'size': int(dash_size) + int(dash_webm_a_size) }