def letvcloud_download(url, info_only=False, **kwargs): qs = urllib.parse.urlparse(url).query vu = match1(qs, r'vu=([\w]+)') uu = match1(qs, r'uu=([\w]+)') title = 'LETV-{}'.format(vu) letvcloud_download_by_vu( vu, uu, title=title, info_only=info_only, **kwargs )
def get_vid_from_url(url): """Extracts video ID from URL. """ return match1(url, r'youtu\.be/([^?/]+)') or \ match1(url, r'youtube\.com/embed/([^/?]+)') or \ match1(url, r'youtube\.com/v/([^/?]+)') or \ match1(url, r'youtube\.com/watch/([^/?]+)') or \ parse_query_param(url, 'v') or \ parse_query_param(parse_query_param(url, 'u'), 'v')
def veoh_download(url, info_only=False, **kwargs): '''Get item_id''' if re.match(r'http://www.veoh.com/watch/\w+', url): item_id = match1(url, r'http://www.veoh.com/watch/(\w+)') elif re.match(r'http://www.veoh.com/m/watch.php\?v=\.*', url): item_id = match1(url, r'http://www.veoh.com/m/watch.php\?v=(\w+)') else: raise NotImplementedError('Cannot find item ID') veoh_download_by_id(item_id, info_only=info_only, **kwargs)
def w56_download(url, info_only=False, **kwargs): content = get_content(url) sohu_url = match1(content, r'url:\s*"(.+)"') if sohu_url: sohu_download(sohu_url, info_only=info_only, **kwargs) return _id = match1(url, r'http://www.56.com/u\d+/v_(\w+).html') or \ match1(url, r'http://www.56.com/.*vid-(\w+).html') w56_download_by_id(_id, info_only=info_only, **kwargs)
def tudou_download(url, **kwargs): if 'video.tudou.com' in url: vid = match1(url, r'.*?video.tudou.com/v/([\w=]+)') else: page = get_content(url) video_info = json.loads( match1(page, r'window.__INITIAL_STATE__=\s*(.+?);</script>')) vid = video_info['videoDesc']['detail']['videoid'] youku_download_by_vid(vid, **kwargs)
def sohu_download(url, info_only=False, **kwargs): if re.match(r'http://share.vrs.sohu.com', url): vid = match1(url, 'id=(\d+)') else: html = get_content(url) vid = match1(html, r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?') assert vid if re.match(r'http[s]://tv.sohu.com/', url): info = json.loads( get_content( 'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'.format(vid))) for qtyp in ['oriVid', 'superVid', 'highVid', 'norVid', 'relativeId']: if 'data' in info: hqvid = info['data'][qtyp] else: hqvid = info[qtyp] if hqvid != 0 and hqvid != vid: info = json.loads( get_content( 'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'. format(hqvid))) if 'allot' not in info: continue break host = info['allot'] tvid = info['tvid'] urls = [] data = info['data'] title = data['tvName'] size = sum(data['clipsBytes']) assert len(data['clipsURL']) == len(data['clipsBytes']) \ == len(data['su']) for new, clip, ck in zip(data['su'], data['clipsURL'], data['ck']): clipURL = urlparse(clip).path urls.append(real_url(host, hqvid, tvid, new, clipURL, ck)) else: info = json.loads( get_content( 'http://my.tv.sohu.com/play/videonew.do?vid={}&referer=' 'http://my.tv.sohu.com'.format(vid))) host = info['allot'] tvid = info['tvid'] urls = [] data = info['data'] title = data['tvName'] size = sum(map(int, data['clipsBytes'])) assert len(data['clipsURL']) == len(data['clipsBytes']) \ == len(data['su']) for new, clip, ck, in zip(data['su'], data['clipsURL'], data['ck']): clipURL = urlparse(clip).path urls.append(real_url(host, vid, tvid, new, clipURL, ck)) print_info(site_info, title, 'mp4', size) if not info_only: download_urls(urls, title, 'mp4', size, refer=url, **kwargs)
def ku6_download(url, info_only=False, **kwargs): page = get_content(url) video = match1(page, r'type: "video/mp4", src: "(.+)"').replace(' ', '%20') video = parse.quote(video, safe=string.printable) title = match1(page, r'document.title = "(.+)"') _type, ext, size = url_info(video) print_info(site_info, title, _type, size) if not info_only: download_urls([video], title, ext, size, **kwargs)
def get_vid_from_url(url): """Extracts video ID from URL. """ vid = match1(url, 'https?://www.mgtv.com/(?:b|l)/\d+/(\d+).html') if not vid: vid = match1(url, 'https?://www.mgtv.com/hz/bdpz/\d+/(\d+).html') if not vid: vid = match1(get_content(url), r'vid: (\d+),') return vid
def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): info_api = ('http://vv.video.qq.com/getinfo?otype=json&appver=3.2.19.333' '&platform=11&defnpayver=1&vid={}'.format(vid)) info = get_content(info_api) video_json = json.loads(match1(info, r'QZOutputJson=(.*)')[:-1]) fn_pre = video_json['vl']['vi'][0]['lnk'] title = video_json['vl']['vi'][0]['ti'] host = video_json['vl']['vi'][0]['ul']['ui'][0]['url'] streams = video_json['fl']['fi'] seg_cnt = video_json['vl']['vi'][0]['cl']['fc'] if seg_cnt == 0: seg_cnt = 1 # best_quality = streams[-1]['name'] part_format_id = streams[-1]['id'] part_urls = [] total_size = 0 for part in range(1, seg_cnt + 1): filename = '{}.p{}.{}.mp4'.format(fn_pre, str(part_format_id % 10000), str(part)) key_api = ('http://vv.video.qq.com/getkey?otype=json&platform=11&' 'format={}&vid={}&filename={}&appver=3.2.19.333'.format( part_format_id, vid, filename)) part_info = get_content(key_api) key_json = json.loads(match1(part_info, r'QZOutputJson=(.*)')[:-1]) if key_json.get('key') is None: vkey = video_json['vl']['vi'][0]['fvkey'] url = '{}{}?vkey={}'.format( video_json['vl']['vi'][0]['ul']['ui'][0]['url'], fn_pre + '.mp4', vkey) else: vkey = key_json['key'] url = '{}{}?vkey={}'.format(host, filename, vkey) if not vkey: if part == 1: log.wtf(key_json['msg']) else: log.w(key_json['msg']) break part_urls.append(url) _, ext, size = url_info(url) total_size += size print_info(site_info, title, ext, total_size) if not info_only: download_urls(part_urls, title, ext, total_size, output_dir=output_dir, merge=merge)
def toutiao_download(url, info_only=False, **kwargs): html = get_content(url) video_id = match1(html, r"videoid\s*:\s*'([^']+)',\n") title = match1(html, r"title: '([^']+)'.replace") video_file_list = get_file_by_vid(video_id) # 调api获取视频源文件 _type, ext, size = url_info(video_file_list[0].url) print_info(site_info=site_info, title=title, type=_type, size=size) if not info_only: download_urls([video_file_list[0].url], title, ext, size, **kwargs)
def prepare(self, **kwargs): if socket.getdefaulttimeout() == 600: # no timeout specified socket.setdefaulttimeout(2) # fail fast, very speedy! # handle 'watchlater' URLs if '/watchlater/' in self.url: aid = match1(self.url, r'av(\d+)') self.url = 'https://www.bilibili.com/video/av{}/'.format(aid) self.ua = FAKE_HEADERS['User-Agent'] if 'bangumi' not in self.url: # bangumi redirect will miss fragment argument here # http://bangumi.bilibili.com/anime/21542/play#173286 -> # https://www.bilibili.com/bangumi/play/ss21542 # It should be https://www.bilibili.com/bangumi/play/ss21542#173286 self.url = url_locations([self.url])[0] frag = urllib.parse.urlparse(self.url).fragment # http://www.bilibili.com/video/av3141144/index_2.html#page=3 if frag: page = match1(frag, r'page=(\d+)') if page: aid = match1(self.url, r'av(\d+)') self.url = ( 'https://www.bilibili.com/video/av{}/index_{}.html'.format( aid, page)) # handle bangumi url like this # http://bangumi.bilibili.com/anime/21542/play#173286 # https://www.bilibili.com/bangumi/play/ss21542#173286 # https://www.bilibili.com/bangumi/play/ep173286 bangumi_ep_id = match1(self.url, r'/anime/\d+/play#(\d+)') or \ match1(self.url, r'/bangumi/play/ss\d+#(\d+)') if bangumi_ep_id: self.url = 'https://www.bilibili.com/bangumi/play/ep{}'.format( bangumi_ep_id) self.referer = self.url self.page = get_content(self.url) self.parser = get_parser(self.page) if self.parser.h1: self.title = self.parser.h1.text.strip() else: # Some movie page got no h1 tag self.title = self.parser.find('meta', property='og:title')['content'] if 'subtitle' in kwargs: subtitle = kwargs['subtitle'] self.title = '{} {}'.format(self.title, subtitle) if 'live.bilibili.com' in self.url: self.live_entry(**kwargs) elif 'vc.bilibili.com' in self.url: self.vc_entry(**kwargs) else: # bangumi, movie use this entry too self.entry(**kwargs)
def icourses_download(url, **kwargs): page = get_content(url) data = json.loads(match1(page, r'var _sourceArrStr = (.+?);')) if 'resId' in url: # 下载播放列表中指定视频 _id = match1(url, r'resId=([\w-]+)') results = list(filter(lambda x: x['id'] == _id, data)) _download(results[0], **kwargs) else: # 下载第一个 _download(data[0], **kwargs)
def vimeo_download(url, info_only=False, **kwargs): if re.match(r'https?://vimeo.com/channels/\w+', url): vimeo_download_by_channel(url, info_only, **kwargs) else: _id = match1(url, r'https?://[\w.]*vimeo.com[/\w]*/(\d+)') if _id is None: video_page = get_content(url) _id = match1(video_page, r'"clip_id":(\d+)') assert _id vimeo_download_by_id(_id, None, info_only=info_only, **kwargs)
def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwargs): url = match1(url, r'([^?]*)') html = get_content(url) vid = match1(url, r'instagram.com/p/([^/]+)') parser = get_parser(html) description = parser.find('meta', property='og:title')['content'] title = '{} [{}]'.format(description, vid) stream = parser.find('meta', property='og:video') if stream: stream = stream['content'] _, ext, size = url_info(stream) print_info(site_info, title, ext, size) if not info_only: download_urls([stream], title, ext, size, output_dir, merge=merge) else: data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', html) info = json.loads(data.group(1)) if 'edge_sidecar_to_children' in info['entry_data']['PostPage'][0][ 'graphql']['shortcode_media']: edges = info['entry_data']['PostPage'][0]['graphql'][ 'shortcode_media']['edge_sidecar_to_children']['edges'] for edge in edges: title = edge['node']['shortcode'] image_url = edge['node']['display_url'] ext = image_url.split('.')[-1] size = url_size(image_url) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], title=title, ext=ext, total_size=size, output_dir=output_dir) else: title = info['entry_data']['PostPage'][0]['graphql'][ 'shortcode_media']['shortcode'] image_url = info['entry_data']['PostPage'][0]['graphql'][ 'shortcode_media']['display_url'] ext = image_url.split('.')[-1] size = url_size(image_url) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], title=title, ext=ext, total_size=size, output_dir=output_dir)
def prepare(self, **kwargs): html = get_content(self.url) self.title = match1(html, r'<meta property="og:title" content="([^"]*)"') account_number = match1(html, r'data-account="(\d+)"') video_id = match1(html, r'data-brightcove-id="(\d+)"') assert account_number, video_id link_list = self.get_streams_by_id(account_number, video_id) for i in link_list: self.stream_types.append({'id': str(i[0])}) self.streams[i[0]] = {'url': i[1]}
def prepare(self, **kwargs): content = get_content(self.url) self.title = match1(content, r'setup\[\'title\'\] = "([^"]+)";') for s in self.stream_types: quality = s['id'] src = match1(content, r'src: "([^"]+)", "data-res": "{}"'.format(quality)) if src is not None: url = 'https://en.musicplayon.com{}'.format(src) self.streams[quality] = {'url': url}
def get_api_key(page): match = match1(page, pattern_inline_api_key) # this happens only when the url points to a gallery page # that contains no inline api_key(and never makes xhr api calls) # in fact this might be a better approch for getting a temporary api key # since there's no place for a user to add custom infomation that may # misguide the regex in the homepage if not match: return match1( get_content('https://flickr.com'), pattern_inline_api_key ) return match
def weibo_download_by_fid(fid, info_only=False, **kwargs): page_url = 'http://video.weibo.com/show?fid={}&type=mp4'.format(fid) mobile_page = get_content(page_url, headers=config.FAKE_HEADERS_MOBILE) url = match1(mobile_page, r'<video id=.*?src=[\'"](.*?)[\'"]\W') title = match1(mobile_page, r'<title>((.|\n)+?)</title>') if not title: title = fid title = title.replace('\n', '_') ext, size = 'mp4', url_size(url) print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, **kwargs)
def iwara_download(url, info_only=False, **kwargs): video_hash = match1(url, r'http://\w+.iwara.tv/videos/(\w+)') video_url = match1(url, r'(http://\w+.iwara.tv)/videos/\w+') html = get_content(url, headers=headers) title = match1(html, r'<title>(.*)</title>') api_url = '{}/api/video/{}'.format(video_url, video_hash) content = get_content(api_url, headers=headers) data = json.loads(content) _type, ext, size = url_info(data[0]['uri'], headers=headers) down_urls = data[0]['uri'] print_info(down_urls, title, _type, size) if not info_only: download_urls([down_urls], title, ext, size, headers=headers, **kwargs)
def get_video_info(url): ep = 'https://vk.com/al_video.php' to_post = dict(act='show', al=1, module='direct', video=re.search(r'video(-\d+_\d+)', url).group(1)) page = post_content(ep, post_data=to_post) url = match1(page, r'"url720":"(.+?)",') url = url.replace('\\', '') title = match1(page, r'<div class="mv_title".+?>(.+?)</div>') mime, ext, size = url_info(url) print_info(site_info, title, mime, size) return url, title, ext, size
def get_single_photo_url(url): page = get_content(url) pid = get_photo_id(url, page) title = match1(page, pattern_inline_title) if match1(page, pattern_inline_video_mark): api_key = get_api_key(page) reply = get_content( tmpl_api_call_photo_info(api_key, get_photo_id(url, page)) ) secret = json.loads(reply)['photo']['secret'] return get_orig_video_source(api_key, pid, secret), title # last match always has the best resolution match = match1(page, pattern_inline_img_url) return 'https:{}'.format(match.replace('\\', '')), title
def nanagogo_download(url, info_only=False, **kwargs): if re.match(r'https?://stat.7gogo.jp', url): universal_download(url, info_only=info_only, **kwargs) return talk_id = match1(url, r'7gogo.jp/([^/]+)/') post_id = match1(url, r'7gogo.jp/[^/]+/(\d+)') title = '{}_{}'.format(talk_id, post_id) api_url = 'https://api.7gogo.jp/web/v2/talks/{}/posts/{}'.format( talk_id, post_id) info = json.loads(get_content(api_url)) items = [] if info['data']['posts']['post'] is None: return if info['data']['posts']['post']['body'] is None: return for i in info['data']['posts']['post']['body']: if 'image' in i: image_url = i['image'] if image_url[:2] == '//': continue # skip stamp images _, ext, size = url_info(image_url) items.append({ 'title': title, 'url': image_url, 'ext': ext, 'size': size }) elif 'movieUrlHq' in i: movie_url = i['movieUrlHq'] _, ext, size = url_info(movie_url) items.append({ 'title': title, 'url': movie_url, 'ext': ext, 'size': size }) size = sum([i['size'] for i in items]) if size == 0: return # do not fail the whole process print_info(site_info, title, ext, size) if not info_only: for i in items: print_info(site_info, i['title'], i['ext'], i['size']) download_urls([i['url']], i['title'], i['ext'], i['size'], **kwargs)
def metacafe_download(url, info_only=False, **kwargs): if re.match(r'http://www.metacafe.com/watch/\w+', url): html = get_content(url) title = match1(html, r'<meta property="og:title" content="([^"]*)"') data = match1( html, r"<script type='text/json' id='json_video_data'>(.+)</script>" ) data = json.loads(data) m3u8_url = data['sources'][0]['src'] print_info( site_info, title, 'm3u8', 0, m3u8_url=m3u8_url, m3u8_type='master' ) if not info_only: download_url_ffmpeg(m3u8_url, title, 'mp4', **kwargs)
def facebook_download( url, output_dir='.', merge=True, info_only=False, **kwargs ): html = get_content(url) title = match1(html, r'<title id="pageTitle">(.+)</title>') if title is None: title = url sd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) for i in re.findall(r'sd_src_no_ratelimit:"([^"]*)"', html) ])) hd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) for i in re.findall(r'hd_src_no_ratelimit:"([^"]*)"', html) ])) urls = hd_urls if hd_urls else sd_urls _type, ext, size = url_info(urls[0], True) size = urls_size(urls) print_info(site_info, title, _type, size) if not info_only: download_urls(urls, title, ext, size, output_dir, merge=False)
def baomihua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_content(url) title = match1(html, r'<title>(.*)</title>') assert title _id = match1(html, r'flvid\s*=\s*(\d+)') assert _id baomihua_download_by_id(_id, title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
def theplatform_download_by_pid( pid, title, output_dir='.', merge=True, info_only=False, **kwargs ): smil_url = ( 'http://link.theplatform.com/s/dJ5BDC/{}/meta.smil?format=smil' '&mbr=true'.format(pid) ) smil = get_content(smil_url) smil_base = unescape_html(match1(smil, r'<meta base="([^"]+)"')) smil_videos = { y: x for x, y in dict( re.findall(r'<video src="([^"]+)".+height="([^"]+)"', smil) ).items() } for height in ['1080', '720', '480', '360', '240', '216']: if height in smil_videos: smil_video = smil_videos[height] break assert smil_video _type, ext, size = 'mp4', 'mp4', 0 print_info(site_info, title, _type, size) if not info_only: download_rtmp_url( url=smil_base, title=title, ext=ext, params={"-y": '{}:{}'.format(ext, smil_video)}, output_dir=output_dir )
def _get_video_query_url(resourceID): # has to be like this headers = FAKE_HEADERS.copy() headers.update({ 'DNT': '1', 'Referer': 'http://v.ucas.ac.cn/', 'Connection': 'keep-alive', }) conn = http.client.HTTPConnection('210.76.211.10') conn.request( 'GET', '/vplus/remote.do?method=query2&loginname=videocas&pwd=af1c7a4c5f77' 'f790722f7cae474c37e281203765d423a23b&resource=%5B%7B%22resourceID%2' '2%3A%22{}%22%2C%22on%22%3A1%2C%22time%22%3A600%2C%22eid%22%3A100%2C' '%22w%22%3A800%2C%22h%22%3A600%7D%5D&timeStamp={}'.format( resourceID, str(int(time())) ), headers=headers ) res = conn.getresponse() data = res.read() info = data.decode("utf-8") return match1(info, r'video":"(.+)"')
def prepare(self, **kwargs): if self.url and not self.vid: if not re.match(r'http://v.pptv.com/show/(\w+)\.html', self.url): raise ('Unknown url pattern') page_content = get_content(self.url) self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)') if not self.vid: raise ('Cannot find id') self.referer = self.url api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid) api_url += ( '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=' '&version=4') dom = parseString(get_content(api_url)) self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom) xml_streams = merge_meta(m_items, m_streams, m_segs) for stream_id in xml_streams: stream_data = xml_streams[stream_id] src = make_url(stream_data) self.streams[stream_id] = { 'container': 'mp4', 'video_profile': stream_data['res'], 'size': int(stream_data['size']), 'src': src }
def kugou_download(url, info_only=False, **kwargs): html = get_content(url) if url.lower().find('5sing') != -1: # for 5sing.kugou.com ticket = match1(html, r'"ticket":\s*"(.*)"') j = json.loads(str(b64decode(ticket), encoding='utf-8')) url = j['file'] title = j['songName'] songtype, ext, size = url_info(url) print_info(site_info, title, songtype, size) if not info_only: download_urls([url], title, ext, size, **kwargs) else: # for the www.kugou.com hash_val = match1(url, r'hash=(\w+)') kugou_download_by_hash(hash_val, info_only, **kwargs)
def baidu_download_album(aid, output_dir='.', merge=True, info_only=False): html = get_content('http://music.baidu.com/album/{}'.format(aid)) parser = get_parser(html) album_name = parser.find('h2', class_='album-name').text artist = parser.find('span', class_='author_list')['title'] output_dir = '{}/{} - {}'.format(output_dir, artist, album_name) ids = json.loads( match1(html, r'<span class="album-add" data-adddata=\'(.+?)\'>').replace( '"', '').replace(';', '"'))['ids'] track_nr = 1 for _id in ids: song_data = baidu_get_song_data(_id) song_url = song_data['songLink'] song_title = song_data['songName'] song_lrc = song_data['lrcLink'] file_name = '{:0>2d}.{}'.format(track_nr, song_title) _type, ext, size = url_info(song_url) print_info(site_info, song_title, _type, size) if not info_only: download_urls([song_url], file_name, ext, size, output_dir, merge=merge) if song_lrc: _type, ext, size = url_info(song_lrc) print_info(site_info, song_title, _type, size) if not info_only: download_urls([song_lrc], file_name, ext, size, output_dir) track_nr += 1