def tudou_download(url, **kwargs): if 'video.tudou.com' in url: vid = match1(url, r'.*?video.tudou.com/v/([\w=]+)') else: page = get_content(url) video_info = json.loads( match1(page, r'window.__INITIAL_STATE__=\s*(.+?);</script>')) vid = video_info['videoDesc']['detail']['videoid'] youku_download_by_vid(vid, **kwargs)
def entry(self, **kwargs): # tencent player tc_flashvars = re.search( r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page ) if tc_flashvars: tc_flashvars = tc_flashvars.group(1) if tc_flashvars is not None: self.out = True qq_download_by_vid( tc_flashvars, self.title, output_dir=kwargs['output_dir'], merge=kwargs['merge'], info_only=kwargs['info_only'] ) return has_plist = re.search(r'<option', self.page) if has_plist and r1('index_(\d+).html', self.url) is None: log.w( 'This page contains a playlist. (use --playlist to download ' 'all videos.)' ) try: cid = re.search(r'cid=(\d+)', self.page).group(1) except Exception: cid = re.search(r'"cid":(\d+)', self.page).group(1) if cid is not None: self.download_by_vid( cid, re.search('bangumi', self.url) is not None, **kwargs ) else: # flashvars? flashvars = re.search(r'flashvars="([^"]+)"', self.page).group(1) if flashvars is None: raise Exception('Unsupported page {}'.format(self.url)) param = flashvars.split('&')[0] t, cid = param.split('=') t = t.strip() cid = cid.strip() if t == 'vid': sina_download_by_vid( cid, self.title, output_dir=kwargs['output_dir'], merge=kwargs['merge'], info_only=kwargs['info_only'] ) elif t == 'ykid': youku_download_by_vid( cid, self.title, output_dir=kwargs['output_dir'], merge=kwargs['merge'], info_only=kwargs['info_only'] ) else: raise NotImplementedError( 'Unknown flashvars {}'.format(flashvars) ) return
def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False, **kwargs): """str, str, str, bool, bool -> None Download Acfun video by vid. Call Acfun API, decide which site to use, and pass the job to its extractor. """ # first call the main parasing API info = json.loads( get_content( 'http://www.acfun.cn/video/getVideo.aspx?id={}'.format(vid))) sourceType = info['sourceType'] # decide sourceId to know which extractor to use if 'sourceId' in info: sourceId = info['sourceId'] # danmakuId = info['danmakuId'] # call extractor decided by sourceId if sourceType == 'sina': sina_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'youku': youku_download_by_vid(sourceId, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) elif sourceType == 'qq': qq_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'letv': letvcloud_download_by_vu(sourceId, '2d8c027396', title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'zhuzhan': # As in Jul.28.2016, Acfun is using embsig to anti hotlink so we need # to pass this In Mar. 2017 there is a dedicated ``acfun_proxy'' in # youku cloud player old code removed url = 'http://www.acfun.cn/v/ac{}'.format(vid) yk_streams = youku_acfun_proxy(info['sourceId'], info['encode'], url) seq = ['mp4hd3', 'mp4hd2', 'mp4hd', 'flvhd'] for t in seq: if yk_streams.get(t): preferred = yk_streams[t] break # total_size in the json could be incorrect(F.I. 0) size = 0 for url in preferred[0]: _, _, seg_size = url_info(url) size += seg_size # fallback to flvhd is not quite possible print_info(site_info, title, 'mp4', size) if not info_only: download_urls(preferred[0], title, 'mp4', size, output_dir=output_dir, merge=merge) else: raise NotImplementedError(sourceType) if not info_only and not dry_run: if not kwargs['caption']: print('Skipping danmaku.') return try: title = get_filename(title) print('Downloading %s ...\n' % (title + '.cmt.json')) cmt = get_srt_json(vid) with open(os.path.join(output_dir, title + '.cmt.json'), 'w', encoding='utf-8') as x: x.write(cmt) except Exception: pass
def embed_download(url, output_dir='.', merge=True, info_only=False, **kwargs): content = get_content(url) found = False title = match1(content, '<title>([^<>]+)</title>') vids = matchall(content, youku_embed_patterns) for vid in set(vids): found = True youku_download_by_vid( vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only ) vids = matchall(content, yinyuetai_embed_patterns) for vid in vids: found = True yinyuetai_download_by_id( vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only ) vids = matchall(content, iqiyi_embed_patterns) for vid in vids: found = True iqiyi_download_by_vid( (vid[1], vid[0]), title=title, output_dir=output_dir, merge=merge, info_only=info_only ) urls = matchall(content, netease_embed_patterns) for url in urls: found = True netease_download( url, output_dir=output_dir, merge=merge, info_only=info_only ) urls = matchall(content, vimeo_embed_patters) for url in urls: found = True vimeo_download_by_id( url, title=title, output_dir=output_dir, merge=merge, info_only=info_only, referer=url ) urls = matchall(content, dailymotion_embed_patterns) for url in urls: found = True dailymotion_download( url, output_dir=output_dir, merge=merge, info_only=info_only ) aids = matchall(content, bilibili_embed_patterns) for aid in aids: found = True url = 'http://www.bilibili.com/video/av{}/'.format(aid) bilibili_download( url, output_dir=output_dir, merge=merge, info_only=info_only ) iqiyi_urls = matchall(content, iqiyi_patterns) for url in iqiyi_urls: found = True iqiyi.download( url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs ) bokecc_metas = matchall(content, bokecc_patterns) for meta in bokecc_metas: found = True bokecc.bokecc_download_by_id( meta[1], output_dir=output_dir, merge=merge, info_only=info_only, **kwargs ) if found: return True # Try harder, check all iframes if 'recur_lv' not in kwargs or kwargs['recur_lv'] < recur_limit: r = kwargs.get('recur_lv') if r is None: r = 1 else: r += 1 iframes = matchall(content, [r'<iframe.+?src=(?:\"|\')(.+?)(?:\"|\')']) for iframe in iframes: if not iframe.startswith('http'): src = urllib.parse.urljoin(url, iframe) else: src = iframe found = embed_download( src, output_dir=output_dir, merge=merge, info_only=info_only, recur_lv=r, **kwargs ) if found: return True if not found and 'recur_lv' not in kwargs: raise NotImplementedError(url) else: return found