def extract(self, **kwargs): if not self.streams_sorted: # No stream is available return if 'stream_id' in kwargs and kwargs['stream_id']: # Extract the stream stream_id = kwargs['stream_id'] if stream_id not in self.streams \ and stream_id not in self.dash_streams: log.e('[Error] Invalid video format.') log.e('Run \'-i\' command with no specific video format to ' 'view all available formats.') exit(2) else: # Extract stream with the best quality stream_id = self.streams_sorted[0]['itag'] if stream_id in self.streams: src = self.streams[stream_id]['url'] if self.streams[stream_id]['sig'] is not None: sig = self.streams[stream_id]['sig'] src += '&signature={}'.format(sig) elif self.streams[stream_id]['s'] is not None: if not hasattr(self, 'js'): self.js = get_content(self.html5player) s = self.streams[stream_id]['s'] sig = self.__class__.decipher(self.js, s) src += '&signature={}'.format(sig) self.streams[stream_id]['src'] = [src] self.streams[stream_id]['size'] = urls_size( self.streams[stream_id]['src'])
def facebook_download( url, output_dir='.', merge=True, info_only=False, **kwargs ): html = get_content(url) title = match1(html, r'<title id="pageTitle">(.+)</title>') if title is None: title = url sd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) for i in re.findall(r'sd_src_no_ratelimit:"([^"]*)"', html) ])) hd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) for i in re.findall(r'hd_src_no_ratelimit:"([^"]*)"', html) ])) urls = hd_urls if hd_urls else sd_urls _type, ext, size = url_info(urls[0], True) size = urls_size(urls) print_info(site_info, title, _type, size) if not info_only: download_urls(urls, title, ext, size, output_dir, merge=False)
def __call__(self, url, **kwargs): ''' data = { 'urls': [], 'title': '', 'file_format': '', 'size': '', } ''' data = self.extract(url, **kwargs) if not self.need_download: return file_format = data.get('file_format', 'mp4') size = data.get('size') urls = data['urls'] if not size: if len(urls) == 1: size = url_size(urls[0]) else: size = urls_size(urls) print_info(site_info=self.site_info, title=data['title'], type=file_format, size=size) if not kwargs['info_only']: download_urls(urls=urls, title=data['title'], ext=file_format, total_size=size, **kwargs)
def extract(self, **kwargs): for s in self.streams: self.streams[s]['size'] = urls_size(self.streams[s]['src']) master_m3u8s = [] for m in self.master_m3u8: master_m3u8s.append(self.master_m3u8[m]['url']) master_content = None master_url = None for master_u in master_m3u8s: try: master_content = get_content(master_u).split('\n') except urllib.error.URLError: continue else: master_url = master_u if master_content is None: return lines = [] for line in master_content: if len(line.strip()) > 0: lines.append(line.strip()) pos = 0 while pos < len(lines): if lines[pos].startswith('#EXT-X-STREAM-INF'): patt = 'RESOLUTION=(\d+)x(\d+)' hit = re.search(patt, lines[pos]) if hit is None: continue # width = hit.group(1) height = hit.group(2) if height in ('2160', '1440'): m3u8_url = urllib.parse.urljoin(master_url, lines[pos+1]) meta = dict(m3u8_url=m3u8_url, container='m3u8') if height == '1440': meta['video_profile'] = '2560x1440' else: meta['video_profile'] = '3840x2160' meta['size'] = 0 meta['src'] = general_m3u8_extractor(m3u8_url) self.streams[height+'p'] = meta pos += 2 else: pos += 1 self.streams_sorted = [] for stream_type in self.stream_types: if stream_type['id'] in self.streams: item = [('id', stream_type['id'])] + list( self.streams[stream_type['id']].items() ) self.streams_sorted.append(dict(item))
def letvcloud_download_by_vu(vu, uu, title=None, info_only=False, **kwargs): argumet_dict = { 'cf': 'flash', 'format': 'json', 'ran': str(int(time.time())), 'uu': str(uu), 'ver': '2.2', 'vu': str(vu), } # ALL YOUR BASE ARE BELONG TO US sign_key = '2f9d6924b33a165a6d8b5d3d42f4f987' str2Hash = ''.join( [i + argumet_dict[i] for i in sorted(argumet_dict)] ) + sign_key sign = hashlib.md5(str2Hash.encode('utf-8')).hexdigest() request_info = urllib.request.Request( 'http://api.letvcloud.com/gpc.php?{}&sign={}'.format( '&'.join( ['{}={}'.format(i, argumet_dict[i]) for i in argumet_dict] ), sign ) ) response = urllib.request.urlopen(request_info) data = response.read() info = json.loads(data.decode('utf-8')) type_available = [] for video_type in info['data']['video_info']['media']: type_available.append({ 'video_url': info['data']['video_info']['media'][video_type][ 'play_url' ]['main_url'], 'video_quality': int( info['data']['video_info']['media'][video_type][ 'play_url' ]['vtype'] ) }) urls = [base64.b64decode(sorted( type_available, key=lambda x: x['video_quality'])[-1]['video_url'] ).decode('utf-8')] size = urls_size(urls) ext = 'mp4' print_info(site_info, title, ext, size) if not info_only: download_urls(urls, title, ext, size, **kwargs)
def download(self, **kwargs): if 'json_output' in kwargs and kwargs['json_output']: json_output.output(self) elif 'info_only' in kwargs and kwargs['info_only']: if 'stream_id' in kwargs and kwargs['stream_id']: # Display the stream stream_id = kwargs['stream_id'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) else: # Display all available streams if 'index' not in kwargs: self.p([]) else: stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] self.p_i(stream_id) else: if 'stream_id' in kwargs and kwargs['stream_id']: # Download the stream stream_id = kwargs['stream_id'] else: # Download stream with the best quality stream_id = self.streams_sorted[0]['id'] \ if 'id' in self.streams_sorted[0] \ else self.streams_sorted[0]['itag'] if 'index' not in kwargs: self.p(stream_id) else: self.p_i(stream_id) if stream_id in self.streams: urls = self.streams[stream_id]['src'] ext = self.streams[stream_id]['container'] total_size = self.streams[stream_id]['size'] else: urls = self.dash_streams[stream_id]['src'] ext = self.dash_streams[stream_id]['container'] total_size = self.dash_streams[stream_id]['size'] if not urls: log.wtf('[Failed] Cannot extract video source.') if ext == 'm3u8': ffmpeg_kwargs = {} if 'iqiyi' in self.name: # ffmpeg_kwargs['override'] = True # ffmpeg_kwargs['params'] = { # '-c:a': 'copy', '-bsf:a': 'aac_adtstoasc' # } m3u8_urls = general_m3u8_extractor(urls[0]) # FIXME(iawia002): 如果要计算大小的话需要消耗太多时间 if len(m3u8_urls) <= 100: size = urls_size(m3u8_urls) else: size = float('inf') download_urls(m3u8_urls, self.title, 'mp4', size, **kwargs) else: download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False, **ffmpeg_kwargs) else: headers = copy(config.FAKE_HEADERS) if self.ua is not None: headers['User-Agent'] = self.ua if self.referer is not None: headers['Referer'] = self.referer download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmuku.') return for lang in self.caption_tracks: filename = '%s.%s.srt' % (get_filename(self.title), lang) print('Saving %s ... ' % filename, end="", flush=True) srt = self.caption_tracks[lang] with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf-8') as x: x.write(srt) print('Done.') if self.danmuku is not None and not dry_run: filename = '{}.cmt.xml'.format(get_filename(self.title)) print('Downloading {} ...\n'.format(filename)) with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: fp.write(self.danmuku) keep_obj = kwargs.get('keep_obj', False) if not keep_obj: self.__init__()
def twitter_download(url, info_only=False, **kwargs): html = get_content(url) screen_name = match1(html, r'data-screen-name="([^"]*)"') or \ match1(html, r'<meta name="twitter:title" content="([^"]*)"') item_id = match1(html, r'data-item-id="([^"]*)"') or \ match1(html, r'<meta name="twitter:site:id" content="([^"]*)"') page_title = '{} [{}]'.format(screen_name, item_id) try: # extract images urls = re.findall( r'property="og:image"\s*content="([^"]+:large)"', html ) assert urls images = [] for url in urls: url = ':'.join(url.split(':')[:-1]) + ':orig' filename = parse.unquote(url.split('/')[-1]) title = '.'.join(filename.split('.')[:-1]) ext = url.split(':')[-2].split('.')[-1] size = int(get_head(url)['Content-Length']) images.append({ 'title': title, 'url': url, 'ext': ext, 'size': size }) size = sum([image['size'] for image in images]) print_info(site_info, page_title, images[0]['ext'], size) if not info_only: for image in images: title = image['title'] ext = image['ext'] size = image['size'] url = image['url'] print_info(site_info, title, ext, size) download_urls([url], title, ext, size, **kwargs) except Exception: # extract video # always use i/cards or videos url if not re.match(r'https?://twitter.com/i/', url): url = match1( html, r'<meta\s*property="og:video:url"\s*content="([^"]+)"' ) if not url: url = 'https://twitter.com/i/videos/{}'.format(item_id) html = get_content(url) data_config = match1(html, r'data-config="([^"]*)"') or \ match1(html, r'data-player-config="([^"]*)"') i = json.loads(unescape(data_config)) if 'video_url' in i: source = i['video_url'] item_id = i['tweet_id'] page_title = "{} [{}]".format(screen_name, item_id) elif 'playlist' in i: source = i['playlist'][0]['source'] if not item_id: page_title = i['playlist'][0]['contentId'] elif 'vmap_url' in i: vmap_url = i['vmap_url'] vmap = get_content(vmap_url) source = match1(vmap, r'<MediaFile>\s*<!\[CDATA\[(.*)\]\]>') item_id = i['tweet_id'] page_title = '{} [{}]'.format(screen_name, item_id) elif 'scribe_playlist_url' in i: scribe_playlist_url = i['scribe_playlist_url'] return vine_download( scribe_playlist_url, info_only=info_only, **kwargs ) try: urls = extract_m3u(source) except Exception: urls = [source] size = urls_size(urls) mime, ext = 'video/mp4', 'mp4' print_info(site_info, page_title, mime, size) if not info_only: download_urls(urls, page_title, ext, size, **kwargs)
def extract(self, **kwargs): if 'stream_id' in kwargs and kwargs['stream_id']: i = kwargs['stream_id'] if 'size' not in self.streams[i]: self.streams[i]['size'] = urls_size(self.streams[i]['src'])