def baidu_download_album(aid, output_dir='.', merge=True, info_only=False): html = get_content('http://music.baidu.com/album/{}'.format(aid)) parser = get_parser(html) album_name = parser.find('h2', class_='album-name').text artist = parser.find('span', class_='author_list')['title'] output_dir = '{}/{} - {}'.format(output_dir, artist, album_name) ids = json.loads( match1(html, r'<span class="album-add" data-adddata=\'(.+?)\'>').replace( '"', '').replace(';', '"'))['ids'] track_nr = 1 for _id in ids: song_data = baidu_get_song_data(_id) song_url = song_data['songLink'] song_title = song_data['songName'] song_lrc = song_data['lrcLink'] file_name = '{:0>2d}.{}'.format(track_nr, song_title) _type, ext, size = url_info(song_url) print_info(site_info, song_title, _type, size) if not info_only: download_urls([song_url], file_name, ext, size, output_dir, merge=merge) if song_lrc: _type, ext, size = url_info(song_lrc) print_info(site_info, song_title, _type, size) if not info_only: download_urls([song_lrc], file_name, ext, size, output_dir) track_nr += 1
def joy_download(url, info_only=False, **kwargs): page = get_content(url) parser = get_parser(page) url = parser.source['src'] title = parser.h1.text.strip() _, ext, size = url_info(url) print_info(site_info, title, ext, size) if not info_only: download_urls([url], title, ext, size, **kwargs)
def prepare(self, **kwargs): if socket.getdefaulttimeout() == 600: # no timeout specified socket.setdefaulttimeout(2) # fail fast, very speedy! # handle 'watchlater' URLs if '/watchlater/' in self.url: aid = match1(self.url, r'av(\d+)') self.url = 'https://www.bilibili.com/video/av{}/'.format(aid) self.ua = FAKE_HEADERS['User-Agent'] if 'bangumi' not in self.url: # bangumi redirect will miss fragment argument here # http://bangumi.bilibili.com/anime/21542/play#173286 -> # https://www.bilibili.com/bangumi/play/ss21542 # It should be https://www.bilibili.com/bangumi/play/ss21542#173286 self.url = url_locations([self.url])[0] frag = urllib.parse.urlparse(self.url).fragment # http://www.bilibili.com/video/av3141144/index_2.html#page=3 if frag: page = match1(frag, r'page=(\d+)') if page: aid = match1(self.url, r'av(\d+)') self.url = ( 'https://www.bilibili.com/video/av{}/index_{}.html'.format( aid, page)) # handle bangumi url like this # http://bangumi.bilibili.com/anime/21542/play#173286 # https://www.bilibili.com/bangumi/play/ss21542#173286 # https://www.bilibili.com/bangumi/play/ep173286 bangumi_ep_id = match1(self.url, r'/anime/\d+/play#(\d+)') or \ match1(self.url, r'/bangumi/play/ss\d+#(\d+)') if bangumi_ep_id: self.url = 'https://www.bilibili.com/bangumi/play/ep{}'.format( bangumi_ep_id) self.referer = self.url self.page = get_content(self.url) self.parser = get_parser(self.page) if self.parser.h1: self.title = self.parser.h1.text.strip() else: # Some movie page got no h1 tag self.title = self.parser.find('meta', property='og:title')['content'] if 'subtitle' in kwargs: subtitle = kwargs['subtitle'] self.title = '{} {}'.format(self.title, subtitle) if 'live.bilibili.com' in self.url: self.live_entry(**kwargs) elif 'vc.bilibili.com' in self.url: self.vc_entry(**kwargs) else: # bangumi, movie use this entry too self.entry(**kwargs)
def douban_download_playlist(url, output_dir='.', **kwargs): html = get_content(url) parser = get_parser(html) video_dir = Path(output_dir) / parser.h1.a.text if not kwargs['info_only']: if not video_dir.exists(): os.mkdir(video_dir) urls = parser.find_all('a', class_='pr-video') for url in urls: douban_download(url['href'], output_dir=video_dir, **kwargs)
def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwargs): url = match1(url, r'([^?]*)') html = get_content(url) vid = match1(url, r'instagram.com/p/([^/]+)') parser = get_parser(html) description = parser.find('meta', property='og:title')['content'] title = '{} [{}]'.format(description, vid) stream = parser.find('meta', property='og:video') if stream: stream = stream['content'] _, ext, size = url_info(stream) print_info(site_info, title, ext, size) if not info_only: download_urls([stream], title, ext, size, output_dir, merge=merge) else: data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', html) info = json.loads(data.group(1)) if 'edge_sidecar_to_children' in info['entry_data']['PostPage'][0][ 'graphql']['shortcode_media']: edges = info['entry_data']['PostPage'][0]['graphql'][ 'shortcode_media']['edge_sidecar_to_children']['edges'] for edge in edges: title = edge['node']['shortcode'] image_url = edge['node']['display_url'] ext = image_url.split('.')[-1] size = url_size(image_url) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], title=title, ext=ext, total_size=size, output_dir=output_dir) else: title = info['entry_data']['PostPage'][0]['graphql'][ 'shortcode_media']['shortcode'] image_url = info['entry_data']['PostPage'][0]['graphql'][ 'shortcode_media']['display_url'] ext = image_url.split('.')[-1] size = url_size(image_url) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], title=title, ext=ext, total_size=size, output_dir=output_dir)
def giphy_download(url, info_only=False, **kwargs): html = get_content(url) parser = get_parser(html) title = parser.find('meta', property='og:title')['content'] gif = parser.find('meta', property='og:image')['content'] video = parser.find('meta', property='og:video')['content'] for url in [gif, video]: _type, ext, size = url_info(url) print_info(site_info, title, _type, size) if not info_only: download_urls([url], title, ext, size, **kwargs)
def bcy_download(url, output_dir='.', info_only=False, **kwargs): html = get_content(url) parser = get_parser(html) title = parser.h1.text.strip() output_dir = Path(output_dir) / title imgs = parser.find_all('img', class_='detail_std detail_clickable') print_info(site_info, title, 'jpg', 0) if not info_only: for img in imgs: # https://img9.bcyimg.com/drawer/15294/post/1799t/1f5a87801a0711e898b12b640777720f.jpg/w650 # noqa img = img['src'][:-5] filename, ext = img.split('/')[-1].split('.') download_urls( [img], filename, ext, 0, output_dir, **kwargs )
def ehow_download(url, info_only=False, **kwargs): assert re.search(r'https?://www.ehow.com/video_', url), 'URL you entered is not supported' html = get_content(url) parser = get_parser(html) title = parser.find('meta', property='og:title')['content'] video = parser.find('meta', property='og:video')['content'] url = match1(video, r'source=(.+?)&') _type, ext, size = url_info(url) print_info(site_info, title, _type, size) if not info_only: download_urls([url], title, ext, size, **kwargs)
def huomaotv_download( url, output_dir='.', merge=True, info_only=False, **kwargs ): room_id_pattern = r'huomao.com/(\d+)' room_id = match1(url, room_id_pattern) html = get_content( 'http://m.huomao.com/mobile/mob_live/{}'.format(room_id) ) parser = get_parser(html) m3u8_url = parser.source['src'] title = parser.title.text print_info(site_info, title, 'm3u8', float('inf')) if not info_only: download_url_ffmpeg( m3u8_url, title, 'mp4', None, output_dir=output_dir, merge=merge )
def prepare(self, **kwargs): assert self.url or self.vid if self.url and not self.vid: html = get_content(self.url) tvid = match1(self.url, r'#curid=(.+)_') or match1( self.url, r'tvid=([^&]+)') or match1( html, r'data-player-tvid="([^"]+)"') or match1( html, r'tv(?:i|I)d=(.+?)\&') or match1( html, r'param\[\'tvid\'\]\s*=\s*"(.+?)"') videoid = match1(self.url, r'#curid=.+_(.*)$') or match1( self.url, r'vid=([^&]+)') or match1( html, r'data-player-videoid="([^"]+)"') or match1( html, r'vid=(.+?)\&') or match1( html, r'param\[\'vid\'\]\s*=\s*"(.+?)"') self.vid = (tvid, videoid) info_u = 'http://mixer.video.iqiyi.com/jp/mixin/videos/{}'.format( tvid) mixin = get_content(info_u) mixin_json = json.loads(mixin[len('var tvInfoJs='):]) real_u = mixin_json['url'] real_html = get_content(real_u) parser = get_parser(real_html) self.title = parser.find('meta', property='og:title')['content'] tvid, videoid = self.vid info = getVMS(tvid, videoid) assert info['code'] == 'A00000', "can't play this video" for stream in info['data']['vidl']: try: stream_id = self.vd_2_id[stream['vd']] if stream_id in self.stream_types: continue stream_profile = self.id_2_profile[stream_id] self.streams[stream_id] = { 'video_profile': stream_profile, 'container': 'm3u8', 'src': [stream['m3u']], 'size': 0, 'm3u8_url': stream['m3u'] } except Exception as e: log.i('vd: {} is not handled'.format(stream['vd'])) log.i('info is {}'.format(stream))
def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url) html = get_content(url) parser = get_parser(html) title = parser.find(id='pageInfo')['data-title'] if match1(url, r'_(\d+)$'): # current P title = '{} {}'.format(title, match1(html, r'active">([^<]*)')) vid = match1(html, 'data-vid="(\d+)"') up = match1(html, 'data-name="([^"]+)"') p_title = match1(html, 'active">([^<]+)') title = '{} ({})'.format(title, up) if p_title: title = '{} - {}'.format(title, p_title) acfun_download_by_vid(vid, title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
def prepare(self, **kwargs): if socket.getdefaulttimeout() == 600: # no timeout specified socket.setdefaulttimeout(2) # fail fast, very speedy! # handle "watchlater" URLs if '/watchlater/' in self.url: aid = re.search(r'av(\d+)', self.url).group(1) self.url = 'http://www.bilibili.com/video/av{}/'.format(aid) self.ua = FAKE_HEADERS['User-Agent'] self.url = url_locations([self.url])[0] frag = urllib.parse.urlparse(self.url).fragment # http://www.bilibili.com/video/av3141144/index_2.html#page=3 if frag: hit = re.search(r'page=(\d+)', frag) if hit is not None: page = hit.group(1) aid = re.search(r'av(\d+)', self.url).group(1) self.url = ( 'http://www.bilibili.com/video/av{}/index_{}.html'.format( aid, page ) ) self.referer = self.url self.page = get_content(self.url) self.parser = get_parser(self.page) self.title = self.parser.h1.text.strip() if 'subtitle' in kwargs: subtitle = kwargs['subtitle'] self.title = '{} {}'.format(self.title, subtitle) if 'bangumi.bilibili.com/movie' in self.url: self.movie_entry(**kwargs) elif 'bangumi.bilibili.com' in self.url: self.bangumi_entry(**kwargs) elif 'live.bilibili.com' in self.url: self.live_entry(**kwargs) elif 'vc.bilibili.com' in self.url: self.vc_entry(**kwargs) else: self.entry(**kwargs)
def douban_download( url, output_dir='.', merge=True, info_only=False, **kwargs ): html = get_content(url) parser = get_parser(html) if re.match(r'https?://movie', url): title = ' '.join( [string.strip() for string in parser.h1.strings] ) tid = match1(url, 'trailer/(\d+)') real_url = 'https://movie.douban.com/trailer/video_url?tid={}'.format( tid ) _type, ext, size = url_info(real_url) print_info(site_info, title, _type, size) if not info_only: download_urls( [real_url], title, ext, size, output_dir, merge=merge )
def freesound_download(url, output_dir='.', merge=True, info_only=False, **kwargs): page = get_content(url) parser = get_parser(page) title = parser.find('meta', property='og:title')['content'] preview_url = parser.find('meta', property='og:audio')['content'] _type, ext, size = url_info(preview_url) print_info(site_info, title, _type, size) if not info_only: download_urls([preview_url], title, ext, size, output_dir, merge=merge, **kwargs)
def pixivision_download(url, output_dir='.', info_only=False, **kwargs): html = get_content(url) parser = get_parser(html) title = parser.h1.text.strip() output_dir = Path(output_dir) / title imgs = parser.find_all('img', class_='am__work__illust') print_info(site_info, title, 'jpg', 0) if not info_only: headers = FAKE_HEADERS.copy() headers.update({'Referer': url}) for img in imgs: img = img['src'] size = url_size(img, headers=headers) filename, ext = img.split('/')[-1].split('.') download_urls([img], filename, ext, size, output_dir, refer=url, **kwargs)
def test_parser(self): p = parser.get_parser('<h1> hello</h1>') self.assertEqual(p.h1.string.strip(), 'hello')
def longzhu_download(url, info_only=False, **kwargs): web_domain = url.split('/')[2] if (web_domain == 'star.longzhu.com') or (web_domain == 'y.longzhu.com'): domain = url.split('/')[3].split('?')[0] m_url = 'http://m.longzhu.com/{}'.format(domain) m_html = get_content(m_url) room_id_patt = r'var\s*roomId\s*=\s*(\d+);' room_id = match1(m_html, room_id_patt) json_url = ('http://liveapi.plu.cn/liveapp/roomstatus?roomId={}'. format(room_id)) content = get_content(json_url) data = json.loads(content) streamUri = data['streamUri'] if len(streamUri) <= 4: raise ValueError('The live stream is not online!') title = data['title'] streamer = data['userName'] title = '{}:{}'.format(streamer, title) steam_api_url = ( 'http://livestream.plu.cn/live/getlivePlayurl?roomId={}'.format( room_id)) content = get_content(steam_api_url) data = json.loads(content) isonline = data.get('isTransfer') if isonline == '0': raise ValueError('The live stream is not online!') real_url = data['playLines'][0]['urls'][0]['securityUrl'] print_info(site_info, title, 'flv', float('inf')) if not info_only: download_urls([real_url], title, 'flv', None, **kwargs) elif web_domain == 'replay.longzhu.com': videoid = match1(url, r'(\d+)$') json_url = ( 'http://liveapi.longzhu.com/livereplay/getreplayfordisplay?' 'videoId={}'.format(videoid)) content = get_content(json_url) data = json.loads(content) username = data['userName'] title = data['title'] title = '{}:{}'.format(username, title) real_url = data['videoUrl'] print_info(site_info, title, 'm3u8', 0) if player: download_urls([real_url], title, 'm3u8', 0, **kwargs) else: urls = general_m3u8_extractor(real_url) if not info_only: download_urls(urls, title, 'ts', 0, **kwargs) elif web_domain == 'v.longzhu.com': page = get_content(url) parser = get_parser(page) title = parser.title.text media_id = match1(url, r'(\d+)$') # http://r.plures.net/ov/video/mobile/channel/video-1525da26174.js json_url = ('http://api.v.plu.cn/CloudMedia/GetInfoForPlayer?' 'mediaId={}'.format(media_id)) content = get_content(json_url) data = json.loads(content) video = list(filter(lambda x: x['Ext'] == 'mp4', data['urls']))[0] video_url = video['SecurityUrl'] ext = video['Ext'] size = url_size(video_url) print_info(site_info, title, ext, size) if not info_only: download_urls([video_url], title, ext, size, **kwargs) else: raise ValueError('Wrong url or unsupported link ... {}'.format(url))