Beispiel #1
0
def naver_download_by_url(url, info_only=False, **kwargs):
    ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'
    page = get_content(url)
    og_video_url = re.search(
        r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page
    ).group(1)
    params_dict = urllib.parse.parse_qs(
        urllib.parse.urlparse(og_video_url).query
    )
    vid = params_dict['vid'][0]
    key = params_dict['outKey'][0]
    meta_str = get_content(ep.format(vid, key))
    meta_json = json.loads(meta_str)
    if 'errorCode' in meta_json:
        log.wtf(meta_json['errorCode'])
    title = meta_json['meta']['subject']
    videos = meta_json['videos']['list']
    video_list = sorted(
        videos, key=lambda video: video['encodingOption']['width']
    )
    video_url = video_list[-1]['source']
    size = url_size(video_url)
    print_info(site_info, title, 'mp4', size)
    if not info_only:
        download_urls([video_url], title, 'mp4', size, **kwargs)
Beispiel #2
0
def vidto_download(url, info_only=False, **kwargs):
    html = get_content(url)
    params = {}
    r = re.findall(
        r'type="(?:hidden|submit)?"(?:.*?)name="(.+?)"\s* value="?(.+?)">',
        html
    )
    for name, value in r:
        params[name] = value
    data = parse.urlencode(params).encode('utf-8')
    req = request.Request(url, headers=FAKE_HEADERS)
    print('Please wait for 6 seconds...')
    time.sleep(6)
    print('Starting')
    new_html = request.urlopen(req, data).read().decode('utf-8', 'replace')
    new_stff = re.search(r'lnk_download" href="(.*?)">', new_html)
    if new_stff:
        url = new_stff.group(1)
        title = params['fname']
        _type = ''
        ext = ''
        a, b, size = url_info(url)
        print_info(site_info, title, _type, size)
        if not info_only:
            download_urls([url], title, ext, size, **kwargs)
    else:
        log.wtf("Cann't find link, please review")
Beispiel #3
0
def panda_download(url, info_only=False, **kwargs):
    roomid = re.search('/(\d+)', url)
    if roomid is None:
        log.wtf('Cannot found room id for this url')
    roomid = roomid.group(1)
    json_request_url = (
        'http://www.panda.tv/api_room_v2?roomid={}&__plat=pc_web&_={}'.format(
            roomid, int(time.time())))
    content = get_content(json_request_url)
    api_json = json.loads(content)

    errno = api_json['errno']
    errmsg = api_json['errmsg']
    if errno:
        raise ValueError('Errno : {}, Errmsg : {}'.format(errno, errmsg))
    data = api_json['data']
    title = data['roominfo']['name']
    room_key = data['videoinfo']['room_key']
    plflag = data['videoinfo']['plflag'].split('_')
    status = data['videoinfo']['status']
    if status is not '2':
        raise ValueError(
            'The live stream is not online! (status:{})'.format(status))

    data2 = json.loads(data['videoinfo']['plflag_list'])
    rid = data2['auth']['rid']
    sign = data2['auth']['sign']
    ts = data2['auth']['time']
    real_url = ('http://pl{}.live.panda.tv/live_panda/{}.flv?sign={}&ts={}&'
                'rid={}'.format(plflag[1], room_key, sign, ts, rid))
    print_info(site_info, title, 'flv', float('inf'))
    if not info_only:
        download_urls([real_url], title, 'flv', None, **kwargs)
Beispiel #4
0
def douyutv_video_download(url,
                           output_dir='.',
                           merge=True,
                           info_only=False,
                           **kwargs):
    ep = 'http://vmobile.douyu.com/video/getInfo?vid='
    patt = r'show/([0-9A-Za-z]+)'
    title_patt = r'<h1>(.+?)</h1>'

    hit = re.search(patt, url)
    if hit is None:
        log.wtf('Unknown url pattern')
    vid = hit.group(1)

    page = get_content(url)
    hit = re.search(title_patt, page)
    if hit is None:
        title = vid
    else:
        title = hit.group(1)

    meta = json.loads(get_content(ep + vid))
    if meta['error'] != 0:
        log.wtf('Error from API server')
    m3u8_url = meta['data']['video_url']
    print_info('Douyu Video', title, 'm3u8', 0, m3u8_url=m3u8_url)
    if not info_only:
        urls = general_m3u8_extractor(m3u8_url)
        download_urls(urls,
                      title,
                      'ts',
                      0,
                      output_dir=output_dir,
                      merge=merge,
                      **kwargs)
Beispiel #5
0
def wanmen_download(url, info_only=False, **kwargs):

    if 'wanmen.org' not in url:
        log.wtf(
            'You are at the wrong place dude. This is for WanMen University!')

    courseID = int(match1(url, r'course\/(\d+)'))
    assert courseID > 0  # without courseID we cannot do anything

    tIndex = int(match1(url, r'tIndex=(\d+)'))

    pIndex = int(match1(url, r'pIndex=(\d+)'))

    json_api_content = _wanmen_get_json_api_content_by_courseID(courseID)

    if pIndex:  # only download ONE single part
        assert tIndex >= 0
        wanmen_download_by_course_topic_part(json_api_content,
                                             tIndex,
                                             pIndex,
                                             info_only=info_only,
                                             **kwargs)
    elif tIndex:  # download a topic
        wanmen_download_by_course_topic(json_api_content,
                                        tIndex,
                                        info_only=info_only,
                                        **kwargs)
    else:  # download the whole course
        wanmen_download_by_course(json_api_content,
                                  info_only=info_only,
                                  **kwargs)
Beispiel #6
0
 def get_room_id_from_url(self, match_id):
     meta = json.loads(get_content(self.game_ep + str(match_id)))
     if meta['error'] != 0:
         log.wtf('Error happens when accessing game_details api')
     rooms = meta['data']['anchor_data']
     for room in rooms:
         if room['is_use_room']:
             return room['room_id']
     log.wtf('No room available for match {}'.format(match_id))
Beispiel #7
0
def qq_download_by_vid(vid,
                       title,
                       output_dir='.',
                       merge=True,
                       info_only=False):
    info_api = ('http://vv.video.qq.com/getinfo?otype=json&appver=3.2.19.333'
                '&platform=11&defnpayver=1&vid={}'.format(vid))
    info = get_content(info_api)
    video_json = json.loads(match1(info, r'QZOutputJson=(.*)')[:-1])
    fn_pre = video_json['vl']['vi'][0]['lnk']
    title = video_json['vl']['vi'][0]['ti']
    host = video_json['vl']['vi'][0]['ul']['ui'][0]['url']
    streams = video_json['fl']['fi']
    seg_cnt = video_json['vl']['vi'][0]['cl']['fc']
    if seg_cnt == 0:
        seg_cnt = 1

    # best_quality = streams[-1]['name']
    part_format_id = streams[-1]['id']

    part_urls = []
    total_size = 0
    for part in range(1, seg_cnt + 1):
        filename = '{}.p{}.{}.mp4'.format(fn_pre, str(part_format_id % 10000),
                                          str(part))
        key_api = ('http://vv.video.qq.com/getkey?otype=json&platform=11&'
                   'format={}&vid={}&filename={}&appver=3.2.19.333'.format(
                       part_format_id, vid, filename))
        part_info = get_content(key_api)
        key_json = json.loads(match1(part_info, r'QZOutputJson=(.*)')[:-1])
        if key_json.get('key') is None:
            vkey = video_json['vl']['vi'][0]['fvkey']
            url = '{}{}?vkey={}'.format(
                video_json['vl']['vi'][0]['ul']['ui'][0]['url'],
                fn_pre + '.mp4', vkey)
        else:
            vkey = key_json['key']
            url = '{}{}?vkey={}'.format(host, filename, vkey)
        if not vkey:
            if part == 1:
                log.wtf(key_json['msg'])
            else:
                log.w(key_json['msg'])
            break

        part_urls.append(url)
        _, ext, size = url_info(url)
        total_size += size

    print_info(site_info, title, ext, total_size)
    if not info_only:
        download_urls(part_urls,
                      title,
                      ext,
                      total_size,
                      output_dir=output_dir,
                      merge=merge)
Beispiel #8
0
def sina_download_by_vid(vid, title=None, info_only=False, **kwargs):
    """Downloads a Sina video by its unique vid.
    http://video.sina.com.cn/
    """
    xml = api_req(vid)
    urls, name, size = video_info(xml)
    if urls is None:
        log.wtf(name)
    title = name
    print_info(site_info, title, 'flv', size)
    if not info_only:
        download_urls(urls, title, 'flv', size, **kwargs)
Beispiel #9
0
 def get_vid_from_url(self, url):
     """Extracts video ID from live.qq.com.
     """
     hit = re.search(r'live.qq.com/(\d+)', url)
     if hit is not None:
         return hit.group(1)
     hit = re.search(r'live.qq.com/directory/match/(\d+)', url)
     if hit is not None:
         return self.get_room_id_from_url(hit.group(1))
     html = get_content(url)
     room_id = match1(html, r'room_id\":(\d+)')
     if room_id is None:
         log.wtf('Unknown page {}'.format(url))
     return room_id
Beispiel #10
0
def pixnet_download(url, info_only=False, **kwargs):
    if not re.match(r'http://(\w)+.pixnet.net/album/video/(\d)+', url):
        log.wtf('[Failed] Unsupported URL pattern.')
        return
    # http://eric6513.pixnet.net/album/video/206644535
    html = get_content(url)
    title = ''.join(match1(
        html, r'<meta property="og:description\" content="([^"]*)"'
    ).split('-')[1:]).strip()

    time_now = int(time())

    m = re.match(r'http://(\w+).pixnet.net/album/video/(\d+)', url)

    username = m.group(1)
    # eric6513
    _id = m.group(2)
    # 206644535

    data_dict = {
        'username': username, 'autoplay': 1, 'id': _id, 'loop': 0,
        'profile': 9, 'time': time_now,
    }
    # have to be like this
    data_dict_str = quote(str(data_dict).replace("'", '"'), safe='"')
    url2 = 'http://api.pixnet.tv/content?type=json&customData={}'.format(
        data_dict_str
    )
    # &sig=edb07258e6a9ff40e375e11d30607983  can be blank for now
    # if required, can be obtained from url like
    # http://s.ext.pixnet.tv/user/eric6513/html5/autoplay/206644507.js
    # http://api.pixnet.tv/content?type=json&customData={%22username%22:%22eric6513%22,%22id%22:%22206644535%22,%22time%22:1441823350,%22autoplay%22:0,%22loop%22:0,%22profile%22:7}

    video_json = get_content(url2)
    content = json.loads(video_json)
    url_main = content['element']['video_url']
    url_backup = content['element']['backup_video_uri']

    try:
        # In some rare cases the main URL is IPv6 only...
        # Something like #611
        url_info(url_main)
        url = url_main
    except Exception:
        url = url_backup

    _type, ext, size = url_info(url)
    print_info(site_info, title, _type, size)
    if not info_only:
        download_urls([url], title, ext, size, **kwargs)
Beispiel #11
0
def funshion_download(url, **kwargs):
    if re.match(r'https?://www.fun.tv/vplay/v-(\w+)', url):
        vid = re.search(r'https?://www.fun.tv/vplay/v-(\w+)', url).group(1)
        Funshion().download_by_vid(vid, single_video=True, **kwargs)
    elif re.match(r'https?://www.fun.tv/vplay/.*g-(\w+)', url):
        epid = re.search(r'https?://www.fun.tv/vplay/.*g-(\w+)', url).group(1)
        url = ('http://pm.funshion.com/v5/media/episode?id={}&cl=mweb&'
               'uc=111'.format(epid))
        meta = json.loads(get_content(url))
        drama_name = meta['name']

        extractor = Funshion()
        for ep in meta['episodes']:
            title = '{}_{}_{}'.format(drama_name, ep['num'], ep['name'])
            extractor.download_by_vid(ep['id'], title=title, **kwargs)
    else:
        log.wtf('Unknown url pattern')
Beispiel #12
0
    def vc_entry(self, **kwargs):
        vc_id = match1(self.url, r'video/(\d+)') \
            or match1(self.url, r'vcdetail\?vc=(\d+)')
        if not vc_id:
            log.wtf('Unknown url pattern')
        endpoint = (
            'https://api.vc.bilibili.com/clip/v1/video/detail?video_id={}'
            '&need_playurl=1'.format(vc_id))
        vc_meta = json.loads(get_content(endpoint, headers=FAKE_HEADERS))
        if vc_meta['code'] != 0:
            log.wtf('{}\n{}'.format(vc_meta['msg'], vc_meta['message']))
        item = vc_meta['data']['item']
        self.title = item['description']

        self.streams['vc'] = {}
        self.streams['vc']['src'] = [item['video_playurl']]
        self.streams['vc']['container'] = 'mp4'
        self.streams['vc']['size'] = int(item['video_size'])
Beispiel #13
0
 def prepare(self, **kwargs):
     self.page = get_content(self.url)
     if self.vid is None:
         self.vid = self.get_vid_from_url()
     self.title = self.get_title()
     meta = json.loads(get_content(self.__class__.ep.format(self.vid)))
     if meta['code'] != 200:
         log.wtf(meta['message'])
     for video in meta['result']['videos']:
         height = video['height']
         url = self.__class__.cdn + video['key']
         stream_meta = dict(m3u8_url=url, size=0, container='mp4')
         video_profile = '{}x{}'.format(video['width'], video['height'])
         stream_meta['video_profile'] = video_profile
         for stream_type in self.__class__.stream_types:
             if height // 10 == int(stream_type['id'][:-1]) // 10:
                 # width 481, 482... 489 are all 480p here
                 stream_id = stream_type['id']
                 self.streams[stream_id] = stream_meta
Beispiel #14
0
    def download_playlist_by_url(self, url, **kwargs):
        self.url = url

        playlist_id = self.__class__.get_playlist_id_from_url(self.url)
        if playlist_id is None:
            log.wtf('[Failed] Unsupported URL pattern.')

        video_page = get_content(
            'https://www.youtube.com/playlist?list={}'.format(playlist_id))
        from html.parser import HTMLParser
        videos = sorted([
            HTMLParser().unescape(video)
            for video in re.findall(r'<a href="(/watch\?[^"]+)"', video_page)
            if parse_query_param(video, 'index')
        ],
                        key=lambda video: parse_query_param(video, 'index'))

        # Parse browse_ajax page for more videos to load
        load_more_href = match1(video_page,
                                r'data-uix-load-more-href="([^"]+)"')
        while load_more_href:
            browse_ajax = get_content(
                'https://www.youtube.com/{}'.format(load_more_href))
            browse_data = json.loads(browse_ajax)
            load_more_widget_html = browse_data['load_more_widget_html']
            content_html = browse_data['content_html']
            vs = set(re.findall(r'href="(/watch\?[^"]+)"', content_html))
            videos += sorted([
                HTMLParser().unescape(video) for video in list(vs)
                if parse_query_param(video, 'index')
            ])
            load_more_href = match1(load_more_widget_html,
                                    r'data-uix-load-more-href="([^"]+)"')

        self.title = re.search(r'<meta name="title" content="([^"]+)"',
                               video_page).group(1)
        self.p_playlist()
        for video in videos:
            vid = parse_query_param(video, 'v')
            index = parse_query_param(video, 'index')
            self.__class__().download_by_url(
                self.__class__.get_url_from_vid(vid), index=index, **kwargs)
Beispiel #15
0
    def prepare(self, **kwargs):
        if self.vid is None:
            hit = re.search(self.__class__.mobile_pt, self.url)
            self.vid = (hit.group(1), hit.group(2))

        ep_url = self.__class__.ep.format(self.vid[0], self.vid[1])
        meta = json.loads(get_content(ep_url))

        if meta['code'] != 0:
            log.wtf(meta['message']['errormsg'])

        file_path = self.__class__.file_host.format(meta['data']['file_path'])
        self.title = meta['data']['name']
        duration = str(meta['data']['duration']) + 's'

        self.streams['_default'] = {
            'src': [file_path],
            'video_profile': duration,
            'container': 'm4a'
        }
Beispiel #16
0
    def prepare(self, vid='', title=None, **kwargs):
        assert vid

        api_url = self.API_ENDPOINT + \
            'servlet/playinfo?vid={vid}&m=0'.format(vid=vid)  # return XML

        html = get_content(api_url)
        self.tree = ET.ElementTree(ET.fromstring(html))

        if self.tree.find('result').text != '1':
            log.wtf('API result says failed!')
            raise

        if title is None:
            self.title = '_'.join([
                i.text for i in self.tree.iterfind(
                    'video/videomarks/videomark/markdesc')
            ])
        else:
            self.title = title

        if not title:
            self.title = vid

        for i in self.tree.iterfind('video/quality'):
            quality = i.attrib['value']
            url = i[0].attrib['playurl']
            self.stream_types.append({
                'id': quality,
                'video_profile': i.attrib['desp']
            })
            self.streams[quality] = {
                'url': url,
                'video_profile': i.attrib['desp']
            }
            self.streams_sorted = [
                dict([('id', stream_type['id'])] +
                     list(self.streams[stream_type['id']].items()))
                for stream_type in self.__class__.stream_types
                if stream_type['id'] in self.streams
            ]
Beispiel #17
0
def sina_zxt(url, info_only=False, **kwargs):
    ep = 'http://s.video.sina.com.cn/video/play?video_id='
    frag = urllib.parse.urlparse(url).fragment
    if not frag:
        log.wtf('No video specified with fragment')
    meta = json.loads(get_content(ep + frag))
    if meta['code'] != 1:
        # Yes they use 1 for success.
        log.wtf(meta['message'])
    title = meta['data']['title']
    videos = sorted(meta['data']['videos'], key=lambda i: int(i['size']))

    if len(videos) == 0:
        log.wtf('No video file returned by API server')

    vid = videos[-1]['file_id']
    container = videos[-1]['type']
    size = int(videos[-1]['size'])

    if container == 'hlv':
        container = 'flv'

    urls, _, _ = video_info(api_req(vid))
    print_info(site_info, title, container, size)
    if not info_only:
        download_urls(urls, title, container, size, **kwargs)
    return
Beispiel #18
0
 def prepare(self, **kwargs):
     page = get_content(self.url)
     server_data = re.search(r'window\.__NUXT__=({.+?});', page)
     if server_data is None:
         log.wtf('cannot find server_data')
     json_data = json.loads(server_data.group(1))['state']
     live_info = json_data['live-info']['liveInfo']
     self.title = '{}_{}'.format(
         json_data['anchor-info']['anchorInfo']['nickName'],
         live_info['videoInfo']['title']
     )
     for exsited_stream in live_info['videoInfo']['streamInfos']:
         for s in self.__class__.stream_types:
             if s['video_profile'] == exsited_stream['bitrate']:
                 current_stream_id = s['id']
                 stream_info = dict(
                     src=[unescape(exsited_stream['playUrl'])]
                 )
                 stream_info['video_profile'] = exsited_stream['desc']
                 stream_info['container'] = s['container']
                 stream_info['size'] = float('inf')
                 self.streams[current_stream_id] = stream_info
Beispiel #19
0
    def download(self, **kwargs):
        """Override the original one
        Ugly ugly dirty hack
        """
        if 'json_output' in kwargs and kwargs['json_output']:
            json_output.output(self)
        elif 'info_only' in kwargs and kwargs['info_only']:
            if 'stream_id' in kwargs and kwargs['stream_id']:
                # Display the stream
                stream_id = kwargs['stream_id']
                if 'index' not in kwargs:
                    self.p(stream_id)
                else:
                    self.p_i(stream_id)
            else:
                # Display all available streams
                if 'index' not in kwargs:
                    self.p([])
                else:
                    stream_id = self.streams_sorted[0]['id'] \
                        if 'id' in self.streams_sorted[0] \
                        else self.streams_sorted[0]['itag']
                    self.p_i(stream_id)

        else:
            if 'stream_id' in kwargs and kwargs['stream_id']:
                # Download the stream
                stream_id = kwargs['stream_id']
            else:
                # Download stream with the best quality
                stream_id = self.streams_sorted[0]['id'] \
                    if 'id' in self.streams_sorted[0] \
                    else self.streams_sorted[0]['itag']

            if 'index' not in kwargs:
                self.p(stream_id)
            else:
                self.p_i(stream_id)

            if stream_id in self.streams:
                urls = self.streams[stream_id]['src']
                # ext = self.streams[stream_id]['container']
                # total_size = self.streams[stream_id]['size']
            else:
                urls = self.dash_streams[stream_id]['src']
                # ext = self.dash_streams[stream_id]['container']
                # total_size = self.dash_streams[stream_id]['size']

            if not urls:
                log.wtf('[Failed] Cannot extract video source.')

            # Here's the change!
            download_url_ffmpeg(urls[0],
                                self.title,
                                'mp4',
                                output_dir=kwargs['output_dir'],
                                merge=kwargs['merge'],
                                stream=False)

            if not kwargs['caption']:
                print('Skipping captions.')
                return
            for lang in self.caption_tracks:
                filename = '{}.{}.srt'.format(get_filename(self.title), lang)
                print('Saving {} ... '.format(filename), end='', flush=True)
                srt = self.caption_tracks[lang]
                with open(os.path.join(kwargs['output_dir'], filename),
                          'w',
                          encoding='utf-8') as x:
                    x.write(srt)
                print('Done.')
Beispiel #20
0
    def prepare(self, **kwargs):
        assert self.url or self.vid

        if not self.vid and self.url:
            self.vid = self.__class__.get_vid_from_url(self.url)

            if self.vid is None:
                self.download_playlist_by_url(self.url, **kwargs)
                exit(0)

        video_info = parse.parse_qs(
            get_content(
                'https://www.youtube.com/get_video_info?video_id={}'.format(
                    self.vid)))

        ytplayer_config = None
        if 'status' not in video_info:
            log.wtf('[Failed] Unknown status.')
        elif video_info['status'] == ['ok']:
            if 'use_cipher_signature' not in video_info \
                    or video_info['use_cipher_signature'] == ['False']:
                self.title = parse.unquote_plus(video_info['title'][0])

                # Parse video page (for DASH)
                video_page = get_content(
                    'https://www.youtube.com/watch?v={}'.format(self.vid))
                try:
                    ytplayer_config = json.loads(
                        re.search('ytplayer.config\s*=\s*([^\n]+?});',
                                  video_page).group(1))
                    self.html5player = 'https://www.youtube.com{}'.format(
                        ytplayer_config['assets']['js'])
                    # Workaround: get_video_info returns bad s. Why?
                    stream_list = ytplayer_config['args'][
                        'url_encoded_fmt_stream_map'].split(',')
                except Exception:
                    stream_list = video_info['url_encoded_fmt_stream_map'][
                        0].split(',')
                    self.html5player = None

            else:
                # Parse video page instead
                video_page = get_content(
                    'https://www.youtube.com/watch?v={}'.format(self.vid))
                ytplayer_config = json.loads(
                    re.search('ytplayer.config\s*=\s*([^\n]+?});',
                              video_page).group(1))

                self.title = ytplayer_config['args']['title']
                self.html5player = 'https://www.youtube.com{}'.format(
                    ytplayer_config['assets']['js'])
                stream_list = ytplayer_config['args'][
                    'url_encoded_fmt_stream_map'].split(',')

        elif video_info['status'] == ['fail']:
            if video_info['errorcode'] == ['150']:
                video_page = get_content(
                    'https://www.youtube.com/watch?v={}'.format(self.vid))
                try:
                    ytplayer_config = json.loads(
                        re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer',
                                  video_page).group(1))
                except Exception:
                    msg = re.search('class="message">([^<]+)<',
                                    video_page).group(1)
                    log.wtf('[Failed] "%s"' % msg.strip())

                if 'title' in ytplayer_config['args']:
                    # 150 Restricted from playback on certain sites
                    # Parse video page instead
                    self.title = ytplayer_config['args']['title']
                    self.html5player = 'https://www.youtube.com{}'.format(
                        ytplayer_config['assets']['js'])
                    stream_list = ytplayer_config['args'][
                        'url_encoded_fmt_stream_map'].split(',')
                else:
                    log.wtf('[Error] The uploader has not made this video '
                            'available in your country.')

            elif video_info['errorcode'] == ['100']:
                log.wtf('[Failed] This video does not exist.',
                        exit_code=int(video_info['errorcode'][0]))

            else:
                log.wtf('[Failed] %s' % video_info['reason'][0],
                        exit_code=int(video_info['errorcode'][0]))

        else:
            log.wtf('[Failed] Invalid status.')

        # YouTube Live
        if ytplayer_config and (
                ytplayer_config['args'].get('livestream') == '1'
                or ytplayer_config['args'].get('live_playback') == '1'):
            hlsvp = ytplayer_config['args']['hlsvp']

            if 'info_only' in kwargs and kwargs['info_only']:
                return
            else:
                download_url_ffmpeg(hlsvp, self.title, 'mp4')
                exit(0)

        for stream in stream_list:
            metadata = parse.parse_qs(stream)
            stream_itag = metadata['itag'][0]
            self.streams[stream_itag] = {
                'itag': metadata['itag'][0],
                'url': metadata['url'][0],
                'sig': metadata['sig'][0] if 'sig' in metadata else None,
                's': metadata['s'][0] if 's' in metadata else None,
                'quality': metadata['quality'][0],
                'type': metadata['type'][0],
                'mime': metadata['type'][0].split(';')[0],
                'container':
                mime_to_container(metadata['type'][0].split(';')[0]),
            }

        # Prepare caption tracks
        try:
            caption_tracks = json.loads(
                ytplayer_config['args']['player_response']
            )['captions']['playerCaptionsTracklistRenderer']['captionTracks']
            for ct in caption_tracks:
                ttsurl, lang = ct['baseUrl'], ct['languageCode']

                tts_xml = parseString(get_content(ttsurl))
                transcript = tts_xml.getElementsByTagName('transcript')[0]
                texts = transcript.getElementsByTagName('text')
                srt = ''
                seq = 0
                for text in texts:
                    if text.firstChild is None:
                        continue  # empty element
                    seq += 1
                    start = float(text.getAttribute('start'))
                    if text.getAttribute('dur'):
                        dur = float(text.getAttribute('dur'))
                    else:
                        dur = 1.0  # could be ill-formed XML
                    finish = start + dur
                    m, s = divmod(start, 60)
                    h, m = divmod(m, 60)
                    start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m),
                                                            s).replace(
                                                                '.', ',')
                    m, s = divmod(finish, 60)
                    h, m = divmod(m, 60)
                    finish = '{:0>2}:{:0>2}:{:06.3f}'.format(
                        int(h), int(m), s).replace('.', ',')
                    content = unescape(text.firstChild.nodeValue)

                    srt += '{}\n'.format(str(seq))
                    srt += '{} --> {}\n'.format(start, finish)
                    srt += '{}\n\n'.format(content)

                self.caption_tracks[lang] = srt
        except Exception:
            pass

        # Prepare DASH streams
        try:
            dashmpd = ytplayer_config['args']['dashmpd']
            dash_xml = parseString(get_content(dashmpd))
            for aset in dash_xml.getElementsByTagName('AdaptationSet'):
                mimeType = aset.getAttribute('mimeType')
                if mimeType == 'audio/mp4':
                    rep = aset.getElementsByTagName('Representation')[-1]
                    burls = rep.getElementsByTagName('BaseURL')
                    dash_mp4_a_url = burls[0].firstChild.nodeValue
                    dash_mp4_a_size = burls[0].getAttribute('yt:contentLength')
                    if not dash_mp4_a_size:
                        try:
                            dash_mp4_a_size = url_size(dash_mp4_a_url)
                        except Exception:
                            continue
                elif mimeType == 'audio/webm':
                    rep = aset.getElementsByTagName('Representation')[-1]
                    burls = rep.getElementsByTagName('BaseURL')
                    dash_webm_a_url = burls[0].firstChild.nodeValue
                    dash_webm_a_size = burls[0].getAttribute(
                        'yt:contentLength')
                    if not dash_webm_a_size:
                        try:
                            dash_webm_a_size = url_size(dash_webm_a_url)
                        except Exception:
                            continue
                elif mimeType == 'video/mp4':
                    for rep in aset.getElementsByTagName('Representation'):
                        w = int(rep.getAttribute('width'))
                        h = int(rep.getAttribute('height'))
                        itag = rep.getAttribute('id')
                        burls = rep.getElementsByTagName('BaseURL')
                        dash_url = burls[0].firstChild.nodeValue
                        dash_size = burls[0].getAttribute('yt:contentLength')
                        if not dash_size:
                            try:
                                dash_size = url_size(dash_url)
                            except Exception:
                                continue
                        self.dash_streams[itag] = {
                            'quality': '{}x{}'.format(w, h),
                            'itag': itag,
                            'type': mimeType,
                            'mime': mimeType,
                            'container': 'mp4',
                            'src': [dash_url, dash_mp4_a_url],
                            'size': int(dash_size) + int(dash_mp4_a_size)
                        }
                elif mimeType == 'video/webm':
                    for rep in aset.getElementsByTagName('Representation'):
                        w = int(rep.getAttribute('width'))
                        h = int(rep.getAttribute('height'))
                        itag = rep.getAttribute('id')
                        burls = rep.getElementsByTagName('BaseURL')
                        dash_url = burls[0].firstChild.nodeValue
                        dash_size = burls[0].getAttribute('yt:contentLength')
                        if not dash_size:
                            try:
                                dash_size = url_size(dash_url)
                            except Exception:
                                continue
                        self.dash_streams[itag] = {
                            'quality': '%sx%s' % (w, h),
                            'itag': itag,
                            'type': mimeType,
                            'mime': mimeType,
                            'container': 'webm',
                            'src': [dash_url, dash_webm_a_url],
                            'size': int(dash_size) + int(dash_webm_a_size)
                        }
        except Exception:
            # VEVO
            if not self.html5player:
                return
            self.js = get_content(self.html5player)
            if 'adaptive_fmts' in ytplayer_config['args']:
                streams = [
                    dict([(i.split('=')[0], parse.unquote(i.split('=')[1]))
                          for i in afmt.split('&')]) for afmt in
                    ytplayer_config['args']['adaptive_fmts'].split(',')
                ]
                for stream in streams:  # get over speed limiting
                    stream['url'] += '&ratebypass=yes'
                for stream in streams:  # audio
                    if stream['type'].startswith('audio/mp4'):
                        dash_mp4_a_url = stream['url']
                        if 's' in stream:
                            sig = self.__class__.decipher(self.js, stream['s'])
                            dash_mp4_a_url += '&signature={}'.format(sig)
                        dash_mp4_a_size = stream['clen']
                    elif stream['type'].startswith('audio/webm'):
                        dash_webm_a_url = stream['url']
                        if 's' in stream:
                            sig = self.__class__.decipher(self.js, stream['s'])
                            dash_webm_a_url += '&signature={}'.format(sig)
                        dash_webm_a_size = stream['clen']
                for stream in streams:  # video
                    if 'size' in stream:
                        if stream['type'].startswith('video/mp4'):
                            mimeType = 'video/mp4'
                            dash_url = stream['url']
                            if 's' in stream:
                                sig = self.__class__.decipher(
                                    self.js, stream['s'])
                                dash_url += '&signature={}'.format(sig)
                            dash_size = stream['clen']
                            itag = stream['itag']
                            self.dash_streams[itag] = {
                                'quality': stream['size'],
                                'itag': itag,
                                'type': mimeType,
                                'mime': mimeType,
                                'container': 'mp4',
                                'src': [dash_url, dash_mp4_a_url],
                                'size': int(dash_size) + int(dash_mp4_a_size)
                            }
                        elif stream['type'].startswith('video/webm'):
                            mimeType = 'video/webm'
                            dash_url = stream['url']
                            if 's' in stream:
                                sig = self.__class__.decipher(
                                    self.js, stream['s'])
                                dash_url += '&signature={}'.format(sig)
                            dash_size = stream['clen']
                            itag = stream['itag']
                            self.dash_streams[itag] = {
                                'quality': stream['size'],
                                'itag': itag,
                                'type': mimeType,
                                'mime': mimeType,
                                'container': 'webm',
                                'src': [dash_url, dash_webm_a_url],
                                'size': int(dash_size) + int(dash_webm_a_size)
                            }
Beispiel #21
0
    def download(self, **kwargs):
        if 'json_output' in kwargs and kwargs['json_output']:
            json_output.output(self)
        elif 'info_only' in kwargs and kwargs['info_only']:
            if 'stream_id' in kwargs and kwargs['stream_id']:
                # Display the stream
                stream_id = kwargs['stream_id']
                if 'index' not in kwargs:
                    self.p(stream_id)
                else:
                    self.p_i(stream_id)
            else:
                # Display all available streams
                if 'index' not in kwargs:
                    self.p([])
                else:
                    stream_id = self.streams_sorted[0]['id'] \
                        if 'id' in self.streams_sorted[0] \
                        else self.streams_sorted[0]['itag']
                    self.p_i(stream_id)

        else:
            if 'stream_id' in kwargs and kwargs['stream_id']:
                # Download the stream
                stream_id = kwargs['stream_id']
            else:
                # Download stream with the best quality
                stream_id = self.streams_sorted[0]['id'] \
                    if 'id' in self.streams_sorted[0] \
                    else self.streams_sorted[0]['itag']

            if 'index' not in kwargs:
                self.p(stream_id)
            else:
                self.p_i(stream_id)

            if stream_id in self.streams:
                urls = self.streams[stream_id]['src']
                ext = self.streams[stream_id]['container']
                total_size = self.streams[stream_id]['size']
            else:
                urls = self.dash_streams[stream_id]['src']
                ext = self.dash_streams[stream_id]['container']
                total_size = self.dash_streams[stream_id]['size']

            if not urls:
                log.wtf('[Failed] Cannot extract video source.')

            if ext == 'm3u8':
                ffmpeg_kwargs = {}
                if 'iqiyi' in self.name:
                    # ffmpeg_kwargs['override'] = True
                    # ffmpeg_kwargs['params'] = {
                    #     '-c:a': 'copy', '-bsf:a': 'aac_adtstoasc'
                    # }
                    m3u8_urls = general_m3u8_extractor(urls[0])
                    # FIXME(iawia002): 如果要计算大小的话需要消耗太多时间
                    if len(m3u8_urls) <= 100:
                        size = urls_size(m3u8_urls)
                    else:
                        size = float('inf')
                    download_urls(m3u8_urls, self.title, 'mp4', size, **kwargs)
                else:
                    download_url_ffmpeg(urls[0],
                                        self.title,
                                        'mp4',
                                        output_dir=kwargs['output_dir'],
                                        merge=kwargs['merge'],
                                        stream=False,
                                        **ffmpeg_kwargs)
            else:
                headers = copy(config.FAKE_HEADERS)
                if self.ua is not None:
                    headers['User-Agent'] = self.ua
                if self.referer is not None:
                    headers['Referer'] = self.referer
                download_urls(urls,
                              self.title,
                              ext,
                              total_size,
                              headers=headers,
                              output_dir=kwargs['output_dir'],
                              merge=kwargs['merge'],
                              av=stream_id in self.dash_streams)
            if 'caption' not in kwargs or not kwargs['caption']:
                print('Skipping captions or danmuku.')
                return
            for lang in self.caption_tracks:
                filename = '%s.%s.srt' % (get_filename(self.title), lang)
                print('Saving %s ... ' % filename, end="", flush=True)
                srt = self.caption_tracks[lang]
                with open(os.path.join(kwargs['output_dir'], filename),
                          'w',
                          encoding='utf-8') as x:
                    x.write(srt)
                print('Done.')
            if self.danmuku is not None and not dry_run:
                filename = '{}.cmt.xml'.format(get_filename(self.title))
                print('Downloading {} ...\n'.format(filename))
                with open(os.path.join(kwargs['output_dir'], filename),
                          'w',
                          encoding='utf8') as fp:
                    fp.write(self.danmuku)

        keep_obj = kwargs.get('keep_obj', False)
        if not keep_obj:
            self.__init__()
Beispiel #22
0
 def get_vid_from_url(self):
     hit = re.search(self.__class__.vid_patt, self.page)
     if hit is None:
         log.wtf('Cannot get stream_id')
     return hit.group(1)
Beispiel #23
0
    def prepare(self, **kwargs):
        assert self.url or self.vid

        if self.url and not self.vid:
            self.get_vid_from_url()

            if self.vid is None:
                self.get_vid_from_page()

                if self.vid is None:
                    log.wtf('Cannot fetch vid')

        if kwargs.get('password') and kwargs['password']:
            self.password_protected = True
            self.password = kwargs['password']

        self.utid = fetch_cna()
        time.sleep(3)
        self.youku_ups()

        if self.api_data.get('stream') is None:
            if self.api_error_code == -6001:  # wrong vid parsed from the page
                vid_from_url = self.vid
                self.get_vid_from_page()
                if vid_from_url == self.vid:
                    log.wtf(self.api_error_msg)
                self.youku_ups()

        if self.api_data.get('stream') is None:
            if self.api_error_code == -2002:  # wrong password
                self.password_protected = True
                # it can be True already(from cli).
                # offer another chance to retry
                self.password = input(log.sprint('Password: '******'stream') is None:
            if self.api_error_msg:
                log.wtf(self.api_error_msg)
            else:
                log.wtf('Unknown error')

        self.title = self.api_data['video']['title']
        stream_types = dict([(i['id'], i) for i in self.stream_types])
        audio_lang = self.api_data['stream'][0]['audio_lang']

        for stream in self.api_data['stream']:
            stream_id = stream['stream_type']
            is_preview = False
            if stream_id in stream_types \
                    and stream['audio_lang'] == audio_lang:
                if 'alias-of' in stream_types[stream_id]:
                    stream_id = stream_types[stream_id]['alias-of']

                if stream_id not in self.streams:
                    self.streams[stream_id] = {
                        'container': stream_types[stream_id]['container'],
                        'video_profile':
                        stream_types[stream_id]['video_profile'],
                        'size': stream['size'],
                        'pieces': [{
                            'segs': stream['segs']
                        }],
                        'm3u8_url': stream['m3u8_url']
                    }
                    src = []
                    for seg in stream['segs']:
                        if seg.get('cdn_url'):
                            src.append(
                                self.__class__.change_cdn(seg['cdn_url']))
                        else:
                            is_preview = True
                    self.streams[stream_id]['src'] = src
                else:
                    self.streams[stream_id]['size'] += stream['size']
                    self.streams[stream_id]['pieces'].append(
                        {'segs': stream['segs']})
                    src = []
                    for seg in stream['segs']:
                        if seg.get('cdn_url'):
                            src.append(
                                self.__class__.change_cdn(seg['cdn_url']))
                        else:
                            is_preview = True
                    self.streams[stream_id]['src'].extend(src)
            if is_preview:
                log.w('{} is a preview'.format(stream_id))

        # Audio languages
        if 'dvd' in self.api_data:
            al = self.api_data['dvd'].get('audiolang')
            if al:
                self.audiolang = al
                for i in self.audiolang:
                    i['url'] = 'http://v.youku.com/v_show/id_{}'.format(
                        i['vid'])
Beispiel #24
0
def youku_download_playlist_by_url(url, **kwargs):
    video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)'
    js_cb_pt = '\(({.+})\)'
    if re.match(video_page_pt, url):
        youku_obj = Youku()
        youku_obj.url = url
        youku_obj.prepare(**kwargs)
        total_episode = None
        try:
            total_episode = youku_obj.api_data['show']['episode_total']
        except KeyError:
            log.wtf('Cannot get total_episode for {}'.format(url))
        next_vid = youku_obj.vid
        for _ in range(total_episode):
            this_extractor = Youku()
            this_extractor.download_by_vid(next_vid, keep_obj=True, **kwargs)
            next_vid = this_extractor.video_next['encodevid']
        '''
        if youku_obj.video_list is None:
            log.wtf('Cannot find video list for {}'.format(url))
        else:
            vid_list = [v['encodevid'] for v in youku_obj.video_list]
            for v in vid_list:
                Youku().download_by_vid(v, **kwargs)
        '''

    elif re.match('https?://list.youku.com/show/id_', url):
        # http://list.youku.com/show/id_z2ae8ee1c837b11e18195.html
        # official playlist
        page = get_content(url)
        show_id = re.search(r'showid:"(\d+)"', page).group(1)
        ep = ('http://list.youku.com/show/module?id={}&tab=showInfo&'
              'callback=jQuery'.format(show_id))
        xhr_page = get_content(ep).replace('\/', '/').replace('\"', '"')
        video_url = re.search(
            r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)',
            xhr_page).group(1)
        youku_download_playlist_by_url('http://' + video_url, **kwargs)
        return
    elif re.match('https?://list.youku.com/albumlist/show/id_(\d+)\.html',
                  url):
        # http://list.youku.com/albumlist/show/id_2336634.html
        # UGC playlist
        list_id = re.search(
            'https?://list.youku.com/albumlist/show/id_(\d+)\.html',
            url).group(1)
        ep = ('http://list.youku.com/albumlist/items?id={}&page={}&size=20&'
              'ascending=1&callback=tuijsonp6')

        first_u = ep.format(list_id, 1)
        xhr_page = get_content(first_u)
        json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1))
        video_cnt = json_data['data']['total']
        xhr_html = json_data['html']
        v_urls = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)',
                            xhr_html)

        if video_cnt > 20:
            req_cnt = video_cnt // 20
            for i in range(2, req_cnt + 2):
                req_u = ep.format(list_id, i)
                xhr_page = get_content(req_u)
                json_data = json.loads(
                    re.search(js_cb_pt, xhr_page).group(1).replace('\/', '/'))
                xhr_html = json_data['html']
                page_videos = re.findall(
                    r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)',
                    xhr_html)
                v_urls.extend(page_videos)
        for u in v_urls[0::2]:
            url = 'http://' + u
            Youku().download_by_url(url, **kwargs)
        return
Beispiel #25
0
    def download(self, **kwargs):
        if 'json_output' in kwargs and kwargs['json_output']:
            json_output.output(self)
        elif 'info_only' in kwargs and kwargs['info_only']:
            if 'stream_id' in kwargs and kwargs['stream_id']:
                # Display the stream
                stream_id = kwargs['stream_id']
                if 'index' not in kwargs:
                    self.p(stream_id)
                else:
                    self.p_i(stream_id)
            else:
                # Display all available streams
                if 'index' not in kwargs:
                    self.p([])
                else:
                    stream_id = self.streams_sorted[0]['id'] \
                        if 'id' in self.streams_sorted[0] \
                        else self.streams_sorted[0]['itag']
                    self.p_i(stream_id)

        else:
            if 'stream_id' in kwargs and kwargs['stream_id']:
                # Download the stream
                stream_id = kwargs['stream_id']
            else:
                # Download stream with the best quality
                stream_id = self.streams_sorted[0]['id'] \
                    if 'id' in self.streams_sorted[0] \
                    else self.streams_sorted[0]['itag']

            if 'index' not in kwargs:
                self.p(stream_id)
            else:
                self.p_i(stream_id)

            if stream_id in self.streams:
                urls = self.streams[stream_id]['src']
                ext = self.streams[stream_id]['container']
                total_size = self.streams[stream_id]['size']
            else:
                urls = self.dash_streams[stream_id]['src']
                ext = self.dash_streams[stream_id]['container']
                total_size = self.dash_streams[stream_id]['size']

            if ext == 'm3u8':
                ext = 'mp4'

            if not urls:
                log.wtf('[Failed] Cannot extract video source.')
            # For legacy main()
            headers = copy(config.FAKE_HEADERS)
            if self.ua is not None:
                headers['User-Agent'] = self.ua
            if self.referer is not None:
                headers['Referer'] = self.referer
            download_urls(
                urls,
                self.title,
                ext,
                total_size,
                headers=headers,
                output_dir=kwargs['output_dir'],
                merge=kwargs['merge'],
                av=stream_id in self.dash_streams
            )
            if 'caption' not in kwargs or not kwargs['caption']:
                print('Skipping captions or danmuku.')
                return
            for lang in self.caption_tracks:
                filename = '%s.%s.srt' % (get_filename(self.title), lang)
                print('Saving %s ... ' % filename, end="", flush=True)
                srt = self.caption_tracks[lang]
                with open(
                    os.path.join(kwargs['output_dir'], filename),
                    'w',
                    encoding='utf-8'
                ) as x:
                    x.write(srt)
                print('Done.')
            if self.danmuku is not None and not dry_run:
                filename = '{}.cmt.xml'.format(get_filename(self.title))
                print('Downloading {} ...\n'.format(filename))
                with open(
                    os.path.join(kwargs['output_dir'], filename),
                    'w',
                    encoding='utf8'
                ) as fp:
                    fp.write(self.danmuku)

            # For main_dev()
            # download_urls(
            #     urls, self.title, self.streams[stream_id]['container'],
            #     self.streams[stream_id]['size']
            # )
        keep_obj = kwargs.get('keep_obj', False)
        if not keep_obj:
            self.__init__()