def get_vid_from_url(url): """Extracts video ID from URL. """ return match1(url, r'youtu\.be/([^?/]+)') or \ match1(url, r'youtube\.com/embed/([^/?]+)') or \ match1(url, r'youtube\.com/v/([^/?]+)') or \ match1(url, r'youtube\.com/watch/([^/?]+)') or \ parse_query_param(url, 'v') or \ parse_query_param(parse_query_param(url, 'u'), 'v')
def download_playlist_by_url(self, url, **kwargs): self.url = url playlist_id = self.__class__.get_playlist_id_from_url(self.url) if playlist_id is None: log.wtf('[Failed] Unsupported URL pattern.') video_page = get_content( 'https://www.youtube.com/playlist?list={}'.format(playlist_id)) from html.parser import HTMLParser videos = sorted([ HTMLParser().unescape(video) for video in re.findall(r'<a href="(/watch\?[^"]+)"', video_page) if parse_query_param(video, 'index') ], key=lambda video: parse_query_param(video, 'index')) # Parse browse_ajax page for more videos to load load_more_href = match1(video_page, r'data-uix-load-more-href="([^"]+)"') while load_more_href: browse_ajax = get_content( 'https://www.youtube.com/{}'.format(load_more_href)) browse_data = json.loads(browse_ajax) load_more_widget_html = browse_data['load_more_widget_html'] content_html = browse_data['content_html'] vs = set(re.findall(r'href="(/watch\?[^"]+)"', content_html)) videos += sorted([ HTMLParser().unescape(video) for video in list(vs) if parse_query_param(video, 'index') ]) load_more_href = match1(load_more_widget_html, r'data-uix-load-more-href="([^"]+)"') self.title = re.search(r'<meta name="title" content="([^"]+)"', video_page).group(1) self.p_playlist() for video in videos: vid = parse_query_param(video, 'v') index = parse_query_param(video, 'index') self.__class__().download_by_url( self.__class__.get_url_from_vid(vid), index=index, **kwargs)
def get_playlist_id_from_url(url): """Extracts playlist ID from URL. """ return parse_query_param(url, 'list') or \ parse_query_param(url, 'p')