def is_channel(string): # example, https://www.youtube.com/channel/UCFdTiwvDjyc62DBWrlYDtlQs try: regex_search(r"(channel/)([0-9A-Za-z_-]{24}).*", string, group=1) return True except: return False
def is_watchUrl(string): # - :samp:`https://youtube.com/watch?v={video_id}` # - :samp:`https://youtube.com/embed/{video_id}` # - :samp:`https://youtu.be/{video_id}` try: regex_search(r"(?:v=|/)([0-9A-Za-z_-]{11}).*", string, group=1) return True except: return False
def is_playList(string): # return (f"playlist?list=" in string) # example, https://www.youtube.com/playlist?list=PL-g0fdC5RMboYEyt6QS2iLb_1m7QcgfHk try: regex_search(r"(playlist\?list=)([0-9A-Za-z_-]{24,34}).*", string, group=1) return True except: return False
def is_age_restricted(watch_html: str) -> bool: """Check if content is age restricted. :param str watch_html: The html contents of the watch page. :rtype: bool :returns: Whether or not the content is age restricted. """ try: regex_search(r"og:restrictions:age", watch_html, group=0) except RegexMatchError: return False return True
def is_age_restricted(watch_html): """Check if content is age restricted. :param str watch_html: The html contents of the watch page. :rtype: bool :returns: Whether or not the content is age restricted. """ try: regex_search(r'og:restrictions:age', watch_html, group=0) except RegexMatchError: return False return True
def video_info_url( video_id, watch_url, watch_html, embed_html, age_restricted, ): """Construct the video_info url. :param str video_id: A YouTube video identifier. :param str watch_url: A YouTube watch url. :param str watch_html: The html contents of the watch page. :param str embed_html: The html contents of the embed page (for age restricted videos). :param bool age_restricted: Is video age restricted. :rtype: str :returns: :samp:`https://youtube.com/get_video_info` with necessary GET parameters. """ if age_restricted: sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1) # Here we use ``OrderedDict`` so that the output is consistent between # Python 2.7+. params = OrderedDict([ ('video_id', video_id), ('eurl', eurl(video_id)), ('sts', sts), ]) else: # I'm not entirely sure what ``t`` represents. Looks to represent a # boolean. t = regex_search( r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]', watch_html, group=0, ) params = OrderedDict([ ('video_id', video_id), ('el', '$el'), ('ps', 'default'), ('eurl', quote(watch_url)), ('hl', 'en_US'), ('t', quote(t)), ]) return 'https://youtube.com/get_video_info?' + urlencode(params)
def get_transform_object(js, var): """Extract the "transform object". The "transform object" contains the function definitions referenced in the "transform plan". The ``var`` argument is the obfuscated variable name which contains these functions, for example, given the function call ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var. :param str js: The contents of the base.js asset file. :param str var: The obfuscated variable name that stores an object with all functions that descrambles the signature. **Example**: >>> get_transform_object(js, 'DE') ['AJ:function(a){a.reverse()}', 'VR:function(a,b){a.splice(0,b)}', 'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}'] """ pattern = r"var %s={(.*?)};" % re.escape(var) logger.debug("getting transform object") return (regex_search(pattern, js, group=1, flags=re.DOTALL).replace("\n", " ").split(", "))
def get_transform_object(js, var): """Extract the "transform object". The "transform object" contains the function definitions referenced in the "transform plan". The ``var`` argument is the obfuscated variable name which contains these functions, for example, given the function call ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var. :param str js: The contents of the base.js asset file. :param str var: The obfuscated variable name that stores an object with all functions that descrambles the signature. **Example**: >>> get_transform_object(js, 'DE') ['AJ:function(a){a.reverse()}', 'VR:function(a,b){a.splice(0,b)}', 'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}'] """ pattern = r'var %s={(.*?)};' % re.escape(var) logger.debug('getting transform object') return ( regex_search(pattern, js, group=1, flags=re.DOTALL) .replace('\n', ' ') .split(', ') )
def video_info_url(video_id, watch_url, watch_html): """Contruct the video_info url. :param str video_id: A YouTube video identifer. :param str watch_url: A YouTube watch url. :param str watch_html: The html contents of the watch page. :rtype: str :returns: :samp:`https://youtube.com/get_video_info` with necessary GET parameters. """ # I'm not entirely sure what ``t`` represents. Looks to represent a # boolean. t = regex_search(r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]', watch_html, group=0) # Here we use ``OrderedDict`` so that the output is consistant between # Python 2.7+. params = OrderedDict([ ('video_id', video_id), ('el', '$el'), ('ps', 'default'), ('eurl', quote(watch_url)), ('hl', 'en_US'), ('t', quote(t)), ]) return 'https://youtube.com/get_video_info?' + urlencode(params)
def get_transform_plan(js): """Extract the "transform plan". The "transform plan" is the functions that the ciphered signature is cycled through to obtain the actual signature. :param str js: The contents of the base.js asset file. **Example**: >>> get_transform_plan(js) ['DE.AJ(a,15)', 'DE.VR(a,3)', 'DE.AJ(a,51)', 'DE.VR(a,3)', 'DE.kT(a,51)', 'DE.kT(a,8)', 'DE.VR(a,3)', 'DE.kT(a,21)'] """ name = re.escape(get_initial_function_name(js)) pattern = r'%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}' % name logger.debug('getting transform plan') return regex_search(pattern, js, group=1).split(';')
def video_info_url_age_restricted(video_id: str, embed_html: str) -> str: """Construct the video_info url. :param str video_id: A YouTube video identifier. :param str embed_html: The html contents of the embed page (for age restricted videos). :rtype: str :returns: :samp:`https://youtube.com/get_video_info` with necessary GET parameters. """ try: sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1) except RegexMatchError: sts = "" # Here we use ``OrderedDict`` so that the output is consistent between # Python 2.7+. eurl = f"https://youtube.googleapis.com/v/{video_id}" params = OrderedDict([ ("video_id", video_id), ("eurl", eurl), ("sts", sts), ("html5", "1"), ]) return _video_info_url(params)
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); # Fix set: https://github.com/nficano/pytube/pull/701/commits/773866382c3412e01f97f242e753cf32f52aaefa # 30.07.2020 Fix github.com/H4KKR/pytubeX/commit/f35b948afe3029ef60b427a1afd14e4551f2b7a7 pattern = [ r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r"\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(", r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 ] PLog('cipher: finding initial function name') return regex_search(pattern, js, group=1)
def is_region_blocked(watch_html: str) -> bool: """Determine if a video is not available in the user's region. :param str watch_html: The html contents of the watch page. :rtype: bool :returns: True if the video is blocked in the users region. False if not, or if unknown. """ player_response = initial_player_response(watch_html) country_code_patterns = [ r"gl\s*=\s*['\"](\w{2})['\"]", # gl="US" r"['\"]gl['\"]\s*:\s*['\"](\w{2})['\"]" # "gl":"US" ] for pattern in country_code_patterns: try: yt_detected_country = regex_search(pattern, watch_html, 1) available_countries = player_response['microformat'][ 'playerMicroformatRenderer']['availableCountries'] except (KeyError, RegexMatchError): pass else: if yt_detected_country not in available_countries: return True return False
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); pattern = [ r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r"\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(", r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<si$", # noqa: E501 r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501 ] logger.debug("finding initial function name") return regex_search(pattern, js, group=1)
def get_transform_plan(js): """Extract the "transform plan". The "transform plan" is the functions that the ciphered signature is cycled through to obtain the actual signature. :param str js: The contents of the base.js asset file. **Example**: >>> get_transform_plan(js) ['DE.AJ(a,15)', 'DE.VR(a,3)', 'DE.AJ(a,51)', 'DE.VR(a,3)', 'DE.kT(a,51)', 'DE.kT(a,8)', 'DE.VR(a,3)', 'DE.kT(a,21)'] """ name = re.escape(get_initial_function_name(js)) pattern = r"%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}" % name logger.debug("getting transform plan") return regex_search(pattern, js, group=1).split(";")
def video_info_url( video_id, watch_url, watch_html, embed_html, age_restricted, ): """Construct the video_info url. :param str video_id: A YouTube video identifier. :param str watch_url: A YouTube watch url. :param str watch_html: The html contents of the watch page. :param str embed_html: The html contents of the embed page (for age restricted videos). :param bool age_restricted: Is video age restricted. :rtype: str :returns: :samp:`https://youtube.com/get_video_info` with necessary GET parameters. """ if age_restricted: sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1) # Here we use ``OrderedDict`` so that the output is consistent between # Python 2.7+. params = OrderedDict([ ('video_id', video_id), ('eurl', eurl(video_id)), ('sts', sts), ]) else: # I'm not entirely sure what ``t`` represents. Looks to represent a # boolean. t = regex_search( r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]', watch_html, group=0, ) params = OrderedDict([ ('video_id', video_id), ('el', '$el'), ('ps', 'default'), ('eurl', quote(watch_url)), ('hl', 'en_US'), ('t', quote(t)), ]) return 'https://youtube.com/get_video_info?' + urlencode(params)
def title(self) -> Optional[str]: """Extract playlist title :return: playlist title (name) :rtype: Optional[str] """ pattern = r"<title>(.+?)</title>" return regex_search(pattern, self.html, 1).replace("- YouTube", "").strip()
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); pattern = r'"signature",\s?([a-zA-Z0-9$]+)\(' logger.debug('finding initial function name') return regex_search(pattern, js, group=1)
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); pattern = r'"signature",\s?([a-zA-Z0-9$]+)\(' logger.debug('finding initial function name') return regex_search(pattern, js, group=1)
def video_id(url: str) -> str: """Extract the ``video_id`` from a YouTube url. This function supports the following patterns: - :samp:`https://youtube.com/watch?v={video_id}` - :samp:`https://youtube.com/embed/{video_id}` - :samp:`https://youtu.be/{video_id}` :param str url: A YouTube url containing a video id. :rtype: str :returns: YouTube video id. """ return regex_search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url, group=1)
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); pattern = [ r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(' ] logger.debug('finding initial function name') return regex_search(pattern, js, group=1)
def initial_player_response(watch_html: str) -> str: """Extract the ytInitialPlayerResponse json from the watch_html page. This mostly contains metadata necessary for rendering the page on-load, such as video information, copyright notices, etc. @param watch_html: Html of the watch page @return: """ pattern = r"window\[['\"]ytInitialPlayerResponse['\"]]\s*=\s*({[^\n]+});" try: return regex_search(pattern, watch_html, 1) except RegexMatchError: return "{}"
def seq_filesize(url): """Fetch size in bytes of file at given URL from sequential requests :param str url: The URL to get the size of :returns: int: size in bytes of remote file """ total_filesize = 0 # YouTube expects a request sequence number as part of the parameters. split_url = parse.urlsplit(url) base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path) querys = dict(parse.parse_qsl(split_url.query)) # The 0th sequential request provides the file headers, which tell us # information about how the file is segmented. querys['sq'] = 0 url = base_url + parse.urlencode(querys) response = _execute_request( url, method="GET" ) response_value = response.read() # The file header must be added to the total filesize total_filesize += len(response_value) # We can then parse the header to find the number of segments segment_count = 0 stream_info = response_value.split(b'\r\n') segment_regex = b'Segment-Count: (\\d+)' for line in stream_info: # One of the lines should contain the segment count, but we don't know # which, so we need to iterate through the lines to find it try: segment_count = int(regex_search(segment_regex, line, 1)) except RegexMatchError: pass if segment_count == 0: raise RegexMatchError('seq_filesize', segment_regex) # We make HEAD requests to the segments sequentially to find the total filesize. seq_num = 1 while seq_num <= segment_count: # Create sequential request URL querys['sq'] = seq_num url = base_url + parse.urlencode(querys) total_filesize += int(head(url)['content-length']) seq_num += 1 return total_filesize
def initial_data(watch_html: str) -> str: """Extract the ytInitialData json from the watch_html page. This mostly contains metadata necessary for rendering the page on-load, such as video information, copyright notices, etc. @param watch_html: Html of the watch page @return: """ initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*([^\n]+)" try: match = regex_search(initial_data_pattern, watch_html, 1) except RegexMatchError: return "{}" else: return match[:-1]
def video_id(url): """Extract the ``video_id`` from a YouTube url. This function supports the following patterns: - :samp:`https://youtube.com/watch?v={video_id}` - :samp:`https://youtube.com/embed/{video_id}` - :samp:`https://youtu.be/{video_id}` :param str url: A YouTube url containing a video id. :rtype: str :returns: YouTube video id. """ return regex_search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url, group=1)
def get_ytplayer_config(watch_html): """Get the YouTube player configuration data from the watch html. Extract the ``ytplayer_config``, which is json data embedded within the watch html and serves as the primary source of obtaining the stream manifest data. :param str watch_html: The html contents of the watch page. :rtype: str :returns: Substring of the html containing the encoded manifest data. """ pattern = r';ytplayer\.config\s*=\s*({.*?});' yt_player_config = regex_search(pattern, watch_html, group=1) return json.loads(yt_player_config)
def get_ytplayer_config(watch_html): """Get the YouTube player configuration data from the watch html. Extract the ``ytplayer_config``, which is json data embedded within the watch html and serves as the primary source of obtaining the stream manifest data. :param str watch_html: The html contents of the watch page. :rtype: str :returns: Substring of the html containing the encoded manifest data. """ pattern = r';ytplayer\.config\s*=\s*({.*?});' yt_player_config = regex_search(pattern, watch_html, group=1) return json.loads(yt_player_config)
def publish_date(watch_html: str): """Extract publish date :param str watch_html: The html contents of the watch page. :rtype: str :returns: Publish date of the video. """ try: result = regex_search( r"(?<=itemprop=\"datePublished\" content=\")\d{4}-\d{2}-\d{2}", watch_html, group=0) except RegexMatchError: return None return datetime.strptime(result, '%Y-%m-%d')
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); pattern = [ r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', ] logger.debug('finding initial function name') return regex_search(pattern, js, group=1)
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); pattern = [ r'yt\.akamaized\.net/\)\s*\|\|\s*' r'.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent' r'\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent' r'\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(', ] logger.debug('finding initial function name') return regex_search(pattern, js, group=1)
def get_videos_from_channel(url): videos = list() try: channel_id: str = regex_search(r"(?:channel|\/)([0-9A-Za-z_-]{24}).*", url, group=1) except IndexError: # assume that url is just the id channel_id = url channel_url = f"https://www.youtube.com/channel/{channel_id}/videos" html = request.get(channel_url) video_regex = re.compile(r"href=\"(/watch\?v=[\w-]*)") videos = uniqueify(video_regex.findall(html)) videos = [f"https://www.youtube.com{video_id}" for video_id in videos] return videos
def parse_function(js_func): """Parse the Javascript transform function. Break a JavaScript transform function down into a two element ``tuple`` containing the function name and some integer-based argument. :param str js_func: The JavaScript version of the transform function. :rtype: tuple :returns: two element tuple containing the function name and an argument. **Example**: >>> parse_function('DE.AJ(a,15)') ('AJ', 15) """ logger.debug("parsing transform function") return regex_search(r"\w+\.(\w+)\(\w,(\d+)\)", js_func, groups=True)
def parse_function(js_func): """Parse the Javascript transform function. Break a JavaScript transform function down into a two element ``tuple`` containing the function name and some integer-based argument. :param str js_func: The JavaScript version of the transform function. :rtype: tuple :returns: two element tuple containing the function name and an argument. **Example**: >>> parse_function('DE.AJ(a,15)') ('AJ', 15) """ logger.debug('parsing transform function') return regex_search(r'\w+\.(\w+)\(\w,(\d+)\)', js_func, groups=True)
def get_ytplayer_config(html: str, age_restricted: bool = False) -> Any: """Get the YouTube player configuration data from the watch html. Extract the ``ytplayer_config``, which is json data embedded within the watch html and serves as the primary source of obtaining the stream manifest data. :param str html: The html contents of the watch page. :param bool age_restricted: Is video age restricted. :rtype: str :returns: Substring of the html containing the encoded manifest data. """ if age_restricted: pattern = r";yt\.setConfig\(\{'PLAYER_CONFIG':\s*({.*})(,'EXPERIMENT_FLAGS'|;)" # noqa: E501 else: pattern = r";ytplayer\.config\s*=\s*({.*?});" yt_player_config = regex_search(pattern, html, group=1) return json.loads(yt_player_config)
def get_ytplayer_config(html, age_restricted=False): """Get the YouTube player configuration data from the watch html. Extract the ``ytplayer_config``, which is json data embedded within the watch html and serves as the primary source of obtaining the stream manifest data. :param str watch_html: The html contents of the watch page. :param bool age_restricted: Is video age restricted. :rtype: str :returns: Substring of the html containing the encoded manifest data. """ if age_restricted: pattern = r";yt\.setConfig\(\{'PLAYER_CONFIG':\s*({.*})(,'EXPERIMENT_FLAGS'|;)" # noqa: E501 else: pattern = r';ytplayer\.config\s*=\s*({.*?});' yt_player_config = regex_search(pattern, html, group=1) return json.loads(yt_player_config)
def video_info_url( video_id: str, watch_url: str, embed_html: Optional[str], age_restricted: bool, ) -> str: """Construct the video_info url. :param str video_id: A YouTube video identifier. :param str watch_url: A YouTube watch url. :param str embed_html: The html contents of the embed page (for age restricted videos). :param bool age_restricted: Is video age restricted. :rtype: str :returns: :samp:`https://youtube.com/get_video_info` with necessary GET parameters. """ if age_restricted: assert embed_html is not None sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1) # Here we use ``OrderedDict`` so that the output is consistent between # Python 2.7+. params = OrderedDict([ ("video_id", video_id), ("eurl", eurl(video_id)), ("sts", sts), ]) else: params = OrderedDict([ ("video_id", video_id), ("el", "$el"), ("ps", "default"), ("eurl", quote(watch_url)), ("hl", "en_US"), ]) return "https://youtube.com/get_video_info?" + urlencode(params)
def mime_type_codec(mime_type_codec): """Parse the type data. Breaks up the data in the ``type`` key of the manifest, which contains the mime type and codecs serialized together, and splits them into separate elements. **Example**: >>> mime_type_codec('audio/webm; codecs="opus"') ('audio/webm', ['opus']) :param str mime_type_codec: String containing mime type and codecs. :rtype: tuple :returns: The mime type and a list of codecs. """ pattern = r'(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"' mime_type, codecs = regex_search(pattern, mime_type_codec, groups=True) return mime_type, [c.strip() for c in codecs.split(',')]
def mime_type_codec(mime_type_codec): """Parse the type data. Breaks up the data in the ``type`` key of the manifest, which contains the mime type and codecs serialized together, and splits them into separate elements. **Example**: >>> mime_type_codec('audio/webm; codecs="opus"') ('audio/webm', ['opus']) :param str mime_type_codec: String containing mime type and codecs. :rtype: tuple :returns: The mime type and a list of codecs. """ pattern = r'(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"' mime_type, codecs = regex_search(pattern, mime_type_codec, groups=True) return mime_type, [c.strip() for c in codecs.split(',')]
def test_regex_search_no_match(): with pytest.raises(RegexMatchError): helpers.regex_search("^a$", "", group=0)
def test_regex_search(): # TODO(nficano): should check isinstance assert helpers.regex_search('^a$', 'a') is not None
def test_regex_search_no_match(): with pytest.raises(RegexMatchError): helpers.regex_search('^a$', '', groups=True)
def load_streams(self): while self.__download_manager.thread_count > 1: self.sig_step.emit(self.id, 'Waiting for threads to clear...') thread_name = QThread.currentThread().objectName() thread_id = int(QThread.currentThreadId()) self.sig_step.emit(self.id, f'{thread_id}: {thread_name} thread starting...') self.__download_manager.videos = [] self.__download_manager.streams = [] proxies = self.__download_manager.get_proxies() top_level_item_count = self.__download_manager.stream_tree.topLevelItemCount() for i in range(top_level_item_count): self.__download_manager.stream_tree.takeTopLevelItem(i) self.__download_manager.stream_tree.clear() self.__download_manager.streams_to_download = {} try: print('get video id') print(extract.video_id(self.__download_manager.url.text())) self.sig_step.emit(self.id, f'Loading video') loaded_url = YouTube(self.__download_manager.url.text(), proxies=proxies) self.sig_step.emit(self.id, f'Loaded video: {loaded_url.title}') self.sig_msg.emit(f'Found {loaded_url.title}') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return self.__download_manager.videos.append(loaded_url) except RegexMatchError: print('playlist') if 'playlist' in self.__download_manager.url.text(): regex_search(r'(?:list=|\/)([0-9A-Za-z_-]{11}).*', self.__download_manager.url.text(), group=1) loaded_url = Playlist(self.__download_manager.url.text()) self.sig_msg.emit(f'Loaded playlist. Discovering videos...') loaded_url.populate_video_urls() i = 0 self.sig_progress_status.emit(0) for video_url in loaded_url.video_urls: self.sig_step.emit(self.id, f'Loading video {i}') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return self.sig_progress_total.emit(int((i / (len(loaded_url.video_urls) * 2)) * 100)) vid = YouTube(video_url, proxies=proxies) self.sig_step.emit(self.id, f'Loaded video: {vid.title}') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return self.sig_msg.emit(f'Found {vid.title}') self.__download_manager.videos.append(vid) self.sig_progress_status.emit(int((i / len(loaded_url.video_urls)) * 100)) i += 1 self.sig_progress_total.emit(50) else: self.sig_error.emit('Could not determine Video ' 'or Playlist ID from provided URL!\n' 'Please check input!') self.sig_done.emit(self.id) return except Exception as e: self.sig_error.emit(str(e)) self.sig_done.emit(self.id) return self.sig_msg.emit(f'Loading Streams..') print('loading streams') i = 0 for video in self.__download_manager.videos: self.sig_progress_status.emit(0) self.sig_step.emit(self.id, f'Loading streams for video {i}') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return audio_streams = QTreeWidgetItem(['Audio Only']) tree_item = StreamTreeWidgetItem([video.title], f'video_{i}', self.__download_manager, video, None) self.__download_manager.streams = video.streams.all() x = 0 for stream in self.__download_manager.streams: self.sig_step.emit(self.id, f'Loading stream {x}') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return self.sig_msg.emit(f'Video {i + 1}/{len(self.__download_manager.videos)}: ' f'Loading Stream ITAG ID: {stream.itag}') if stream.video_codec is None: stream_item = StreamTreeWidgetItem([ f'Codec: {stream.audio_codec}, ' f'ABR: {stream.abr}, ' f'File Type: {stream.mime_type.split("/")[1]}, ' f'Size: {stream.filesize // 1024} KB' ], f'video_{i}_stream{x}', self.__download_manager, video, stream) self.sig_step.emit(self.id, f'Loaded stream {x}') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return audio_streams.addChild(stream_item) else: stream_item = StreamTreeWidgetItem([ f'Res: {stream.resolution}, FPS: {stream.fps}, ' f' Video Codec: {stream.video_codec}, Audio Codec: {stream.audio_codec}, ' f'File Type: {stream.mime_type.split("/")[1]}, ' f'Size: {stream.filesize // 1024} KB' ], f'video_{i}_stream{x}', self.__download_manager, video, stream) self.sig_step.emit(self.id, f'Loaded stream {x}') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return tree_item.addChild(stream_item) stream_item.setCheckState(0, Qt.Unchecked) x += 1 self.sig_progress_status.emit(int((x / len(self.__download_manager.streams)) * 100)) tree_item.addChild(audio_streams) self.sig_step.emit(self.id, f'Adding video {i} to tree') if self.__abort: self.sig_progress_status.emit(f'Aborted!') self.sig_done.emit(self.id) return self.__download_manager.stream_tree.addTopLevelItem(tree_item) i += 1 self.sig_progress_status.emit(100) self.sig_progress_total.emit(int((i / (len(self.__download_manager.videos) * 2)) * 100) + 50) self.sig_msg.emit(f'Streams Loaded!') self.sig_done.emit(self.id)
def test_regex_search(): assert helpers.regex_search("^a$", "a", group=0) == "a"