def is_channel(string):
    # example, https://www.youtube.com/channel/UCFdTiwvDjyc62DBWrlYDtlQs
    try:
        regex_search(r"(channel/)([0-9A-Za-z_-]{24}).*", string, group=1)
        return True
    except:
        return False
def is_watchUrl(string):
    # - :samp:`https://youtube.com/watch?v={video_id}`
    # - :samp:`https://youtube.com/embed/{video_id}`
    # - :samp:`https://youtu.be/{video_id}`
    try:
        regex_search(r"(?:v=|/)([0-9A-Za-z_-]{11}).*", string, group=1)
        return True
    except:
        return False
def is_playList(string):
    # return (f"playlist?list=" in string)
    # example, https://www.youtube.com/playlist?list=PL-g0fdC5RMboYEyt6QS2iLb_1m7QcgfHk
    try:
        regex_search(r"(playlist\?list=)([0-9A-Za-z_-]{24,34}).*",
                     string,
                     group=1)
        return True
    except:
        return False
Beispiel #4
0
def is_age_restricted(watch_html: str) -> bool:
    """Check if content is age restricted.
    :param str watch_html:
        The html contents of the watch page.
    :rtype: bool
    :returns:
        Whether or not the content is age restricted.
    """
    try:
        regex_search(r"og:restrictions:age", watch_html, group=0)
    except RegexMatchError:
        return False
    return True
Beispiel #5
0
def is_age_restricted(watch_html):
    """Check if content is age restricted.

    :param str watch_html:
        The html contents of the watch page.
    :rtype: bool
    :returns:
        Whether or not the content is age restricted.
    """
    try:
        regex_search(r'og:restrictions:age', watch_html, group=0)
    except RegexMatchError:
        return False
    return True
def video_info_url(
    video_id,
    watch_url,
    watch_html,
    embed_html,
    age_restricted,
):
    """Construct the video_info url.

    :param str video_id:
        A YouTube video identifier.
    :param str watch_url:
        A YouTube watch url.
    :param str watch_html:
        The html contents of the watch page.
    :param str embed_html:
        The html contents of the embed page (for age restricted videos).
    :param bool age_restricted:
        Is video age restricted.
    :rtype: str
    :returns:
        :samp:`https://youtube.com/get_video_info` with necessary GET
        parameters.
    """
    if age_restricted:
        sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1)
        # Here we use ``OrderedDict`` so that the output is consistent between
        # Python 2.7+.
        params = OrderedDict([
            ('video_id', video_id),
            ('eurl', eurl(video_id)),
            ('sts', sts),
        ])
    else:
        # I'm not entirely sure what ``t`` represents. Looks to represent a
        # boolean.
        t = regex_search(
            r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]',
            watch_html,
            group=0,
        )
        params = OrderedDict([
            ('video_id', video_id),
            ('el', '$el'),
            ('ps', 'default'),
            ('eurl', quote(watch_url)),
            ('hl', 'en_US'),
            ('t', quote(t)),
        ])
    return 'https://youtube.com/get_video_info?' + urlencode(params)
def get_transform_object(js, var):
    """Extract the "transform object".

    The "transform object" contains the function definitions referenced in the
    "transform plan". The ``var`` argument is the obfuscated variable name
    which contains these functions, for example, given the function call
    ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var.

    :param str js:
        The contents of the base.js asset file.
    :param str var:
        The obfuscated variable name that stores an object with all functions
        that descrambles the signature.

    **Example**:

    >>> get_transform_object(js, 'DE')
    ['AJ:function(a){a.reverse()}',
    'VR:function(a,b){a.splice(0,b)}',
    'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}']

    """
    pattern = r"var %s={(.*?)};" % re.escape(var)
    logger.debug("getting transform object")
    return (regex_search(pattern, js, group=1,
                         flags=re.DOTALL).replace("\n", " ").split(", "))
Beispiel #8
0
def get_transform_object(js, var):
    """Extract the "transform object".

    The "transform object" contains the function definitions referenced in the
    "transform plan". The ``var`` argument is the obfuscated variable name
    which contains these functions, for example, given the function call
    ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var.

    :param str js:
        The contents of the base.js asset file.
    :param str var:
        The obfuscated variable name that stores an object with all functions
        that descrambles the signature.

    **Example**:

    >>> get_transform_object(js, 'DE')
    ['AJ:function(a){a.reverse()}',
    'VR:function(a,b){a.splice(0,b)}',
    'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}']

    """
    pattern = r'var %s={(.*?)};' % re.escape(var)
    logger.debug('getting transform object')
    return (
        regex_search(pattern, js, group=1, flags=re.DOTALL)
        .replace('\n', ' ')
        .split(', ')
    )
Beispiel #9
0
def video_info_url(video_id, watch_url, watch_html):
    """Contruct the video_info url.

    :param str video_id:
        A YouTube video identifer.
    :param str watch_url:
        A YouTube watch url.
    :param str watch_html:
        The html contents of the watch page.

    :rtype: str
    :returns:
        :samp:`https://youtube.com/get_video_info` with necessary GET
        parameters.
    """
    # I'm not entirely sure what ``t`` represents. Looks to represent a
    # boolean.
    t = regex_search(r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]', watch_html, group=0)
    # Here we use ``OrderedDict`` so that the output is consistant between
    # Python 2.7+.
    params = OrderedDict([
        ('video_id', video_id),
        ('el', '$el'),
        ('ps', 'default'),
        ('eurl', quote(watch_url)),
        ('hl', 'en_US'),
        ('t', quote(t)),
    ])
    return 'https://youtube.com/get_video_info?' + urlencode(params)
Beispiel #10
0
def get_transform_plan(js):
    """Extract the "transform plan".

    The "transform plan" is the functions that the ciphered signature is
    cycled through to obtain the actual signature.

    :param str js:
        The contents of the base.js asset file.

    **Example**:

    >>> get_transform_plan(js)
    ['DE.AJ(a,15)',
    'DE.VR(a,3)',
    'DE.AJ(a,51)',
    'DE.VR(a,3)',
    'DE.kT(a,51)',
    'DE.kT(a,8)',
    'DE.VR(a,3)',
    'DE.kT(a,21)']
    """
    name = re.escape(get_initial_function_name(js))
    pattern = r'%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}' % name
    logger.debug('getting transform plan')
    return regex_search(pattern, js, group=1).split(';')
Beispiel #11
0
def video_info_url_age_restricted(video_id: str, embed_html: str) -> str:
    """Construct the video_info url.

    :param str video_id:
        A YouTube video identifier.
    :param str embed_html:
        The html contents of the embed page (for age restricted videos).
    :rtype: str
    :returns:
        :samp:`https://youtube.com/get_video_info` with necessary GET
        parameters.
    """
    try:
        sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1)
    except RegexMatchError:
        sts = ""
    # Here we use ``OrderedDict`` so that the output is consistent between
    # Python 2.7+.
    eurl = f"https://youtube.googleapis.com/v/{video_id}"
    params = OrderedDict([
        ("video_id", video_id),
        ("eurl", eurl),
        ("sts", sts),
        ("html5", "1"),
    ])
    return _video_info_url(params)
def get_initial_function_name(js):
	"""Extract the name of the function responsible for computing the signature.

	:param str js:
		The contents of the base.js asset file.

	"""
	# c&&d.set("signature", EE(c));
	# Fix set: https://github.com/nficano/pytube/pull/701/commits/773866382c3412e01f97f242e753cf32f52aaefa
	# 30.07.2020 Fix github.com/H4KKR/pytubeX/commit/f35b948afe3029ef60b427a1afd14e4551f2b7a7
	pattern = [
		r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
		r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
		r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501
		r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',  # noqa: E501
		r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
		r"\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(",
		r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
		r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
		r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
		r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
		r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
		r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
	]

	PLog('cipher: finding initial function name')
	return regex_search(pattern, js, group=1)
Beispiel #13
0
def is_region_blocked(watch_html: str) -> bool:
    """Determine if a video is not available in the user's region.

    :param str watch_html:
        The html contents of the watch page.
    :rtype: bool
    :returns:
        True if the video is blocked in the users region.
        False if not, or if unknown.
    """
    player_response = initial_player_response(watch_html)
    country_code_patterns = [
        r"gl\s*=\s*['\"](\w{2})['\"]",  # gl="US"
        r"['\"]gl['\"]\s*:\s*['\"](\w{2})['\"]"  # "gl":"US"
    ]
    for pattern in country_code_patterns:
        try:
            yt_detected_country = regex_search(pattern, watch_html, 1)
            available_countries = player_response['microformat'][
                'playerMicroformatRenderer']['availableCountries']
        except (KeyError, RegexMatchError):
            pass
        else:
            if yt_detected_country not in available_countries:
                return True
    return False
def get_initial_function_name(js):
    """Extract the name of the function responsible for computing the signature.

    :param str js:
        The contents of the base.js asset file.

    """
    # c&&d.set("signature", EE(c));

    pattern = [
        r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
        r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
        r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',  # noqa: E501
        r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
        r"\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(",
        r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<si$",  # noqa: E501
        r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
        r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
        r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
        r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
        r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(",  # noqa: E501
    ]

    logger.debug("finding initial function name")
    return regex_search(pattern, js, group=1)
def get_transform_plan(js):
    """Extract the "transform plan".

    The "transform plan" is the functions that the ciphered signature is
    cycled through to obtain the actual signature.

    :param str js:
        The contents of the base.js asset file.

    **Example**:

    >>> get_transform_plan(js)
    ['DE.AJ(a,15)',
    'DE.VR(a,3)',
    'DE.AJ(a,51)',
    'DE.VR(a,3)',
    'DE.kT(a,51)',
    'DE.kT(a,8)',
    'DE.VR(a,3)',
    'DE.kT(a,21)']
    """
    name = re.escape(get_initial_function_name(js))
    pattern = r"%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}" % name
    logger.debug("getting transform plan")
    return regex_search(pattern, js, group=1).split(";")
Beispiel #16
0
def video_info_url(
    video_id, watch_url, watch_html, embed_html,
    age_restricted,
):
    """Construct the video_info url.

    :param str video_id:
        A YouTube video identifier.
    :param str watch_url:
        A YouTube watch url.
    :param str watch_html:
        The html contents of the watch page.
    :param str embed_html:
        The html contents of the embed page (for age restricted videos).
    :param bool age_restricted:
        Is video age restricted.
    :rtype: str
    :returns:
        :samp:`https://youtube.com/get_video_info` with necessary GET
        parameters.
    """
    if age_restricted:
        sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1)
        # Here we use ``OrderedDict`` so that the output is consistent between
        # Python 2.7+.
        params = OrderedDict([
            ('video_id', video_id),
            ('eurl', eurl(video_id)),
            ('sts', sts),
        ])
    else:
        # I'm not entirely sure what ``t`` represents. Looks to represent a
        # boolean.
        t = regex_search(
            r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]', watch_html,
            group=0,
        )
        params = OrderedDict([
            ('video_id', video_id),
            ('el', '$el'),
            ('ps', 'default'),
            ('eurl', quote(watch_url)),
            ('hl', 'en_US'),
            ('t', quote(t)),
        ])
    return 'https://youtube.com/get_video_info?' + urlencode(params)
Beispiel #17
0
    def title(self) -> Optional[str]:
        """Extract playlist title

        :return: playlist title (name)
        :rtype: Optional[str]
        """
        pattern = r"<title>(.+?)</title>"
        return regex_search(pattern, self.html, 1).replace("- YouTube", "").strip()
Beispiel #18
0
def get_initial_function_name(js):
    """Extract the name of the function responsible for computing the signature.

    :param str js:
        The contents of the base.js asset file.

    """
    # c&&d.set("signature", EE(c));
    pattern = r'"signature",\s?([a-zA-Z0-9$]+)\('
    logger.debug('finding initial function name')
    return regex_search(pattern, js, group=1)
def get_initial_function_name(js):
    """Extract the name of the function responsible for computing the signature.

    :param str js:
        The contents of the base.js asset file.

    """
    # c&&d.set("signature", EE(c));
    pattern = r'"signature",\s?([a-zA-Z0-9$]+)\('
    logger.debug('finding initial function name')
    return regex_search(pattern, js, group=1)
Beispiel #20
0
def video_id(url: str) -> str:
    """Extract the ``video_id`` from a YouTube url.
    This function supports the following patterns:
    - :samp:`https://youtube.com/watch?v={video_id}`
    - :samp:`https://youtube.com/embed/{video_id}`
    - :samp:`https://youtu.be/{video_id}`
    :param str url:
        A YouTube url containing a video id.
    :rtype: str
    :returns:
        YouTube video id.
    """
    return regex_search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url, group=1)
Beispiel #21
0
def get_initial_function_name(js):
    """Extract the name of the function responsible for computing the signature.

    :param str js:
        The contents of the base.js asset file.

    """
    # c&&d.set("signature", EE(c));
    pattern = [
        r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('
    ]
    logger.debug('finding initial function name')
    return regex_search(pattern, js, group=1)
Beispiel #22
0
def initial_player_response(watch_html: str) -> str:
    """Extract the ytInitialPlayerResponse json from the watch_html page.

    This mostly contains metadata necessary for rendering the page on-load,
    such as video information, copyright notices, etc.

    @param watch_html: Html of the watch page
    @return:
    """
    pattern = r"window\[['\"]ytInitialPlayerResponse['\"]]\s*=\s*({[^\n]+});"
    try:
        return regex_search(pattern, watch_html, 1)
    except RegexMatchError:
        return "{}"
Beispiel #23
0
def seq_filesize(url):
    """Fetch size in bytes of file at given URL from sequential requests

    :param str url: The URL to get the size of
    :returns: int: size in bytes of remote file
    """
    total_filesize = 0
    # YouTube expects a request sequence number as part of the parameters.
    split_url = parse.urlsplit(url)
    base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
    querys = dict(parse.parse_qsl(split_url.query))

    # The 0th sequential request provides the file headers, which tell us
    #  information about how the file is segmented.
    querys['sq'] = 0
    url = base_url + parse.urlencode(querys)
    response = _execute_request(
        url, method="GET"
    )

    response_value = response.read()
    # The file header must be added to the total filesize
    total_filesize += len(response_value)

    # We can then parse the header to find the number of segments
    segment_count = 0
    stream_info = response_value.split(b'\r\n')
    segment_regex = b'Segment-Count: (\\d+)'
    for line in stream_info:
        # One of the lines should contain the segment count, but we don't know
        #  which, so we need to iterate through the lines to find it
        try:
            segment_count = int(regex_search(segment_regex, line, 1))
        except RegexMatchError:
            pass

    if segment_count == 0:
        raise RegexMatchError('seq_filesize', segment_regex)

    # We make HEAD requests to the segments sequentially to find the total filesize.
    seq_num = 1
    while seq_num <= segment_count:
        # Create sequential request URL
        querys['sq'] = seq_num
        url = base_url + parse.urlencode(querys)

        total_filesize += int(head(url)['content-length'])
        seq_num += 1
    return total_filesize
Beispiel #24
0
def initial_data(watch_html: str) -> str:
    """Extract the ytInitialData json from the watch_html page.

    This mostly contains metadata necessary for rendering the page on-load,
    such as video information, copyright notices, etc.

    @param watch_html: Html of the watch page
    @return:
    """
    initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*([^\n]+)"
    try:
        match = regex_search(initial_data_pattern, watch_html, 1)
    except RegexMatchError:
        return "{}"
    else:
        return match[:-1]
Beispiel #25
0
def video_id(url):
    """Extract the ``video_id`` from a YouTube url.

    This function supports the following patterns:

    - :samp:`https://youtube.com/watch?v={video_id}`
    - :samp:`https://youtube.com/embed/{video_id}`
    - :samp:`https://youtu.be/{video_id}`

    :param str url:
        A YouTube url containing a video id.
    :rtype: str
    :returns:
        YouTube video id.
    """
    return regex_search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url, group=1)
Beispiel #26
0
def get_ytplayer_config(watch_html):
    """Get the YouTube player configuration data from the watch html.

    Extract the ``ytplayer_config``, which is json data embedded within the
    watch html and serves as the primary source of obtaining the stream
    manifest data.

    :param str watch_html:
        The html contents of the watch page.
    :rtype: str
    :returns:
        Substring of the html containing the encoded manifest data.
    """
    pattern = r';ytplayer\.config\s*=\s*({.*?});'
    yt_player_config = regex_search(pattern, watch_html, group=1)
    return json.loads(yt_player_config)
def get_ytplayer_config(watch_html):
    """Get the YouTube player configuration data from the watch html.

    Extract the ``ytplayer_config``, which is json data embedded within the
    watch html and serves as the primary source of obtaining the stream
    manifest data.

    :param str watch_html:
        The html contents of the watch page.
    :rtype: str
    :returns:
        Substring of the html containing the encoded manifest data.
    """
    pattern = r';ytplayer\.config\s*=\s*({.*?});'
    yt_player_config = regex_search(pattern, watch_html, group=1)
    return json.loads(yt_player_config)
Beispiel #28
0
def publish_date(watch_html: str):
    """Extract publish date
    :param str watch_html:
        The html contents of the watch page.
    :rtype: str
    :returns:
        Publish date of the video.
    """
    try:
        result = regex_search(
            r"(?<=itemprop=\"datePublished\" content=\")\d{4}-\d{2}-\d{2}",
            watch_html,
            group=0)
    except RegexMatchError:
        return None
    return datetime.strptime(result, '%Y-%m-%d')
Beispiel #29
0
def get_initial_function_name(js):
    """Extract the name of the function responsible for computing the signature.

    :param str js:
        The contents of the base.js asset file.

    """
    # c&&d.set("signature", EE(c));
    pattern = [
        r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
        r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
        r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',
        r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
        r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
    ]
    logger.debug('finding initial function name')
    return regex_search(pattern, js, group=1)
Beispiel #30
0
def get_initial_function_name(js):
    """Extract the name of the function responsible for computing the signature.

    :param str js:
        The contents of the base.js asset file.

    """
    # c&&d.set("signature", EE(c));
    pattern = [
        r'yt\.akamaized\.net/\)\s*\|\|\s*'
        r'.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent'
        r'\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',
        r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
        r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent'
        r'\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',
    ]
    logger.debug('finding initial function name')
    return regex_search(pattern, js, group=1)
def get_videos_from_channel(url):
    videos = list()

    try:
        channel_id: str = regex_search(r"(?:channel|\/)([0-9A-Za-z_-]{24}).*",
                                       url,
                                       group=1)
    except IndexError:  # assume that url is just the id
        channel_id = url

    channel_url = f"https://www.youtube.com/channel/{channel_id}/videos"
    html = request.get(channel_url)

    video_regex = re.compile(r"href=\"(/watch\?v=[\w-]*)")
    videos = uniqueify(video_regex.findall(html))

    videos = [f"https://www.youtube.com{video_id}" for video_id in videos]

    return videos
def parse_function(js_func):
    """Parse the Javascript transform function.

    Break a JavaScript transform function down into a two element ``tuple``
    containing the function name and some integer-based argument.

    :param str js_func:
        The JavaScript version of the transform function.
    :rtype: tuple
    :returns:
        two element tuple containing the function name and an argument.

    **Example**:

    >>> parse_function('DE.AJ(a,15)')
    ('AJ', 15)

    """
    logger.debug("parsing transform function")
    return regex_search(r"\w+\.(\w+)\(\w,(\d+)\)", js_func, groups=True)
Beispiel #33
0
def parse_function(js_func):
    """Parse the Javascript transform function.

    Break a JavaScript transform function down into a two element ``tuple``
    containing the function name and some integer-based argument.

    :param str js_func:
        The JavaScript version of the transform function.
    :rtype: tuple
    :returns:
        two element tuple containing the function name and an argument.

    **Example**:

    >>> parse_function('DE.AJ(a,15)')
    ('AJ', 15)

    """
    logger.debug('parsing transform function')
    return regex_search(r'\w+\.(\w+)\(\w,(\d+)\)', js_func, groups=True)
Beispiel #34
0
def get_ytplayer_config(html: str, age_restricted: bool = False) -> Any:
    """Get the YouTube player configuration data from the watch html.

    Extract the ``ytplayer_config``, which is json data embedded within the
    watch html and serves as the primary source of obtaining the stream
    manifest data.

    :param str html:
        The html contents of the watch page.
    :param bool age_restricted:
        Is video age restricted.
    :rtype: str
    :returns:
        Substring of the html containing the encoded manifest data.
    """
    if age_restricted:
        pattern = r";yt\.setConfig\(\{'PLAYER_CONFIG':\s*({.*})(,'EXPERIMENT_FLAGS'|;)"  # noqa: E501
    else:
        pattern = r";ytplayer\.config\s*=\s*({.*?});"
    yt_player_config = regex_search(pattern, html, group=1)
    return json.loads(yt_player_config)
Beispiel #35
0
def get_ytplayer_config(html, age_restricted=False):
    """Get the YouTube player configuration data from the watch html.

    Extract the ``ytplayer_config``, which is json data embedded within the
    watch html and serves as the primary source of obtaining the stream
    manifest data.

    :param str watch_html:
        The html contents of the watch page.
    :param bool age_restricted:
        Is video age restricted.
    :rtype: str
    :returns:
        Substring of the html containing the encoded manifest data.
    """
    if age_restricted:
        pattern = r";yt\.setConfig\(\{'PLAYER_CONFIG':\s*({.*})(,'EXPERIMENT_FLAGS'|;)"  # noqa: E501
    else:
        pattern = r';ytplayer\.config\s*=\s*({.*?});'
    yt_player_config = regex_search(pattern, html, group=1)
    return json.loads(yt_player_config)
Beispiel #36
0
def video_info_url(
    video_id: str,
    watch_url: str,
    embed_html: Optional[str],
    age_restricted: bool,
) -> str:
    """Construct the video_info url.

    :param str video_id:
        A YouTube video identifier.
    :param str watch_url:
        A YouTube watch url.
    :param str embed_html:
        The html contents of the embed page (for age restricted videos).
    :param bool age_restricted:
        Is video age restricted.
    :rtype: str
    :returns:
        :samp:`https://youtube.com/get_video_info` with necessary GET
        parameters.
    """
    if age_restricted:
        assert embed_html is not None
        sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1)
        # Here we use ``OrderedDict`` so that the output is consistent between
        # Python 2.7+.
        params = OrderedDict([
            ("video_id", video_id),
            ("eurl", eurl(video_id)),
            ("sts", sts),
        ])
    else:
        params = OrderedDict([
            ("video_id", video_id),
            ("el", "$el"),
            ("ps", "default"),
            ("eurl", quote(watch_url)),
            ("hl", "en_US"),
        ])
    return "https://youtube.com/get_video_info?" + urlencode(params)
Beispiel #37
0
def mime_type_codec(mime_type_codec):
    """Parse the type data.

    Breaks up the data in the ``type`` key of the manifest, which contains the
    mime type and codecs serialized together, and splits them into separate
    elements.

    **Example**:

    >>> mime_type_codec('audio/webm; codecs="opus"')
    ('audio/webm', ['opus'])

    :param str mime_type_codec:
        String containing mime type and codecs.
    :rtype: tuple
    :returns:
        The mime type and a list of codecs.

    """
    pattern = r'(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"'
    mime_type, codecs = regex_search(pattern, mime_type_codec, groups=True)
    return mime_type, [c.strip() for c in codecs.split(',')]
Beispiel #38
0
def mime_type_codec(mime_type_codec):
    """Parse the type data.

    Breaks up the data in the ``type`` key of the manifest, which contains the
    mime type and codecs serialized together, and splits them into separate
    elements.

    **Example**:

    >>> mime_type_codec('audio/webm; codecs="opus"')
    ('audio/webm', ['opus'])

    :param str mime_type_codec:
        String containing mime type and codecs.
    :rtype: tuple
    :returns:
        The mime type and a list of codecs.

    """
    pattern = r'(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"'
    mime_type, codecs = regex_search(pattern, mime_type_codec, groups=True)
    return mime_type, [c.strip() for c in codecs.split(',')]
Beispiel #39
0
def test_regex_search_no_match():
    with pytest.raises(RegexMatchError):
        helpers.regex_search("^a$", "", group=0)
Beispiel #40
0
def test_regex_search():
    # TODO(nficano): should check isinstance
    assert helpers.regex_search('^a$', 'a') is not None
Beispiel #41
0
def test_regex_search_no_match():
    with pytest.raises(RegexMatchError):
        helpers.regex_search('^a$', '', groups=True)
    def load_streams(self):
        while self.__download_manager.thread_count > 1:
            self.sig_step.emit(self.id, 'Waiting for threads to clear...')
        thread_name = QThread.currentThread().objectName()
        thread_id = int(QThread.currentThreadId())
        self.sig_step.emit(self.id, f'{thread_id}: {thread_name} thread starting...')
        self.__download_manager.videos = []
        self.__download_manager.streams = []
        proxies = self.__download_manager.get_proxies()
        top_level_item_count = self.__download_manager.stream_tree.topLevelItemCount()
        for i in range(top_level_item_count):
            self.__download_manager.stream_tree.takeTopLevelItem(i)
        self.__download_manager.stream_tree.clear()
        self.__download_manager.streams_to_download = {}
        try:
            print('get video id')
            print(extract.video_id(self.__download_manager.url.text()))
            self.sig_step.emit(self.id, f'Loading video')
            loaded_url = YouTube(self.__download_manager.url.text(), proxies=proxies)
            self.sig_step.emit(self.id, f'Loaded video: {loaded_url.title}')
            self.sig_msg.emit(f'Found {loaded_url.title}')
            if self.__abort:
                self.sig_progress_status.emit(f'Aborted!')
                self.sig_done.emit(self.id)
                return
            self.__download_manager.videos.append(loaded_url)

        except RegexMatchError:
            print('playlist')
            if 'playlist' in self.__download_manager.url.text():
                regex_search(r'(?:list=|\/)([0-9A-Za-z_-]{11}).*', self.__download_manager.url.text(), group=1)
                loaded_url = Playlist(self.__download_manager.url.text())
                self.sig_msg.emit(f'Loaded playlist. Discovering videos...')
                loaded_url.populate_video_urls()
                i = 0
                self.sig_progress_status.emit(0)

                for video_url in loaded_url.video_urls:
                    self.sig_step.emit(self.id, f'Loading video {i}')
                    if self.__abort:
                        self.sig_progress_status.emit(f'Aborted!')
                        self.sig_done.emit(self.id)
                        return
                    self.sig_progress_total.emit(int((i / (len(loaded_url.video_urls) * 2)) * 100))
                    vid = YouTube(video_url, proxies=proxies)
                    self.sig_step.emit(self.id, f'Loaded video: {vid.title}')
                    if self.__abort:
                        self.sig_progress_status.emit(f'Aborted!')
                        self.sig_done.emit(self.id)
                        return
                    self.sig_msg.emit(f'Found {vid.title}')

                    self.__download_manager.videos.append(vid)
                    self.sig_progress_status.emit(int((i / len(loaded_url.video_urls)) * 100))
                    i += 1
                self.sig_progress_total.emit(50)
            else:
                self.sig_error.emit('Could not determine Video '
                                    'or Playlist ID from provided URL!\n'
                                    'Please check input!')
                self.sig_done.emit(self.id)
                return
        except Exception as e:
            self.sig_error.emit(str(e))
            self.sig_done.emit(self.id)
            return

        self.sig_msg.emit(f'Loading Streams..')
        print('loading streams')
        i = 0
        for video in self.__download_manager.videos:
            self.sig_progress_status.emit(0)
            self.sig_step.emit(self.id, f'Loading streams for video {i}')
            if self.__abort:
                self.sig_progress_status.emit(f'Aborted!')
                self.sig_done.emit(self.id)
                return
            audio_streams = QTreeWidgetItem(['Audio Only'])
            tree_item = StreamTreeWidgetItem([video.title], f'video_{i}',
                                             self.__download_manager, video, None)
            self.__download_manager.streams = video.streams.all()
            x = 0
            for stream in self.__download_manager.streams:
                self.sig_step.emit(self.id, f'Loading stream {x}')
                if self.__abort:
                    self.sig_progress_status.emit(f'Aborted!')
                    self.sig_done.emit(self.id)
                    return
                self.sig_msg.emit(f'Video {i + 1}/{len(self.__download_manager.videos)}: '
                                  f'Loading Stream ITAG ID: {stream.itag}')
                if stream.video_codec is None:
                    stream_item = StreamTreeWidgetItem([
                        f'Codec: {stream.audio_codec}, '
                        f'ABR: {stream.abr}, '
                        f'File Type: {stream.mime_type.split("/")[1]}, '
                        f'Size: {stream.filesize // 1024} KB'
                    ], f'video_{i}_stream{x}',
                       self.__download_manager, video, stream)
                    self.sig_step.emit(self.id, f'Loaded stream {x}')
                    if self.__abort:
                        self.sig_progress_status.emit(f'Aborted!')
                        self.sig_done.emit(self.id)
                        return
                    audio_streams.addChild(stream_item)
                else:
                    stream_item = StreamTreeWidgetItem([
                        f'Res: {stream.resolution}, FPS: {stream.fps}, '
                        f' Video Codec: {stream.video_codec}, Audio Codec: {stream.audio_codec}, '
                        f'File Type: {stream.mime_type.split("/")[1]}, '
                        f'Size: {stream.filesize // 1024} KB'
                    ], f'video_{i}_stream{x}',
                       self.__download_manager, video, stream)
                    self.sig_step.emit(self.id, f'Loaded stream {x}')
                    if self.__abort:
                        self.sig_progress_status.emit(f'Aborted!')
                        self.sig_done.emit(self.id)
                        return
                    tree_item.addChild(stream_item)
                stream_item.setCheckState(0, Qt.Unchecked)
                x += 1
                self.sig_progress_status.emit(int((x / len(self.__download_manager.streams)) * 100))
            tree_item.addChild(audio_streams)
            self.sig_step.emit(self.id, f'Adding video {i} to tree')
            if self.__abort:
                self.sig_progress_status.emit(f'Aborted!')
                self.sig_done.emit(self.id)
                return
            self.__download_manager.stream_tree.addTopLevelItem(tree_item)
            i += 1
            self.sig_progress_status.emit(100)
            self.sig_progress_total.emit(int((i / (len(self.__download_manager.videos) * 2)) * 100) + 50)
        self.sig_msg.emit(f'Streams Loaded!')
        self.sig_done.emit(self.id)
Beispiel #43
0
def test_regex_search():
    assert helpers.regex_search("^a$", "a", group=0) == "a"