Exemple #1
0
    def prefetch(self) -> None:
        """Eagerly download all necessary data.


        Eagerly executes all necessary network requests so all other
        operations don't does need to make calls outside of the interpreter
        which blocks for long periods of time.

        :rtype: None
        """
        self.watch_html = request.get(url=self.watch_url,
                                      request_headers=self.request_headers)
        if self.watch_html is None:
            raise VideoUnavailable(video_id=self.video_id)
        self.age_restricted = extract.is_age_restricted(self.watch_html)

        if not self.age_restricted and "This video is private" in self.watch_html:
            raise VideoUnavailable(video_id=self.video_id)

        if self.age_restricted:
            if not self.embed_html:
                self.embed_html = request.get(
                    url=self.embed_url, request_headers=self.request_headers)
            self.vid_info_url = extract.video_info_url_age_restricted(
                self.video_id, self.watch_url)
        else:
            self.vid_info_url = extract.video_info_url(
                video_id=self.video_id, watch_url=self.watch_url)

        self.vid_info_raw = request.get(self.vid_info_url,
                                        request_headers=self.request_headers)
        if not self.age_restricted:
            self.js_url = extract.js_url(self.watch_html)
            self.js = request.get(self.js_url,
                                  request_headers=self.request_headers)
Exemple #2
0
    def _paginate(self, until_watch_id: Optional[str] = None) -> Iterable[List[str]]:
        """Parse the video links from the page source, yields the /watch?v= part from video link
        """
        req = self.html
        videos_urls = self._extract_videos(req)
        if until_watch_id:
            try:
                trim_index = videos_urls.index(f"/watch?v={until_watch_id}")
                yield videos_urls[:trim_index]
                return
            except ValueError:
                pass
        yield videos_urls

        # The above only returns 100 or fewer links
        # Simulating a browser request for the load more link
        load_more_url = self._find_load_more_url(req)

        while load_more_url:  # there is an url found
            logger.debug("load more url: %s", load_more_url)
            req = request.get(load_more_url)
            load_more = json.loads(req)
            try:
                html = load_more["content_html"]
            except KeyError:
                logger.debug("Could not find content_html")
                return
            videos_urls = self._extract_videos(html)
            if until_watch_id:
                try:
                    trim_index = videos_urls.index(f"/watch?v={until_watch_id}")
                    yield videos_urls[:trim_index]
                    return
                except ValueError:
                    pass
            yield videos_urls

            load_more_url = self._find_load_more_url(
                load_more["load_more_widget_html"],
            )

        return
Exemple #3
0
    def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None):
        if proxies:
            install_proxy(proxies)

        try:
            self.playlist_id: str = parse_qs(url.split("?")[1])["list"][0]
        except IndexError:  # assume that url is just the id
            self.playlist_id = url

        self.playlist_url = f"https://www.youtube.com/playlist?list={self.playlist_id}"
        self.html = request.get(self.playlist_url)

        # Needs testing with non-English
        self.last_update: Optional[date] = None
        date_match = re.search(
            r"<li>Last updated on (\w{3}) (\d{1,2}), (\d{4})</li>", self.html
        )
        if date_match:
            month, day, year = date_match.groups()
            self.last_update = datetime.strptime(
                f"{month} {day:0>2} {year}", "%b %d %Y"
            ).date()

        self._video_regex = re.compile(r"href=\"(/watch\?v=[\w-]*)")
Exemple #4
0
    def descramble(self) -> None:
        """Descramble the stream data and build Stream instances.

        The initialization process takes advantage of Python's
        "call-by-reference evaluation," which allows dictionary transforms to
        be applied in-place, instead of holding references to mutations at each
        interstitial step.

        :rtype: None

        """
        logger.info("init started")

        self.vid_info = dict(parse_qsl(self.vid_info_raw))
        if self.age_restricted:
            self.player_config_args = self.vid_info
        else:
            assert self.watch_html is not None
            self.player_config_args = get_ytplayer_config(
                self.watch_html)["args"]

            # Fix for KeyError: 'title' issue #434
            if "title" not in self.player_config_args:  # type: ignore
                i_start = self.watch_html.lower().index("<title>") + len(
                    "<title>")
                i_end = self.watch_html.lower().index("</title>")
                title = self.watch_html[i_start:i_end].strip()
                index = title.lower().rfind(" - youtube")
                title = title[:index] if index > 0 else title
                self.player_config_args["title"] = unescape(title)

        # https://github.com/nficano/youpy/issues/165
        stream_maps = ["url_encoded_fmt_stream_map"]
        if "adaptive_fmts" in self.player_config_args:
            stream_maps.append("adaptive_fmts")

        # unscramble the progressive and adaptive stream manifests.
        for fmt in stream_maps:
            if not self.age_restricted and fmt in self.vid_info:
                apply_descrambler(self.vid_info, fmt)
            apply_descrambler(self.player_config_args, fmt)

            if not self.js:
                if not self.embed_html:
                    self.embed_html = request.get(url=self.embed_url)
                self.js_url = extract.js_url(self.embed_html)
                self.js = request.get(self.js_url)

            apply_signature(self.player_config_args, fmt, self.js)

            # build instances of :class:`Stream <Stream>`
            self.initialize_stream_objects(fmt)

        # load the player_response object (contains subtitle information)
        self.player_response = json.loads(
            self.player_config_args["player_response"])
        del self.player_config_args["player_response"]
        self.stream_progress_state.title = self.title
        self.stream_progress_state.duration = self.length

        logger.info("init finished successfully")
Exemple #5
0
def test_get_non_http():
    with pytest.raises(ValueError):
        request.get("file://bad", None)
Exemple #6
0
def test_get(mock_urlopen):
    response = mock.Mock()
    response.read.return_value = "<html></html>".encode("utf-8")
    mock_urlopen.return_value = response
    response = request.get("http://fakeassurl.gov", None)
    assert response == "<html></html>"
Exemple #7
0
 def xml_captions(self) -> str:
     """Download the xml caption tracks."""
     return request.get(self.url, request_headers=self.request_headers)