def test_metadata_from_initial_data(stream_dict): initial_data = extract.initial_data(stream_dict) ytmd = extract.metadata(json.loads(initial_data)) assert len(ytmd.raw_metadata) > 0 assert 'contents' in ytmd.raw_metadata[0] assert len(ytmd.metadata) > 0 assert 'Song' in ytmd.metadata[0]
def prefetch(self) -> None: """Eagerly download all necessary data. Eagerly executes all necessary network requests so all other operations don't does need to make calls outside of the interpreter which blocks for long periods of time. :rtype: None """ self.watch_html = request.get(url=self.watch_url) self.check_availability() self.age_restricted = extract.is_age_restricted(self.watch_html) if self.age_restricted: if not self.embed_html: self.embed_html = request.get(url=self.embed_url) self.vid_info_url = extract.video_info_url_age_restricted( self.video_id, self.watch_url ) else: self.vid_info_url = extract.video_info_url( video_id=self.video_id, watch_url=self.watch_url ) self.initial_data = extract.initial_data(self.watch_html) self.vid_info_raw = request.get(self.vid_info_url) if not self.age_restricted: self.js_url = extract.js_url(self.watch_html) self.js = request.get(self.js_url)
def _paginate( self, until_watch_id: Optional[str] = None ) -> Iterable[List[str]]: """Parse the video links from the page source, yields the /watch?v= part from video link :param until_watch_id Optional[str]: YouTube Video watch id until which the playlist should be read. :rtype: Iterable[List[str]] :returns: Iterable of lists of YouTube watch ids """ req = self.html videos_urls, continuation = self._extract_videos( json.dumps(extract.initial_data(self.html)) ) if until_watch_id: try: trim_index = videos_urls.index(f"/watch?v={until_watch_id}") yield videos_urls[:trim_index] return except ValueError: pass yield videos_urls # Extraction from a playlist only returns 100 videos at a time # if self._extract_videos returns a continuation there are more # than 100 songs inside a playlist, so we need to add further requests # to gather all of them if continuation: load_more_url, headers = self._build_continuation_url(continuation) else: load_more_url, headers = None, None while load_more_url and headers: # there is an url found logger.debug("load more url: %s", load_more_url) # requesting the next page of videos with the url generated from the # previous page req = request.get(load_more_url, extra_headers=headers) # extract up to 100 songs from the page loaded # returns another continuation if more videos are available videos_urls, continuation = self._extract_videos(req) if until_watch_id: try: trim_index = videos_urls.index(f"/watch?v={until_watch_id}") yield videos_urls[:trim_index] return except ValueError: pass yield videos_urls if continuation: load_more_url, headers = self._build_continuation_url( continuation ) else: load_more_url, headers = None, None
def initial_data(self): """Extract the initial data from the playlist page html. :rtype: dict """ if self._initial_data: return self._initial_data else: self._initial_data = extract.initial_data(self.html) return self._initial_data
def prefetch(self) -> None: """Eagerly download all necessary data. Eagerly executes all necessary network requests so all other operations don't does need to make calls outside of the interpreter which blocks for long periods of time. :rtype: None """ self.watch_html = request.get(url=self.watch_url) if self.watch_html is None: raise VideoUnavailable(video_id=self.video_id) self.age_restricted = extract.is_age_restricted(self.watch_html) if extract.is_private(self.watch_html): raise VideoPrivate(video_id=self.video_id) if not extract.recording_available(self.watch_html): raise RecordingUnavailable(video_id=self.video_id) if self.age_restricted: if not self.embed_html: self.embed_html = request.get(url=self.embed_url) self.vid_info_url = extract.video_info_url_age_restricted( self.video_id, self.watch_url) else: self.vid_info_url = extract.video_info_url( video_id=self.video_id, watch_url=self.watch_url) self.initial_data_raw = extract.initial_data(self.watch_html) self.initial_data = json.loads(self.initial_data_raw) self.vid_info_raw = request.get(self.vid_info_url) if not self.age_restricted: self.js_url = extract.js_url(self.watch_html) self.js = request.get(self.js_url)
def initial_data(self): if self._initial_data: return self._initial_data self._initial_data = extract.initial_data(self.watch_html) return self._initial_data
def test_initial_data(stream_dict): initial_data = extract.initial_data(stream_dict) assert 'contents' in initial_data
def test_initial_data_missing(): with pytest.raises(RegexMatchError): extract.initial_data('')
def test_initial_data_missing(): initial_data = extract.initial_data('') assert initial_data == "{}"
async def initial_data(self): if self._initial_data: return self._initial_data else: self._initial_data = extract.initial_data(await self.html) return self._initial_data