def parse_links(self): """Parse the video links from the page source, extracts and returns the /watch?v= part from video link href It's an alternative for BeautifulSoup """ url = self.construct_playlist_url() req = request.get(url) # split the page source by line and process each line content = [x for x in req.split('\n') if 'pl-video-title-link' in x] link_list = [x.split('href="', 1)[1].split('&', 1)[0] for x in content] # The above only returns 100 or fewer links # Simulating a browser request for the load more link load_more_url = self._load_more_url(req) while len(load_more_url): # there is an url found logger.debug('load more url: %s' % load_more_url) req = request.get(load_more_url) load_more = json.loads(req) videos = re.findall( r'href=\"(/watch\?v=[\w-]*)', load_more['content_html'], ) # remove duplicates link_list.extend(list(OrderedDict.fromkeys(videos))) load_more_url = self._load_more_url( load_more['load_more_widget_html'], ) return link_list
def prefetch(self): """Eagerly download all necessary data. Eagerly executes all necessary network requests so all other operations don't does need to make calls outside of the interpreter which blocks for long periods of time. :rtype: None """ self.watch_html = request.get(url=self.watch_url) if '<img class="icon meh" src="/yts/img' not in self.watch_html: raise VideoUnavailable('This video is unavailable.') self.embed_html = request.get(url=self.embed_url) self.age_restricted = extract.is_age_restricted(self.watch_html) self.vid_info_url = extract.video_info_url( video_id=self.video_id, watch_url=self.watch_url, watch_html=self.watch_html, embed_html=self.embed_html, age_restricted=self.age_restricted, ) self.vid_info = request.get(self.vid_info_url) if not self.age_restricted: self.js_url = extract.js_url(self.watch_html, self.age_restricted) self.js = request.get(self.js_url)
def filesize(self): """File size of the media stream in bytes. :rtype: int :returns: Filesize (in bytes) of the stream. """ if self._filesize is None: headers = request.get(self.url, headers=True) self._filesize = int(headers['content-length']) return self._filesize
def download(self, output_path=None, filename=None, filename_prefix=None): """Write the media stream to disk. :param output_path: (optional) Output path for writing media file. If one is not specified, defaults to the current working directory. :type output_path: str or None :param filename: (optional) Output filename (stem only) for writing media file. If one is not specified, the default filename is used. :type filename: str or None :param filename_prefix: (optional) A string that will be prepended to the filename. For example a number in a playlist or the name of a series. If one is not specified, nothing will be prepended This is seperate from filename so you can use the default filename but still add a prefix. :type filename_prefix: str or None :rtype: str """ output_path = output_path or os.getcwd() if filename: safe = safe_filename(filename) filename = '{filename}.{s.subtype}'.format(filename=safe, s=self) filename = filename or self.default_filename if filename_prefix: filename = '{prefix}{filename}'\ .format( prefix=safe_filename(filename_prefix), filename=filename, ) # file path fp = os.path.join(output_path, filename) bytes_remaining = self.filesize logger.debug( 'downloading (%s total bytes) file to %s', self.filesize, fp, ) with open(fp, 'wb') as fh: for chunk in request.get(self.url, streaming=True): # reduce the (bytes) remainder by the length of the chunk. bytes_remaining -= len(chunk) # send to the on_progress callback. self.on_progress(chunk, fh, bytes_remaining) self.on_complete(fh) return fp
def init(self): """Descramble the stream data and build Stream instances. The initialization process takes advantage of Python's "call-by-reference evaluation," which allows dictionary transforms to be applied in-place, instead of holding references to mutations at each interstitial step. :rtype: None """ logger.info('init started') self.vid_info = {k: v for k, v in parse_qsl(self.vid_info)} if self.age_restricted: self.player_config_args = self.vid_info else: self.player_config_args = extract.get_ytplayer_config( self.watch_html, )['args'] self.vid_descr = extract.get_vid_descr(self.watch_html) # https://github.com/nficano/pytube/issues/165 stream_maps = ['url_encoded_fmt_stream_map'] if 'adaptive_fmts' in self.player_config_args: stream_maps.append('adaptive_fmts') # unscramble the progressive and adaptive stream manifests. for fmt in stream_maps: if not self.age_restricted and fmt in self.vid_info: mixins.apply_descrambler(self.vid_info, fmt) mixins.apply_descrambler(self.player_config_args, fmt) try: mixins.apply_signature(self.player_config_args, fmt, self.js) except TypeError: self.js_url = extract.js_url( self.embed_html, self.age_restricted, ) self.js = request.get(self.js_url) mixins.apply_signature(self.player_config_args, fmt, self.js) # build instances of :class:`Stream <Stream>` self.initialize_stream_objects(fmt) # load the player_response object (contains subtitle information) apply_mixin(self.player_config_args, 'player_response', json.loads) self.initialize_caption_objects() logger.info('init finished successfully')
def title(self): """return playlist title (name) """ try: url = self.construct_playlist_url() req = request.get(url) open_tag = "<title>" end_tag = "</title>" matchresult = re.compile(open_tag + "(.+?)" + end_tag) matchresult = matchresult.search(req).group() matchresult = matchresult.replace(open_tag, "") matchresult = matchresult.replace(end_tag, "") matchresult = matchresult.replace("- YouTube", "") matchresult = matchresult.strip() return matchresult except Exception as e: logger.debug(e) return None
def stream_to_buffer(self): """Write the media stream to buffer :rtype: io.BytesIO buffer """ buffer = io.BytesIO() bytes_remaining = self.filesize logger.debug( 'downloading (%s total bytes) file to BytesIO buffer', self.filesize, ) for chunk in request.get(self.url, streaming=True): # reduce the (bytes) remainder by the length of the chunk. bytes_remaining -= len(chunk) # send to the on_progress callback. self.on_progress(chunk, buffer, bytes_remaining) self.on_complete(buffer) return buffer
def xml_captions(self): """Download the xml caption tracks.""" return request.get(self.url)