def youtube_get_new_endpoint(vid): url = WATCH_ENDPOINT + vid r = util.urlopen(url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) ipr = get_ipr(r.text) if ipr is None: try: url = get_gdpr_consent_url(r.text) except YouTubeError as e: raise YouTubeError( 'Youtube "%s": No ytInitialPlayerResponse found and %s' % (url, str(e))) r = util.urlopen(url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) ipr = get_ipr(r.text) if ipr is None: raise YouTubeError( 'Youtube "%s": No ytInitialPlayerResponse found' % url) return None, ipr.group(1)
def get_show(self): content_media = util.urlopen( 'https://api-v4.audionow.de/api/v4/media/{}.json'.format( self.uuid)).json() tagged_episodes = [] page_number = 1 while True: content_episodes = util.urlopen( f'https://api-v4.audionow.de/api/v4/podcast/{self.uuid}/episodes.json?page={page_number}' ).json() for episode in content_episodes['data']: tagged_episodes.append( Episode( episode["mediaURL"], episode["description"], episode["title"], episode["uid"], datetime.datetime.fromisoformat( episode["publicationDate"]).timestamp(), episode["duration"], episode["fileSize"])) if content_episodes['meta']['pagination'][ "total_pages"] <= page_number: break page_number += 1 image_resolution = max( map(int, content_media["imageInfo"]["variantSourceWidths"])) image_url = content_media['imageInfo']["optimizedImageUrls"][str( image_resolution)] return Show(tagged_episodes, content_media["description"], content_media["title"], image_url)
def get_channels_for_user(username, api_key_v3): # already a channel ID: return videos.xml. # Can't rely on automatic discovery, see #371 if username.startswith('UC'): try: url = '{0}?channel_id={1}'.format(CHANNEL_VIDEOS_XML, username) stream = util.urlopen(url) return [url] except urllib.error.HTTPError as e: logger.debug( "get_channels_for_user(%s) not a channel id (got %i response code)", username, e.code) except: logger.error( "get_channels_for_user(%s) not a channel id (got unexpected exception)", username) # try username to channel ID conversion stream = util.urlopen( '{0}/channels?forUsername={1}&part=id&key={2}'.format( V3_API_ENDPOINT, username, api_key_v3)) data = json.load(stream) return [ '{0}?channel_id={1}'.format(CHANNEL_VIDEOS_XML, item['id']) for item in data['items'] ]
def get_total_time(episode): try: vid = get_youtube_id(episode.url) if vid is None: return 0 url = WATCH_ENDPOINT + vid r = util.urlopen(url) if not r.ok: return 0 ipr = get_ipr(r.text) if ipr is None: url = get_gdpr_consent_url(r.text) r = util.urlopen(url) if not r.ok: return 0 ipr = get_ipr(r.text) if ipr is None: return 0 player_response = json.loads(ipr.group(1)) return int( player_response['videoDetails']['lengthSeconds']) # 0 if live except: return 0
def __init__(self, url, category): self.url = url self.category = category # TODO: Use proper caching of contents with support for # conditional GETs (If-Modified-Since, ETag, ...) self.data = minidom.parse(util.urlopen(url)) self.playlist = self.data.getElementsByTagName('playlist')[0]
def opendata(url, stream): fp = util.urlopen(url) data = fp.read(1024*10) while data != '': stream.write(data) data = fp.read(1024*10) stream.close()
def get_real_download_url(url): quality = 'sd' codecs = 'H264,VP8,VP6' video_id = get_vimeo_id(url) if video_id is None: return url web_url = 'http://vimeo.com/%s' % video_id web_data = util.urlopen(web_url).read() data_config_frag = DATA_CONFIG_RE.search(web_data) if data_config_frag is None: raise VimeoError('Cannot get data config from Vimeo') data_config_url = data_config_frag.group(1).replace('&', '&') def get_urls(data_config_url): data_config_data = util.urlopen(data_config_url).read().decode('utf-8') data_config = json.loads(data_config_data) for fileinfo in data_config['request']['files'].values(): if not isinstance(fileinfo, dict): continue for fileformat, keys in fileinfo.items(): if not isinstance(keys, dict): continue yield (fileformat, keys['url']) for quality, url in get_urls(data_config_url): return url
def get_data_from_url(self, url): try: response = util.urlopen(url).read() except Exception as e: logger.warn("subtitle url returned error %s", e) return '' return response
def get_real_download_url(url): video_id = get_escapist_id(url) if video_id is None: return url web_data = get_escapist_web(video_id) data_config_frag = DATA_CONFIG_RE.search(web_data) data_config_url = get_escapist_config_url(data_config_frag.group(1)) if data_config_url is None: raise EscapistError('Cannot parse configuration from the site') logger.debug('Config URL: %s', data_config_url) data_config_data = util.urlopen(data_config_url).read().decode('utf-8') # TODO: This second argument should get a real name real_url = get_escapist_real_url(data_config_data, data_config_frag.group(1)) if real_url is None: raise EscapistError('Cannot get MP4 URL from The Escapist') elif "sales-marketing/" in real_url: raise EscapistError('Oops, seems The Escapist blocked this IP. Wait a few days/weeks to get it unblocked') else: return real_url
def get_real_download_url(url): video_id = get_escapist_id(url) if video_id is None: return url web_data = get_escapist_web(video_id) data_config_frag = DATA_CONFIG_RE.search(web_data) if data_config_frag is None: raise EscapistError('Cannot get flashvars URL from The Escapist') data_config_url = data_config_frag.group(1) logger.debug('Config URL: %s', data_config_url) data_config_data = util.urlopen(data_config_url).read().decode('utf-8') data_config_data_frag = DATA_CONFIG_DATA_RE.search(data_config_data) if data_config_data_frag is None: raise EscapistError('Cannot get configuration JS from The Escapist') real_url = data_config_data_frag.group(0) if real_url is None: raise EscapistError('Cannot get MP4 URL from The Escapist') elif "-ad-rotation/" in real_url: raise EscapistError('Oops, seems The Escapist blocked this IP. Wait a few days/weeks to get it unblocked') else: return real_url
def __init__(self, channel, max_episodes): url = channel.authenticate_url(channel.url) logger.info('Parsing via podcastparser: %s', url) headers = {} if channel.http_etag: headers['If-None-Match'] = channel.http_etag if channel.http_last_modified: headers['If-Modified-Since'] = channel.http_last_modified try: stream = util.urlopen(url, headers) self.status = 200 info = stream.info() self.etag = info.get('etag') self.modified = info.get('last-modified') self.parsed = podcastparser.parse(url, stream, max_episodes) except urllib.error.HTTPError as error: self.status = error.code if error.code == 304: logger.info('Not modified') else: logger.warn('Feed update failed: %s', error) raise error self.etag = None self.modified = None self.parsed = None
def vimeo_resolve_download_url(episode, config): url = episode.url video_id = get_vimeo_id(url) if video_id is None: return None web_url = 'http://vimeo.com/%s' % video_id web_data = util.urlopen(web_url).read().decode('utf-8') data_config_frag = DATA_CONFIG_RE.search(web_data) if data_config_frag is None: raise VimeoError('Cannot get data config from Vimeo') data_config_url = data_config_frag.group(1).replace('&', '&') def get_urls(data_config_url): data_config_data = util.urlopen(data_config_url).read().decode('utf-8') data_config = json.loads(data_config_data) for fileinfo in data_config['request']['files'].values(): if not isinstance(fileinfo, dict): continue for fileformat, keys in fileinfo.items(): if not isinstance(keys, dict): continue yield (fileformat, keys['url']) for quality, url in get_urls(data_config_url): return url
def get_real_download_url(url): quality = 'sd' codecs = 'H264,VP8,VP6' video_id = get_vimeo_id(url) if video_id is None: return url web_url = 'http://vimeo.com/%s' % video_id web_data = util.urlopen(web_url).read() sig_pair = SIGNATURE_RE.search(web_data) if sig_pair is None: raise VimeoError('Cannot get signature pair from Vimeo') signature, timestamp = sig_pair.groups() params = '&'.join('%s=%s' % i for i in [ ('clip_id', video_id), ('sig', signature), ('time', timestamp), ('quality', quality), ('codecs', codecs), ('type', 'moogaloop_local'), ('embed_location', ''), ]) player_url = 'http://player.vimeo.com/play_redirect?%s' % params return player_url
def get_channel_id_url(url, feed_data=None): if 'youtube.com' in url: try: if feed_data is None: r = util.urlopen(url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) else: r = feed_data # video page may contain corrupt HTML/XML, search for tag to avoid exception m = re.search(r'<meta itemprop="channelId" content="([^"]+)">', r.text) if m: channel_id = m.group(1) else: raw_xml_data = io.BytesIO(r.content) xml_data = xml.etree.ElementTree.parse(raw_xml_data) channel_id = xml_data.find( "{http://www.youtube.com/xml/schemas/2015}channelId").text channel_url = 'https://www.youtube.com/channel/{}'.format( channel_id) return channel_url except Exception: logger.warning('Could not retrieve youtube channel id.', exc_info=True) raise Exception('Could not retrieve youtube channel id.')
def _handle_paged_feed(self, max_episodes): page = 2 remaining_episodes = max_episodes - len(self.parsed['episodes']) while ('paged_feed_next' in self.parsed and page < self.PAGED_FEED_MAX_PAGES and remaining_episodes > 0): # Take the next page from the paged feed url = self.parsed['paged_feed_next'] del self.parsed['paged_feed_next'] if not url: break try: logger.debug('Downloading page %d from %s', page, url) stream = util.urlopen(url) parsed = podcastparser.parse(url, stream, remaining_episodes) added_episodes = len(parsed['episodes']) remaining_episodes -= added_episodes logger.debug('Page %d contains %d additional episodes', page, added_episodes) self.parsed['episodes'].extend(parsed['episodes']) # Next iteration if we still have a next page if 'paged_feed_next' in parsed: self.parsed['paged_feed_next'] = parsed['paged_feed_next'] except Exception as e: logger.warn('Error while fetching feed page %d from %s: %s', page, url, e) # Give up, don't try to download additional pages here break page += 1
def get_channel_desc(url): if 'youtube.com' in url: class YouTubeHTMLDesc(HTMLParser): """This custom html parser searches for the YouTube channel description.""" def __init__(self): super().__init__() self.description = '' def handle_starttag(self, tag, attributes): attribute_dict = {attribute[0]: attribute[1] for attribute in attributes} # Get YouTube channel description. if tag == 'meta' \ and 'name' in attribute_dict \ and attribute_dict['name'] == "description": self.description = attribute_dict['content'] try: channel_url = get_channel_id_url(url) html_data = util.urlopen(channel_url).read().decode('utf-8') parser = YouTubeHTMLDesc() parser.feed(html_data) if parser.description: logger.debug('YouTube description for %s is: %s', url, parser.description) return parser.description else: logger.debug('YouTube description for %s is not provided.', url) return _('No description available') except Exception: logger.warning('Could not retrieve YouTube channel description.', exc_info=True)
def get_cover(url): if 'youtube.com' in url: class YouTubeHTMLCoverParser(HTMLParser): """This custom html parser searches for the youtube channel thumbnail/avatar""" def __init__(self): super().__init__() self.url = [] def handle_starttag(self, tag, attributes): attribute_dict = {attribute[0]: attribute[1] for attribute in attributes} # Look for 900x900px image first. if tag == 'link' \ and 'rel' in attribute_dict \ and attribute_dict['rel'] == 'image_src': self.url.append(attribute_dict['href']) # Fallback to image that may only be 100x100px. elif tag == 'img' \ and 'class' in attribute_dict \ and attribute_dict['class'] == "channel-header-profile-image": self.url.append(attribute_dict['src']) try: channel_url = get_channel_id_url(url) html_data = util.urlopen(channel_url).read().decode('utf-8') parser = YouTubeHTMLCoverParser() parser.feed(html_data) if parser.url: logger.debug('Youtube cover art for {} is: {}'.format(url, parser.url)) return parser.url[0] except Exception: logger.warning('Could not retrieve cover art', exc_info=True)
def __init__(self, channel, max_episodes): url = channel.authenticate_url(channel.url) logger.info('Parsing via podcastparser: %s', url) headers = {} if channel.http_etag: headers['If-None-Match'] = channel.http_etag if channel.http_last_modified: headers['If-Modified-Since'] = channel.http_last_modified try: stream = util.urlopen(url, headers) self.status = 200 info = stream.info() self.etag = info.get('etag') self.modified = info.get('last-modified') self.parsed = podcastparser.parse(url, stream, max_episodes) self._handle_paged_feed(max_episodes) except urllib.error.HTTPError as error: self.status = error.code if error.code == 304: logger.info('Not modified') else: logger.warn('Feed update failed: %s', error) raise error self.etag = None self.modified = None self.parsed = None
def find_youtube_channels(string): url = 'http://gdata.youtube.com/feeds/api/videos?alt=json&q=%s' % urllib.quote( string, '') data = json.load(util.urlopen(url)) class FakeImporter(object): def __init__(self): self.items = [] result = FakeImporter() seen_users = set() for entry in data['feed']['entry']: user = os.path.basename(entry['author'][0]['uri']['$t']) title = entry['title']['$t'] url = 'http://www.youtube.com/rss/user/%s/videos.rss' % user if user not in seen_users: result.items.append({ 'title': user, 'url': url, 'description': title }) seen_users.add(user) return result
def get_real_download_url(url): quality = 'sd' codecs = 'H264,VP8,VP6' video_id = get_vimeo_id(url) if video_id is None: return url web_url = 'http://vimeo.com/%s' % video_id web_data = util.urlopen(web_url).read() sig_pair = SIGNATURE_RE.search(web_data) if sig_pair is None: raise VimeoError('Cannot get signature pair from Vimeo') timestamp, signature = sig_pair.groups() params = '&'.join('%s=%s' % i for i in [ ('clip_id', video_id), ('sig', signature), ('time', timestamp), ('quality', quality), ('codecs', codecs), ('type', 'moogaloop_local'), ('embed_location', ''), ]) player_url = 'http://player.vimeo.com/play_redirect?%s' % params return player_url
def get_escapist_web(video_id): if video_id is None: return None # FIXME: must check if it's utf-8 web_url = 'http://www.escapistmagazine.com/videos/view/%s' % video_id return util.urlopen(web_url).read()
def find_youtube_channels(string): # FIXME: Make proper use of the YouTube API instead # of screen-scraping the YouTube website url = 'http://www.youtube.com/results?search_query='+ urllib.quote(string, '') +'&search_type=search_users&aq=f' r = re.compile('>\s+<') data = r.sub('><', util.urlopen(url).read()) r1 = re.compile('<a href="/user/([^"?]+)[^"]+"[^>]*>([^<]+)</a>') m1 = r1.findall(data) r2 = re.compile('\s+') class FakeImporter(object): def __init__(self): self.items = [] result = FakeImporter() found_users = [] for name, title in m1: if name not in found_users: found_users.append(name) link = 'http://www.youtube.com/rss/user/'+ name +'/videos.rss' result.items.append({'title': name, 'url': link, 'description': title}) return result
def get_cover(self, filename, cover_url, feed_url, title, username=None, password=None, download=False): # Detection of "all episodes" podcast if filename == self.ALL_EPISODES_ID: return self.get_cover_all_episodes() # Return already existing files for extension in self.EXTENSIONS: if os.path.exists(filename + extension): return filename + extension # If allowed to download files, do so here if download: # YouTube-specific cover art image resolver youtube_cover_url = youtube.get_real_cover(feed_url) if youtube_cover_url is not None: cover_url = youtube_cover_url if not cover_url: return self._fallback_filename(title) # We have to add username/password, because password-protected # feeds might keep their cover art also protected (bug 1521) if username is not None and password is not None: cover_url = util.url_add_authentication( cover_url, username, password) try: logger.info('Downloading cover art: %s', cover_url) data = util.urlopen(cover_url, timeout=self.TIMEOUT).read() except Exception, e: logger.warn('Cover art download failed: %s', e) return self._fallback_filename(title) try: extension = None for filetype, check in self.SUPPORTED_EXTENSIONS.items(): if check(data): extension = filetype break if extension is None: msg = 'Unknown file type: %s (%r)' % (cover_url, data[:6]) raise ValueError(msg) # Successfully downloaded the cover art - save it! fp = open(filename + extension, 'wb') fp.write(data) fp.close() return filename + extension except Exception, e: logger.warn('Cannot save cover art', exc_info=True)
def itunes_feed_handler(channel, max_episodes, config): m = re.match(r'https?://itunes.apple.com/(?:[^/]*/)?podcast/.+$', channel.url, re.I) if m is None: return None logger.debug('Detected iTunes feed.') version = ITUNES_DEFAULT_VERSION headers = {'User-agent': 'iTunes/{}'.format(version)} try: data = util.urlopen(channel.url, headers).read().decode('utf-8') m = re.search(ITUNES_FEEDURL_RE[version], data) if m is None: raise ITunesFeedException( 'Could not resolve real feed URL from iTunes feed.') url = m.group(1) logger.info('Resolved iTunes feed URL: {} -> {}'.format( channel.url, url)) channel.url = url # Delegate further processing of the feed to the normal podcast parser # by returning None (will try the next handler in the resolver chain) return None except Exception as ex: logger.warn('Cannot resolve iTunes feed: {}'.format(str(ex))) raise
def get_real_download_url(url): video_id = get_escapist_id(url) if video_id is None: return url web_data = get_escapist_web(video_id) data_config_frag = DATA_CONFIG_RE.search(web_data) data_config_url = get_escapist_config_url(data_config_frag.group(1)) if data_config_url is None: raise EscapistError('Cannot parse configuration from the site') logger.debug('Config URL: %s', data_config_url) data_config_data = util.urlopen(data_config_url).read().decode('utf-8') #TODO: This second argument should get a real name real_url = get_escapist_real_url(data_config_data, data_config_frag.group(1)) if real_url is None: raise EscapistError('Cannot get MP4 URL from The Escapist') elif "sales-marketing/" in real_url: raise EscapistError( 'Oops, seems The Escapist blocked this IP. Wait a few days/weeks to get it unblocked' ) else: return real_url
def fetch(self, url, etag=None, modified=None, autodiscovery=True, **kwargs): """ use kwargs to pass extra data to parse_feed in Fetcher subclasses """ # handle local file first if url.startswith('file://'): url = url[len('file://'):] stream = open(url) return self.parse_feed(url, stream, {}, UPDATED_FEED, **kwargs) # remote feed headers = {} if modified is not None: headers['If-Modified-Since'] = modified if etag is not None: headers['If-None-Match'] = etag stream = util.urlopen(url, headers) responses = stream.history + [stream] for i, resp in enumerate(responses): if resp.is_permanent_redirect: # there should always be a next response when a redirect is encountered # If max redirects is reached, TooManyRedirects is raised # TODO: since we've got the end contents anyway, modify model.py to accept contents on NEW_LOCATION return Result(NEW_LOCATION, responses[i + 1].url) res = self._check_statuscode(stream.status_code, stream.url) if res == NOT_MODIFIED: return Result(NOT_MODIFIED, stream.url) if autodiscovery and stream.headers.get('content-type', '').startswith('text/html'): ad = FeedAutodiscovery(url) # response_text() will assume utf-8 if no charset specified ad.feed(util.response_text(stream)) if ad._resolved_url and ad._resolved_url != url: try: self.fetch(ad._resolved_url, etag=None, modified=None, autodiscovery=False, **kwargs) return Result(NEW_LOCATION, ad._resolved_url) except Exception as e: logger.warn('Feed autodiscovery failed', exc_info=True) # Second, try to resolve the URL new_url = self._resolve_url(url) if new_url and new_url != url: return Result(NEW_LOCATION, new_url) # xml documents specify the encoding inline so better pass encoded body. # Especially since requests will use ISO-8859-1 for content-type 'text/xml' # if the server doesn't specify a charset. return self.parse_feed(url, BytesIO(stream.content), stream.headers, UPDATED_FEED, **kwargs)
def get_urls(data_config_url): data_config = util.urlopen(data_config_url).json() for fileinfo in list(data_config['request']['files'].values()): if not isinstance(fileinfo, list): continue for item in fileinfo: yield (item['quality'], item['url'])
def get_tracks(self, feed): """Get a generator of tracks from a SC user The generator will give you a dictionary for every track it can find for its user.""" global CONSUMER_KEY try: json_url = 'https://api.soundcloud.com/users/%(user)s/%(feed)s.json?filter=downloadable&consumer_key=%(consumer_key)s&limit=200' \ % { "user":self.get_user_id(), "feed":feed, "consumer_key": CONSUMER_KEY } logger.debug("loading %s", json_url) json_tracks = json.loads( util.urlopen(json_url).read().decode('utf-8')) tracks = [track for track in json_tracks if track['downloadable']] total_count = len(tracks) + len( [track for track in json_tracks if not track['downloadable']]) if len(tracks) == 0 and total_count > 0: logger.warn("Download of all %i %s of user %s is disabled" % (total_count, feed, self.username)) else: logger.info("%i/%i downloadable tracks for user %s %s feed" % (len(tracks), total_count, self.username, feed)) for track in tracks: # Prefer stream URL (MP3), fallback to download URL url = track.get('stream_url', track['download_url']) + \ '?consumer_key=%(consumer_key)s' \ % { 'consumer_key': CONSUMER_KEY } if url not in self.cache: try: self.cache[url] = get_metadata(url) except: continue filesize, filetype, filename = self.cache[url] yield { 'title': track.get('title', track.get('permalink')) or _('Unknown track'), 'link': track.get('permalink_url') or 'https://soundcloud.com/' + self.username, 'description': track.get('description') or _('No description available'), 'url': url, 'file_size': int(filesize), 'mime_type': filetype, 'guid': track.get('permalink', track.get('id')), 'published': soundcloud_parsedate(track.get('created_at', None)), } finally: self.commit_cache()
def get_channels_for_user(username, api_key_v3): stream = util.urlopen( '{0}/channels?forUsername={1}&part=id&key={2}'.format( V3_API_ENDPOINT, username, api_key_v3)) data = json.loads(stream.read().decode('utf-8')) return [ '{0}?channel_id={1}'.format(CHANNEL_VIDEOS_XML, item['id']) for item in data['items'] ]
def get_urls(data_config_url): data_config_data = util.urlopen(data_config_url).read().decode('utf-8') data_config = json.loads(data_config_data) for fileinfo in list(data_config['request']['files'].values()): if not isinstance(fileinfo, list): continue for item in fileinfo: yield (item['quality'], item['url'])
def youtube_get_old_endpoint(vid): # TODO: changing 'detailpage' to 'embedded' allows age-restricted content url = 'https://www.youtube.com/get_video_info?html5=1&c=TVHTML5&cver=6.20180913&el=detailpage&video_id=' + vid r = util.urlopen(url) if not r.ok: raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason)) else: return r.text, None
def get_urls(data_config_url): data_config_data = util.urlopen(data_config_url).read().decode('utf-8') data_config = json.loads(data_config_data) for fileinfo in data_config['request']['files'].values(): if not isinstance(fileinfo, list): continue for item in fileinfo: yield (item['quality'], item['url'])
def _parse_feed(self, url, etag, modified, autodiscovery=True): headers = {} if modified is not None: headers['If-Modified-Since'] = modified if etag is not None: headers['If-None-Match'] = etag if url.startswith('file://'): is_local = True url = url[len('file://'):] stream = open(url) else: is_local = False try: stream = util.urlopen(url, headers) except HTTPError as e: return self._check_statuscode(e, e.geturl()) data = stream if autodiscovery and not is_local and stream.headers.get( 'content-type', '').startswith('text/html'): # Not very robust attempt to detect encoding: http://stackoverflow.com/a/1495675/1072626 charset = stream.headers.get_param('charset') if charset is None: charset = 'utf-8' # utf-8 appears hard-coded elsewhere in this codebase # We use StringIO in case the stream needs to be read again data = StringIO(stream.read().decode(charset)) ad = FeedAutodiscovery(url) ad.feed(data.getvalue()) if ad._resolved_url: try: self._parse_feed(ad._resolved_url, None, None, False) return Result(NEW_LOCATION, ad._resolved_url) except Exception as e: logger.warn('Feed autodiscovery failed', exc_info=True) # Second, try to resolve the URL url = self._resolve_url(url) if url: return Result(NEW_LOCATION, url) # Reset the stream so podcastparser can give it a go data.seek(0) try: feed = podcastparser.parse(url, data) except ValueError as e: raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e)) if is_local: feed['headers'] = {} return Result(UPDATED_FEED, feed) else: feed['headers'] = stream.headers return self._check_statuscode(stream, feed)
def get_cover(self, filename, cover_url, feed_url, title, username=None, password=None, download=False): # Detection of "all episodes" podcast if filename == self.ALL_EPISODES_ID: return self.get_cover_all_episodes() # Return already existing files for extension in self.EXTENSIONS: if os.path.exists(filename + extension): return filename + extension # If allowed to download files, do so here if download: # YouTube-specific cover art image resolver youtube_cover_url = youtube.get_cover(feed_url) if youtube_cover_url is not None: cover_url = youtube_cover_url if not cover_url: return self._fallback_filename(title) # We have to add username/password, because password-protected # feeds might keep their cover art also protected (bug 1521) if username is not None and password is not None: cover_url = util.url_add_authentication(cover_url, username, password) try: logger.info('Downloading cover art: %s', cover_url) data = util.urlopen(cover_url, timeout=self.TIMEOUT).read() except Exception as e: logger.warn('Cover art download failed: %s', e) return self._fallback_filename(title) try: extension = None for filetype, check in list(self.SUPPORTED_EXTENSIONS.items()): if check(data): extension = filetype break if extension is None: msg = 'Unknown file type: %s (%r)' % (cover_url, data[:6]) raise ValueError(msg) # Successfully downloaded the cover art - save it! fp = open(filename + extension, 'wb') fp.write(data) fp.close() return filename + extension except Exception as e: logger.warn('Cannot save cover art', exc_info=True) # Fallback to cover art based on the podcast title return self._fallback_filename(title)
def return_user_cover(url, channel): api_url = 'http://gdata.youtube.com/feeds/api/users/{}?v=2'.format(channel) data = util.urlopen(api_url).read() m = re.search('<media:thumbnail url=[\'"]([^\'"]+)[\'"]/>', data) if m is not None: logger.debug('YouTube userpic for %s is: %s', url, m.group(1)) return m.group(1) return None
def get_cover(url): if 'youtube.com' in url: class YouTubeHTMLCoverParser(HTMLParser): """This custom html parser searches for the youtube channel thumbnail/avatar""" def __init__(self): super().__init__() self.url = "" def handle_starttag(self, tag, attributes): attribute_dict = { attribute[0]: attribute[1] for attribute in attributes } # Look for 900x900px image first. if tag == 'link' \ and 'rel' in attribute_dict \ and attribute_dict['rel'] == 'image_src': self.url = attribute_dict['href'] # Fallback to image that may only be 100x100px. elif tag == 'img' \ and 'class' in attribute_dict \ and attribute_dict['class'] == "channel-header-profile-image": self.url = attribute_dict['src'] try: raw_xml_data = util.urlopen(url).read().decode('utf-8') xml_data = xml.etree.ElementTree.fromstring(raw_xml_data) channel_id = xml_data.find( "{http://www.youtube.com/xml/schemas/2015}channelId").text channel_url = 'https://www.youtube.com/channel/{}'.format( channel_id) html_data = util.urlopen(channel_url).read().decode('utf-8') parser = YouTubeHTMLCoverParser() parser.feed(html_data) if parser.url: logger.debug('Youtube cover art for {} is: {}'.format( url, parser.url)) return parser.url except Exception: logger.warning('Could not retrieve cover art', exc_info=True)
def _parse_feed(self, url, etag, modified, autodiscovery=True): headers = {} if modified is not None: headers['If-Modified-Since'] = modified if etag is not None: headers['If-None-Match'] = etag if url.startswith('file://'): is_local = True url = url[len('file://'):] stream = open(url) else: is_local = False try: stream = util.urlopen(url, headers) except HTTPError as e: return self._check_statuscode(e, e.geturl()) data = stream if autodiscovery and not is_local and stream.headers.get('content-type', '').startswith('text/html'): # Not very robust attempt to detect encoding: http://stackoverflow.com/a/1495675/1072626 charset = stream.headers.get_param('charset') if charset is None: charset = 'utf-8' # utf-8 appears hard-coded elsewhere in this codebase # We use StringIO in case the stream needs to be read again data = StringIO(stream.read().decode(charset)) ad = FeedAutodiscovery(url) ad.feed(data.getvalue()) if ad._resolved_url: try: self._parse_feed(ad._resolved_url, None, None, False) return Result(NEW_LOCATION, ad._resolved_url) except Exception as e: logger.warn('Feed autodiscovery failed', exc_info=True) # Second, try to resolve the URL url = self._resolve_url(url) if url: return Result(NEW_LOCATION, url) # Reset the stream so podcastparser can give it a go data.seek(0) try: feed = podcastparser.parse(url, data) except ValueError as e: raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e)) if is_local: feed['headers'] = {} return Result(UPDATED_FEED, feed) else: feed['headers'] = stream.headers return self._check_statuscode(stream, feed)
def get_user_info(self): global CONSUMER_KEY key = ':'.join((self.username, 'user_info')) if key not in self.cache: json_url = 'https://api.soundcloud.com/users/%s.json?consumer_key=%s' % (self.username, CONSUMER_KEY) logger.debug('get_user_info url: %s', json_url) user_info = json.loads(util.urlopen(json_url).read().decode('utf-8')) self.cache[key] = user_info return self.cache[key]
def get_channel_id_url(url): if 'youtube.com' in url: try: channel_url = '' raw_xml_data = util.urlopen(url).read().decode('utf-8') xml_data = xml.etree.ElementTree.fromstring(raw_xml_data) channel_id = xml_data.find("{http://www.youtube.com/xml/schemas/2015}channelId").text channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id) return channel_url except Exception: logger.warning('Could not retrieve youtube channel id.', exc_info=True)
def get_urls(data_config_url): data_config_data = util.urlopen(data_config_url).read().decode('utf-8') data_config = json.loads(data_config_data) for fileinfo in data_config['request']['files'].values(): if not isinstance(fileinfo, dict): continue for fileformat, keys in fileinfo.items(): if not isinstance(keys, dict): continue yield (fileformat, keys['url'])
def get_real_cover(url): rss_url = get_real_channel_url(url) if rss_url is None: return None rss_data = util.urlopen(rss_url).read() rss_data_frag = DATA_COVERART_RE.search(rss_data) if rss_data_frag is None: return None return rss_data_frag.group(1)
def on_search(self, query): search = util.urlopen( f'https://api-v4.audionow.de/api/v4/search.json?q=*{query}*&page=1' ).json() to_return = [] for result in search["data"]: to_return.append( directory.DirectoryEntry( result["title"], f"https://audionow.de/podcast/{result['uid']}", description=result["subtitle"])) return to_return
def get_cover(self, podcast, download=False): filename = podcast.cover_file cover_url = podcast.cover_url # Return already existing files for extension in self.EXTENSIONS: if os.path.exists(filename + extension): return filename + extension # If allowed to download files, do so here if download: # YouTube-specific cover art image resolver youtube_cover_url = youtube.get_real_cover(podcast.url) if youtube_cover_url is not None: cover_url = youtube_cover_url if not cover_url: return None # We have to add username/password, because password-protected # feeds might keep their cover art also protected (bug 1521) cover_url = util.url_add_authentication(cover_url, podcast.auth_username, podcast.auth_password) try: logger.info('Downloading cover art: %s', cover_url) data = util.urlopen(cover_url, timeout=self.TIMEOUT).read() except Exception as e: logger.warn('Cover art download failed: %s', e) return None try: extension = None for filetype, check in list(self.SUPPORTED_EXTENSIONS.items()): if check(data): extension = filetype break if extension is None: msg = 'Unknown file type: %s (%r)' % (cover_url, data[:6]) raise ValueError(msg) # Successfully downloaded the cover art - save it! fp = open(filename + extension, 'wb') fp.write(data) fp.close() return filename + extension except Exception as e: logger.warn('Cannot save cover art', exc_info=True) return None
def get_real_cover(url): rss_url = get_real_channel_url(url) if rss_url is None: return None # FIXME: can I be sure to decode it as utf-8? rss_data = util.urlopen(rss_url).read() rss_data_frag = DATA_COVERART_RE.search(rss_data) if rss_data_frag is None: return None return rss_data_frag.group(1)
def get_tracks(self, feed): """Get a generator of tracks from a SC user The generator will give you a dictionary for every track it can find for its user.""" global CONSUMER_KEY try: json_url = ('https://api.soundcloud.com/users/%(user)s/%(feed)s.' 'json?filter=downloadable&consumer_key=%' '(consumer_key)s&limit=200' % {"user": self.get_user_id(), "feed": feed, "consumer_key": CONSUMER_KEY}) logger.debug("loading %s", json_url) json_tracks = json.loads(util.urlopen(json_url).read().decode('utf-8')) tracks = [track for track in json_tracks if track['downloadable']] total_count = len(tracks) + len([track for track in json_tracks if not track['downloadable']]) if len(tracks) == 0 and total_count > 0: logger.warn("Download of all %i %s of user %s is disabled" % (total_count, feed, self.username)) else: logger.info("%i/%i downloadable tracks for user %s %s feed" % (len(tracks), total_count, self.username, feed)) for track in tracks: # Prefer stream URL (MP3), fallback to download URL url = track.get('stream_url', track['download_url']) + \ '?consumer_key=%(consumer_key)s' \ % {'consumer_key': CONSUMER_KEY} if url not in self.cache: try: self.cache[url] = get_metadata(url) except: continue filesize, filetype, filename = self.cache[url] yield { 'title': track.get('title', track.get('permalink')) or _('Unknown track'), 'link': track.get('permalink_url') or 'https://soundcloud.com/' + self.username, 'description': track.get('description') or _('No description available'), 'url': url, 'file_size': int(filesize), 'mime_type': filetype, 'guid': track.get('permalink', track.get('id')), 'published': soundcloud_parsedate(track.get('created_at', None)), } finally: self.commit_cache()
def youtube_resolve_cover_art(podcast): url = podcast.url r = re.compile('http://www\.youtube\.com/rss/user/([^/]+)/videos\.rss', re.IGNORECASE) m = r.match(url) if m is not None: username = m.group(1) api_url = 'http://gdata.youtube.com/feeds/api/users/%s?v=2' % username data = util.urlopen(api_url).read().decode('utf-8', 'ignore') match = re.search('<media:thumbnail url=[\'"]([^\'"]+)[\'"]/>', data) if match is not None: return match.group(1) return None
def return_user_cover(url, channel): try: api_url = 'https://www.youtube.com/channel/{0}'.format(channel) data = util.urlopen(api_url).read() m = re.search('<img class="channel-header-profile-image"[^>]* src=[\'"]([^\'"]+)[\'"][^>]*>', data) if m is not None: logger.debug('YouTube userpic for %s is: %s', url, m.group(1)) return m.group(1) except Exception as e: logger.warn('Could not retrieve cover art', exc_info=True) return None return None
def get_user_info(self): global CONSUMER_KEY key = ':'.join((self.username, 'user_info')) if key in self.cache: return self.cache[key] try: json_url = 'https://api.soundcloud.com/users/%s.json?consumer_key=%s' % (self.username, CONSUMER_KEY) user_info = json.loads(util.urlopen(json_url).read().decode('utf-8')) self.cache[key] = user_info finally: self.commit_cache() return user_info
def get_real_cover(url): r = re.compile('http://www\.youtube\.com/rss/user/([^/]+)/videos\.rss', re.IGNORECASE) m = r.match(url) if m is not None: username = m.group(1) api_url = 'http://gdata.youtube.com/feeds/api/users/%s?v=2' % username data = util.urlopen(api_url).read().decode('utf-8', 'ignore') match = re.search('<media:thumbnail url=[\'"]([^\'"]+)[\'"]/>', data) if match is not None: logger.debug('YouTube userpic for %s is: %s', url, match.group(1)) return match.group(1) return None
def get_metadata(url): """Get file download metadata Returns a (size, type, name) from the given download URL. Will use the network connection to determine the metadata via the HTTP header fields. """ track_fp = util.urlopen(url) headers = track_fp.info() filesize = headers['content-length'] or '0' filetype = headers['content-type'] or 'application/octet-stream' headers_s = '\n'.join('%s:%s'%(k,v) for k, v in headers.items()) filename = get_param(headers_s) or os.path.basename(os.path.dirname(url)) track_fp.close() return filesize, filetype, filename
def on_search(self, query): url = 'http://gdata.youtube.com/feeds/api/videos?alt=json&q=%s' % urllib.quote(query) data = json.load(util.urlopen(url)) result = [] seen_users = set() for entry in data['feed']['entry']: user = os.path.basename(entry['author'][0]['uri']['$t']) title = entry['title']['$t'] url = 'http://www.youtube.com/rss/user/%s/videos.rss' % user if user not in seen_users: result.append(DirectoryEntry(user, url)) seen_users.add(user) return result
def request(self, url, data=None): headers = {'Content-Type': 'application/json'} if url == self.OAUTH_TOKEN_URL: # Inject username and password into the request URL url = util.url_add_authentication(url, self.KEY, self.SECRET) elif self._config.token: headers['Authorization'] = 'Bearer ' + self._config.token if data is not None: data = json.dumps(data) try: response = util.urlopen(url, headers, data) except urllib2.HTTPError, error: return {'_gpodder_statuscode': error.getcode()}
def get_coverart(self): global CONSUMER_KEY key = ':'.join((self.username, 'avatar_url')) if key in self.cache: return self.cache[key] image = None try: json_url = 'http://api.soundcloud.com/users/%s.json?consumer_key=%s' % (self.username, CONSUMER_KEY) user_info = json.load(util.urlopen(json_url)) image = user_info.get('avatar_url', None) self.cache[key] = image finally: self.commit_cache() return image
def __init__(self, url): """ Parses the OPML feed from the given URL into a local data structure containing channel metadata. """ self.items = [] try: if os.path.exists(url): doc = xml.dom.minidom.parse(url) else: # FIXME: is it ok to pass bytes to parseString? doc = xml.dom.minidom.parseString(util.urlopen(url).read()) for outline in doc.getElementsByTagName('outline'): # Make sure we are dealing with a valid link type (ignore case) otl_type = outline.getAttribute('type') if otl_type is None or otl_type.lower() not in self.VALID_TYPES: continue if outline.getAttribute('xmlUrl') or outline.getAttribute('url'): channel = { 'url': outline.getAttribute('xmlUrl') or outline.getAttribute('url'), 'title': outline.getAttribute('title') or outline.getAttribute('text') or outline.getAttribute('xmlUrl') or outline.getAttribute('url'), 'description': outline.getAttribute('text') or outline.getAttribute('xmlUrl') or outline.getAttribute('url'), } if channel['description'] == channel['title']: channel['description'] = channel['url'] for attr in ('url', 'title', 'description'): channel[attr] = channel[attr].strip() self.items.append(channel) if not len(self.items): logger.info('OPML import finished, but no items found: %s', url) except: logger.error('Cannot import OPML from URL: %s', url, exc_info=True)
def return_user_cover(url, channel): try: api_url = 'https://www.youtube.com/channel/{0}'.format(channel) data = util.urlopen(api_url).read() # Look for 900x900px image first. m = re.search('<link rel="image_src"[^>]* href=[\'"]([^\'"]+)[\'"][^>]*>', data) if m is None: # Fallback to image that may only be 100x100px. m = re.search('<img class="channel-header-profile-image"[^>]* src=[\'"]([^\'"]+)[\'"][^>]*>', data) if m is not None: logger.debug('YouTube userpic for %s is: %s', url, m.group(1)) return m.group(1) except Exception as e: logger.warn('Could not retrieve cover art', exc_info=True) return None return None