Esempio n. 1
0
def youtube_get_new_endpoint(vid):
    url = WATCH_ENDPOINT + vid
    r = util.urlopen(url)
    if not r.ok:
        raise YouTubeError('Youtube "%s": %d %s' %
                           (url, r.status_code, r.reason))

    ipr = get_ipr(r.text)
    if ipr is None:
        try:
            url = get_gdpr_consent_url(r.text)
        except YouTubeError as e:
            raise YouTubeError(
                'Youtube "%s": No ytInitialPlayerResponse found and %s' %
                (url, str(e)))
        r = util.urlopen(url)
        if not r.ok:
            raise YouTubeError('Youtube "%s": %d %s' %
                               (url, r.status_code, r.reason))

        ipr = get_ipr(r.text)
        if ipr is None:
            raise YouTubeError(
                'Youtube "%s": No ytInitialPlayerResponse found' % url)

    return None, ipr.group(1)
 def get_show(self):
     content_media = util.urlopen(
         'https://api-v4.audionow.de/api/v4/media/{}.json'.format(
             self.uuid)).json()
     tagged_episodes = []
     page_number = 1
     while True:
         content_episodes = util.urlopen(
             f'https://api-v4.audionow.de/api/v4/podcast/{self.uuid}/episodes.json?page={page_number}'
         ).json()
         for episode in content_episodes['data']:
             tagged_episodes.append(
                 Episode(
                     episode["mediaURL"], episode["description"],
                     episode["title"], episode["uid"],
                     datetime.datetime.fromisoformat(
                         episode["publicationDate"]).timestamp(),
                     episode["duration"], episode["fileSize"]))
         if content_episodes['meta']['pagination'][
                 "total_pages"] <= page_number:
             break
         page_number += 1
     image_resolution = max(
         map(int, content_media["imageInfo"]["variantSourceWidths"]))
     image_url = content_media['imageInfo']["optimizedImageUrls"][str(
         image_resolution)]
     return Show(tagged_episodes, content_media["description"],
                 content_media["title"], image_url)
Esempio n. 3
0
def get_channels_for_user(username, api_key_v3):
    # already a channel ID: return videos.xml.
    # Can't rely on automatic discovery, see #371
    if username.startswith('UC'):
        try:
            url = '{0}?channel_id={1}'.format(CHANNEL_VIDEOS_XML, username)
            stream = util.urlopen(url)
            return [url]
        except urllib.error.HTTPError as e:
            logger.debug(
                "get_channels_for_user(%s) not a channel id (got %i response code)",
                username, e.code)
        except:
            logger.error(
                "get_channels_for_user(%s) not a channel id (got unexpected exception)",
                username)

    # try username to channel ID conversion
    stream = util.urlopen(
        '{0}/channels?forUsername={1}&part=id&key={2}'.format(
            V3_API_ENDPOINT, username, api_key_v3))
    data = json.load(stream)
    return [
        '{0}?channel_id={1}'.format(CHANNEL_VIDEOS_XML, item['id'])
        for item in data['items']
    ]
Esempio n. 4
0
def get_total_time(episode):
    try:
        vid = get_youtube_id(episode.url)
        if vid is None:
            return 0

        url = WATCH_ENDPOINT + vid
        r = util.urlopen(url)
        if not r.ok:
            return 0

        ipr = get_ipr(r.text)
        if ipr is None:
            url = get_gdpr_consent_url(r.text)
            r = util.urlopen(url)
            if not r.ok:
                return 0

            ipr = get_ipr(r.text)
            if ipr is None:
                return 0

        player_response = json.loads(ipr.group(1))
        return int(
            player_response['videoDetails']['lengthSeconds'])  # 0 if live
    except:
        return 0
Esempio n. 5
0
 def __init__(self, url, category):
     self.url = url
     self.category = category
     # TODO: Use proper caching of contents with support for
     #       conditional GETs (If-Modified-Since, ETag, ...)
     self.data = minidom.parse(util.urlopen(url))
     self.playlist = self.data.getElementsByTagName('playlist')[0]
Esempio n. 6
0
 def opendata(url, stream):
     fp = util.urlopen(url)
     data = fp.read(1024*10)
     while data != '':
         stream.write(data)
         data = fp.read(1024*10)
     stream.close()
Esempio n. 7
0
def get_real_download_url(url):
    quality = 'sd'
    codecs = 'H264,VP8,VP6'

    video_id = get_vimeo_id(url)

    if video_id is None:
        return url

    web_url = 'http://vimeo.com/%s' % video_id
    web_data = util.urlopen(web_url).read()
    data_config_frag = DATA_CONFIG_RE.search(web_data)

    if data_config_frag is None:
        raise VimeoError('Cannot get data config from Vimeo')

    data_config_url = data_config_frag.group(1).replace('&amp;', '&')

    def get_urls(data_config_url):
        data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
        data_config = json.loads(data_config_data)
        for fileinfo in data_config['request']['files'].values():
            if not isinstance(fileinfo, dict):
                continue

            for fileformat, keys in fileinfo.items():
                if not isinstance(keys, dict):
                    continue

                yield (fileformat, keys['url'])

    for quality, url in get_urls(data_config_url):
        return url
Esempio n. 8
0
 def get_data_from_url(self, url):
     try:
         response = util.urlopen(url).read()
     except Exception as e:
         logger.warn("subtitle url returned error %s", e)
         return ''
     return response
Esempio n. 9
0
def get_real_download_url(url):
    video_id = get_escapist_id(url)
    if video_id is None:
        return url

    web_data = get_escapist_web(video_id)

    data_config_frag = DATA_CONFIG_RE.search(web_data)

    data_config_url = get_escapist_config_url(data_config_frag.group(1))

    if data_config_url is None:
        raise EscapistError('Cannot parse configuration from the site')

    logger.debug('Config URL: %s', data_config_url)

    data_config_data = util.urlopen(data_config_url).read().decode('utf-8')

    # TODO: This second argument should get a real name
    real_url = get_escapist_real_url(data_config_data, data_config_frag.group(1))

    if real_url is None:
        raise EscapistError('Cannot get MP4 URL from The Escapist')
    elif "sales-marketing/" in real_url:
        raise EscapistError('Oops, seems The Escapist blocked this IP. Wait a few days/weeks to get it unblocked')
    else:
        return real_url
Esempio n. 10
0
def get_real_download_url(url):
    video_id = get_escapist_id(url)
    if video_id is None:
        return url

    web_data = get_escapist_web(video_id)

    data_config_frag = DATA_CONFIG_RE.search(web_data)

    if data_config_frag is None:
        raise EscapistError('Cannot get flashvars URL from The Escapist')

    data_config_url = data_config_frag.group(1)

    logger.debug('Config URL: %s', data_config_url)

    data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
    data_config_data_frag = DATA_CONFIG_DATA_RE.search(data_config_data)
    if data_config_data_frag is None:
        raise EscapistError('Cannot get configuration JS from The Escapist')
    real_url = data_config_data_frag.group(0)
    if real_url is None:
        raise EscapistError('Cannot get MP4 URL from The Escapist')
    elif "-ad-rotation/" in real_url:
        raise EscapistError('Oops, seems The Escapist blocked this IP. Wait a few days/weeks to get it unblocked')
    else:
        return real_url
Esempio n. 11
0
    def __init__(self, channel, max_episodes):
        url = channel.authenticate_url(channel.url)

        logger.info('Parsing via podcastparser: %s', url)

        headers = {}
        if channel.http_etag:
            headers['If-None-Match'] = channel.http_etag
        if channel.http_last_modified:
            headers['If-Modified-Since'] = channel.http_last_modified

        try:
            stream = util.urlopen(url, headers)
            self.status = 200
            info = stream.info()
            self.etag = info.get('etag')
            self.modified = info.get('last-modified')
            self.parsed = podcastparser.parse(url, stream, max_episodes)
        except urllib.error.HTTPError as error:
            self.status = error.code
            if error.code == 304:
                logger.info('Not modified')
            else:
                logger.warn('Feed update failed: %s', error)
                raise error

            self.etag = None
            self.modified = None
            self.parsed = None
Esempio n. 12
0
def vimeo_resolve_download_url(episode, config):
    url = episode.url

    video_id = get_vimeo_id(url)

    if video_id is None:
        return None

    web_url = 'http://vimeo.com/%s' % video_id
    web_data = util.urlopen(web_url).read().decode('utf-8')
    data_config_frag = DATA_CONFIG_RE.search(web_data)

    if data_config_frag is None:
        raise VimeoError('Cannot get data config from Vimeo')

    data_config_url = data_config_frag.group(1).replace('&amp;', '&')

    def get_urls(data_config_url):
        data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
        data_config = json.loads(data_config_data)
        for fileinfo in data_config['request']['files'].values():
            if not isinstance(fileinfo, dict):
                continue

            for fileformat, keys in fileinfo.items():
                if not isinstance(keys, dict):
                    continue

                yield (fileformat, keys['url'])

    for quality, url in get_urls(data_config_url):
        return url
Esempio n. 13
0
def get_real_download_url(url):
    quality = 'sd'
    codecs = 'H264,VP8,VP6'

    video_id = get_vimeo_id(url)

    if video_id is None:
        return url

    web_url = 'http://vimeo.com/%s' % video_id
    web_data = util.urlopen(web_url).read()
    sig_pair = SIGNATURE_RE.search(web_data)

    if sig_pair is None:
        raise VimeoError('Cannot get signature pair from Vimeo')

    signature, timestamp = sig_pair.groups()
    params = '&'.join('%s=%s' % i for i in [
        ('clip_id', video_id),
        ('sig', signature),
        ('time', timestamp),
        ('quality', quality),
        ('codecs', codecs),
        ('type', 'moogaloop_local'),
        ('embed_location', ''),
    ])
    player_url = 'http://player.vimeo.com/play_redirect?%s' % params
    return player_url
Esempio n. 14
0
def get_channel_id_url(url, feed_data=None):
    if 'youtube.com' in url:
        try:
            if feed_data is None:
                r = util.urlopen(url)
                if not r.ok:
                    raise YouTubeError('Youtube "%s": %d %s' %
                                       (url, r.status_code, r.reason))
            else:
                r = feed_data
            # video page may contain corrupt HTML/XML, search for tag to avoid exception
            m = re.search(r'<meta itemprop="channelId" content="([^"]+)">',
                          r.text)
            if m:
                channel_id = m.group(1)
            else:
                raw_xml_data = io.BytesIO(r.content)
                xml_data = xml.etree.ElementTree.parse(raw_xml_data)
                channel_id = xml_data.find(
                    "{http://www.youtube.com/xml/schemas/2015}channelId").text
            channel_url = 'https://www.youtube.com/channel/{}'.format(
                channel_id)
            return channel_url

        except Exception:
            logger.warning('Could not retrieve youtube channel id.',
                           exc_info=True)

    raise Exception('Could not retrieve youtube channel id.')
Esempio n. 15
0
 def get_data_from_url(self, url):
     try:
         response = util.urlopen(url).read()
     except Exception as e:
         logger.warn("subtitle url returned error %s", e)
         return ''
     return response
Esempio n. 16
0
    def _handle_paged_feed(self, max_episodes):
        page = 2
        remaining_episodes = max_episodes - len(self.parsed['episodes'])
        while ('paged_feed_next' in self.parsed and
                page < self.PAGED_FEED_MAX_PAGES and
                remaining_episodes > 0):
            # Take the next page from the paged feed
            url = self.parsed['paged_feed_next']
            del self.parsed['paged_feed_next']

            if not url:
                break

            try:
                logger.debug('Downloading page %d from %s', page, url)
                stream = util.urlopen(url)
                parsed = podcastparser.parse(url, stream, remaining_episodes)
                added_episodes = len(parsed['episodes'])
                remaining_episodes -= added_episodes
                logger.debug('Page %d contains %d additional episodes', page,
                             added_episodes)
                self.parsed['episodes'].extend(parsed['episodes'])

                # Next iteration if we still have a next page
                if 'paged_feed_next' in parsed:
                    self.parsed['paged_feed_next'] = parsed['paged_feed_next']
            except Exception as e:
                logger.warn('Error while fetching feed page %d from %s: %s', page, url, e)
                # Give up, don't try to download additional pages here
                break

            page += 1
Esempio n. 17
0
def get_channel_desc(url):
    if 'youtube.com' in url:

        class YouTubeHTMLDesc(HTMLParser):
            """This custom html parser searches for the YouTube channel description."""
            def __init__(self):
                super().__init__()
                self.description = ''

            def handle_starttag(self, tag, attributes):
                attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}

                # Get YouTube channel description.
                if tag == 'meta' \
                        and 'name' in attribute_dict \
                        and attribute_dict['name'] == "description":
                    self.description = attribute_dict['content']

        try:
            channel_url = get_channel_id_url(url)
            html_data = util.urlopen(channel_url).read().decode('utf-8')
            parser = YouTubeHTMLDesc()
            parser.feed(html_data)
            if parser.description:
                logger.debug('YouTube description for %s is: %s', url, parser.description)
                return parser.description
            else:
                logger.debug('YouTube description for %s is not provided.', url)
                return _('No description available')

        except Exception:
            logger.warning('Could not retrieve YouTube channel description.', exc_info=True)
Esempio n. 18
0
def get_cover(url):
    if 'youtube.com' in url:

        class YouTubeHTMLCoverParser(HTMLParser):
            """This custom html parser searches for the youtube channel thumbnail/avatar"""
            def __init__(self):
                super().__init__()
                self.url = []

            def handle_starttag(self, tag, attributes):
                attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}

                # Look for 900x900px image first.
                if tag == 'link' \
                        and 'rel' in attribute_dict \
                        and attribute_dict['rel'] == 'image_src':
                    self.url.append(attribute_dict['href'])

                # Fallback to image that may only be 100x100px.
                elif tag == 'img' \
                        and 'class' in attribute_dict \
                        and attribute_dict['class'] == "channel-header-profile-image":
                    self.url.append(attribute_dict['src'])

        try:
            channel_url = get_channel_id_url(url)
            html_data = util.urlopen(channel_url).read().decode('utf-8')
            parser = YouTubeHTMLCoverParser()
            parser.feed(html_data)
            if parser.url:
                logger.debug('Youtube cover art for {} is: {}'.format(url, parser.url))
                return parser.url[0]

        except Exception:
            logger.warning('Could not retrieve cover art', exc_info=True)
Esempio n. 19
0
    def __init__(self, channel, max_episodes):
        url = channel.authenticate_url(channel.url)

        logger.info('Parsing via podcastparser: %s', url)

        headers = {}
        if channel.http_etag:
            headers['If-None-Match'] = channel.http_etag
        if channel.http_last_modified:
            headers['If-Modified-Since'] = channel.http_last_modified

        try:
            stream = util.urlopen(url, headers)
            self.status = 200
            info = stream.info()
            self.etag = info.get('etag')
            self.modified = info.get('last-modified')
            self.parsed = podcastparser.parse(url, stream, max_episodes)
            self._handle_paged_feed(max_episodes)
        except urllib.error.HTTPError as error:
            self.status = error.code
            if error.code == 304:
                logger.info('Not modified')
            else:
                logger.warn('Feed update failed: %s', error)
                raise error

            self.etag = None
            self.modified = None
            self.parsed = None
Esempio n. 20
0
def find_youtube_channels(string):
    url = 'http://gdata.youtube.com/feeds/api/videos?alt=json&q=%s' % urllib.quote(
        string, '')
    data = json.load(util.urlopen(url))

    class FakeImporter(object):
        def __init__(self):
            self.items = []

    result = FakeImporter()

    seen_users = set()
    for entry in data['feed']['entry']:
        user = os.path.basename(entry['author'][0]['uri']['$t'])
        title = entry['title']['$t']
        url = 'http://www.youtube.com/rss/user/%s/videos.rss' % user
        if user not in seen_users:
            result.items.append({
                'title': user,
                'url': url,
                'description': title
            })
            seen_users.add(user)

    return result
Esempio n. 21
0
 def __init__(self, url, category):
     self.url = url
     self.category = category
     # TODO: Use proper caching of contents with support for
     #       conditional GETs (If-Modified-Since, ETag, ...)
     self.data = minidom.parse(util.urlopen(url))
     self.playlist = self.data.getElementsByTagName('playlist')[0]
Esempio n. 22
0
    def _handle_paged_feed(self, max_episodes):
        page = 2
        remaining_episodes = max_episodes - len(self.parsed['episodes'])
        while ('paged_feed_next' in self.parsed
               and page < self.PAGED_FEED_MAX_PAGES
               and remaining_episodes > 0):
            # Take the next page from the paged feed
            url = self.parsed['paged_feed_next']
            del self.parsed['paged_feed_next']

            if not url:
                break

            try:
                logger.debug('Downloading page %d from %s', page, url)
                stream = util.urlopen(url)
                parsed = podcastparser.parse(url, stream, remaining_episodes)
                added_episodes = len(parsed['episodes'])
                remaining_episodes -= added_episodes
                logger.debug('Page %d contains %d additional episodes', page,
                             added_episodes)
                self.parsed['episodes'].extend(parsed['episodes'])

                # Next iteration if we still have a next page
                if 'paged_feed_next' in parsed:
                    self.parsed['paged_feed_next'] = parsed['paged_feed_next']
            except Exception as e:
                logger.warn('Error while fetching feed page %d from %s: %s',
                            page, url, e)
                # Give up, don't try to download additional pages here
                break

            page += 1
Esempio n. 23
0
def get_real_download_url(url):
    quality = 'sd'
    codecs = 'H264,VP8,VP6'

    video_id = get_vimeo_id(url)

    if video_id is None:
        return url

    web_url = 'http://vimeo.com/%s' % video_id
    web_data = util.urlopen(web_url).read()
    sig_pair = SIGNATURE_RE.search(web_data)

    if sig_pair is None:
        raise VimeoError('Cannot get signature pair from Vimeo')

    timestamp, signature = sig_pair.groups()
    params = '&'.join('%s=%s' % i for i in [
        ('clip_id', video_id),
        ('sig', signature),
        ('time', timestamp),
        ('quality', quality),
        ('codecs', codecs),
        ('type', 'moogaloop_local'),
        ('embed_location', ''),
    ])
    player_url = 'http://player.vimeo.com/play_redirect?%s' % params
    return player_url
Esempio n. 24
0
def get_escapist_web(video_id):
    if video_id is None:
        return None

    # FIXME: must check if it's utf-8
    web_url = 'http://www.escapistmagazine.com/videos/view/%s' % video_id
    return util.urlopen(web_url).read()
Esempio n. 25
0
def find_youtube_channels(string):
    # FIXME: Make proper use of the YouTube API instead
    # of screen-scraping the YouTube website
    url = 'http://www.youtube.com/results?search_query='+ urllib.quote(string, '') +'&search_type=search_users&aq=f'

    r = re.compile('>\s+<')
    data = r.sub('><', util.urlopen(url).read())

    r1 = re.compile('<a href="/user/([^"?]+)[^"]+"[^>]*>([^<]+)</a>')
    m1 = r1.findall(data)

    r2 = re.compile('\s+')

    class FakeImporter(object):
        def __init__(self):
            self.items = []

    result = FakeImporter()
    found_users = []
    for name, title in m1:
        if name not in found_users:
            found_users.append(name)
            link = 'http://www.youtube.com/rss/user/'+ name +'/videos.rss'
            result.items.append({'title': name, 'url': link, 'description': title})

    return result
Esempio n. 26
0
    def get_cover(self,
                  filename,
                  cover_url,
                  feed_url,
                  title,
                  username=None,
                  password=None,
                  download=False):
        # Detection of "all episodes" podcast
        if filename == self.ALL_EPISODES_ID:
            return self.get_cover_all_episodes()

        # Return already existing files
        for extension in self.EXTENSIONS:
            if os.path.exists(filename + extension):
                return filename + extension

        # If allowed to download files, do so here
        if download:
            # YouTube-specific cover art image resolver
            youtube_cover_url = youtube.get_real_cover(feed_url)
            if youtube_cover_url is not None:
                cover_url = youtube_cover_url

            if not cover_url:
                return self._fallback_filename(title)

            # We have to add username/password, because password-protected
            # feeds might keep their cover art also protected (bug 1521)
            if username is not None and password is not None:
                cover_url = util.url_add_authentication(
                    cover_url, username, password)

            try:
                logger.info('Downloading cover art: %s', cover_url)
                data = util.urlopen(cover_url, timeout=self.TIMEOUT).read()
            except Exception, e:
                logger.warn('Cover art download failed: %s', e)
                return self._fallback_filename(title)

            try:
                extension = None

                for filetype, check in self.SUPPORTED_EXTENSIONS.items():
                    if check(data):
                        extension = filetype
                        break

                if extension is None:
                    msg = 'Unknown file type: %s (%r)' % (cover_url, data[:6])
                    raise ValueError(msg)

                # Successfully downloaded the cover art - save it!
                fp = open(filename + extension, 'wb')
                fp.write(data)
                fp.close()

                return filename + extension
            except Exception, e:
                logger.warn('Cannot save cover art', exc_info=True)
Esempio n. 27
0
def itunes_feed_handler(channel, max_episodes, config):
    m = re.match(r'https?://itunes.apple.com/(?:[^/]*/)?podcast/.+$',
                 channel.url, re.I)
    if m is None:
        return None

    logger.debug('Detected iTunes feed.')
    version = ITUNES_DEFAULT_VERSION
    headers = {'User-agent': 'iTunes/{}'.format(version)}
    try:
        data = util.urlopen(channel.url, headers).read().decode('utf-8')
        m = re.search(ITUNES_FEEDURL_RE[version], data)
        if m is None:
            raise ITunesFeedException(
                'Could not resolve real feed URL from iTunes feed.')

        url = m.group(1)
        logger.info('Resolved iTunes feed URL: {} -> {}'.format(
            channel.url, url))
        channel.url = url

        # Delegate further processing of the feed to the normal podcast parser
        # by returning None (will try the next handler in the resolver chain)
        return None
    except Exception as ex:
        logger.warn('Cannot resolve iTunes feed: {}'.format(str(ex)))
        raise
Esempio n. 28
0
def get_escapist_web(video_id):
    if video_id is None:
        return None

    # FIXME: must check if it's utf-8
    web_url = 'http://www.escapistmagazine.com/videos/view/%s' % video_id
    return util.urlopen(web_url).read()
Esempio n. 29
0
def get_real_download_url(url):
    video_id = get_escapist_id(url)
    if video_id is None:
        return url

    web_data = get_escapist_web(video_id)

    data_config_frag = DATA_CONFIG_RE.search(web_data)

    data_config_url = get_escapist_config_url(data_config_frag.group(1))

    if data_config_url is None:
        raise EscapistError('Cannot parse configuration from the site')

    logger.debug('Config URL: %s', data_config_url)

    data_config_data = util.urlopen(data_config_url).read().decode('utf-8')

    #TODO: This second argument should get a real name
    real_url = get_escapist_real_url(data_config_data,
                                     data_config_frag.group(1))

    if real_url is None:
        raise EscapistError('Cannot get MP4 URL from The Escapist')
    elif "sales-marketing/" in real_url:
        raise EscapistError(
            'Oops, seems The Escapist blocked this IP. Wait a few days/weeks to get it unblocked'
        )
    else:
        return real_url
Esempio n. 30
0
    def fetch(self,
              url,
              etag=None,
              modified=None,
              autodiscovery=True,
              **kwargs):
        """ use kwargs to pass extra data to parse_feed in Fetcher subclasses """
        # handle local file first
        if url.startswith('file://'):
            url = url[len('file://'):]
            stream = open(url)
            return self.parse_feed(url, stream, {}, UPDATED_FEED, **kwargs)

        # remote feed
        headers = {}
        if modified is not None:
            headers['If-Modified-Since'] = modified
        if etag is not None:
            headers['If-None-Match'] = etag

        stream = util.urlopen(url, headers)

        responses = stream.history + [stream]
        for i, resp in enumerate(responses):
            if resp.is_permanent_redirect:
                # there should always be a next response when a redirect is encountered
                # If max redirects is reached, TooManyRedirects is raised
                # TODO: since we've got the end contents anyway, modify model.py to accept contents on NEW_LOCATION
                return Result(NEW_LOCATION, responses[i + 1].url)
        res = self._check_statuscode(stream.status_code, stream.url)
        if res == NOT_MODIFIED:
            return Result(NOT_MODIFIED, stream.url)

        if autodiscovery and stream.headers.get('content-type',
                                                '').startswith('text/html'):
            ad = FeedAutodiscovery(url)
            # response_text() will assume utf-8 if no charset specified
            ad.feed(util.response_text(stream))
            if ad._resolved_url and ad._resolved_url != url:
                try:
                    self.fetch(ad._resolved_url,
                               etag=None,
                               modified=None,
                               autodiscovery=False,
                               **kwargs)
                    return Result(NEW_LOCATION, ad._resolved_url)
                except Exception as e:
                    logger.warn('Feed autodiscovery failed', exc_info=True)

            # Second, try to resolve the URL
            new_url = self._resolve_url(url)
            if new_url and new_url != url:
                return Result(NEW_LOCATION, new_url)

        # xml documents specify the encoding inline so better pass encoded body.
        # Especially since requests will use ISO-8859-1 for content-type 'text/xml'
        # if the server doesn't specify a charset.
        return self.parse_feed(url, BytesIO(stream.content), stream.headers,
                               UPDATED_FEED, **kwargs)
Esempio n. 31
0
    def get_urls(data_config_url):
        data_config = util.urlopen(data_config_url).json()
        for fileinfo in list(data_config['request']['files'].values()):
            if not isinstance(fileinfo, list):
                continue

            for item in fileinfo:
                yield (item['quality'], item['url'])
Esempio n. 32
0
    def get_tracks(self, feed):
        """Get a generator of tracks from a SC user

        The generator will give you a dictionary for every
        track it can find for its user."""
        global CONSUMER_KEY
        try:
            json_url = 'https://api.soundcloud.com/users/%(user)s/%(feed)s.json?filter=downloadable&consumer_key=%(consumer_key)s&limit=200' \
                    % { "user":self.get_user_id(), "feed":feed, "consumer_key": CONSUMER_KEY }
            logger.debug("loading %s", json_url)

            json_tracks = json.loads(
                util.urlopen(json_url).read().decode('utf-8'))
            tracks = [track for track in json_tracks if track['downloadable']]
            total_count = len(tracks) + len(
                [track for track in json_tracks if not track['downloadable']])

            if len(tracks) == 0 and total_count > 0:
                logger.warn("Download of all %i %s of user %s is disabled" %
                            (total_count, feed, self.username))
            else:
                logger.info("%i/%i downloadable tracks for user %s %s feed" %
                            (len(tracks), total_count, self.username, feed))

            for track in tracks:
                # Prefer stream URL (MP3), fallback to download URL
                url = track.get('stream_url', track['download_url']) + \
                    '?consumer_key=%(consumer_key)s' \
                    % { 'consumer_key': CONSUMER_KEY }
                if url not in self.cache:
                    try:
                        self.cache[url] = get_metadata(url)
                    except:
                        continue
                filesize, filetype, filename = self.cache[url]

                yield {
                    'title':
                    track.get('title', track.get('permalink'))
                    or _('Unknown track'),
                    'link':
                    track.get('permalink_url')
                    or 'https://soundcloud.com/' + self.username,
                    'description':
                    track.get('description') or _('No description available'),
                    'url':
                    url,
                    'file_size':
                    int(filesize),
                    'mime_type':
                    filetype,
                    'guid':
                    track.get('permalink', track.get('id')),
                    'published':
                    soundcloud_parsedate(track.get('created_at', None)),
                }
        finally:
            self.commit_cache()
Esempio n. 33
0
def get_channels_for_user(username, api_key_v3):
    stream = util.urlopen(
        '{0}/channels?forUsername={1}&part=id&key={2}'.format(
            V3_API_ENDPOINT, username, api_key_v3))
    data = json.loads(stream.read().decode('utf-8'))
    return [
        '{0}?channel_id={1}'.format(CHANNEL_VIDEOS_XML, item['id'])
        for item in data['items']
    ]
Esempio n. 34
0
    def get_urls(data_config_url):
        data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
        data_config = json.loads(data_config_data)
        for fileinfo in list(data_config['request']['files'].values()):
            if not isinstance(fileinfo, list):
                continue

            for item in fileinfo:
                yield (item['quality'], item['url'])
Esempio n. 35
0
def youtube_get_old_endpoint(vid):
    # TODO: changing 'detailpage' to 'embedded' allows age-restricted content
    url = 'https://www.youtube.com/get_video_info?html5=1&c=TVHTML5&cver=6.20180913&el=detailpage&video_id=' + vid
    r = util.urlopen(url)
    if not r.ok:
        raise YouTubeError('Youtube "%s": %d %s' %
                           (url, r.status_code, r.reason))
    else:
        return r.text, None
Esempio n. 36
0
    def get_urls(data_config_url):
        data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
        data_config = json.loads(data_config_data)
        for fileinfo in data_config['request']['files'].values():
            if not isinstance(fileinfo, list):
                continue

            for item in fileinfo:
                yield (item['quality'], item['url'])
Esempio n. 37
0
    def _parse_feed(self, url, etag, modified, autodiscovery=True):
        headers = {}
        if modified is not None:
            headers['If-Modified-Since'] = modified
        if etag is not None:
            headers['If-None-Match'] = etag

        if url.startswith('file://'):
            is_local = True
            url = url[len('file://'):]
            stream = open(url)
        else:
            is_local = False
            try:
                stream = util.urlopen(url, headers)
            except HTTPError as e:
                return self._check_statuscode(e, e.geturl())

        data = stream
        if autodiscovery and not is_local and stream.headers.get(
                'content-type', '').startswith('text/html'):
            # Not very robust attempt to detect encoding: http://stackoverflow.com/a/1495675/1072626
            charset = stream.headers.get_param('charset')
            if charset is None:
                charset = 'utf-8'  # utf-8 appears hard-coded elsewhere in this codebase

            # We use StringIO in case the stream needs to be read again
            data = StringIO(stream.read().decode(charset))
            ad = FeedAutodiscovery(url)

            ad.feed(data.getvalue())
            if ad._resolved_url:
                try:
                    self._parse_feed(ad._resolved_url, None, None, False)
                    return Result(NEW_LOCATION, ad._resolved_url)
                except Exception as e:
                    logger.warn('Feed autodiscovery failed', exc_info=True)

                # Second, try to resolve the URL
                url = self._resolve_url(url)
                if url:
                    return Result(NEW_LOCATION, url)

            # Reset the stream so podcastparser can give it a go
            data.seek(0)

        try:
            feed = podcastparser.parse(url, data)
        except ValueError as e:
            raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e))

        if is_local:
            feed['headers'] = {}
            return Result(UPDATED_FEED, feed)
        else:
            feed['headers'] = stream.headers
            return self._check_statuscode(stream, feed)
Esempio n. 38
0
    def get_cover(self, filename, cover_url, feed_url, title,
            username=None, password=None, download=False):
        # Detection of "all episodes" podcast
        if filename == self.ALL_EPISODES_ID:
            return self.get_cover_all_episodes()

        # Return already existing files
        for extension in self.EXTENSIONS:
            if os.path.exists(filename + extension):
                return filename + extension

        # If allowed to download files, do so here
        if download:
            # YouTube-specific cover art image resolver
            youtube_cover_url = youtube.get_cover(feed_url)
            if youtube_cover_url is not None:
                cover_url = youtube_cover_url

            if not cover_url:
                return self._fallback_filename(title)

            # We have to add username/password, because password-protected
            # feeds might keep their cover art also protected (bug 1521)
            if username is not None and password is not None:
                cover_url = util.url_add_authentication(cover_url,
                        username, password)

            try:
                logger.info('Downloading cover art: %s', cover_url)
                data = util.urlopen(cover_url, timeout=self.TIMEOUT).read()
            except Exception as e:
                logger.warn('Cover art download failed: %s', e)
                return self._fallback_filename(title)

            try:
                extension = None

                for filetype, check in list(self.SUPPORTED_EXTENSIONS.items()):
                    if check(data):
                        extension = filetype
                        break

                if extension is None:
                    msg = 'Unknown file type: %s (%r)' % (cover_url, data[:6])
                    raise ValueError(msg)

                # Successfully downloaded the cover art - save it!
                fp = open(filename + extension, 'wb')
                fp.write(data)
                fp.close()

                return filename + extension
            except Exception as e:
                logger.warn('Cannot save cover art', exc_info=True)

        # Fallback to cover art based on the podcast title
        return self._fallback_filename(title)
Esempio n. 39
0
    def return_user_cover(url, channel):
        api_url = 'http://gdata.youtube.com/feeds/api/users/{}?v=2'.format(channel)
        data = util.urlopen(api_url).read()
        m = re.search('<media:thumbnail url=[\'"]([^\'"]+)[\'"]/>', data)
        if m is not None:
            logger.debug('YouTube userpic for %s is: %s', url, m.group(1))
            return m.group(1)

        return None
Esempio n. 40
0
def get_cover(url):
    if 'youtube.com' in url:

        class YouTubeHTMLCoverParser(HTMLParser):
            """This custom html parser searches for the youtube channel thumbnail/avatar"""
            def __init__(self):
                super().__init__()
                self.url = ""

            def handle_starttag(self, tag, attributes):
                attribute_dict = {
                    attribute[0]: attribute[1]
                    for attribute in attributes
                }

                # Look for 900x900px image first.
                if tag == 'link' \
                        and 'rel' in attribute_dict \
                        and attribute_dict['rel'] == 'image_src':
                    self.url = attribute_dict['href']

                # Fallback to image that may only be 100x100px.
                elif tag == 'img' \
                        and 'class' in attribute_dict \
                        and attribute_dict['class'] == "channel-header-profile-image":
                    self.url = attribute_dict['src']

        try:
            raw_xml_data = util.urlopen(url).read().decode('utf-8')
            xml_data = xml.etree.ElementTree.fromstring(raw_xml_data)
            channel_id = xml_data.find(
                "{http://www.youtube.com/xml/schemas/2015}channelId").text
            channel_url = 'https://www.youtube.com/channel/{}'.format(
                channel_id)
            html_data = util.urlopen(channel_url).read().decode('utf-8')
            parser = YouTubeHTMLCoverParser()
            parser.feed(html_data)
            if parser.url:
                logger.debug('Youtube cover art for {} is: {}'.format(
                    url, parser.url))
                return parser.url

        except Exception:
            logger.warning('Could not retrieve cover art', exc_info=True)
Esempio n. 41
0
    def _parse_feed(self, url, etag, modified, autodiscovery=True):
        headers = {}
        if modified is not None:
            headers['If-Modified-Since'] = modified
        if etag is not None:
            headers['If-None-Match'] = etag

        if url.startswith('file://'):
            is_local = True
            url = url[len('file://'):]
            stream = open(url)
        else:
            is_local = False
            try:
                stream = util.urlopen(url, headers)
            except HTTPError as e:
                return self._check_statuscode(e, e.geturl())

        data = stream
        if autodiscovery and not is_local and stream.headers.get('content-type', '').startswith('text/html'):
            # Not very robust attempt to detect encoding: http://stackoverflow.com/a/1495675/1072626
            charset = stream.headers.get_param('charset')
            if charset is None:
                charset = 'utf-8'  # utf-8 appears hard-coded elsewhere in this codebase

            # We use StringIO in case the stream needs to be read again
            data = StringIO(stream.read().decode(charset))
            ad = FeedAutodiscovery(url)

            ad.feed(data.getvalue())
            if ad._resolved_url:
                try:
                    self._parse_feed(ad._resolved_url, None, None, False)
                    return Result(NEW_LOCATION, ad._resolved_url)
                except Exception as e:
                    logger.warn('Feed autodiscovery failed', exc_info=True)

                # Second, try to resolve the URL
                url = self._resolve_url(url)
                if url:
                    return Result(NEW_LOCATION, url)

            # Reset the stream so podcastparser can give it a go
            data.seek(0)

        try:
            feed = podcastparser.parse(url, data)
        except ValueError as e:
            raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e))

        if is_local:
            feed['headers'] = {}
            return Result(UPDATED_FEED, feed)
        else:
            feed['headers'] = stream.headers
            return self._check_statuscode(stream, feed)
Esempio n. 42
0
    def get_user_info(self):
        global CONSUMER_KEY
        key = ':'.join((self.username, 'user_info'))

        if key not in self.cache:
            json_url = 'https://api.soundcloud.com/users/%s.json?consumer_key=%s' % (self.username, CONSUMER_KEY)
            logger.debug('get_user_info url: %s', json_url)
            user_info = json.loads(util.urlopen(json_url).read().decode('utf-8'))
            self.cache[key] = user_info

        return self.cache[key]
Esempio n. 43
0
def get_channel_id_url(url):
    if 'youtube.com' in url:
        try:
            channel_url = ''
            raw_xml_data = util.urlopen(url).read().decode('utf-8')
            xml_data = xml.etree.ElementTree.fromstring(raw_xml_data)
            channel_id = xml_data.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
            channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id)
            return channel_url

        except Exception:
            logger.warning('Could not retrieve youtube channel id.', exc_info=True)
Esempio n. 44
0
    def get_urls(data_config_url):
        data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
        data_config = json.loads(data_config_data)
        for fileinfo in data_config['request']['files'].values():
            if not isinstance(fileinfo, dict):
                continue

            for fileformat, keys in fileinfo.items():
                if not isinstance(keys, dict):
                    continue

                yield (fileformat, keys['url'])
Esempio n. 45
0
def get_real_cover(url):
    rss_url = get_real_channel_url(url)
    if rss_url is None:
        return None

    rss_data = util.urlopen(rss_url).read()
    rss_data_frag = DATA_COVERART_RE.search(rss_data)

    if rss_data_frag is None:
        return None

    return rss_data_frag.group(1)
 def on_search(self, query):
     search = util.urlopen(
         f'https://api-v4.audionow.de/api/v4/search.json?q=*{query}*&page=1'
     ).json()
     to_return = []
     for result in search["data"]:
         to_return.append(
             directory.DirectoryEntry(
                 result["title"],
                 f"https://audionow.de/podcast/{result['uid']}",
                 description=result["subtitle"]))
     return to_return
Esempio n. 47
0
    def get_cover(self, podcast, download=False):
        filename = podcast.cover_file
        cover_url = podcast.cover_url

        # Return already existing files
        for extension in self.EXTENSIONS:
            if os.path.exists(filename + extension):
                return filename + extension

        # If allowed to download files, do so here
        if download:
            # YouTube-specific cover art image resolver
            youtube_cover_url = youtube.get_real_cover(podcast.url)
            if youtube_cover_url is not None:
                cover_url = youtube_cover_url

            if not cover_url:
                return None

            # We have to add username/password, because password-protected
            # feeds might keep their cover art also protected (bug 1521)
            cover_url = util.url_add_authentication(cover_url,
                    podcast.auth_username, podcast.auth_password)

            try:
                logger.info('Downloading cover art: %s', cover_url)
                data = util.urlopen(cover_url, timeout=self.TIMEOUT).read()
            except Exception as e:
                logger.warn('Cover art download failed: %s', e)
                return None

            try:
                extension = None

                for filetype, check in list(self.SUPPORTED_EXTENSIONS.items()):
                    if check(data):
                        extension = filetype
                        break

                if extension is None:
                    msg = 'Unknown file type: %s (%r)' % (cover_url, data[:6])
                    raise ValueError(msg)

                # Successfully downloaded the cover art - save it!
                fp = open(filename + extension, 'wb')
                fp.write(data)
                fp.close()

                return filename + extension
            except Exception as e:
                logger.warn('Cannot save cover art', exc_info=True)

        return None
Esempio n. 48
0
def get_real_cover(url):
    rss_url = get_real_channel_url(url)
    if rss_url is None:
        return None

    # FIXME: can I be sure to decode it as utf-8?
    rss_data = util.urlopen(rss_url).read()
    rss_data_frag = DATA_COVERART_RE.search(rss_data)

    if rss_data_frag is None:
        return None

    return rss_data_frag.group(1)
Esempio n. 49
0
    def get_tracks(self, feed):
        """Get a generator of tracks from a SC user

        The generator will give you a dictionary for every
        track it can find for its user."""
        global CONSUMER_KEY
        try:
            json_url = ('https://api.soundcloud.com/users/%(user)s/%(feed)s.'
                        'json?filter=downloadable&consumer_key=%'
                        '(consumer_key)s&limit=200'
                        % {"user": self.get_user_id(),
                           "feed": feed,
                           "consumer_key": CONSUMER_KEY})
            logger.debug("loading %s", json_url)

            json_tracks = json.loads(util.urlopen(json_url).read().decode('utf-8'))
            tracks = [track for track in json_tracks if track['downloadable']]
            total_count = len(tracks) + len([track for track in json_tracks
                                             if not track['downloadable']])

            if len(tracks) == 0 and total_count > 0:
                logger.warn("Download of all %i %s of user %s is disabled" %
                            (total_count, feed, self.username))
            else:
                logger.info("%i/%i downloadable tracks for user %s %s feed" %
                            (len(tracks), total_count, self.username, feed))

            for track in tracks:
                # Prefer stream URL (MP3), fallback to download URL
                url = track.get('stream_url', track['download_url']) + \
                    '?consumer_key=%(consumer_key)s' \
                    % {'consumer_key': CONSUMER_KEY}
                if url not in self.cache:
                    try:
                        self.cache[url] = get_metadata(url)
                    except:
                        continue
                filesize, filetype, filename = self.cache[url]

                yield {
                    'title': track.get('title', track.get('permalink')) or _('Unknown track'),
                    'link': track.get('permalink_url') or 'https://soundcloud.com/' + self.username,
                    'description': track.get('description') or _('No description available'),
                    'url': url,
                    'file_size': int(filesize),
                    'mime_type': filetype,
                    'guid': track.get('permalink', track.get('id')),
                    'published': soundcloud_parsedate(track.get('created_at', None)),
                }
        finally:
            self.commit_cache()
Esempio n. 50
0
def youtube_resolve_cover_art(podcast):
    url = podcast.url
    r = re.compile('http://www\.youtube\.com/rss/user/([^/]+)/videos\.rss', re.IGNORECASE)
    m = r.match(url)

    if m is not None:
        username = m.group(1)
        api_url = 'http://gdata.youtube.com/feeds/api/users/%s?v=2' % username
        data = util.urlopen(api_url).read().decode('utf-8', 'ignore')
        match = re.search('<media:thumbnail url=[\'"]([^\'"]+)[\'"]/>', data)
        if match is not None:
            return match.group(1)

    return None
Esempio n. 51
0
    def return_user_cover(url, channel):
        try:
            api_url = 'https://www.youtube.com/channel/{0}'.format(channel)
            data = util.urlopen(api_url).read()
            m = re.search('<img class="channel-header-profile-image"[^>]* src=[\'"]([^\'"]+)[\'"][^>]*>', data)
            if m is not None:
                logger.debug('YouTube userpic for %s is: %s', url, m.group(1))
                return m.group(1)
        except Exception as e:
            logger.warn('Could not retrieve cover art', exc_info=True)
            return None


        return None
Esempio n. 52
0
    def get_user_info(self):
        global CONSUMER_KEY
        key = ':'.join((self.username, 'user_info'))
        if key in self.cache:
            return self.cache[key]

        try:
            json_url = 'https://api.soundcloud.com/users/%s.json?consumer_key=%s' % (self.username, CONSUMER_KEY)
            user_info = json.loads(util.urlopen(json_url).read().decode('utf-8'))
            self.cache[key] = user_info
        finally:
            self.commit_cache()

        return user_info
Esempio n. 53
0
def get_real_cover(url):
    r = re.compile('http://www\.youtube\.com/rss/user/([^/]+)/videos\.rss', re.IGNORECASE)
    m = r.match(url)

    if m is not None:
        username = m.group(1)
        api_url = 'http://gdata.youtube.com/feeds/api/users/%s?v=2' % username
        data = util.urlopen(api_url).read().decode('utf-8', 'ignore')
        match = re.search('<media:thumbnail url=[\'"]([^\'"]+)[\'"]/>', data)
        if match is not None:
            logger.debug('YouTube userpic for %s is: %s', url, match.group(1))
            return match.group(1)

    return None
Esempio n. 54
0
def get_metadata(url):
    """Get file download metadata

    Returns a (size, type, name) from the given download
    URL. Will use the network connection to determine the
    metadata via the HTTP header fields.
    """
    track_fp = util.urlopen(url)
    headers = track_fp.info()
    filesize = headers['content-length'] or '0'
    filetype = headers['content-type'] or 'application/octet-stream'
    headers_s = '\n'.join('%s:%s'%(k,v) for k, v in headers.items())
    filename = get_param(headers_s) or os.path.basename(os.path.dirname(url))
    track_fp.close()
    return filesize, filetype, filename
Esempio n. 55
0
    def on_search(self, query):
        url = 'http://gdata.youtube.com/feeds/api/videos?alt=json&q=%s' % urllib.quote(query)
        data = json.load(util.urlopen(url))

        result = []

        seen_users = set()
        for entry in data['feed']['entry']:
            user = os.path.basename(entry['author'][0]['uri']['$t'])
            title = entry['title']['$t']
            url = 'http://www.youtube.com/rss/user/%s/videos.rss' % user
            if user not in seen_users:
                result.append(DirectoryEntry(user, url))
                seen_users.add(user)

        return result
Esempio n. 56
0
    def request(self, url, data=None):
        headers = {'Content-Type': 'application/json'}

        if url == self.OAUTH_TOKEN_URL:
            # Inject username and password into the request URL
            url = util.url_add_authentication(url, self.KEY, self.SECRET)
        elif self._config.token:
            headers['Authorization'] = 'Bearer ' + self._config.token

        if data is not None:
            data = json.dumps(data)

        try:
            response = util.urlopen(url, headers, data)
        except urllib2.HTTPError, error:
            return {'_gpodder_statuscode': error.getcode()}
Esempio n. 57
0
    def get_coverart(self):
        global CONSUMER_KEY
        key = ':'.join((self.username, 'avatar_url'))
        if key in self.cache:
            return self.cache[key]

        image = None
        try:
            json_url = 'http://api.soundcloud.com/users/%s.json?consumer_key=%s' % (self.username, CONSUMER_KEY)
            user_info = json.load(util.urlopen(json_url))
            image = user_info.get('avatar_url', None)
            self.cache[key] = image
        finally:
            self.commit_cache()

        return image
Esempio n. 58
0
    def __init__(self, url):
        """
        Parses the OPML feed from the given URL into
        a local data structure containing channel metadata.
        """
        self.items = []
        try:
            if os.path.exists(url):
                doc = xml.dom.minidom.parse(url)
            else:
                # FIXME: is it ok to pass bytes to parseString?
                doc = xml.dom.minidom.parseString(util.urlopen(url).read())

            for outline in doc.getElementsByTagName('outline'):
                # Make sure we are dealing with a valid link type (ignore case)
                otl_type = outline.getAttribute('type')
                if otl_type is None or otl_type.lower() not in self.VALID_TYPES:
                    continue

                if outline.getAttribute('xmlUrl') or outline.getAttribute('url'):
                    channel = {
                        'url':
                            outline.getAttribute('xmlUrl')
                            or outline.getAttribute('url'),
                        'title':
                            outline.getAttribute('title')
                            or outline.getAttribute('text')
                            or outline.getAttribute('xmlUrl')
                            or outline.getAttribute('url'),
                        'description':
                            outline.getAttribute('text')
                            or outline.getAttribute('xmlUrl')
                            or outline.getAttribute('url'),
                    }

                    if channel['description'] == channel['title']:
                        channel['description'] = channel['url']

                    for attr in ('url', 'title', 'description'):
                        channel[attr] = channel[attr].strip()

                    self.items.append(channel)
            if not len(self.items):
                logger.info('OPML import finished, but no items found: %s', url)
        except:
            logger.error('Cannot import OPML from URL: %s', url, exc_info=True)
Esempio n. 59
0
    def return_user_cover(url, channel):
        try:
            api_url = 'https://www.youtube.com/channel/{0}'.format(channel)
            data = util.urlopen(api_url).read()
            # Look for 900x900px image first.
            m = re.search('<link rel="image_src"[^>]* href=[\'"]([^\'"]+)[\'"][^>]*>', data)
            if m is None:
                # Fallback to image that may only be 100x100px.
                m = re.search('<img class="channel-header-profile-image"[^>]* src=[\'"]([^\'"]+)[\'"][^>]*>', data)
            if m is not None:
                logger.debug('YouTube userpic for %s is: %s', url, m.group(1))
                return m.group(1)
        except Exception as e:
            logger.warn('Could not retrieve cover art', exc_info=True)
            return None

        return None