Exemple #1
0
    def fetch(self,
              url,
              etag=None,
              modified=None,
              autodiscovery=True,
              **kwargs):
        """ use kwargs to pass extra data to parse_feed in Fetcher subclasses """
        # handle local file first
        if url.startswith('file://'):
            url = url[len('file://'):]
            stream = open(url)
            return self.parse_feed(url, stream, {}, UPDATED_FEED, **kwargs)

        # remote feed
        headers = {}
        if modified is not None:
            headers['If-Modified-Since'] = modified
        if etag is not None:
            headers['If-None-Match'] = etag

        stream = util.urlopen(url, headers)

        responses = stream.history + [stream]
        for i, resp in enumerate(responses):
            if resp.is_permanent_redirect:
                # there should always be a next response when a redirect is encountered
                # If max redirects is reached, TooManyRedirects is raised
                # TODO: since we've got the end contents anyway, modify model.py to accept contents on NEW_LOCATION
                return Result(NEW_LOCATION, responses[i + 1].url)
        res = self._check_statuscode(stream.status_code, stream.url)
        if res == NOT_MODIFIED:
            return Result(NOT_MODIFIED, stream.url)

        if autodiscovery and stream.headers.get('content-type',
                                                '').startswith('text/html'):
            ad = FeedAutodiscovery(url)
            # response_text() will assume utf-8 if no charset specified
            ad.feed(util.response_text(stream))
            if ad._resolved_url and ad._resolved_url != url:
                try:
                    self.fetch(ad._resolved_url,
                               etag=None,
                               modified=None,
                               autodiscovery=False,
                               **kwargs)
                    return Result(NEW_LOCATION, ad._resolved_url)
                except Exception as e:
                    logger.warn('Feed autodiscovery failed', exc_info=True)

            # Second, try to resolve the URL
            new_url = self._resolve_url(url)
            if new_url and new_url != url:
                return Result(NEW_LOCATION, new_url)

        # xml documents specify the encoding inline so better pass encoded body.
        # Especially since requests will use ISO-8859-1 for content-type 'text/xml'
        # if the server doesn't specify a charset.
        return self.parse_feed(url, BytesIO(stream.content), stream.headers,
                               UPDATED_FEED, **kwargs)
Exemple #2
0
def get_channel_desc(url, feed_data=None):
    if 'youtube.com' in url:

        class YouTubeHTMLDesc(HTMLParser):
            """This custom html parser searches for the YouTube channel description."""
            def __init__(self):
                super().__init__()
                self.description = ''

            def handle_starttag(self, tag, attributes):
                attribute_dict = {
                    attribute[0]: attribute[1]
                    for attribute in attributes
                }

                # Get YouTube channel description.
                if tag == 'meta' \
                        and 'name' in attribute_dict \
                        and attribute_dict['name'] == "description":
                    self.description = attribute_dict['content']

        try:
            channel_url = get_channel_id_url(url, feed_data)
            r = util.urlopen(channel_url)
            if not r.ok:
                raise YouTubeError('Youtube "%s": %d %s' %
                                   (url, r.status_code, r.reason))
            html_data = util.response_text(r)
            parser = YouTubeHTMLDesc()
            parser.feed(html_data)
            if parser.description:
                logger.debug('YouTube description for %s is: %s', url,
                             parser.description)
                return parser.description
            else:
                logger.debug('YouTube description for %s is not provided.',
                             url)
                return _('No description available')

        except Exception:
            logger.warning(
                'Could not retrieve YouTube channel description for %s.' % url,
                exc_info=True)
Exemple #3
0
def get_cover(url, feed_data=None):
    if 'youtube.com' in url:

        class YouTubeHTMLCoverParser(HTMLParser):
            """This custom html parser searches for the youtube channel thumbnail/avatar"""
            def __init__(self):
                super().__init__()
                self.url = []

            def handle_starttag(self, tag, attributes):
                attribute_dict = {
                    attribute[0]: attribute[1]
                    for attribute in attributes
                }

                # Look for 900x900px image first.
                if tag == 'link' \
                        and 'rel' in attribute_dict \
                        and attribute_dict['rel'] == 'image_src':
                    self.url.append(attribute_dict['href'])

                # Fallback to image that may only be 100x100px.
                elif tag == 'img' \
                        and 'class' in attribute_dict \
                        and attribute_dict['class'] == "channel-header-profile-image":
                    self.url.append(attribute_dict['src'])

        try:
            channel_url = get_channel_id_url(url, feed_data)
            r = util.urlopen(channel_url)
            if not r.ok:
                raise YouTubeError('Youtube "%s": %d %s' %
                                   (url, r.status_code, r.reason))
            html_data = util.response_text(r)
            parser = YouTubeHTMLCoverParser()
            parser.feed(html_data)
            if parser.url:
                logger.debug('Youtube cover art for {} is: {}'.format(
                    url, parser.url))
                return parser.url[0]

        except Exception:
            logger.warning('Could not retrieve cover art', exc_info=True)