Beispiel #1
0
 def __init__(self, *args, **kwargs):
     super(AdvancedFeed, self).__init__(*args, **kwargs)
     if not self.is_available():
         if oauth_hook is None:
             raise UnhandledFeed(u"{0} requires requests-oauth.".format(
                                 self.__class__.__name__))
         else:
             raise UnhandledFeed(u"{0} requires API keys.".format(
                                 self.__class__.__name__))
Beispiel #2
0
 def __init__(self, *args, **kwargs):
     # SimpleFeed's __init__ just warns us that it only returns 60
     # results; we don't need that, so skip it!
     super(SimpleFeed, self).__init__(*args, **kwargs)
     if not self.is_available():
         clsname = self.__class__.__name__
         if oauth_hook is None:
             msg = u"{0} requires requests-oauth.".format(clsname)
             warnings.warn(msg)
             raise UnhandledFeed(msg)
         else:
             raise UnhandledFeed(u"{0} requires API keys.".format(clsname))
Beispiel #3
0
    def get_feed(self,
                 url,
                 last_modified=None,
                 etag=None,
                 start_index=1,
                 max_results=None,
                 video_fields=None,
                 api_keys=None):
        """
        For each registered :mod:`suite <vidscraper.suites>`, calls
        :meth:`~BaseSuite.get_feed` with the given parameters, until a suite
        returns a feed instance.

        :returns: An instance of a specific suite's
                  :attr:`~.BaseSuite.feed_class` with no data loaded.

        :raises: :exc:`.UnhandledFeed` if no registered suites know how
                 to handle this url.

        """
        for suite in self.suites:
            try:
                return suite.get_feed(url,
                                      last_modified=last_modified,
                                      etag=etag,
                                      start_index=start_index,
                                      max_results=max_results,
                                      video_fields=video_fields,
                                      api_keys=api_keys)
            except UnhandledFeed:
                pass
        raise UnhandledFeed(url)
    def get_url_data(self, url):
        """
        Parses the url into data which can be used to construct page urls.

        :raises: :exc:`.UnhandledFeed` if the url isn't handled by this feed.

        """
        raise UnhandledFeed(url)
Beispiel #5
0
    def get_url_data(self, url):
        parsed_url = urlparse.urlsplit(url)
        if parsed_url.scheme in ('http', 'https'):
            if parsed_url.netloc == 'blip.tv':
                match = self.path_re.match(parsed_url.path)
                if match:
                    return match.groupdict()

        raise UnhandledFeed(url)
Beispiel #6
0
    def get_feed(self, url, *args, **kwargs):
        """
        Returns an instance of :attr:`feed_class`, which should be a subclass
        of :class:`.BaseFeed`.

        :raises: :exc:`.UnhandledFeed` if :attr:`feed_class` is None.

        """
        if self.feed_class is None:
            raise UnhandledFeed(url)
        return self.feed_class(url, *args, **kwargs)
Beispiel #7
0
    def get_url_data(self, url):
        parsed_url = urlparse.urlsplit(url)
        if parsed_url.scheme in ('http', 'https'):
            if parsed_url.netloc in ('vimeo.com', 'www.vimeo.com'):
                match = self.path_re.match(parsed_url.path)
                if not match:
                    # Only use the api regex as a fallback - less likely to
                    # see it.
                    match = self.api_re.match(parsed_url.path)

                if match:
                    return match.groupdict()
        raise UnhandledFeed(url)
Beispiel #8
0
    def get_url_data(self, url):
        parsed_url = urlparse.urlsplit(url)
        if (parsed_url.scheme in self.schemes and
                parsed_url.netloc in self.netlocs and
                parsed_url.path == self.path):
            parsed_qs = urlparse.parse_qs(parsed_url.query)
            try:
                return {
                    'partner_id': parsed_qs['partner_id'][0],
                    'subp_id': parsed_qs['subp_id'][0],
                    'playlist_id': parsed_qs['playlist_id'][0],
                }
            except (KeyError, IndexError):
                pass

        raise UnhandledFeed(url)
Beispiel #9
0
    def get_url_data(self, url):
        parsed_url = urlparse.urlsplit(url)
        if parsed_url.scheme in ('http', 'https'):
            if parsed_url.netloc in ('youtube.com', 'www.youtube.com'):
                match = self.path_re.match(parsed_url.path)
                if (match and match.group('username')
                        not in self.invalid_usernames):
                    return match.groupdict()

                match = self.old_path_re.match(parsed_url.path)
                if match:
                    parsed_qs = urlparse.parse_qs(parsed_url.query)
                    if 'user' in parsed_qs:
                        username = parsed_qs['user'][0]
                        if username not in self.invalid_usernames:
                            return {'username': username}
            elif parsed_url.netloc == 'gdata.youtube.com':
                match = self.gdata_re.match(parsed_url.path)
                if match:
                    return match.groupdict()

        raise UnhandledFeed(url)
Beispiel #10
0
    def get_url_data(self, url):
        parsed_url = urlparse.urlsplit(url)
        if parsed_url.scheme in ('http', 'https'):
            if parsed_url.netloc in ('youtube.com', 'www.youtube.com'):
                match = self.path_re.match(parsed_url.path)
                if match:
                    groupdict = match.groupdict()
                    if groupdict['username'] not in self.invalid_usernames:
                        if groupdict['user'] is None:
                            # Some URLs at root are a kind of vanity URL that
                            # doesn't correspond to a username. The only way
                            # to be sure is to actually fetch the page and
                            # check for a canonical url.
                            response = requests.get(url)
                            if response.status_code == 200:
                                strainer = SoupStrainer('link',
                                                        rel='canonical')
                                soup = BeautifulSoup(response.content,
                                                     parse_only=strainer)
                                if soup.link is not None:
                                    return self.get_url_data(soup.link['href'])
                        else:
                            return {'username': groupdict['username']}

                match = self.old_path_re.match(parsed_url.path)
                if match:
                    parsed_qs = urlparse.parse_qs(parsed_url.query)
                    if 'user' in parsed_qs:
                        username = parsed_qs['user'][0]
                        if username not in self.invalid_usernames:
                            return {'username': username}
            elif parsed_url.netloc == 'gdata.youtube.com':
                match = self.gdata_re.match(parsed_url.path)
                if match:
                    return match.groupdict()

        raise UnhandledFeed(url)