Python UrlCanonicalizer Examples

Programming Language: Python

Namespace/Package Name: util

Method/Function: UrlCanonicalizer

Examples at hotexamples.com: 20

Python UrlCanonicalizer - 20 examples found. These are the top rated real world Python examples of util.UrlCanonicalizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: instagram.py Project: sheyril/bridgy

class Instagram(browser.BrowserSource):
  """An Instagram account.

  The key name is the username. Instagram usernames may have ASCII letters (case
  insensitive), numbers, periods, and underscores:
  https://stackoverflow.com/questions/15470180
  """
  GR_CLASS = gr_instagram.Instagram
  SHORT_NAME = 'instagram'
  OAUTH_START_HANDLER = oauth_instagram.StartHandler
  URL_CANONICALIZER = util.UrlCanonicalizer(
    domain=GR_CLASS.DOMAIN,
    subdomain='www',
    approve=r'https://www.instagram.com/p/[^/?]+/$',
    trailing_slash=True,
    headers=util.REQUEST_HEADERS)
    # no reject regexp; non-private Instagram post URLs just 404

  # blank granary Instagram object, shared across all instances
  gr_source = gr_instagram.Instagram()

  @classmethod
  def key_id_from_actor(cls, actor):
    """Returns the actor's username field to be used as this entity's key id."""
    return actor['username']

  def silo_url(self):
    """Returns the Instagram account URL, e.g. https://instagram.com/foo."""
    return self.gr_source.user_url(self.key.id())

  def label_name(self):
    """Returns the username."""
    return self.key_id()

Example #2

Show file

    def finish(self, auth_entity, state=None):
        if auth_entity:
            user_json = json.loads(auth_entity.user_json)

            # find instagram profile URL
            urls = user_json.get('rel-me', [])
            logging.info('rel-mes: %s', urls)
            for url in util.trim_nulls(urls):
                if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
                    username = urllib.parse.urlparse(url).path.strip('/')
                    break
            else:
                self.messages.add(
                    'No Instagram profile found. Please <a href="https://indieauth.com/setup">add an Instagram rel-me link</a>, then try again.'
                )
                return self.redirect('/')

            # check that instagram profile links to web site
            try:
                actor = gr_instagram.Instagram(scrape=True).get_actor(
                    username, ignore_rate_limit=True)
            except Exception as e:
                code, _ = util.interpret_http_exception(e)
                if code in Instagram.RATE_LIMIT_HTTP_CODES:
                    self.messages.add(
                        '<a href="https://github.com/snarfed/bridgy/issues/665#issuecomment-524977427">Apologies, Instagram is temporarily blocking us.</a> Please try again later!'
                    )
                    return self.redirect('/')
                else:
                    raise

            if not actor:
                self.messages.add(
                    "Couldn't find Instagram user '%s'. Please check your site's rel-me link and your Instagram account."
                    % username)
                return self.redirect('/')

            canonicalize = util.UrlCanonicalizer(redirects=False)
            website = canonicalize(auth_entity.key.id())
            urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
            logging.info('Looking for %s in %s', website, urls)
            if website not in urls:
                self.messages.add(
                    "Please add %s to your Instagram profile's website or bio field and try again."
                    % website)
                return self.redirect('/')

            # check that the instagram account is public
            if not gr_source.Source.is_public(actor):
                self.messages.add(
                    'Your Instagram account is private. Bridgy only supports public accounts.'
                )
                return self.redirect('/')

        self.maybe_add_or_delete_source(Instagram,
                                        auth_entity,
                                        state,
                                        actor=actor)

Example #3

Show file

class GitHub(Source):
    """A GitHub user.

  The key name is the GitHub username.
  """
    GR_CLASS = gr_github.GitHub
    OAUTH_START_HANDLER = oauth_github.StartHandler
    SHORT_NAME = 'github'
    TYPE_LABELS = {
        'post': 'issue',
        'like': 'star',
    }
    BACKFEED_REQUIRES_SYNDICATION_LINK = True
    DISABLE_HTTP_CODES = Source.DISABLE_HTTP_CODES + ('403', )
    CAN_PUBLISH = True
    URL_CANONICALIZER = util.UrlCanonicalizer(domain=GR_CLASS.DOMAIN,
                                              headers=util.REQUEST_HEADERS,
                                              fragment=True)
    # This makes us backfeed issue/PR comments to previous comments on the same
    # issue/PR.
    IGNORE_SYNDICATION_LINK_FRAGMENTS = True

    @staticmethod
    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a :class:`GitHub` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.github.GitHubAuth`
      kwargs: property values
    """
        user = json_loads(auth_entity.user_json)
        gr_source = gr_github.GitHub(access_token=auth_entity.access_token())
        actor = gr_source.user_to_actor(user)
        return GitHub(id=auth_entity.key_id(),
                      auth_entity=auth_entity.key,
                      name=actor.get('displayName'),
                      picture=actor.get('image', {}).get('url'),
                      url=actor.get('url'),
                      **kwargs)

    def silo_url(self):
        """Returns the GitHub account URL, e.g. https://github.com/foo."""
        return self.gr_source.user_url(self.key_id())

    def label_name(self):
        """Returns the username."""
        return self.key_id()

    def get_activities_response(self, *args, **kwargs):
        """Drop kwargs that granary doesn't currently support for github."""
        kwargs.update({
            'fetch_shares': None,
            'fetch_mentions': None,
        })
        return self.gr_source.get_activities_response(*args, **kwargs)

Example #4

Show file

class FakeSource(Source):
  GR_CLASS = FakeGrSource
  OAUTH_START_HANDLER = OAuthStartHandler
  SHORT_NAME = 'fake'
  TYPE_LABELS = {'post': 'FakeSource post label'}
  RATE_LIMITED_POLL = datetime.timedelta(hours=30)
  URL_CANONICALIZER = util.UrlCanonicalizer(
    domain=GR_CLASS.DOMAIN,
    headers=util.REQUEST_HEADERS)
  PATH_BLACKLIST = (re.compile('^/blocklisted/.*'),)
  HAS_BLOCKS = True

  string_id_counter = 1
  gr_source = FakeGrSource()
  username = ndb.StringProperty()
  is_saved = False

  def is_beta_user(self):
    return True

  def silo_url(self):
    return 'http://fa.ke/profile/url'

  def feed_url(self):
    return 'fake feed url'

  def search_for_links(self):
    return copy.deepcopy(FakeGrSource.search_results)

  @classmethod
  def new(cls, handler, **props):
    id = None
    if 'url' not in props:
      props['url'] = 'http://fake/url'
    auth_entity = props.get('auth_entity')
    if auth_entity:
      props['auth_entity'] = auth_entity.key
      if auth_entity.user_json:
        user_obj = json_loads(auth_entity.user_json)
        if 'name' not in props:
          props['name'] = user_obj.get('name')
        id = user_obj.get('id')
    if not props.get('name'):
      props['name'] = 'fake'
    if not id:
      id = cls.string_id_counter
      cls.string_id_counter += 1
    return cls(id=str(id), **props)

  def put(self, **kwargs):
    self.is_saved = True
    return super(FakeSource, self).put(**kwargs)

  @classmethod
  def next_key(cls):
    return ndb.Key(cls, str(cls.string_id_counter))

Example #5

Show file

File: github.py Project: EdwardHinkle/bridgy

class GitHub(models.Source):
    """A GitHub user.

  The key name is the GitHub username.
  """
    GR_CLASS = gr_github.GitHub
    SHORT_NAME = 'github'
    TYPE_LABELS = {
        'post': 'issue',
        'like': 'star',
    }

    URL_CANONICALIZER = util.UrlCanonicalizer(domain=GR_CLASS.DOMAIN,
                                              headers=util.REQUEST_HEADERS)

    @staticmethod
    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a :class:`GitHub` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.github.GitHubAuth`
      kwargs: property values
    """
        user = json.loads(auth_entity.user_json)
        gr_source = gr_github.GitHub(access_token=auth_entity.access_token())
        actor = gr_source.user_to_actor(user)

        # temporary!
        util.email_me(subject='New Bridgy GitHub user!',
                      body=json.dumps(auth_entity.user_json, indent=2))

        return GitHub(id=auth_entity.key.id(),
                      auth_entity=auth_entity.key,
                      name=actor.get('displayName'),
                      picture=actor.get('image', {}).get('url'),
                      url=actor.get('url'),
                      **kwargs)

    def silo_url(self):
        """Returns the GitHub account URL, e.g. https://github.com/foo."""
        return self.gr_source.user_url(self.key.id())

    def label_name(self):
        """Returns the username."""
        return self.key.id()

    def get_activities_response(self, *args, **kwargs):
        """Drop kwargs that granary doesn't currently support for github."""
        kwargs.update({
            'fetch_shares': None,
            'fetch_mentions': None,
        })
        return self.gr_source.get_activities_response(*args, **kwargs)

Example #6

Show file

File: testutil.py Project: dougbeal/bridgy

class FakeSource(Source):
    GR_CLASS = FakeGrSource
    SHORT_NAME = 'fake'
    TYPE_LABELS = {'post': 'FakeSource post label'}
    RATE_LIMITED_POLL = datetime.timedelta(hours=30)
    URL_CANONICALIZER = util.UrlCanonicalizer(domain=GR_CLASS.DOMAIN,
                                              headers=util.REQUEST_HEADERS)

    string_id_counter = 1
    gr_source = FakeGrSource()
    username = ndb.StringProperty()
    is_saved = False

    def is_beta_user(self):
        return True

    def silo_url(self):
        return 'http://fa.ke/profile/url'

    def feed_url(self):
        return 'fake feed url'

    def poll_period(self):
        return (self.RATE_LIMITED_POLL if self.rate_limited else super(
            FakeSource, self).poll_period())

    def search_for_links(self):
        return copy.deepcopy(FakeGrSource.search_results)

    @classmethod
    def new(cls, handler, **props):
        id = None
        if 'url' not in props:
            props['url'] = 'http://fake/url'
        auth_entity = props.get('auth_entity')
        if auth_entity:
            props['auth_entity'] = auth_entity.key
            if auth_entity.user_json:
                user_obj = json.loads(auth_entity.user_json)
                if 'name' not in props:
                    props['name'] = user_obj.get('name')
                id = user_obj.get('id')
        if not props.get('name'):
            props['name'] = 'fake'
        if not id:
            id = str(cls.string_id_counter)
            cls.string_id_counter += 1
        return cls(id=id, **props)

    def put(self, **kwargs):
        self.is_saved = True
        return super(FakeSource, self).put(**kwargs)

Example #7

Show file

File: instagram.py Project: v1cker/bridgy

    def finish(self, auth_entity, state=None):
        if auth_entity:
            user_json = json.loads(auth_entity.user_json)

            # find instagram profile URL
            urls = user_json.get('rel-me', [])
            logging.info('rel-mes: %s', urls)
            for url in util.trim_nulls(urls):
                if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
                    username = urlparse.urlparse(url).path.strip('/')
                    break
            else:
                self.messages.add(
                    'No Instagram profile found. Please <a href="https://indieauth.com/setup">'
                    'add an Instagram rel-me link</a>, then try again.')
                return self.redirect('/')

            # check that instagram profile links to web site
            actor = gr_instagram.Instagram(scrape=True).get_actor(
                username, ignore_rate_limit=True)
            if not actor:
                self.messages.add(
                    "Couldn't find Instagram user '%s'. Please check your site's rel-me "
                    "link and your Instagram account." % username)
                return self.redirect('/')

            canonicalize = util.UrlCanonicalizer(redirects=False)
            website = canonicalize(auth_entity.key.id())
            urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
            logging.info('Looking for %s in %s', website, urls)
            if website not in urls:
                self.messages.add(
                    "Please add %s to your Instagram profile's website or "
                    'bio field and try again.' % website)
                return self.redirect('/')

            # check that the instagram account is public
            if not gr_source.Source.is_public(actor):
                self.messages.add('Your Instagram account is private. '
                                  'Bridgy only supports public accounts.')
                return self.redirect('/')

        self.maybe_add_or_delete_source(Instagram,
                                        auth_entity,
                                        state,
                                        actor=actor)

Example #8

Show file

File: meetup.py Project: stedn/bridgy

class Meetup(Source):
    GR_CLASS = gr_meetup.Meetup
    OAUTH_START_HANDLER = oauth_meetup.StartHandler
    SHORT_NAME = 'meetup'
    BACKFEED_REQUIRES_SYNDICATION_LINK = True
    CAN_LISTEN = False
    CAN_PUBLISH = True
    URL_CANONICALIZER = util.UrlCanonicalizer(domain=GR_CLASS.DOMAIN,
                                              headers=util.REQUEST_HEADERS)

    @staticmethod
    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a :class:`Meetup` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.meetup.MeetupAuth`
      kwargs: property values
    """
        user = json_loads(auth_entity.user_json)
        gr_source = gr_meetup.Meetup(access_token=auth_entity.access_token())
        actor = gr_source.user_to_actor(user)
        return Meetup(id=auth_entity.key.id(),
                      auth_entity=auth_entity.key,
                      name=actor.get('displayName'),
                      picture=actor.get('image', {}).get('url'),
                      url=actor.get('url'),
                      **kwargs)

    def silo_url(self):
        """Returns the Meetup account URL, e.g. https://meetup.com/members/...."""
        return self.gr_source.user_url(self.key.id())

    def label_name(self):
        """Returns the username."""
        return self.name

Example #9

Show file

File: mastodon.py Project: jamietanna/bridgy

 def URL_CANONICALIZER(self):
     """Generate URL_CANONICALIZER dynamically to use the instance's domain."""
     return util.UrlCanonicalizer(domain=self.gr_source.DOMAIN,
                                  headers=util.REQUEST_HEADERS)

Example #10

Show file

File: flickr.py Project: sheyril/bridgy

class Flickr(models.Source):
    """A Flickr account.

  The key name is the nsid.
  """
    # Fetching comments and likes is extremely request-intensive, so let's dial
    # back the frequency for now.
    FAST_POLL = datetime.timedelta(minutes=60)
    GR_CLASS = gr_flickr.Flickr
    OAUTH_START_HANDLER = oauth_flickr.StartHandler
    SHORT_NAME = 'flickr'
    TRANSIENT_ERROR_HTTP_CODES = ('400', )
    CAN_PUBLISH = True
    URL_CANONICALIZER = util.UrlCanonicalizer(
        domain=GR_CLASS.DOMAIN,
        approve=r'https://www\.flickr\.com/(photos|people)/[^/?]+/([^/?]+/)?$',
        reject=r'https://login\.yahoo\.com/.*',
        subdomain='www',
        trailing_slash=True,
        headers=util.REQUEST_HEADERS)

    # unique name optionally used in URLs instead of nsid (e.g.,
    # flickr.com/photos/username)
    username = ndb.StringProperty()

    @staticmethod
    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a :class:`Flickr` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.flickr.FlickrAuth`
    """
        person = json_loads(auth_entity.user_json).get('person', {})
        return Flickr(
            id=person.get('nsid'),
            auth_entity=auth_entity.key,
            name=person.get('realname', {}).get('_content'),
            # path_alias, if it exists, is the actual thing that shows up in the url.
            # I think this is an artifact of the conversion to Yahoo.
            username=(person.get('path_alias')
                      or person.get('username', {}).get('_content')),
            picture='https://farm{}.staticflickr.com/{}/buddyicons/{}.jpg'.
            format(person.get('iconfarm'), person.get('iconserver'),
                   person.get('nsid')),
            url=person.get('profileurl', {}).get('_content'),
            **kwargs)

    def silo_url(self):
        """Returns the Flickr account URL, e.g. https://www.flickr.com/people/foo/."""
        return self.url

    def user_tag_id(self):
        """Returns the tag URI for this source, e.g. 'tag:flickr.com:123456'."""
        return self.gr_source.tag_uri(self.username)

    def label_name(self):
        """Human-readable name, username, or id for this source."""
        return self.name or self.username or self.key_id()

    def get_activities_response(self, *args, **kwargs):
        """Discard min_id because we still want new comments/likes on old photos."""
        kwargs.setdefault('group_id', SELF)
        if 'min_id' in kwargs:
            del kwargs['min_id']
        return self.gr_source.get_activities_response(*args, **kwargs)

    def canonicalize_url(self, url, activity=None, **kwargs):
        if not url.endswith('/'):
            url = url + '/'
        if self.username:
            url = url.replace('flickr.com/photos/%s/' % self.username,
                              'flickr.com/photos/%s/' % self.key_id())
            url = url.replace('flickr.com/people/%s/' % self.username,
                              'flickr.com/people/%s/' % self.key_id())
        return super(Flickr, self).canonicalize_url(url, **kwargs)

Example #11

Show file

File: models.py Project: mblaney/bridgy

class Source(StringIdModel):
    """A silo account, e.g. a Facebook or Google+ account.

  Each concrete silo class should subclass this class.
  """
    __metaclass__ = SourceMeta

    # Turn off NDB instance and memcache caching.
    # https://developers.google.com/appengine/docs/python/ndb/cache
    # https://github.com/snarfed/bridgy/issues/558
    # https://github.com/snarfed/bridgy/issues/68
    _use_cache = False

    STATUSES = ('enabled', 'disabled', 'error')  # 'error' is deprecated
    POLL_STATUSES = ('ok', 'error', 'polling')
    FEATURES = ('listen', 'publish', 'webmention')

    # short name for this site type. used in URLs, etc.
    SHORT_NAME = None
    # the corresponding granary class
    GR_CLASS = None

    # how often to poll for responses
    FAST_POLL = datetime.timedelta(minutes=30)
    # how often to poll sources that have never sent a webmention
    SLOW_POLL = datetime.timedelta(days=1)
    # how often to poll sources that are currently rate limited by their silo
    RATE_LIMITED_POLL = SLOW_POLL
    # how long to wait after signup for a successful webmention before dropping to
    # the lower frequency poll
    FAST_POLL_GRACE_PERIOD = datetime.timedelta(days=7)
    # how often refetch author url to look for updated syndication links
    FAST_REFETCH = datetime.timedelta(hours=6)
    # refetch less often (this often) if it's been >2w since the last synd link
    SLOW_REFETCH = datetime.timedelta(days=2)

    # Maps Publish.type (e.g. 'like') to source-specific human readable type label
    # (e.g. 'favorite'). Subclasses should override this.
    TYPE_LABELS = {}

    # subclasses should override this
    URL_CANONICALIZER = util.UrlCanonicalizer(headers=util.REQUEST_HEADERS)

    created = ndb.DateTimeProperty(auto_now_add=True, required=True)
    url = ndb.StringProperty()
    status = ndb.StringProperty(choices=STATUSES, default='enabled')
    poll_status = ndb.StringProperty(choices=POLL_STATUSES, default='ok')
    rate_limited = ndb.BooleanProperty(default=False)
    name = ndb.StringProperty()  # full human-readable name
    picture = ndb.StringProperty()
    domains = ndb.StringProperty(repeated=True)
    domain_urls = ndb.StringProperty(repeated=True)
    features = ndb.StringProperty(repeated=True, choices=FEATURES)
    superfeedr_secret = ndb.StringProperty()
    webmention_endpoint = ndb.StringProperty()

    # points to an oauth-dropins auth entity. The model class should be a subclass
    # of oauth_dropins.BaseAuth.
    # the token should be generated with the offline_access scope so that it
    # doesn't expire. details: http://developers.facebook.com/docs/authentication/
    auth_entity = ndb.KeyProperty()

    #
    # listen-only properties
    #
    last_polled = ndb.DateTimeProperty(default=util.EPOCH)
    last_poll_attempt = ndb.DateTimeProperty(default=util.EPOCH)
    last_webmention_sent = ndb.DateTimeProperty()
    last_public_post = ndb.DateTimeProperty()
    recent_private_posts = ndb.IntegerProperty()

    # the last time we re-fetched the author's url looking for updated
    # syndication links
    last_hfeed_refetch = ndb.DateTimeProperty(default=util.EPOCH)

    # the last time we've seen a rel=syndication link for this Source.
    # we won't spend the time to re-fetch and look for updates if there's
    # never been one
    last_syndication_url = ndb.DateTimeProperty()
    # the last time we saw a syndication link in an h-feed, as opposed to just on
    # permalinks. background: https://github.com/snarfed/bridgy/issues/624
    last_feed_syndication_url = ndb.DateTimeProperty()

    last_activity_id = ndb.StringProperty()
    last_activities_etag = ndb.StringProperty()
    last_activities_cache_json = ndb.TextProperty()
    seen_responses_cache_json = ndb.TextProperty(compressed=True)

    # maps updated property names to values that put_updates() writes back to the
    # datastore transactionally. set this to {} before beginning.
    updates = None

    # gr_source is *not* set to None by default here, since it needs to be unset
    # for __getattr__ to run when it's accessed.

    @classmethod
    def new(cls, handler, **kwargs):
        """Factory method. Creates and returns a new instance for the current user.

    To be implemented by subclasses.
    """
        raise NotImplementedError()

    def __getattr__(self, name):
        """Lazily load the auth entity and instantiate :attr:`self.gr_source`.

    Once :attr:`self.gr_source` is set, this method will *not* be called;
    :attr:`gr_source` will be returned normally.
    """
        if name == 'gr_source' and self.auth_entity:
            auth_entity = self.auth_entity.get()
            token = auth_entity.access_token()
            if not isinstance(token, tuple):
                token = (token, )

            kwargs = {}
            if self.key.kind(
            ) == 'FacebookPage' and auth_entity.type == 'user':
                kwargs = {'user_id': self.key.id()}
            elif self.key.kind() == 'Instagram':
                kwargs = {'scrape': True}
            elif self.key.kind() == 'Twitter':
                kwargs = {'username': self.key.id()}

            self.gr_source = self.GR_CLASS(*token, **kwargs)
            return self.gr_source

        return getattr(super(Source, self), name)

    @classmethod
    def lookup(cls, id):
        """Returns the entity with the given id.

    By default, interprets id as just the key id. Subclasses may extend this to
    support usernames, etc.
    """
        return ndb.Key(cls, id).get()

    def user_tag_id(self):
        """Returns the tag URI for this source, e.g. 'tag:plus.google.com:123456'."""
        return self.gr_source.tag_uri(self.key.id())

    def bridgy_path(self):
        """Returns the Bridgy page URL path for this source."""
        return '/%s/%s' % (self.SHORT_NAME, self.key.string_id())

    def bridgy_url(self, handler):
        """Returns the Bridgy page URL for this source."""
        return handler.request.host_url + self.bridgy_path()

    def silo_url(self, handler):
        """Returns the silo account URL, e.g. https://twitter.com/foo."""
        raise NotImplementedError()

    def label(self):
        """Human-readable label for this source."""
        return '%s (%s)' % (self.label_name(), self.GR_CLASS.NAME)

    def label_name(self):
        """Human-readable name or username for this source, whichever is preferred."""
        return self.name

    @classmethod
    @ndb.transactional
    def put_updates(cls, source):
        """Writes source.updates to the datastore transactionally.

    Returns:
      source: :class:`Source`

    Returns:
      the updated :class:`Source`
    """
        if not source.updates:
            return source

        logging.info(
            'Updating %s %s : %r', source.label(), source.bridgy_path(), {
                k: v
                for k, v in source.updates.items() if not k.endswith('_json')
            })

        updates = source.updates
        source = source.key.get()
        source.updates = updates  # because FacebookPage._pre_put_hook uses it
        for name, val in updates.items():
            setattr(source, name, val)

        if source.status == 'error':  # deprecated
            logging.warning('Resetting status from error to enabled')
            source.status = 'enabled'

        source.put()
        return source

    def poll_period(self):
        """Returns the poll frequency for this source, as a :class:`datetime.timedelta`.

    Defaults to ~15m, depending on silo. If we've never sent a webmention for
    this source, or the last one we sent was over a month ago, we drop them down
    to ~1d after a week long grace period.
    """
        now = datetime.datetime.now()
        if self.rate_limited:
            return self.RATE_LIMITED_POLL
        elif now < self.created + self.FAST_POLL_GRACE_PERIOD:
            return self.FAST_POLL
        elif not self.last_webmention_sent:
            return self.SLOW_POLL
        elif self.last_webmention_sent > now - datetime.timedelta(days=7):
            return self.FAST_POLL
        elif self.last_webmention_sent > now - datetime.timedelta(days=30):
            return self.FAST_POLL * 10
        else:
            return self.SLOW_POLL

    def should_refetch(self):
        """Returns True if we should run OPD refetch on this source now."""
        now = datetime.datetime.now()
        if self.last_hfeed_refetch == REFETCH_HFEED_TRIGGER:
            return True
        elif not self.last_syndication_url:
            return False

        period = (self.FAST_REFETCH if self.last_syndication_url > now -
                  datetime.timedelta(days=14) else self.SLOW_REFETCH)
        return self.last_poll_attempt >= self.last_hfeed_refetch + period

    @classmethod
    def bridgy_webmention_endpoint(cls, domain='brid.gy'):
        """Returns the Bridgy webmention endpoint for this source type."""
        return 'https://%s/webmention/%s' % (domain, cls.SHORT_NAME)

    def has_bridgy_webmention_endpoint(self):
        """Returns True if this source uses Bridgy's webmention endpoint."""
        return self.webmention_endpoint in (self.bridgy_webmention_endpoint(),
                                            self.bridgy_webmention_endpoint(
                                                domain='www.brid.gy'))

    def get_author_urls(self):
        """Determine the author urls for a particular source.

    In debug mode, replace test domains with localhost.

    Return:
      a list of string URLs, possibly empty
    """
        return [
            util.replace_test_domains_with_localhost(u)
            for u in self.domain_urls
        ]

    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    https://github.com/snarfed/bridgy/issues/456
    https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
        return []

    def get_activities_response(self, **kwargs):
        """Returns recent posts and embedded comments for this source.

    May be overridden by subclasses.
    """
        kwargs.setdefault('group_id', gr_source.SELF)
        resp = self.gr_source.get_activities_response(**kwargs)
        for activity in resp['items']:
            self._inject_user_urls(activity)
        return resp

    def get_activities(self, **kwargs):
        return self.get_activities_response(**kwargs)['items']

    def get_comment(self, comment_id, **kwargs):
        """Returns a comment from this source.

    Passes through to granary by default. May be overridden by subclasses.

    Args:
      comment_id: string, site-specific comment id
      kwargs: passed to :meth:`granary.source.Source.get_comment`

    Returns:
      dict, decoded ActivityStreams comment object, or None
    """
        comment = self.gr_source.get_comment(comment_id, **kwargs)
        if comment:
            self._inject_user_urls(comment)
        return comment

    def get_like(self, activity_user_id, activity_id, like_user_id, **kwargs):
        """Returns an ActivityStreams 'like' activity object.

    Passes through to granary by default. May be overridden
    by subclasses.

    Args:
      activity_user_id: string id of the user who posted the original activity
      activity_id: string activity id
      like_user_id: string id of the user who liked the activity
      kwargs: passed to granary.Source.get_comment
    """
        return self.gr_source.get_like(activity_user_id, activity_id,
                                       like_user_id, **kwargs)

    def _inject_user_urls(self, activity):
        """Adds this user's web site URLs to their user mentions (in tags), in place."""
        obj = activity.get('object') or activity
        user_tag_id = self.user_tag_id()
        for tag in obj.get('tags', []):
            if tag.get('id') == user_tag_id:
                tag.setdefault('urls', []).extend([{
                    'value': u
                } for u in self.domain_urls])

    def create_comment(self, post_url, author_name, author_url, content):
        """Creates a new comment in the source silo.

    Must be implemented by subclasses.

    Args:
      post_url: string
      author_name: string
      author_url: string
      content: string

    Returns:
      response dict with at least 'id' field
    """
        raise NotImplementedError()

    def feed_url(self):
        """Returns the RSS or Atom (or similar) feed URL for this source.

    Must be implemented by subclasses. Currently only implemented by
    :mod:`blogger`, :mod:`medium`, :mod:`tumblr`, and :mod:`wordpress_rest`.

    Returns:
      string URL
    """
        raise NotImplementedError()

    def edit_template_url(self):
        """Returns the URL for editing this blog's template HTML.

    Must be implemented by subclasses. Currently only implemented by
    :mod:`blogger`, :mod:`medium`, :mod:`tumblr`, and :mod:`wordpress_rest`.

    Returns:
      string URL
    """
        raise NotImplementedError()

    @classmethod
    def create_new(cls, handler, user_url=None, **kwargs):
        """Creates and saves a new :class:`Source` and adds a poll task for it.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      user_url: a string, optional. if provided, supersedes other urls when
        determining the author_url
      **kwargs: passed to :meth:`new()`
    """
        source = cls.new(handler, **kwargs)
        if source is None:
            return None

        new_features = source.features or ['listen']
        if not source.domain_urls:  # defer to the source if it already set this
            auth_entity = kwargs.get('auth_entity')
            if auth_entity and hasattr(auth_entity, 'user_json'):
                source.domain_urls, source.domains = source._urls_and_domains(
                    auth_entity, user_url)
                logging.debug('URLs/domains: %s %s', source.domain_urls,
                              source.domains)
                if ('publish' in new_features
                        and (not source.domain_urls or not source.domains)):
                    handler.messages = {
                        'No valid web sites found in your %s profile. '
                        'Please update it and try again!' % cls.GR_CLASS.NAME
                    }
                    return None

        # check if this source already exists
        existing = source.key.get()
        if existing:
            # merge some fields
            source.features = set(source.features + existing.features)
            source.populate(**existing.to_dict(
                include=('created', 'last_hfeed_refetch', 'last_poll_attempt',
                         'last_polled', 'last_syndication_url',
                         'last_webmention_sent', 'superfeedr_secret')))
            verb = 'Updated'
        else:
            verb = 'Added'

        author_urls = source.get_author_urls()
        link = ('http://indiewebify.me/send-webmentions/?url=' + author_urls[0]
                if author_urls else 'http://indiewebify.me/#send-webmentions')
        blurb = '%s %s. %s' % (
            verb, source.label(), {
                'listen': "Refresh in a minute to see what we've found!",
                'publish': 'Try previewing a post from your web site!',
                'webmention': '<a href="%s">Try a webmention!</a>' % link,
            }.get(new_features[0], ''))
        logging.info('%s %s', blurb, source.bridgy_url(handler))
        # uncomment to send email notification for each new user
        # if not existing:
        #   util.email_me(subject=blurb, body=source.bridgy_url(handler))

        source.verify()
        if source.verified():
            handler.messages = {blurb}

        # TODO: ugh, *all* of this should be transactional
        source.put()

        if 'webmention' in source.features:
            superfeedr.subscribe(source, handler)

        if 'listen' in source.features:
            util.add_poll_task(source, now=True)
            util.add_poll_task(source,
                               countdown=source.poll_period().total_seconds())

        return source

    def verified(self):
        """Returns True if this source is ready to be used, false otherwise.

    See :meth:`verify()` for details. May be overridden by subclasses, e.g.
    :class:`tumblr.Tumblr`.
    """
        if not self.domains or not self.domain_urls:
            return False
        if 'webmention' in self.features and not self.webmention_endpoint:
            return False
        if ('listen' in self.features and
                not (self.webmention_endpoint or self.last_webmention_sent)):
            return False
        return True

    def verify(self, force=False):
        """Checks that this source is ready to be used.

    For blog and listen sources, this fetches their front page HTML and
    discovers their webmention endpoint. For publish sources, this checks that
    they have a domain.

    May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`.

    Args:
      force: if True, fully verifies (e.g. re-fetches the blog's HTML and
        performs webmention discovery) even we already think this source is
        verified.
    """
        author_urls = self.get_author_urls()
        if ((self.verified() and not force) or self.status == 'disabled'
                or not self.features or not author_urls):
            return

        author_url = author_urls[0]
        logging.info('Attempting to discover webmention endpoint on %s',
                     author_url)
        mention = send.WebmentionSend('https://brid.gy/', author_url)
        mention.requests_kwargs = {
            'timeout': HTTP_TIMEOUT,
            'headers': util.REQUEST_HEADERS
        }
        try:
            mention._discoverEndpoint()
        except BaseException:
            logging.info('Error discovering webmention endpoint',
                         exc_info=True)
            mention.error = {'code': 'EXCEPTION'}

        self._fetched_html = getattr(mention, 'html', None)
        error = getattr(mention, 'error', None)
        endpoint = getattr(mention, 'receiver_endpoint', None)
        if error or not endpoint:
            logging.info("No webmention endpoint found: %s %r", error,
                         endpoint)
            self.webmention_endpoint = None
        else:
            logging.info("Discovered webmention endpoint %s", endpoint)
            self.webmention_endpoint = endpoint

        self.put()

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains

    def canonicalize_url(self, url, activity=None, **kwargs):
        """Canonicalizes a post or object URL.

    Wraps :class:`oauth_dropins.webutil.util.UrlCanonicalizer`.
    """
        return self.URL_CANONICALIZER(
            url, **kwargs) if self.URL_CANONICALIZER else url

    def infer_profile_url(self, url):
        """Given an arbitrary URL representing a person, try to find their
    profile URL for *this* service.

    Queries Bridgy's registered accounts for users with a particular
    domain in their silo profile.

    Args:
      url: string, a person's URL

    Return:
      a string URL for their profile on this service (or None)
    """
        domain = util.domain_from_link(url)
        if domain == self.gr_source.DOMAIN:
            return url
        user = self.__class__.query(self.__class__.domains == domain).get()
        if user:
            return self.gr_source.user_url(user.key.id())

    def preprocess_for_publish(self, obj):
        """Preprocess an object before trying to publish it.

    By default this tries to massage person tags so that the tag's
    "url" points to the person's profile on this service (as opposed
    to a person's homepage).

    The object is modified in place.

    Args:
      obj: ActivityStreams activity or object dict
    """
        for tag in obj.get('tags', []):
            if tag.get('objectType') == 'person':
                silo_url = None
                for url in microformats2.object_urls(tag):
                    silo_url = url and self.infer_profile_url(url)
                    if silo_url:
                        break
                if silo_url:
                    tag['url'] = silo_url

        # recurse on contained object(s)
        for obj in util.get_list(obj, 'object'):
            self.preprocess_for_publish(obj)

    def on_new_syndicated_post(self, syndpost):
        """Called when a new :class:`SyndicatedPost` is stored for this source.

    Args:
      syndpost: :class:`SyndicatedPost`
    """
        pass

    def is_private(self):
        """Returns True if this source is private aka protected.

    ...ie their posts are not public.
    """
        return False

    def is_activity_public(self, activity):
        """Returns True if the given activity is public, False otherwise.

    Just wraps :meth:`granary.source.Source.is_public`. Subclasses may override.
    """
        return gr_source.Source.is_public(activity)

    def is_beta_user(self):
        """Returns True if this is a "beta" user opted into new features.

    Beta users come from beta_users.txt.
    """
        return self.bridgy_path() in util.BETA_USER_PATHS

    def is_blocked(self, obj):
        """Returns True if an object's author is being blocked.

    ...ie they're in this user's block list.
    """
        return False

Example #12

Show file

File: instagram.py Project: v1cker/bridgy

class Instagram(Source):
    """An Instagram account.

  The key name is the username. Instagram usernames may have ASCII letters (case
  insensitive), numbers, periods, and underscores:
  https://stackoverflow.com/questions/15470180
  """

    GR_CLASS = gr_instagram.Instagram
    SHORT_NAME = 'instagram'
    FAST_POLL = datetime.timedelta(minutes=120)
    RATE_LIMITED_POLL = Source.SLOW_POLL
    RATE_HTTP_LIMIT_CODES = Source.RATE_LIMIT_HTTP_CODES + ('503', )

    URL_CANONICALIZER = util.UrlCanonicalizer(
        domain=GR_CLASS.DOMAIN,
        subdomain='www',
        approve=r'https://www.instagram.com/p/[^/?]+/$',
        trailing_slash=True,
        headers=util.REQUEST_HEADERS)
    # no reject regexp; non-private Instagram post URLs just 404

    @staticmethod
    def new(handler, auth_entity=None, actor=None, **kwargs):
        """Creates and returns an :class:`Instagram` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.instagram.InstagramAuth`
    """
        user = json.loads(auth_entity.user_json)
        user['actor'] = actor
        auth_entity.user_json = json.dumps(user)
        auth_entity.put()

        username = actor['username']
        if not kwargs.get('features'):
            kwargs['features'] = ['listen']
        urls = microformats2.object_urls(actor)
        return Instagram(id=username,
                         auth_entity=auth_entity.key,
                         name=actor.get('displayName'),
                         picture=actor.get('image', {}).get('url'),
                         url=gr_instagram.Instagram.user_url(username),
                         domain_urls=urls,
                         domains=[util.domain_from_link(url) for url in urls],
                         **kwargs)

    def silo_url(self):
        """Returns the Instagram account URL, e.g. https://instagram.com/foo."""
        return self.url

    def user_tag_id(self):
        """Returns the tag URI for this source, e.g. 'tag:instagram.com:123456'."""
        user = json.loads(self.auth_entity.get().user_json)
        return (user.get('actor', {}).get('id')
                or self.gr_source.tag_uri(user.get('id') or self.key.id()))

    def label_name(self):
        """Returns the username."""
        return self.key.id()

    def get_activities_response(self, *args, **kwargs):
        """Set user_id because scraping requires it."""
        kwargs.setdefault('group_id', gr_source.SELF)
        kwargs.setdefault('user_id', self.key.id())
        return self.gr_source.get_activities_response(*args, **kwargs)

Example #13

Show file

File: facebook.py Project: v1cker/bridgy

class FacebookPage(models.Source):
  """A Facebook profile or page.

  The key name is the Facebook id.
  """

  GR_CLASS = gr_facebook.Facebook
  SHORT_NAME = 'facebook'

  URL_CANONICALIZER = util.UrlCanonicalizer(
    domain=GR_CLASS.DOMAIN,
    subdomain='www',
    query=True,
    approve=r'https://www\.facebook\.com/[^/?]+/posts/[^/?]+$',
    headers=util.REQUEST_HEADERS)
    # no reject regexp; non-private FB post URLs just 404

  # unique name used in FB URLs, e.g. facebook.com/[username]
  username = ndb.StringProperty()
  # inferred from syndication URLs if username isn't available
  inferred_username = ndb.StringProperty()
  # inferred application-specific user IDs (from other applications)
  inferred_user_ids = ndb.StringProperty(repeated=True)

  # maps string FB post id to string FB object id or None. background:
  # https://github.com/snarfed/bridgy/pull/513#issuecomment-149312879
  resolved_object_ids_json = ndb.TextProperty(compressed=True)
  # maps string FB post id to True or False for whether the post is public
  # or private. only contains posts with *known* privacy. background:
  # https://github.com/snarfed/bridgy/issues/633#issuecomment-198806909
  post_publics_json = ndb.TextProperty(compressed=True)

  @staticmethod
  def new(handler, auth_entity=None, **kwargs):
    """Creates and returns a :class:`FacebookPage` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.facebook.FacebookAuth`
      kwargs: property values
    """
    user = json.loads(auth_entity.user_json)
    gr_source = gr_facebook.Facebook(access_token=auth_entity.access_token())
    actor = gr_source.user_to_actor(user)
    return FacebookPage(id=user['id'],
                        auth_entity=auth_entity.key,
                        name=actor.get('displayName'),
                        username=actor.get('username'),
                        picture=actor.get('image', {}).get('url'),
                        url=actor.get('url'),
                        **kwargs)

  @classmethod
  def lookup(cls, id):
    """Returns the entity with the given id or username."""
    return ndb.Key(cls, id).get() or cls.query(cls.username == id).get()

  def silo_url(self):
    """Returns the Facebook account URL, e.g. https://facebook.com/foo.

    Facebook profile URLS with app-scoped user ids (eg www.facebook.com/ID) no
    longer work as of April 2018, so if that's all we have, return None instead.
    https://developers.facebook.com/blog/post/2018/04/19/facebook-login-changes-address-abuse/
    """
    if self.username or self.inferred_username:
      return self.gr_source.user_url(self.username or self.inferred_username)

    for id in [self.key.id()] + self.inferred_user_ids:
      if util.is_int(id) and int(id) < MIN_APP_SCOPED_ID:
        return self.gr_source.user_url(id)

  def get_activities_response(self, **kwargs):
    type = self.auth_entity.get().type
    kwargs.setdefault('fetch_events', True)
    kwargs.setdefault('fetch_news', type == 'user')
    kwargs.setdefault('event_owner_id', self.key.id())

    try:
      activities = super(FacebookPage, self).get_activities_response(**kwargs)
    except urllib2.HTTPError as e:
      code, body = util.interpret_http_exception(e)
      # use a function so any new exceptions (JSON decoding, missing keys) don't
      # clobber the original exception so we can re-raise it below.
      def dead_token():
        try:
          err = json.loads(body)['error']
          return (err.get('code') in DEAD_TOKEN_ERROR_CODES or
                  err.get('error_subcode') in DEAD_TOKEN_ERROR_SUBCODES or
                  err.get('message') in DEAD_TOKEN_ERROR_MESSAGES)
        except:
          logging.warning("Couldn't determine whether token is still valid", exc_info=True)
          return False

      if code == '401':
        if not dead_token() and type == 'user':
          # ask the user to reauthenticate. if this API call fails, it will raise
          # urllib2.HTTPError instead of DisableSource, so that we don't disable
          # the source without notifying.
          #
          # TODO: for pages, fetch the owners/admins and notify them.
          self.gr_source.create_notification(
            self.key.id(),
            "Bridgy's access to your account has expired. Click here to renew it now!",
            'https://brid.gy/facebook/start')
        raise models.DisableSource()

      raise

    # update the resolved_object_ids and post_publics caches
    def parsed_post_id(id):
      parsed = gr_facebook.Facebook.parse_id(id)
      return parsed.post if parsed.post else id

    resolved = self._load_cache('resolved_object_ids')
    for activity in activities['items']:
      obj = activity.get('object', {})
      obj_id = parsed_post_id(obj.get('fb_id'))
      ids = obj.get('fb_object_for_ids')
      if obj_id and ids:
        resolved[obj_id] = obj_id
        for id in ids:
          resolved[parsed_post_id(id)] = obj_id

    for activity in activities['items']:
      self.is_activity_public(activity)

    return activities

  def canonicalize_url(self, url, activity=None, **kwargs):
    """Facebook-specific standardization of syndicated urls.

    Canonical form is https://www.facebook.com/USERID/posts/POSTID

    Args:
      url: a string, the url of the syndicated content
      activity: the activity this URL came from. If it has an fb_object_id,
        we'll use that instead of fetching the post from Facebook
      kwargs: unused

    Return:
      a string, the canonical form of the syndication url
    """
    if util.domain_from_link(url) != self.gr_source.DOMAIN:
      return None

    def post_url(id):
      return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id)

    parsed = urlparse.urlparse(url)
    params = urlparse.parse_qs(parsed.query)
    url_id = self.gr_source.post_id(url)

    ids = params.get('story_fbid') or params.get('fbid')
    if ids:
      url = post_url(ids[0])
    elif url_id:
      if parsed.path.startswith('/notes/'):
        url = post_url(url_id)
      else:
        object_id = self.cached_resolve_object_id(url_id, activity=activity)
        if object_id:
          url = post_url(object_id)

    for alternate_id in util.trim_nulls(itertools.chain(
       (self.username or self.inferred_username,), self.inferred_user_ids)):
      url = url.replace('facebook.com/%s/' % alternate_id,
                        'facebook.com/%s/' % self.key.id())

    return super(FacebookPage, self).canonicalize_url(url)

  def cached_resolve_object_id(self, post_id, activity=None):
    """Resolve a post id to its Facebook object id, if any.

    Wraps :meth:`granary.facebook.Facebook.resolve_object_id()` and uses
    self.resolved_object_ids_json as a cache.

    Args:
      post_id: string Facebook post id
      activity: optional AS activity representation of Facebook post

    Returns:
      string Facebook object id or None
    """
    parsed = gr_facebook.Facebook.parse_id(post_id)
    if parsed.post:
      post_id = parsed.post

    resolved = self._load_cache('resolved_object_ids')
    if post_id not in resolved:
      resolved[post_id] = self.gr_source.resolve_object_id(
        self.key.id(), post_id, activity=activity)

    return resolved[post_id]

  def is_activity_public(self, activity):
    """Returns True if the given activity is public, False otherwise.

    Uses the :attr:`post_publics_json` cache if we can't tell otherwise.
    """
    obj = activity.get('object', {})
    fb_id = activity.get('fb_id') or obj.get('fb_id')
    if fb_id and gr_source.object_type(activity) not in ('comment', 'like', 'share'):
      fb_id = self.cached_resolve_object_id(fb_id, activity=activity)

    post_publics = self._load_cache('post_publics')
    public = gr_source.Source.is_public(activity)

    if not fb_id:
      return public
    elif public is not None:
      post_publics[fb_id] = public    # write cache
      return public
    else:
      return post_publics.get(fb_id)  # read cache

  def _load_cache(self, name):
    """Loads resolved_object_ids_json or post_publics_json into self.updates."""
    assert name in ('resolved_object_ids', 'post_publics')
    field = getattr(self, name + '_json')

    if self.updates is None:
      self.updates = {}
    loaded = self.updates.setdefault(name, {})

    if not loaded and field:
      loaded = self.updates[name] = json.loads(field)
    return loaded

  def _save_cache(self, name):
    """Writes resolved_object_ids or post_publics from self.updates to _json."""
    if self.updates is None:
      return

    assert name in ('resolved_object_ids', 'post_publics')
    max = globals()['MAX_' + name.upper()]
    val = self.updates.get(name)
    if val:
      keep = heapq.nlargest(max,
        (int(id) if util.is_int(id) else str(id) for id in val.keys()))
      setattr(self, name + '_json',
              json.dumps({str(id): val[str(id)] for id in keep}))

  def _pre_put_hook(self):
    """Encode the resolved_object_ids and post_publics fields from updates.

    ...and cap them at MAX_RESOLVED_OBJECT_IDS and MAX_POST_PUBLICS. Tries to
    keep the latest ones by assuming that ids are roughly monotonically
    increasing.
    """
    self._save_cache('resolved_object_ids')
    self._save_cache('post_publics')

  def infer_profile_url(self, url):
    """Find a Facebook profile URL (ideally the one with the user's numeric ID)

    Looks up existing sources by username, inferred username, and domain.

    Args:
      url: string, a person's URL

    Return:
      a string URL for their Facebook profile (or None)
    """
    domain = util.domain_from_link(url)
    if domain == self.gr_source.DOMAIN:
      username = urlparse.urlparse(url).path.strip('/')
      if '/' not in username:
        user = FacebookPage.query(ndb.OR(
          FacebookPage.username == username,
          FacebookPage.inferred_username == username)).get()
        if user:
          return self.gr_source.user_url(user.key.id())
    return super(FacebookPage, self).infer_profile_url(url)

  @ndb.transactional
  def on_new_syndicated_post(self, syndpost):
    """If this source has no username, try to infer one from a syndication URL.

    Args:
      syndpost: :class:`models.SyndicatedPost`
    """
    url = syndpost.syndication
    if self.username or not url:
      return

    # FB usernames only have letters, numbers, and periods:
    # https://www.facebook.com/help/105399436216001
    author_id = self.gr_source.base_object({'object': {'url': url}})\
                              .get('author', {}).get('id')
    if author_id:
      if author_id != self.inferred_username and not util.is_int(author_id):
        logging.info('Inferring username %s from syndication url %s', author_id, url)
        self.inferred_username = author_id
        self.put()
        syndpost.syndication = self.canonicalize_url(syndpost.syndication)
      elif author_id != self.key.id() and author_id not in self.inferred_user_ids:
        logging.info('Inferring app-scoped user id %s from syndication url %s', author_id, url)
        self.inferred_user_ids = util.uniquify(self.inferred_user_ids + [author_id])
        self.put()
        syndpost.syndication = self.canonicalize_url(syndpost.syndication)

Example #14

Show file

File: github.py Project: snarfed/bridgy

class GitHub(Source):
    """A GitHub user.

  The key name is the GitHub username.
  """
    GR_CLASS = gr_github.GitHub
    OAUTH_START = oauth_github.Start
    SHORT_NAME = 'github'
    TYPE_LABELS = {
        'post': 'issue',
        'like': 'star',
    }
    BACKFEED_REQUIRES_SYNDICATION_LINK = True
    DISABLE_HTTP_CODES = Source.DISABLE_HTTP_CODES + ('403', )
    CAN_PUBLISH = True
    URL_CANONICALIZER = util.UrlCanonicalizer(domain=GR_CLASS.DOMAIN,
                                              fragment=True)
    # This makes us backfeed issue/PR comments to previous comments on the same
    # issue/PR.
    IGNORE_SYNDICATION_LINK_FRAGMENTS = True
    USERNAME_KEY_ID = True

    @staticmethod
    def new(auth_entity=None, **kwargs):
        """Creates and returns a :class:`GitHub` for the logged in user.

    Args:
      auth_entity: :class:`oauth_dropins.github.GitHubAuth`
      kwargs: property values
    """
        assert 'username' not in kwargs
        assert 'id' not in kwargs
        user = json_loads(auth_entity.user_json)
        gr_source = gr_github.GitHub(access_token=auth_entity.access_token())
        actor = gr_source.user_to_actor(user)
        return GitHub(username=auth_entity.key_id(),
                      auth_entity=auth_entity.key,
                      name=actor.get('displayName'),
                      picture=actor.get('image', {}).get('url'),
                      url=actor.get('url'),
                      **kwargs)

    def silo_url(self):
        """Returns the GitHub account URL, e.g. https://github.com/foo."""
        return self.gr_source.user_url(self.username)

    def label_name(self):
        """Returns the username."""
        return self.username

    def user_tag_id(self):
        """Returns this user's tag URI, eg 'tag:github.com:2013,MDQ6VXNlcjc3OD='."""
        id = json_loads(self.auth_entity.get().user_json)['id']
        return self.gr_source.tag_uri(id)

    def get_activities_response(self, *args, **kwargs):
        """Override/drop a few kwargs."""
        kwargs.update({
            'fetch_shares': None,
            'fetch_mentions': None,
            'count': min(10, kwargs.get('count', 0)),
        })
        return self.gr_source.get_activities_response(*args, **kwargs)

Example #15

Show file

class GooglePlusPage(models.Source):
    """A Google+ profile or page.

  The key name is the user id.
  """

    GR_CLASS = gr_googleplus.GooglePlus
    SHORT_NAME = 'googleplus'

    URL_CANONICALIZER = util.UrlCanonicalizer(
        domain=GR_CLASS.DOMAIN,
        approve=r'https://plus\.google\.com/[^/?]+/posts/[^/?]+$',
        headers=util.REQUEST_HEADERS)
    # no reject regexp; non-private G+ post URLs just 404

    # We're currently close to the G+ API's daily limit of 10k requests per day.
    # So low! :/ Usage history:
    # QPS: https://cloud.google.com/console/project/1029605954231
    # Today's quota usage: https://code.google.com/apis/console/b/0/?noredirect#project:1029605954231:quotas
    # Daily total usage: https://code.google.com/apis/console/b/0/?pli=1#project:1029605954231:stats

    # API quotas are refilled daily. Use 30h to make sure we're over a day even
    # after the randomized task ETA.
    RATE_LIMITED_POLL = datetime.timedelta(hours=30)

    type = ndb.StringProperty(choices=('user', 'page'))

    @staticmethod
    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a :class:`GooglePlusPage` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.googleplus.GooglePlusAuth`
    """
        # Google+ Person resource
        # https://developers.google.com/+/api/latest/people#resource
        user = json.loads(auth_entity.user_json)
        type = 'user' if user.get('objectType',
                                  'person') == 'person' else 'page'

        # override the sz param to ask for a 128x128 image. if there's an existing
        # sz query param (there usually is), the new one will come afterward and
        # override it.
        picture = user.get('image', {}).get('url')
        picture = util.add_query_params(picture, {'sz': '128'})

        return GooglePlusPage(id=user['id'],
                              auth_entity=auth_entity.key,
                              url=user.get('url'),
                              name=user.get('displayName'),
                              picture=picture,
                              type=type,
                              **kwargs)

    def silo_url(self):
        """Returns the Google+ account URL, e.g. https://plus.google.com/+Foo."""
        return self.url

    def __getattr__(self, name):
        """Overridden to pass auth_entity to :class:`granary.googleplus.GooglePlus`."""
        if name == 'gr_source' and self.auth_entity:
            self.gr_source = gr_googleplus.GooglePlus(
                auth_entity=self.auth_entity.get())
            return self.gr_source

        return getattr(super(GooglePlusPage, self), name)

    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Only searches for root domain web site URLs! Skips URLs with paths; they
    tend to generate false positive results in G+'s search. Not sure why yet.

    G+ search supports OR:
    https://developers.google.com/+/api/latest/activities/search

    Returns:
      sequence of ActivityStreams activity dicts
    """
        urls = [
            '"%s"' % util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url))
            and urlparse.urlparse(url).path in ('', '/')
        ][:models.MAX_AUTHOR_URLS]

        if urls:
            return self.get_activities(search_query=' OR '.join(urls),
                                       group_id=gr_source.SEARCH,
                                       etag=self.last_activities_etag,
                                       fetch_replies=False,
                                       fetch_likes=False,
                                       fetch_shares=False,
                                       count=50)

        return []

Example #16

Show file

class Instagram(models.Source):
  """An Instagram account.

  The key name is the username.
  """

  GR_CLASS = gr_instagram.Instagram
  SHORT_NAME = 'instagram'
  FAST_POLL = datetime.timedelta(minutes=60)

  URL_CANONICALIZER = util.UrlCanonicalizer(
    domain=GR_CLASS.DOMAIN,
    subdomain='www',
    approve=r'https://www.instagram.com/p/[^/?]+/',
    trailing_slash=True,
    headers=util.USER_AGENT_HEADER)
    # no reject regexp; non-private Instagram post URLs just 404

  @staticmethod
  def new(handler, auth_entity=None, actor=None, **kwargs):
    """Creates and returns a InstagramPage for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.instagram.InstagramAuth
    """
    user = json.loads(auth_entity.user_json)
    user['actor'] = actor
    auth_entity.user_json = json.dumps(user)
    auth_entity.put()

    username = actor['username']
    if not kwargs.get('features'):
      kwargs['features'] = ['listen']
    urls = microformats2.object_urls(actor)
    return Instagram(id=username,
                     auth_entity=auth_entity.key,
                     name=actor.get('displayName'),
                     picture=actor.get('image', {}).get('url'),
                     url=gr_instagram.Instagram.user_url(username),
                     domain_urls=urls,
                     domains=[util.domain_from_link(url) for url in urls],
                     **kwargs)

  def silo_url(self):
    """Returns the Instagram account URL, e.g. https://instagram.com/foo."""
    return self.url

  def user_tag_id(self):
    """Returns the tag URI for this source, e.g. 'tag:instagram.com:123456'."""
    user = json.loads(self.auth_entity.get().user_json)
    return self.gr_source.tag_uri(user.get('id') or self.key.id())

  def label_name(self):
    """Returns the username."""
    return self.key.id()

  def get_activities_response(self, *args, **kwargs):
    """Set user_id because scraping requires it."""
    kwargs.setdefault('group_id', gr_source.SELF)
    kwargs.setdefault('user_id', self.key.id())
    return self.gr_source.get_activities_response(*args, **kwargs)

Example #17

Show file

class Source(StringIdModel, metaclass=SourceMeta):
  """A silo account, e.g. a Facebook or Google+ account.

  Each concrete silo class should subclass this class.
  """

  # Turn off NDB instance and memcache caching.
  # https://developers.google.com/appengine/docs/python/ndb/cache
  # https://github.com/snarfed/bridgy/issues/558
  # https://github.com/snarfed/bridgy/issues/68
  _use_cache = False

  STATUSES = ('enabled', 'disabled')
  POLL_STATUSES = ('ok', 'error', 'polling')
  FEATURES = ('listen', 'publish', 'webmention', 'email')

  # short name for this site type. used in URLs, etc.
  SHORT_NAME = None
  # the corresponding granary class
  GR_CLASS = None
  # oauth-dropins Start class
  OAUTH_START = None
  # whether Bridgy supports listen for this silo - this is unlikely, so we default to True
  CAN_LISTEN = True
  # whether Bridgy supports publish for this silo
  CAN_PUBLISH = None
  # whether this source should poll automatically, or only when triggered
  # (eg Instagram)
  AUTO_POLL = True
  # how often to poll for responses
  FAST_POLL = timedelta(minutes=30)
  # how often to poll sources that have never sent a webmention
  SLOW_POLL = timedelta(days=1)
  # how often to poll sources that are currently rate limited by their silo
  RATE_LIMITED_POLL = SLOW_POLL
  # how long to wait after signup for a successful webmention before dropping to
  # the lower frequency poll
  FAST_POLL_GRACE_PERIOD = timedelta(days=7)
  # how often refetch author url to look for updated syndication links
  FAST_REFETCH = timedelta(hours=6)
  # refetch less often (this often) if it's been >2w since the last synd link
  SLOW_REFETCH = timedelta(days=2)
  # rate limiting HTTP status codes returned by this silo. e.g. twitter returns
  # 429, instagram 503, google+ 403.
  RATE_LIMIT_HTTP_CODES = ('429',)
  DISABLE_HTTP_CODES = ('401',)
  TRANSIENT_ERROR_HTTP_CODES = ()
  # whether granary supports fetching block lists
  HAS_BLOCKS = False
  # whether to require a u-syndication link for backfeed
  BACKFEED_REQUIRES_SYNDICATION_LINK = False
  # ignore fragments when comparing syndication links in OPD
  IGNORE_SYNDICATION_LINK_FRAGMENTS = False
  # convert username to all lower case to use as key name
  USERNAME_KEY_ID = False

  # Maps Publish.type (e.g. 'like') to source-specific human readable type label
  # (e.g. 'favorite'). Subclasses should override this.
  TYPE_LABELS = {}

  # subclasses should override this
  URL_CANONICALIZER = util.UrlCanonicalizer()

  # Regexps for URL paths that don't accept incoming webmentions. Currently used
  # by Blogger.
  PATH_BLOCKLIST = ()

  created = ndb.DateTimeProperty(auto_now_add=True, required=True, tzinfo=timezone.utc)
  url = ndb.StringProperty()
  username = ndb.StringProperty()
  status = ndb.StringProperty(choices=STATUSES, default='enabled')
  poll_status = ndb.StringProperty(choices=POLL_STATUSES, default='ok')
  rate_limited = ndb.BooleanProperty(default=False)
  name = ndb.StringProperty()  # full human-readable name
  picture = ndb.StringProperty()
  domains = ndb.StringProperty(repeated=True)
  domain_urls = ndb.StringProperty(repeated=True)
  features = ndb.StringProperty(repeated=True, choices=FEATURES)
  superfeedr_secret = ndb.StringProperty()
  webmention_endpoint = ndb.StringProperty()

  # points to an oauth-dropins auth entity. The model class should be a subclass
  # of oauth_dropins.BaseAuth. the token should be generated with the
  # offline_access scope so that it doesn't expire.
  auth_entity = ndb.KeyProperty()

  #
  # listen-only properties
  #
  last_polled = ndb.DateTimeProperty(default=util.EPOCH, tzinfo=timezone.utc)
  last_poll_attempt = ndb.DateTimeProperty(default=util.EPOCH, tzinfo=timezone.utc)
  last_webmention_sent = ndb.DateTimeProperty(tzinfo=timezone.utc)
  last_public_post = ndb.DateTimeProperty(tzinfo=timezone.utc)
  recent_private_posts = ndb.IntegerProperty(default=0)

  # the last time we re-fetched the author's url looking for updated
  # syndication links
  last_hfeed_refetch = ndb.DateTimeProperty(default=util.EPOCH, tzinfo=timezone.utc)

  # the last time we've seen a rel=syndication link for this Source.
  # we won't spend the time to re-fetch and look for updates if there's
  # never been one
  last_syndication_url = ndb.DateTimeProperty(tzinfo=timezone.utc)
  # the last time we saw a syndication link in an h-feed, as opposed to just on
  # permalinks. background: https://github.com/snarfed/bridgy/issues/624
  last_feed_syndication_url = ndb.DateTimeProperty(tzinfo=timezone.utc)

  last_activity_id = ndb.StringProperty()
  last_activities_etag = ndb.StringProperty()
  last_activities_cache_json = ndb.TextProperty()
  seen_responses_cache_json = ndb.TextProperty(compressed=True)

  # populated in Poll.poll(), used by handlers
  blocked_ids = ndb.JsonProperty(compressed=True)

  # maps updated property names to values that put_updates() writes back to the
  # datastore transactionally. set this to {} before beginning.
  updates = None

  # gr_source is *not* set to None by default here, since it needs to be unset
  # for __getattr__ to run when it's accessed.

  def __init__(self, *args, id=None, **kwargs):
    """Constructor. Escapes the key string id if it starts with `__`."""
    username = kwargs.get('username')
    if self.USERNAME_KEY_ID and username and not id:
      id = username.lower()
    if id and id.startswith('__'):
      id = '\\' + id
    super().__init__(*args, id=id, **kwargs)

  def key_id(self):
    """Returns the key's unescaped string id."""
    id = self.key.id()
    return id[1:] if id[0] == '\\' else id

  @classmethod
  def new(cls, **kwargs):
    """Factory method. Creates and returns a new instance for the current user.

    To be implemented by subclasses.
    """
    raise NotImplementedError()

  def __getattr__(self, name):
    """Lazily load the auth entity and instantiate :attr:`self.gr_source`.

    Once :attr:`self.gr_source` is set, this method will *not* be called;
    :attr:`gr_source` will be returned normally.
    """
    if name != 'gr_source':
      return getattr(super(), name)

    super_attr = getattr(super(), name, None)
    if super_attr:
      return super_attr
    elif not self.auth_entity:
      return None

    auth_entity = self.auth_entity.get()
    try:
      refresh_token = auth_entity.refresh_token
      self.gr_source = self.GR_CLASS(refresh_token)
      return self.gr_source
    except AttributeError:
      logger.info('no refresh_token')
    args = auth_entity.access_token()
    if not isinstance(args, tuple):
      args = (args,)

    kwargs = {}
    if self.key.kind() == 'FacebookPage' and auth_entity.type == 'user':
      kwargs = {'user_id': self.key_id()}
    elif self.key.kind() == 'Instagram':
      kwargs = {'scrape': True, 'cookie': INSTAGRAM_SESSIONID_COOKIE}
    elif self.key.kind() == 'Mastodon':
      args = (auth_entity.instance(),) + args
      inst = auth_entity.app.get().instance_info
      kwargs = {
        'user_id': json_loads(auth_entity.user_json).get('id'),
        # https://docs-develop.pleroma.social/backend/API/differences_in_mastoapi_responses/#instance
        'truncate_text_length':
          json_loads(inst).get('max_toot_chars') if inst else None,
      }
    elif self.key.kind() == 'Twitter':
      kwargs = {'username': self.key_id(), 'scrape_headers': TWITTER_SCRAPE_HEADERS}

    self.gr_source = self.GR_CLASS(*args, **kwargs)
    return self.gr_source

  @classmethod
  def lookup(cls, id):
    """Returns the entity with the given id.

    By default, interprets id as just the key id. Subclasses may extend this to
    support usernames, etc.

    TODO: if USERNAME_KEY_ID, normalize to lower case before looking up. Need to
    wait until we've backfilled all existing entities with upper case key ids.
    """
    if id and id.startswith('__'):
      id = '\\' + id
    return ndb.Key(cls, id).get()

  def user_tag_id(self):
    """Returns the tag URI for this source, e.g. 'tag:plus.google.com:123456'."""
    return self.gr_source.tag_uri(self.key_id())

  def bridgy_path(self):
    """Returns the Bridgy page URL path for this source."""
    return f'/{self.SHORT_NAME}/{self.key_id()}'

  def bridgy_url(self):
    """Returns the Bridgy page URL for this source."""
    return util.host_url(self.bridgy_path())

  def silo_url(self, handler):
    """Returns the silo account URL, e.g. https://twitter.com/foo."""
    raise NotImplementedError()

  def label(self):
    """Human-readable label for this source."""
    return f'{self.label_name()} ({self.GR_CLASS.NAME})'

  def label_name(self):
    """Human-readable name or username for this source, whichever is preferred."""
    return self.name or self.key_id()

  @classmethod
  @ndb.transactional()
  def put_updates(cls, source):
    """Writes source.updates to the datastore transactionally.

    Returns:
      source: :class:`Source`

    Returns:
      the updated :class:`Source`
    """
    if not source.updates:
      return source

    to_log = {k: v for k, v in source.updates.items() if not k.endswith('_json')}
    logger.info(f'Updating {source.label()} {source.bridgy_path()} : {to_log!r}')

    updates = source.updates
    source = source.key.get()
    source.updates = updates
    for name, val in updates.items():
      setattr(source, name, val)

    source.put()
    return source

  def poll_period(self):
    """Returns the poll frequency for this source, as a :class:`datetime.timedelta`.

    Defaults to ~15m, depending on silo. If we've never sent a webmention for
    this source, or the last one we sent was over a month ago, we drop them down
    to ~1d after a week long grace period.
    """
    now = util.now_fn()
    if self.rate_limited:
      return self.RATE_LIMITED_POLL
    elif now < self.created + self.FAST_POLL_GRACE_PERIOD:
      return self.FAST_POLL
    elif not self.last_webmention_sent:
      return self.SLOW_POLL
    elif self.last_webmention_sent > now - timedelta(days=7):
      return self.FAST_POLL
    elif self.last_webmention_sent > now - timedelta(days=30):
      return self.FAST_POLL * 10
    else:
      return self.SLOW_POLL

  def should_refetch(self):
    """Returns True if we should run OPD refetch on this source now."""
    now = util.now_fn()
    if self.last_hfeed_refetch == REFETCH_HFEED_TRIGGER:
      return True
    elif not self.last_syndication_url:
      return False

    period = (self.FAST_REFETCH
              if self.last_syndication_url > now - timedelta(days=14)
              else self.SLOW_REFETCH)
    return self.last_poll_attempt >= self.last_hfeed_refetch + period

  @classmethod
  def bridgy_webmention_endpoint(cls, domain='brid.gy'):
    """Returns the Bridgy webmention endpoint for this source type."""
    return f'https://{domain}/webmention/{cls.SHORT_NAME}'

  def has_bridgy_webmention_endpoint(self):
    """Returns True if this source uses Bridgy's webmention endpoint."""
    return self.webmention_endpoint in (
      self.bridgy_webmention_endpoint(),
      self.bridgy_webmention_endpoint(domain='www.brid.gy'))

  def get_author_urls(self):
    """Determine the author urls for a particular source.

    In debug mode, replace test domains with localhost.

    Return:
      a list of string URLs, possibly empty
    """
    return [util.replace_test_domains_with_localhost(u) for u in self.domain_urls]

  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    https://github.com/snarfed/bridgy/issues/456
    https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
    return []

  def get_activities_response(self, **kwargs):
    """Returns recent posts and embedded comments for this source.

    May be overridden by subclasses.
    """
    kwargs.setdefault('group_id', gr_source.SELF)
    resp = self.gr_source.get_activities_response(**kwargs)
    for activity in resp['items']:
      self._inject_user_urls(activity)
    return resp

  def get_activities(self, **kwargs):
    return self.get_activities_response(**kwargs)['items']

  def get_comment(self, comment_id, **kwargs):
    """Returns a comment from this source.

    Passes through to granary by default. May be overridden by subclasses.

    Args:
      comment_id: string, site-specific comment id
      kwargs: passed to :meth:`granary.source.Source.get_comment`

    Returns:
      dict, decoded ActivityStreams comment object, or None
    """
    comment = self.gr_source.get_comment(comment_id, **kwargs)
    if comment:
      self._inject_user_urls(comment)
    return comment

  def get_like(self, activity_user_id, activity_id, like_user_id, **kwargs):
    """Returns an ActivityStreams 'like' activity object.

    Passes through to granary by default. May be overridden
    by subclasses.

    Args:
      activity_user_id: string id of the user who posted the original activity
      activity_id: string activity id
      like_user_id: string id of the user who liked the activity
      kwargs: passed to granary.Source.get_comment
    """
    return self.gr_source.get_like(activity_user_id, activity_id, like_user_id,
                                   **kwargs)

  def _inject_user_urls(self, activity):
    """Adds this user's web site URLs to their user mentions (in tags), in place."""
    obj = activity.get('object') or activity
    user_tag_id = self.user_tag_id()
    for tag in obj.get('tags', []):
      if tag.get('id') == user_tag_id:
        tag.setdefault('urls', []).extend([{'value': u} for u in self.domain_urls])

  def create_comment(self, post_url, author_name, author_url, content):
    """Creates a new comment in the source silo.

    Must be implemented by subclasses.

    Args:
      post_url: string
      author_name: string
      author_url: string
      content: string

    Returns:
      response dict with at least 'id' field
    """
    raise NotImplementedError()

  def feed_url(self):
    """Returns the RSS or Atom (or similar) feed URL for this source.

    Must be implemented by subclasses. Currently only implemented by
    :mod:`blogger`, :mod:`medium`, :mod:`tumblr`, and :mod:`wordpress_rest`.

    Returns:
      string URL
    """
    raise NotImplementedError()

  def edit_template_url(self):
    """Returns the URL for editing this blog's template HTML.

    Must be implemented by subclasses. Currently only implemented by
    :mod:`blogger`, :mod:`medium`, :mod:`tumblr`, and :mod:`wordpress_rest`.

    Returns:
      string URL
    """
    raise NotImplementedError()

  @classmethod
  def button_html(cls, feature, **kwargs):
    """Returns an HTML string with a login form and button for this site.

    Mostly just passes through to
    :meth:`oauth_dropins.handlers.Start.button_html`.

    Returns: string, HTML
    """
    assert set(feature.split(',')) <= set(cls.FEATURES)
    form_extra = (kwargs.pop('form_extra', '') +
                  f'<input name="feature" type="hidden" value="{feature}" />')

    source = kwargs.pop('source', None)
    if source:
      form_extra += f'\n<input name="id" type="hidden" value="{source.key_id()}" />'

    if cls.OAUTH_START:
      return cls.OAUTH_START.button_html(
        f'/{cls.SHORT_NAME}/start',
        form_extra=form_extra,
        image_prefix='/oauth_dropins_static/',
        **kwargs)

    return ''

  @classmethod
  @ndb.transactional()
  def create_new(cls, user_url=None, **kwargs):
    """Creates and saves a new :class:`Source` and adds a poll task for it.

    Args:
      user_url: a string, optional. if provided, supersedes other urls when
        determining the author_url
      **kwargs: passed to :meth:`new()`

    Returns: newly created :class:`Source`
    """
    source = cls.new(**kwargs)
    if source is None:
      return None

    if not source.domain_urls:  # defer to the source if it already set this
      auth_entity = kwargs.get('auth_entity')
      if auth_entity and hasattr(auth_entity, 'user_json'):
        source.domain_urls, source.domains = source.urls_and_domains(
          auth_entity, user_url)
    logger.debug(f'URLs/domains: {source.domain_urls} {source.domains}')

    # check if this source already exists
    existing = source.key.get()
    if existing:
      # merge some fields
      source.features = set(source.features + existing.features)
      source.populate(**existing.to_dict(include=(
            'created', 'last_hfeed_refetch', 'last_poll_attempt', 'last_polled',
            'last_syndication_url', 'last_webmention_sent', 'superfeedr_secret',
            'webmention_endpoint')))
      verb = 'Updated'
    else:
      verb = 'Added'

    author_urls = source.get_author_urls()
    link = ('http://indiewebify.me/send-webmentions/?url=' + author_urls[0]
            if author_urls else 'http://indiewebify.me/#send-webmentions')
    feature = source.features[0] if source.features else 'listen'
    blurb = '%s %s. %s' % (
      verb, source.label(),
      'Try previewing a post from your web site!' if feature == 'publish'
      else '<a href="%s">Try a webmention!</a>' % link if feature == 'webmention'
      else "Refresh in a minute to see what we've found!")
    logger.info(f'{blurb} {source.bridgy_url()}')

    source.verify()
    if source.verified():
      flash(blurb)

    source.put()

    if 'webmention' in source.features:
      superfeedr.subscribe(source)

    if 'listen' in source.features and source.AUTO_POLL:
      util.add_poll_task(source, now=True)
      util.add_poll_task(source)

    return source

  def verified(self):
    """Returns True if this source is ready to be used, false otherwise.

    See :meth:`verify()` for details. May be overridden by subclasses, e.g.
    :class:`tumblr.Tumblr`.
    """
    if not self.domains or not self.domain_urls:
      return False
    if 'webmention' in self.features and not self.webmention_endpoint:
      return False
    if ('listen' in self.features and
        not (self.webmention_endpoint or self.last_webmention_sent)):
      return False
    return True

  def verify(self, force=False):
    """Checks that this source is ready to be used.

    For blog and listen sources, this fetches their front page HTML and
    discovers their webmention endpoint. For publish sources, this checks that
    they have a domain.

    May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`.

    Args:
      force: if True, fully verifies (e.g. re-fetches the blog's HTML and
        performs webmention discovery) even we already think this source is
        verified.
    """
    author_urls = [u for u, d in zip(self.get_author_urls(), self.domains)
                   if not util.in_webmention_blocklist(d)]
    if ((self.verified() and not force) or self.status == 'disabled' or
        not self.features or not author_urls):
      return

    author_url = author_urls[0]
    try:
      got = webmention.discover(author_url, timeout=util.HTTP_TIMEOUT)
      self.webmention_endpoint = got.endpoint
      self._fetched_html = got.response.text
    except BaseException as e:
      logger.info('Error discovering webmention endpoint', exc_info=e)
      self.webmention_endpoint = None

    self.put()

  def urls_and_domains(self, auth_entity, user_url, actor=None,
                       resolve_source_domain=True):
    """Returns this user's valid (not webmention-blocklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing
      actor: dict, optional AS actor for the user. If provided, overrides
        auth_entity
      resolve_source_domain: boolean, whether to follow redirects on URLs on
        this source's domain

    Returns:
      ([string url, ...], [string domain, ...])
    """
    if not actor:
      actor = self.gr_source.user_to_actor(json_loads(auth_entity.user_json))
    logger.debug(f'Extracting URLs and domains from actor: {json_dumps(actor, indent=2)}')

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logger.info(f'Too many profile links! Only resolving the first {MAX_AUTHOR_URLS}: {candidates}')

    urls = []
    for i, url in enumerate(candidates):
      on_source_domain = util.domain_from_link(url) == self.gr_source.DOMAIN
      resolve = ((resolve_source_domain or not on_source_domain) and
                 i < MAX_AUTHOR_URLS)
      resolved = self.resolve_profile_url(url, resolve=resolve)
      if resolved:
        urls.append(resolved)

    final_urls = []
    domains = []
    for url in util.dedupe_urls(urls):  # normalizes domains to lower case
      # skip links on this source's domain itself. only currently needed for
      # Mastodon; the other silo domains are in the webmention blocklist.
      domain = util.domain_from_link(url)
      if domain != self.gr_source.DOMAIN:
        final_urls.append(url)
        domains.append(domain)

    return final_urls, domains

  @staticmethod
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        mf2 = util.fetch_mf2(root)
        me_urls = mf2['rels'].get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logger.warning(f"Couldn't fetch {root}, preserving path in {final}", exc_info=True)

    return final

  def canonicalize_url(self, url, activity=None, **kwargs):
    """Canonicalizes a post or object URL.

    Wraps :class:`oauth_dropins.webutil.util.UrlCanonicalizer`.
    """
    return self.URL_CANONICALIZER(url, **kwargs) if self.URL_CANONICALIZER else url

  def infer_profile_url(self, url):
    """Given an arbitrary URL representing a person, try to find their
    profile URL for *this* service.

    Queries Bridgy's registered accounts for users with a particular
    domain in their silo profile.

    Args:
      url: string, a person's URL

    Return:
      a string URL for their profile on this service (or None)
    """
    domain = util.domain_from_link(url)
    if domain == self.gr_source.DOMAIN:
      return url
    user = self.__class__.query(self.__class__.domains == domain).get()
    if user:
      return self.gr_source.user_url(user.key_id())

  def preprocess_for_publish(self, obj):
    """Preprocess an object before trying to publish it.

    By default this tries to massage person tags so that the tag's
    "url" points to the person's profile on this service (as opposed
    to a person's homepage).

    The object is modified in place.

    Args:
      obj: ActivityStreams activity or object dict
    """
    for tag in obj.get('tags', []):
      if tag.get('objectType') == 'person':
        silo_url = None
        for url in microformats2.object_urls(tag):
          silo_url = url and self.infer_profile_url(url)
          if silo_url:
            break
        if silo_url:
          tag['url'] = silo_url

    # recurse on contained object(s)
    for obj in util.get_list(obj, 'object'):
      self.preprocess_for_publish(obj)

  def on_new_syndicated_post(self, syndpost):
    """Called when a new :class:`SyndicatedPost` is stored for this source.

    Args:
      syndpost: :class:`SyndicatedPost`
    """
    pass

  def is_private(self):
    """Returns True if this source is private aka protected.

    ...ie their posts are not public.
    """
    return False

  def is_activity_public(self, activity):
    """Returns True if the given activity is public, False otherwise.

    Just wraps :meth:`granary.source.Source.is_public`. Subclasses may override.
    """
    return gr_source.Source.is_public(activity)

  def is_beta_user(self):
    """Returns True if this is a "beta" user opted into new features.

    Beta users come from beta_users.txt.
    """
    return self.bridgy_path() in util.BETA_USER_PATHS

  def load_blocklist(self):
    """Fetches this user's blocklist, if supported, and stores it in the entity."""
    if not self.HAS_BLOCKS:
      return

    try:
      ids = self.gr_source.get_blocklist_ids()
    except gr_source.RateLimited as e:
      ids = e.partial or []

    self.blocked_ids = ids[:BLOCKLIST_MAX_IDS]
    self.put()

  def is_blocked(self, obj):
    """Returns True if an object's author is being blocked.

    ...ie they're in this user's block list.

    Note that this method is tested in test_twitter.py, not test_models.py, for
    historical reasons.
    """
    if not self.blocked_ids:
      return False

    for o in [obj] + util.get_list(obj, 'object'):
      for field in 'author', 'actor':
        if o.get(field, {}).get('numeric_id') in self.blocked_ids:
          return True

Example #18

Show file

class Twitter(models.Source):
    """A Twitter account.

  The key name is the username.
  """

    GR_CLASS = gr_twitter.Twitter
    SHORT_NAME = 'twitter'
    TYPE_LABELS = {
        'post': 'tweet',
        'comment': '@-reply',
        'repost': 'retweet',
        'like': 'favorite',
    }

    URL_CANONICALIZER = util.UrlCanonicalizer(
        domain=GR_CLASS.DOMAIN,
        approve=r'https://twitter\.com/[^/?]+/status/[^/?]+',
        reject=r'https://twitter\.com/.+\?protected_redirect=true',
        headers=util.USER_AGENT_HEADER)

    # Twitter's rate limiting window is currently 15m. A normal poll with nothing
    # new hits /statuses/user_timeline and /search/tweets once each. Both
    # allow 180 calls per window before they're rate limited.
    # https://dev.twitter.com/docs/rate-limiting/1.1/limits

    @staticmethod
    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a Twitter entity.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth-dropins.twitter.TwitterAuth
      kwargs: property values
    """
        user = json.loads(auth_entity.user_json)
        gr_source = gr_twitter.Twitter(*auth_entity.access_token())
        actor = gr_source.user_to_actor(user)
        return Twitter(id=user['screen_name'],
                       auth_entity=auth_entity.key,
                       url=actor.get('url'),
                       name=actor.get('displayName'),
                       picture=actor.get('image', {}).get('url'),
                       **kwargs)

    def silo_url(self):
        """Returns the Twitter account URL, e.g. https://twitter.com/foo."""
        return self.gr_source.user_url(self.key.id())

    def label_name(self):
        """Returns the username."""
        return self.key.id()

    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Twitter search supports OR:
    https://dev.twitter.com/rest/public/search

    ...but it only returns complete(ish) results if we strip scheme from URLs,
    ie search for example.com instead of http://example.com/, and that also
    returns false positivies, so we check that the returned tweets actually have
    matching links. https://github.com/snarfed/bridgy/issues/565

    Returns: sequence of ActivityStreams activity dicts
    """
        urls = set(
            util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url)))
        if not urls:
            return []

        query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False)
                            for url in urls)
        candidates = self.get_activities(search_query=query,
                                         group_id=gr_source.SEARCH,
                                         etag=self.last_activities_etag,
                                         fetch_replies=False,
                                         fetch_likes=False,
                                         fetch_shares=False,
                                         count=50)

        # filter out retweets and search false positives that don't actually link to us
        results = []
        for candidate in candidates:
            if candidate.get('verb') == 'share':
                continue
            obj = candidate['object']
            tags = obj.get('tags', [])
            atts = obj.get('attachments', [])
            for url in urls:
                if (url in obj.get('content', '') or any(
                        t.get('url', '').startswith(url)
                        for t in tags + atts)):
                    id = candidate['id']
                    results.append(candidate)
                    break

        return results

    def get_like(self, activity_user_id, activity_id, like_user_id, **kwargs):
        """Returns an ActivityStreams 'like' activity object for a favorite.

    We get Twitter favorites by scraping HTML, and we only get the first page,
    which only has 25. So, use a Response in the datastore first, if we have
    one, and only re-scrape HTML as a fallback.

    Args:
      activity_user_id: string id of the user who posted the original activity
      activity_id: string activity id
      like_user_id: string id of the user who liked the activity
      kwargs: passed to granary.Source.get_comment
    """
        id = self.gr_source.tag_uri('%s_favorited_by_%s' %
                                    (activity_id, like_user_id))
        resp = models.Response.get_by_id(id)
        if resp:
            return json.loads(resp.response_json)
        else:
            return super(Twitter, self).get_like(activity_user_id, activity_id,
                                                 like_user_id, **kwargs)

    def is_private(self):
        """Returns True if this Twitter account is protected.

    https://dev.twitter.com/rest/reference/get/users/show#highlighter_25173
    https://support.twitter.com/articles/14016
    https://support.twitter.com/articles/20169886
    """
        return json.loads(self.auth_entity.get().user_json).get('protected')

    def canonicalize_url(self, url, activity=None, **kwargs):
        """Normalize /statuses/ to /status/.

    https://github.com/snarfed/bridgy/issues/618
    """
        url = url.replace('/statuses/', '/status/')
        return super(Twitter, self).canonicalize_url(url, **kwargs)

Example #19

Show file

class GitHub(Source):
    """A GitHub user.

  WARNING: technically we should override URL_CANONICALIZER here and pass it
  fragment=True, since comment permalinks have meaningful fragments, eg
  #issuecomment=123. Right now, when we see a comment syndication URL, we strip
  its fragment and store just the issue URL as the synd URL, which is obviously
  wrong.

  ...HOWEVER, that has the nice side effect of enabling backfeed to comments as
  well as issues, since we think comment OPs are the issue itself.

  This is obviously not ideal. The fix is to extend
  original_post_discovery.discover() to allow silo-specific synd URL
  comparisons, so that a comment on an issue can match along with the issue
  itself. I'm lazy, though, so I'm leaving this as is for now.

  The key name is the GitHub username.
  """
    GR_CLASS = gr_github.GitHub
    OAUTH_START_HANDLER = oauth_github.StartHandler
    SHORT_NAME = 'github'
    TYPE_LABELS = {
        'post': 'issue',
        'like': 'star',
    }
    BACKFEED_REQUIRES_SYNDICATION_LINK = True
    DISABLE_HTTP_CODES = Source.DISABLE_HTTP_CODES + ('403', )
    CAN_PUBLISH = True
    # WARNING: see docstring
    URL_CANONICALIZER = util.UrlCanonicalizer(domain=GR_CLASS.DOMAIN,
                                              headers=util.REQUEST_HEADERS)

    @staticmethod
    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a :class:`GitHub` for the logged in user.

    Args:
      handler: the current :class:`webapp2.RequestHandler`
      auth_entity: :class:`oauth_dropins.github.GitHubAuth`
      kwargs: property values
    """
        user = json_loads(auth_entity.user_json)
        gr_source = gr_github.GitHub(access_token=auth_entity.access_token())
        actor = gr_source.user_to_actor(user)
        return GitHub(id=auth_entity.key.id(),
                      auth_entity=auth_entity.key,
                      name=actor.get('displayName'),
                      picture=actor.get('image', {}).get('url'),
                      url=actor.get('url'),
                      **kwargs)

    def silo_url(self):
        """Returns the GitHub account URL, e.g. https://github.com/foo."""
        return self.gr_source.user_url(self.key.id())

    def label_name(self):
        """Returns the username."""
        return self.key.id()

    def get_activities_response(self, *args, **kwargs):
        """Drop kwargs that granary doesn't currently support for github."""
        kwargs.update({
            'fetch_shares': None,
            'fetch_mentions': None,
        })
        return self.gr_source.get_activities_response(*args, **kwargs)

Example #20

Show file

File: facebook.py Project: sheyril/bridgy

class Facebook(browser.BrowserSource):
    """A Facebook account.

  The key name is the Facebook global user id.
  """
    GR_CLASS = gr_facebook.Facebook
    SHORT_NAME = 'facebook'
    OAUTH_START_HANDLER = oauth_facebook.StartHandler
    URL_CANONICALIZER = util.UrlCanonicalizer(
        domain=GR_CLASS.DOMAIN,
        subdomain='www',
        query=True,
        approve=r'https://www\.facebook\.com/[^/?]+/posts/[^/?]+$',
        headers=util.REQUEST_HEADERS)
    # no reject regexp; non-private FB post URLs just 404

    # blank granary Facebook object, shared across all instances
    gr_source = gr_facebook.Facebook()

    # unique name used in FB URLs, e.g. facebook.com/[username]
    username = ndb.StringProperty()

    @classmethod
    def new(cls, handler, auth_entity=None, actor=None, **kwargs):
        """Creates and returns an entity based on an AS1 actor."""
        src = super().new(handler, auth_entity=None, actor=actor, **kwargs)
        src.username = actor.get('username')
        return src

    @classmethod
    def key_id_from_actor(cls, actor):
        """Returns the actor's numeric_id field to use as this entity's key id.

    numeric_id is the Facebook global user id.
    """
        return actor['numeric_id']

    @classmethod
    def lookup(cls, id):
        """Returns the entity with the given id or username."""
        return ndb.Key(cls, id).get() or cls.query(cls.username == id).get()

    def silo_url(self):
        """Returns the Facebook profile URL, e.g. https://facebook.com/foo.

    Facebook profile URLS with app-scoped user ids (eg www.facebook.com/ID) no
    longer work as of April 2018, so if that's all we have, return None instead.
    https://developers.facebook.com/blog/post/2018/04/19/facebook-login-changes-address-abuse/
    """
        if self.username:
            return self.gr_source.user_url(self.username)

        user_id = self.key.id()
        if util.is_int(id) and int(id) < MIN_APP_SCOPED_ID:
            return self.gr_source.user_url(user_id)

    @classmethod
    def button_html(cls, feature, **kwargs):
        return super(cls, cls).button_html(feature,
                                           form_method='get',
                                           **kwargs)
        return oauth_instagram.StartHandler.button_html(
            '/about#browser-extension',
            form_method='get',
            image_prefix='/oauth_dropins/static/')

    def canonicalize_url(self, url, **kwargs):
        """Facebook-specific standardization of syndicated urls.

    Canonical form is https://www.facebook.com/USERID/posts/POSTID

    Args:
      url: a string, the url of the syndicated content
      kwargs: unused

    Return:
      a string, the canonical form of the syndication url
    """
        if util.domain_from_link(url) != self.gr_source.DOMAIN:
            return None

        def post_url(id):
            return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id)

        parsed = urllib.parse.urlparse(url)
        params = urllib.parse.parse_qs(parsed.query)
        path = parsed.path.strip('/').split('/')
        url_id = self.gr_source.post_id(url)
        ids = params.get('story_fbid') or params.get('fbid')

        post_id = ids[0] if ids else url_id
        if post_id:
            url = post_url(post_id)

        url = url.replace('facebook.com/%s/' % self.username,
                          'facebook.com/%s/' % self.key.id())

        return super(Facebook, self).canonicalize_url(url)