Python domain_from_link Examples, util.domain_from_link Python Examples

Example #1

0

Show file

File: models.py Project: uniteddiversity/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    urls = []
    for url in util.trim_nulls(util.uniquify(
        [user_url] + [actor.get('url')] +
        [u.get('value') for u in actor.get('urls', [])])):
      domain = util.domain_from_link(url)
      if domain and not util.in_webmention_blacklist(domain.lower()):
        urls.append(url)

    urls = util.dedupe_urls(urls)
    domains = [util.domain_from_link(url).lower() for url in urls]
    return urls, domains

Example #2

0

Show file

File: util_test.py Project: kylewm/webutil

  def test_domain_from_link(self):
    self.assertEqual('localhost', util.domain_from_link('http://localhost/foo'))
    self.assertEqual('a.b.c.d', util.domain_from_link('http://a.b.c.d/foo'))
    for good_link in ('asdf.com', 'www.asdf.com', 'https://asdf.com/',
                      'asdf.com/foo?bar#baz'):
      actual = util.domain_from_link(good_link)
      self.assertEqual('asdf.com', actual, '%s returned %s' % (good_link, actual))

    self.assertEqual('asdf.com.', util.domain_from_link('http://asdf.com./x'))

    for bad_link in '', '  ', 'a&b.com', 'http://', 'file:///':
      self.assertEquals(None, util.domain_from_link(bad_link))

Example #3

0

Show file

File: test_util.py Project: snarfed/webutil

  def test_domain_from_link(self):
    self.assertEqual('localhost', util.domain_from_link('http://localhost/foo'))
    self.assertEqual('a.b.c.d', util.domain_from_link('http://a.b.c.d/foo'))
    for good_link in ('asdf.com', 'www.asdf.com', 'https://asdf.com/',
                      'asdf.com/foo?bar#baz', 'm.asdf.com', 'asdf.com:1234',
                      'mobile.asdf.com/foo?bar#baz', '//asdf.com/foo/bar',
                      'https://m.asdf.com/foo?bar#baz'):
      actual = util.domain_from_link(good_link)
      self.assertEqual('asdf.com', actual, '%s returned %s' % (good_link, actual))

    self.assertEqual('asdf.com.', util.domain_from_link('http://asdf.com./x'))

    for bad_link in '', '  ', 'a&b.com', 'http://', 'file:///':
      self.assertEquals(None, util.domain_from_link(bad_link))

Example #4

0

Show file

  def urls_and_domains(self, auth_entity, user_url, actor=None,
                       resolve_source_domain=True):
    """Returns this user's valid (not webmention-blocklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing
      actor: dict, optional AS actor for the user. If provided, overrides
        auth_entity
      resolve_source_domain: boolean, whether to follow redirects on URLs on
        this source's domain

    Returns:
      ([string url, ...], [string domain, ...])
    """
    if not actor:
      actor = self.gr_source.user_to_actor(json_loads(auth_entity.user_json))
    logger.debug(f'Extracting URLs and domains from actor: {json_dumps(actor, indent=2)}')

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logger.info(f'Too many profile links! Only resolving the first {MAX_AUTHOR_URLS}: {candidates}')

    urls = []
    for i, url in enumerate(candidates):
      on_source_domain = util.domain_from_link(url) == self.gr_source.DOMAIN
      resolve = ((resolve_source_domain or not on_source_domain) and
                 i < MAX_AUTHOR_URLS)
      resolved = self.resolve_profile_url(url, resolve=resolve)
      if resolved:
        urls.append(resolved)

    final_urls = []
    domains = []
    for url in util.dedupe_urls(urls):  # normalizes domains to lower case
      # skip links on this source's domain itself. only currently needed for
      # Mastodon; the other silo domains are in the webmention blocklist.
      domain = util.domain_from_link(url)
      if domain != self.gr_source.DOMAIN:
        final_urls.append(url)
        domains.append(domain)

    return final_urls, domains

Example #5

0

Show file

    def check_token_for_actor(self, actor):
        """Checks that the given actor is public and matches the request's token.

    Raises: :class:`HTTPException` with HTTP 400
    """
        if not actor:
            self.abort(400, f'Missing actor!')

        if not gr_source.Source.is_public(actor):
            self.abort(
                400,
                f'Your {self.gr_source().NAME} account is private. Bridgy only supports public accounts.'
            )

        token = util.get_required_param(self, 'token')
        domains = set(
            util.domain_from_link(util.replace_test_domains_with_localhost(u))
            for u in microformats2.object_urls(actor))
        domains.discard(self.source_class().GR_CLASS.DOMAIN)

        logging.info(f'Checking token against domains {domains}')
        for domain in ndb.get_multi(ndb.Key(Domain, d) for d in domains):
            if domain and token in domain.tokens:
                return

        self.abort(403,
                   f'Token {token} is not authorized for any of: {domains}')

Example #6

0

Show file

    def finish(self, auth_entity, state=None):
        if not auth_entity:
            util.maybe_add_or_delete_source(Tumblr, auth_entity, state)
            return

        vars = {
            'action':
            '/tumblr/add',
            'state':
            state,
            'auth_entity_key':
            auth_entity.key.urlsafe().decode(),
            'blogs': [
                {
                    'id': b['name'],
                    'title': b.get('title', ''),
                    'domain': util.domain_from_link(b['url'])
                }
                # user_json is the user/info response:
                # http://www.tumblr.com/docs/en/api/v2#user-methods
                for b in json_loads(auth_entity.user_json)['user']['blogs']
                if b.get('name') and b.get('url')
            ],
        }
        print(logger.getEffectiveLevel())
        assert logger.isEnabledFor(logging.DEBUG)
        logger.info(f'Rendering choose_blog.html with {vars}')
        return render_template('choose_blog.html', **vars)

Example #7

0

Show file

File: original_post_discovery.py Project: sanduhrs/bridgy

def _process_syndication_urls(source, permalink, syndication_urls):
  """Process a list of syndication URLs looking for one that matches the
  current source.  If one is found, stores a new SyndicatedPost in the
  db.

  Args:
    source: a models.Source subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication_urls
  """

  results = {}
  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.insert(
        source, syndication=syndication_url, original=permalink)
      results.setdefault(syndication_url, []).append(relationship)
  return results

Example #8

0

Show file

File: models.py Project: chrisaldrich/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.warning('Too many profile links! Only resolving the first %s: %s',
                      MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if send:
        urls.append(url)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains

Example #9

0

Show file

File: wordpress_rest.py Project: dev511/bridgy

  def new(handler, auth_entity=None, **kwargs):
    """Creates and returns a WordPress for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.wordpress.WordPressAuth
    """
    auth_domain = auth_entity.key.id()
    site_info = WordPress.get_site_info(handler, auth_entity)
    if site_info is None:
      return

    urls = util.dedupe_urls(util.trim_nulls(
      [site_info.get('URL'), auth_entity.blog_url]))
    domains = [util.domain_from_link(u) for u in urls]

    avatar = (json.loads(auth_entity.user_json).get('avatar_URL')
              if auth_entity.user_json else None)
    return WordPress(id=domains[0],
                     auth_entity=auth_entity.key,
                     name=auth_entity.user_display_name(),
                     picture=avatar,
                     superfeedr_secret=util.generate_secret(),
                     url=urls[0],
                     domain_urls=urls,
                     domains=domains,
                     site_info=site_info,
                     **kwargs)

Example #10

0

Show file

  def post(self):
    # load source
    try:
      source = ndb.Key(urlsafe=util.get_required_param(self, 'source_key')).get()
      if not source:
        self.abort(400, 'Source key not found')
    except ProtocolBufferDecodeError:
      logging.exception('Bad value for source_key')
      self.abort(400, 'Bad value for source_key')

    # validate URL, find silo post
    url = util.get_required_param(self, 'url')
    domain = util.domain_from_link(url)
    msg = 'Discovering now. Refresh in a minute to see the results!'

    if domain == source.GR_CLASS.DOMAIN:
      post_id = source.GR_CLASS.post_id(url)
      util.add_discover_task(source, post_id)
    elif util.domain_or_parent_in(domain, source.domains):
      synd_links = original_post_discovery.process_entry(source, url, {}, False, [])
      if synd_links:
        for link in synd_links:
          util.add_discover_task(source, source.GR_CLASS.post_id(link))
      else:
        msg = 'Failed to fetch %s or find a %s syndication link.' % (
          util.pretty_link(url), source.GR_CLASS.NAME)
    else:
      msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME

    self.messages.add(msg)
    self.redirect(source.bridgy_url(self))

Example #11

0

Show file

    def post(self):
        source = self.load_source()

        # validate URL, find silo post
        url = util.get_required_param(self, 'url')
        domain = util.domain_from_link(url)
        path = urllib.parse.urlparse(url).path
        msg = 'Discovering now. Refresh in a minute to see the results!'

        if domain == source.GR_CLASS.DOMAIN:
            post_id = source.GR_CLASS.post_id(url)
            if post_id:
                type = 'event' if path.startswith('/events/') else None
                util.add_discover_task(source, post_id, type=type)
            else:
                msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME

        elif util.domain_or_parent_in(domain, source.domains):
            synd_links = original_post_discovery.process_entry(
                source, url, {}, False, [])
            if synd_links:
                for link in synd_links:
                    util.add_discover_task(source,
                                           source.GR_CLASS.post_id(link))
                source.updates = {'last_syndication_url': util.now_fn()}
                models.Source.put_updates(source)
            else:
                msg = 'Failed to fetch %s or find a %s syndication link.' % (
                    util.pretty_link(url), source.GR_CLASS.NAME)

        else:
            msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME

        self.messages.add(msg)
        self.redirect(source.bridgy_url(self))

Example #12

0

Show file

    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Only searches for root domain web site URLs! Skips URLs with paths; they
    tend to generate false positive results in G+'s search. Not sure why yet.

    G+ search supports OR:
    https://developers.google.com/+/api/latest/activities/search

    Returns:
      sequence of ActivityStreams activity dicts
    """
        urls = [
            '"%s"' % util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url))
            and urlparse.urlparse(url).path in ('', '/')
        ][:models.MAX_AUTHOR_URLS]

        if urls:
            return self.get_activities(search_query=' OR '.join(urls),
                                       group_id=gr_source.SEARCH,
                                       etag=self.last_activities_etag,
                                       fetch_replies=False,
                                       fetch_likes=False,
                                       fetch_shares=False,
                                       count=50)

        return []

Example #13

0

Show file

File: pages.py Project: snarfed/bridgy

def discover():
    source = util.load_source()

    # validate URL, find silo post
    url = request.form['url']
    domain = util.domain_from_link(url)
    path = urllib.parse.urlparse(url).path
    msg = 'Discovering now. Refresh in a minute to see the results!'

    gr_source = source.gr_source
    if domain == gr_source.DOMAIN:
        post_id = gr_source.post_id(url)
        if post_id:
            type = 'event' if path.startswith('/events/') else None
            util.add_discover_task(source, post_id, type=type)
        else:
            msg = f"Sorry, that doesn't look like a {gr_source.NAME} post URL."

    elif util.domain_or_parent_in(domain, source.domains):
        synd_links = original_post_discovery.process_entry(
            source, url, {}, False, [])
        if synd_links:
            for link in synd_links:
                util.add_discover_task(source, gr_source.post_id(link))
            source.updates = {'last_syndication_url': util.now_fn()}
            models.Source.put_updates(source)
        else:
            msg = f'Failed to fetch {util.pretty_link(url)} or find a {gr_source.NAME} syndication link.'

    else:
        msg = f'Please enter a URL on either your web site or {gr_source.NAME}.'

    flash(msg)
    return redirect(source.bridgy_url())

Example #14

0

Show file

File: wordpress_rest.py Project: LennonFlores/bridgy

    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a WordPress for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.wordpress.WordPressAuth
    """
        auth_domain = auth_entity.key.id()
        site_info = WordPress.get_site_info(handler, auth_entity)
        if site_info is None:
            return

        urls = util.dedupe_urls(
            util.trim_nulls([site_info.get('URL'), auth_entity.blog_url]))
        domains = [util.domain_from_link(u) for u in urls]

        avatar = (json.loads(auth_entity.user_json).get('avatar_URL')
                  if auth_entity.user_json else None)
        return WordPress(id=domains[0],
                         auth_entity=auth_entity.key,
                         name=auth_entity.user_display_name(),
                         picture=avatar,
                         superfeedr_secret=util.generate_secret(),
                         url=urls[0],
                         domain_urls=urls,
                         domains=domains,
                         site_info=site_info,
                         **kwargs)

Example #15

0

Show file

File: wordpress_rest.py Project: notenoughneon/bridgy

  def new(handler, auth_entity=None, **kwargs):
    """Creates and returns a WordPress for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.wordpress.WordPressAuth
    """
    # Fetch blog's site info
    auth_domain = auth_entity.key.id()
    site_info = json.loads(auth_entity.urlopen(
        API_SITE_URL % auth_entity.blog_id).read())
    site_url = site_info.get('URL')
    if site_url:
      domains = [util.domain_from_link(site_url), auth_domain]
      urls = [site_url, auth_entity.blog_url]
    else:
      domains = [auth_domain]
      urls = [auth_entity.blog_url]

    avatar = (json.loads(auth_entity.user_json).get('avatar_URL')
              if auth_entity.user_json else None)
    return WordPress(id=domains[0],
                     auth_entity=auth_entity.key,
                     name=auth_entity.user_display_name(),
                     picture=avatar,
                     superfeedr_secret=util.generate_secret(),
                     url=urls[0],
                     domain_urls=urls,
                     domains=domains,
                     site_info=site_info,
                     **kwargs)

Example #16

0

Show file

File: facebook.py Project: sheyril/bridgy

    def canonicalize_url(self, url, **kwargs):
        """Facebook-specific standardization of syndicated urls.

    Canonical form is https://www.facebook.com/USERID/posts/POSTID

    Args:
      url: a string, the url of the syndicated content
      kwargs: unused

    Return:
      a string, the canonical form of the syndication url
    """
        if util.domain_from_link(url) != self.gr_source.DOMAIN:
            return None

        def post_url(id):
            return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id)

        parsed = urllib.parse.urlparse(url)
        params = urllib.parse.parse_qs(parsed.query)
        path = parsed.path.strip('/').split('/')
        url_id = self.gr_source.post_id(url)
        ids = params.get('story_fbid') or params.get('fbid')

        post_id = ids[0] if ids else url_id
        if post_id:
            url = post_url(post_id)

        url = url.replace('facebook.com/%s/' % self.username,
                          'facebook.com/%s/' % self.key.id())

        return super(Facebook, self).canonicalize_url(url)

Example #17

0

Show file

File: app.py Project: snarfed/bridgy

  def post(self):
    source = self.load_source()

    # validate URL, find silo post
    url = util.get_required_param(self, 'url')
    domain = util.domain_from_link(url)
    path = urlparse.urlparse(url).path
    msg = 'Discovering now. Refresh in a minute to see the results!'

    if domain == source.GR_CLASS.DOMAIN:
      post_id = source.GR_CLASS.post_id(url)
      if post_id:
        type = 'event' if path.startswith('/events/') else None
        util.add_discover_task(source, post_id, type=type)
      else:
        msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME

    elif util.domain_or_parent_in(domain, source.domains):
      synd_links = original_post_discovery.process_entry(source, url, {}, False, [])
      if synd_links:
        for link in synd_links:
          util.add_discover_task(source, source.GR_CLASS.post_id(link))
        source.updates = {'last_syndication_url': util.now_fn()}
        models.Source.put_updates(source)
      else:
        msg = 'Failed to fetch %s or find a %s syndication link.' % (
          util.pretty_link(url), source.GR_CLASS.NAME)

    else:
      msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME

    self.messages.add(msg)
    self.redirect(source.bridgy_url(self))

Example #18

0

Show file

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.warning(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            url, domain, send = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if send:
                urls.append(url)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains

Example #19

0

Show file

File: tumblr.py Project: swamim/bridgy

    def finish(self, auth_entity, state=None):
        if not auth_entity:
            self.maybe_add_or_delete_source(Tumblr, auth_entity, state)
            return

        vars = {
            'action':
            '/tumblr/add',
            'state':
            state,
            'auth_entity_key':
            auth_entity.key.urlsafe().decode(),
            'blogs': [
                {
                    'id': b['name'],
                    'title': b.get('title', ''),
                    'domain': util.domain_from_link(b['url'])
                }
                # user_json is the user/info response:
                # http://www.tumblr.com/docs/en/api/v2#user-methods
                for b in json_loads(auth_entity.user_json)['user']['blogs']
                if b.get('name') and b.get('url')
            ],
        }
        logging.info('Rendering choose_blog.html with %s', vars)

        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write(
            JINJA_ENV.get_template('choose_blog.html').render(**vars))

Example #20

0

Show file

    def post(self):
        source = self.load_source()
        redirect_url = '%s?%s' % (self.request.path,
                                  urllib.parse.urlencode({
                                      'source_key':
                                      source.key.urlsafe().decode(),
                                  }))

        add = self.request.get('add')
        delete = self.request.get('delete')
        if (add and delete) or (not add and not delete):
            self.abort(400,
                       'Either add or delete param (but not both) required')

        link = util.pretty_link(add or delete)

        if add:
            resolved = Source.resolve_profile_url(add)
            if resolved:
                if resolved in source.domain_urls:
                    self.messages.add('%s already exists.' % link)
                else:
                    source.domain_urls.append(resolved)
                    domain = util.domain_from_link(resolved)
                    source.domains.append(domain)
                    source.put()
                    self.messages.add('Added %s.' % link)
            else:
                self.messages.add(
                    "%s doesn't look like your web site. Try again?" % link)

        else:
            assert delete
            try:
                source.domain_urls.remove(delete)
            except ValueError:
                self.abort(
                    400, "%s not found in %s's current web sites" %
                    (delete, source.label()))
            domain = util.domain_from_link(delete)
            if domain not in set(
                    util.domain_from_link(url) for url in source.domain_urls):
                source.domains.remove(domain)
            source.put()
            self.messages.add('Removed %s.' % link)

        self.redirect(redirect_url)

Example #21

0

Show file

File: superfeedr.py Project: v1cker/bridgy

def handle_feed(feed, source):
    """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
    logging.info('Source: %s %s', source.label(), source.key.string_id())
    logging.info('Raw feed: %s', feed)

    if source.status != 'enabled':
        logging.info('Dropping because source is %s', source.status)
        return
    elif 'webmention' not in source.features:
        logging.info("Dropping because source doesn't have webmention feature")
        return

    for item in json.loads(feed).get('items', []):
        url = item.get('permalinkUrl') or item.get('id')
        if not url:
            logging.error('Dropping feed item without permalinkUrl or id!')
            continue

        # extract links from content, discarding self links.
        #
        # i don't use get_webmention_target[s]() here because they follows redirects
        # and fetch link contents, and this handler should be small and fast and try
        # to return a response to superfeedr successfully.
        #
        # TODO: extract_links currently has a bug that makes it drop trailing
        # slashes. ugh. fix that.
        content = item.get('content') or item.get('summary', '')
        links = [
            util.clean_url(util.unwrap_t_umblr_com(l))
            for l in util.extract_links(content)
            if util.domain_from_link(l) not in source.domains
        ]

        logging.info('Found links: %s', links)
        if len(url) > _MAX_KEYPART_BYTES:
            logging.warning(
                'Blog post URL is too long (over 500 chars)! Giving up.')
            bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES],
                                 source=source.key,
                                 feed_item=item,
                                 failed=links)
        else:
            bp = models.BlogPost(id=url,
                                 source=source.key,
                                 feed_item=item,
                                 unsent=links)

        bp.get_or_save()

Example #22

0

Show file

File: superfeedr.py Project: snarfed/bridgy

def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.info('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.info("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [util.clean_url(util.unwrap_t_umblr_com(l))
             for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    unique = []
    for link in util.dedupe_urls(links):
      if len(link) <= _MAX_STRING_LENGTH:
        unique.append(link)
      else:
        logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link)

    logging.info('Found links: %s', unique)
    if len(url) > _MAX_KEYPART_BYTES:
      logging.warning('Blog post URL is too long (over 500 chars)! Giving up.')
      bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key,
                           feed_item=item, failed=unique)
    else:
      bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique)

    bp.get_or_save()

Example #23

0

Show file

    def finish(self, auth_entity, state=None):
        if auth_entity:
            user_json = json.loads(auth_entity.user_json)

            # find instagram profile URL
            urls = user_json.get('rel-me', [])
            logging.info('rel-mes: %s', urls)
            for url in util.trim_nulls(urls):
                if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
                    username = urllib.parse.urlparse(url).path.strip('/')
                    break
            else:
                self.messages.add(
                    'No Instagram profile found. Please <a href="https://indieauth.com/setup">add an Instagram rel-me link</a>, then try again.'
                )
                return self.redirect('/')

            # check that instagram profile links to web site
            try:
                actor = gr_instagram.Instagram(scrape=True).get_actor(
                    username, ignore_rate_limit=True)
            except Exception as e:
                code, _ = util.interpret_http_exception(e)
                if code in Instagram.RATE_LIMIT_HTTP_CODES:
                    self.messages.add(
                        '<a href="https://github.com/snarfed/bridgy/issues/665#issuecomment-524977427">Apologies, Instagram is temporarily blocking us.</a> Please try again later!'
                    )
                    return self.redirect('/')
                else:
                    raise

            if not actor:
                self.messages.add(
                    "Couldn't find Instagram user '%s'. Please check your site's rel-me link and your Instagram account."
                    % username)
                return self.redirect('/')

            canonicalize = util.UrlCanonicalizer(redirects=False)
            website = canonicalize(auth_entity.key.id())
            urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
            logging.info('Looking for %s in %s', website, urls)
            if website not in urls:
                self.messages.add(
                    "Please add %s to your Instagram profile's website or bio field and try again."
                    % website)
                return self.redirect('/')

            # check that the instagram account is public
            if not gr_source.Source.is_public(actor):
                self.messages.add(
                    'Your Instagram account is private. Bridgy only supports public accounts.'
                )
                return self.redirect('/')

        self.maybe_add_or_delete_source(Instagram,
                                        auth_entity,
                                        state,
                                        actor=actor)

Example #24

0

Show file

File: indieauth.py Project: snarfed/bridgy

 def add_or_update_domain():
     domain = Domain.get_or_insert(
         util.domain_from_link(
             util.replace_test_domains_with_localhost(
                 auth_entity.key.id())))
     domain.auth = auth_entity.key
     if state not in domain.tokens:
         domain.tokens.append(state)
     domain.put()
     flash(f'Authorized you for {domain.key.id()}.')

Example #25

0

Show file

File: pages.py Project: snarfed/bridgy

def edit_websites_post():
    source = util.load_source()
    redirect_url = f'{request.path}?{urllib.parse.urlencode({"source_key": source.key.urlsafe().decode()})}'

    add = request.values.get('add')
    delete = request.values.get('delete')
    if (add and delete) or (not add and not delete):
        error('Either add or delete param (but not both) required')

    link = util.pretty_link(add or delete)

    if add:
        resolved = Source.resolve_profile_url(add)
        if resolved:
            if resolved in source.domain_urls:
                flash(f'{link} already exists.')
            else:
                source.domain_urls.append(resolved)
                domain = util.domain_from_link(resolved)
                source.domains.append(domain)
                source.put()
                flash(f'Added {link}.')
        else:
            flash(f"{link} doesn't look like your web site. Try again?")

    else:
        assert delete
        try:
            source.domain_urls.remove(delete)
        except ValueError:
            error(
                f"{delete} not found in {source.label()}'s current web sites")
        domain = util.domain_from_link(delete)
        if domain not in {
                util.domain_from_link(url)
                for url in source.domain_urls
        }:
            source.domains.remove(domain)
        source.put()
        flash(f'Removed {link}.')

    return redirect(redirect_url)

Example #26

0

Show file

File: app.py Project: snarfed/bridgy

  def post(self):
    source = self.load_source()
    redirect_url = '%s?%s' % (self.request.path, urllib.urlencode({
      'source_key': source.key.urlsafe(),
    }))

    add = self.request.get('add')
    delete = self.request.get('delete')
    if (add and delete) or (not add and not delete):
      self.abort(400, 'Either add or delete param (but not both) required')

    link = util.pretty_link(add or delete)

    if add:
      resolved = Source.resolve_profile_url(add)
      if resolved:
        if resolved in source.domain_urls:
          self.messages.add('%s already exists.' % link)
        else:
          source.domain_urls.append(resolved)
          domain = util.domain_from_link(resolved)
          source.domains.append(domain)
          source.put()
          self.messages.add('Added %s.' % link)
      else:
        self.messages.add("%s doesn't look like your web site. Try again?" % link)

    else:
      assert delete
      try:
        source.domain_urls.remove(delete)
      except ValueError:
        self.abort(400, "%s not found in %s's current web sites" % (
                          delete, source.label()))
      domain = util.domain_from_link(delete)
      if domain not in set(util.domain_from_link(url) for url in source.domain_urls):
        source.domains.remove(domain)
      source.put()
      self.messages.add('Removed %s.' % link)

    self.redirect(redirect_url)

Example #27

0

Show file

File: tasks.py Project: notenoughneon/bridgy

 def post(self):
   logging.debug('Params: %s', self.request.params)
   if self.lease(ndb.Key(urlsafe=self.request.params['key'])):
     # skip "self" links to this blog's domain
     source_domains = self.entity.source.get().domains
     to_send = set()
     for url in self.entity.unsent:
       link_domain = util.domain_from_link(url)
       if link_domain and link_domain not in source_domains:
         to_send.add(url)
     self.entity.unsent = list(to_send)
     self.send_webmentions()

Example #28

0

Show file

File: indieauth.py Project: kevincox/bridgy

  def finish(self, auth_entity, state=None):
    if not auth_entity:
      return

    assert state
    domain = Domain.get_or_insert(util.domain_from_link(auth_entity.key.id()))
    domain.auth = auth_entity.key
    if state not in domain.tokens:
      domain.tokens.append(state)
    domain.put()

    self.messages.add(f'Authorized you for {domain.key.id()}.')
    self.redirect('/')

Example #29

0

Show file

File: googleplus.py Project: kylewm/bridgy

  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    G+ search supports OR:
    https://developers.google.com/+/api/latest/activities/search

    Returns: sequence of ActivityStreams activity dicts
    """
    query = ' OR '.join(
      '"%s"' % util.fragmentless(url) for url in self.domain_urls
      if not util.in_webmention_blacklist(util.domain_from_link(url)))
    return self.get_activities(
      search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag,
      fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)

Example #30

0

Show file

File: models.py Project: mblaney/bridgy

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains

Example #31

0

Show file

File: tumblr.py Project: notenoughneon/bridgy

  def _url_and_domain(auth_entity, blog_name=None):
    """Returns the blog URL and domain.

    Args:
      auth_entity: oauth_dropins.tumblr.TumblrAuth
      blog_name: which blog. optional. matches the 'name' field for one of the
        blogs in auth_entity.user_json['user']['blogs'].

    Returns: (string url, string domain, boolean ok)
    """
    for blog in json.loads(auth_entity.user_json).get('user', {}).get('blogs', []):
      if ((blog_name and blog_name == blog.get('name')) or
          (not blog_name and blog.get('primary'))):
        return blog['url'], util.domain_from_link(blog['url']), True

    return None, None, False

Example #32

0

Show file

File: tumblr.py Project: LennonFlores/bridgy

  def _urls_and_domains(auth_entity, blog_name=None):
    """Returns this blog's URL and domain.

    Args:
      auth_entity: oauth_dropins.tumblr.TumblrAuth
      blog_name: which blog. optional. matches the 'name' field for one of the
        blogs in auth_entity.user_json['user']['blogs'].

    Returns: ([string url], [string domain])
    """
    for blog in json.loads(auth_entity.user_json).get('user', {}).get('blogs', []):
      if ((blog_name and blog_name == blog.get('name')) or
          (not blog_name and blog.get('primary'))):
        return [blog['url']], [util.domain_from_link(blog['url']).lower()]

    return [], []

Example #33

0

Show file

File: twitter.py Project: mblaney/bridgy

    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Twitter search supports OR:
    https://dev.twitter.com/rest/public/search

    ...but it only returns complete(ish) results if we strip scheme from URLs,
    ie search for example.com instead of http://example.com/, and that also
    returns false positivies, so we check that the returned tweets actually have
    matching links. https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
        urls = set(
            util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url)))
        if not urls:
            return []

        query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False)
                            for url in urls)
        candidates = self.get_activities(search_query=query,
                                         group_id=gr_source.SEARCH,
                                         etag=self.last_activities_etag,
                                         fetch_replies=False,
                                         fetch_likes=False,
                                         fetch_shares=False,
                                         count=50)

        # filter out retweets and search false positives that don't actually link to us
        results = []
        for candidate in candidates:
            if candidate.get('verb') == 'share':
                continue
            obj = candidate['object']
            tags = obj.get('tags', [])
            atts = obj.get('attachments', [])
            for url in urls:
                if (url in obj.get('content', '') or any(
                        t.get('url', '').startswith(url)
                        for t in tags + atts)):
                    id = candidate['id']
                    results.append(candidate)
                    break

        return results

Example #34

0

Show file

File: superfeedr.py Project: sanduhrs/bridgy

def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates BlogPost entities and adds propagate-blogpost tasks for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.warning('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.warning("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    source.preprocess_superfeedr_item(item)
    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [l for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    logging.info('Found links: %s', links)
    models.BlogPost(id=url,
                    source=source.key,
                    feed_item=item,
                    unsent=links,
                    ).get_or_save()

Example #35

0

Show file

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        user = json_loads(auth_entity.user_json)
        actor = (
            user.get('actor')  # for Instagram; its user_json is IndieAuth
            or self.gr_source.user_to_actor(user))
        logging.debug('Extracting URLs and domains from actor: %s',
                      json_dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            resolved = self.resolve_profile_url(url,
                                                resolve=i < MAX_AUTHOR_URLS)
            if resolved:
                urls.append(resolved)

        final_urls = []
        domains = []
        for url in util.dedupe_urls(urls):  # normalizes domains to lower case
            # skip links on this source's domain itself. only currently needed for
            # Mastodon; the other silo domains are in the webmention blacklist.
            domain = util.domain_from_link(url)
            if domain != self.gr_source.DOMAIN:
                final_urls.append(url)
                domains.append(domain)

        return final_urls, domains

Example #36

0

Show file

File: reddit.py Project: snarfed/bridgy

  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    Returns:
      sequence of ActivityStreams activity dicts
    """
    urls = {util.schemeless(util.fragmentless(url), slashes=False)
            for url in self.domain_urls
            if not util.in_webmention_blocklist(util.domain_from_link(url))}
    if not urls:
      return []

    # Search syntax: https://www.reddit.com/wiki/search
    url_query = ' OR '.join(f'site:"{u}" OR selftext:"{u}"' for u in urls)
    return self.get_activities(
      search_query=url_query, group_id=gr_source.SEARCH, etag=self.last_activities_etag,
      fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)

Example #37

0

Show file

File: facebook.py Project: qls0ulp/bridgy

    def canonicalize_url(self, url, activity=None, **kwargs):
        """Facebook-specific standardization of syndicated urls.

    Canonical form is https://www.facebook.com/USERID/posts/POSTID

    Args:
      url: a string, the url of the syndicated content
      activity: the activity this URL came from. If it has an fb_object_id,
        we'll use that instead of fetching the post from Facebook
      kwargs: unused

    Return:
      a string, the canonical form of the syndication url
    """
        if util.domain_from_link(url) != self.gr_source.DOMAIN:
            return None

        def post_url(id):
            return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id)

        parsed = urllib.parse.urlparse(url)
        params = urllib.parse.parse_qs(parsed.query)
        path = parsed.path.strip('/').split('/')
        url_id = self.gr_source.post_id(url)

        ids = params.get('story_fbid') or params.get('fbid')
        if ids:
            url = post_url(ids[0])
        elif url_id:
            if path and path[0] == 'notes':
                url = post_url(url_id)
            else:
                object_id = self.cached_resolve_object_id(url_id,
                                                          activity=activity)
                if object_id:
                    url = post_url(object_id)
                elif path and len(path) > 1 and path[1] == 'posts':
                    url = post_url(url_id)

        for alternate_id in util.trim_nulls(
                itertools.chain((self.username or self.inferred_username, ),
                                self.inferred_user_ids)):
            url = url.replace('facebook.com/%s/' % alternate_id,
                              'facebook.com/%s/' % self.key.id())

        return super(FacebookPage, self).canonicalize_url(url)

Example #38

0

Show file

File: models.py Project: mblaney/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.info('Too many profile links! Only resolving the first %s: %s',
                   MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if ok:
        final = final.lower()
        if util.schemeless(final).startswith(util.schemeless(url.lower())):
          # redirected to a deeper path. use the original higher level URL. #652
          final = url
        # If final has a path segment check if root has a matching rel=me.
        match = re.match(r'^(https?://[^/]+)/.+', final)
        if match and i < MAX_AUTHOR_URLS:
          root = match.group(1)
          resp = util.requests_get(root)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, root)
          me_urls = data.get('rels', {}).get('me', [])
          if final in me_urls:
            final = root
        urls.append(final)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains

Example #39

0

Show file

File: instagram.py Project: v1cker/bridgy

    def finish(self, auth_entity, state=None):
        if auth_entity:
            user_json = json.loads(auth_entity.user_json)

            # find instagram profile URL
            urls = user_json.get('rel-me', [])
            logging.info('rel-mes: %s', urls)
            for url in util.trim_nulls(urls):
                if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
                    username = urlparse.urlparse(url).path.strip('/')
                    break
            else:
                self.messages.add(
                    'No Instagram profile found. Please <a href="https://indieauth.com/setup">'
                    'add an Instagram rel-me link</a>, then try again.')
                return self.redirect('/')

            # check that instagram profile links to web site
            actor = gr_instagram.Instagram(scrape=True).get_actor(
                username, ignore_rate_limit=True)
            if not actor:
                self.messages.add(
                    "Couldn't find Instagram user '%s'. Please check your site's rel-me "
                    "link and your Instagram account." % username)
                return self.redirect('/')

            canonicalize = util.UrlCanonicalizer(redirects=False)
            website = canonicalize(auth_entity.key.id())
            urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
            logging.info('Looking for %s in %s', website, urls)
            if website not in urls:
                self.messages.add(
                    "Please add %s to your Instagram profile's website or "
                    'bio field and try again.' % website)
                return self.redirect('/')

            # check that the instagram account is public
            if not gr_source.Source.is_public(actor):
                self.messages.add('Your Instagram account is private. '
                                  'Bridgy only supports public accounts.')
                return self.redirect('/')

        self.maybe_add_or_delete_source(Instagram,
                                        auth_entity,
                                        state,
                                        actor=actor)

Example #40

0

Show file

File: tumblr.py Project: siakaramalegos/bridgy

  def _urls_and_domains(auth_entity, blog_name=None):
    """Returns this blog's URL and domain.

    Args:
      auth_entity: :class:`oauth_dropins.tumblr.TumblrAuth`
      blog_name: which blog. optional. matches the 'name' field for one of the
        blogs in auth_entity.user_json['user']['blogs'].

    Returns:
      ([string url], [string domain])
    """
    for blog in json_loads(auth_entity.user_json).get('user', {}).get('blogs', []):
      if ((blog_name and blog_name == blog.get('name')) or
          (not blog_name and blog.get('primary'))):
        return [blog['url']], [util.domain_from_link(blog['url']).lower()]

    return [], []

Example #41

0

Show file

    def source_url(self, target_url):
        # determine which activity to use
        try:
            activity = self.activities[0]
            if self.entity.urls_to_activity:
                urls_to_activity = json_loads(self.entity.urls_to_activity)
                if urls_to_activity:
                    activity = self.activities[urls_to_activity[target_url]]
        except (KeyError, IndexError):
            logging.warning(
                """\
Hit https://github.com/snarfed/bridgy/issues/237 KeyError!
target url %s not in urls_to_activity: %s
activities: %s""", target_url, self.entity.urls_to_activity, self.activities)
            self.abort(util.ERROR_HTTP_RETURN_CODE)

        # generate source URL
        id = activity['id']
        parsed = util.parse_tag_uri(id)
        post_id = parsed[1] if parsed else id
        # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL)
        # currently have problems with brid.gy's SSL cert. details:
        # https://github.com/snarfed/bridgy/issues/20
        host_url = self.request.host_url
        domain = util.domain_from_link(host_url)
        if domain == util.PRIMARY_DOMAIN or domain in util.OTHER_DOMAINS:
            host_url = 'https://brid-gy.appspot.com'

        path = [
            host_url, self.entity.type,
            self.entity.source.get().SHORT_NAME,
            self.entity.source.string_id(), post_id
        ]

        if self.entity.type != 'post':
            # parse and add response id. (we know Response key ids are always tag URIs)
            _, response_id = util.parse_tag_uri(self.entity.key.string_id())
            reaction_id = response_id
            if self.entity.type in ('like', 'react', 'repost', 'rsvp'):
                response_id = response_id.split('_')[
                    -1]  # extract responder user id
            path.append(response_id)
            if self.entity.type == 'react':
                path.append(reaction_id)

        return '/'.join(path)

Example #42

0

Show file

File: publish.py Project: lcorbasson/bridgy

  def authorize(self):
    """Check for a backlink to brid.gy/publish/SILO."""
    bases = set()
    if util.domain_from_link(self.request.host_url) == 'brid.gy':
      bases.add('brid.gy')
      bases.add('www.brid.gy')  # also accept www
    else:
      bases.add(self.request.host_url)

    expected = ['%s/publish/%s' % (base, self.source.SHORT_NAME) for base in bases]

    if self.entity.html:
      for url in expected:
        if url in self.entity.html or urllib.quote(url, safe='') in self.entity.html:
          return True

    self.error("Couldn't find link to %s" % expected[0])
    return False

Example #43

0

Show file

File: tasks.py Project: tantek/bridgy

  def record_source_webmention(self, mention):
    """Sets this source's last_webmention_sent and maybe webmention_endpoint.

    Args:
      mention: webmentiontools.send.WebmentionSend
    """
    self.source = self.source.key.get()
    logging.info('Setting last_webmention_sent')
    self.source.last_webmention_sent = util.now_fn()

    if (mention.receiver_endpoint != self.source.webmention_endpoint and
        util.domain_from_link(mention.target_url) in self.source.domains):
      logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)',
                   mention.receiver_endpoint, mention.target_url,
                   self.source.webmention_endpoint)
      self.source.webmention_endpoint = mention.receiver_endpoint

    self.source.put()

Example #44

0

Show file

File: tasks.py Project: jamietanna/bridgy

  def record_source_webmention(self, mention):
    """Sets this source's last_webmention_sent and maybe webmention_endpoint.

    Args:
      mention: :class:`webmentiontools.send.WebmentionSend`
    """
    self.source = self.source.key.get()
    logging.info('Setting last_webmention_sent')
    self.source.last_webmention_sent = util.now_fn()

    if (mention.receiver_endpoint != self.source.webmention_endpoint and
        util.domain_from_link(mention.target_url) in self.source.domains):
      logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)',
                   mention.receiver_endpoint, mention.target_url,
                   self.source.webmention_endpoint)
      self.source.webmention_endpoint = mention.receiver_endpoint

    self.source.put()

Example #45

0

Show file

File: publish.py Project: Triadai/bridgy

  def authorize(self):
    """Check for a backlink to brid.gy/publish/SILO."""
    bases = set()
    if util.domain_from_link(self.request.host_url) == 'brid.gy':
      bases.add('brid.gy')
      bases.add('www.brid.gy')  # also accept www
    else:
      bases.add(self.request.host_url)

    expected = ['%s/publish/%s' % (base, self.source.SHORT_NAME) for base in bases]

    if self.entity.html:
      for url in expected:
        if url in self.entity.html or urllib.quote(url, safe='') in self.entity.html:
          return True

    self.error("Couldn't find link to %s" % expected[0])
    return False

Example #46

0

Show file

File: tasks.py Project: Tiamat-Tech/bridgy

    def record_source_webmention(self, endpoint, target):
        """Sets this source's last_webmention_sent and maybe webmention_endpoint.

    Args:
      endpoint: str, URL
      target: str, URL
    """
        self.source = self.source.key.get()
        logging.info('Setting last_webmention_sent')
        self.source.last_webmention_sent = util.now_fn()

        if (endpoint != self.source.webmention_endpoint
                and util.domain_from_link(target) in self.source.domains):
            logging.info(
                'Also setting webmention_endpoint to %s (discovered in %s; was %s)',
                endpoint, target, self.source.webmention_endpoint)
            self.source.webmention_endpoint = endpoint

        self.source.put()

Example #47

0

Show file

File: models.py Project: chrisaldrich/bridgy

  def infer_profile_url(self, url):
    """Given an arbitrary URL representing a person, try to find their
    profile URL for *this* service.

    Queries Bridgy's registered accounts for users with a particular
    domain in their silo profile.

    Args:
      url: string, a person's URL

    Return:
      a string URL for their profile on this service (or None)
    """
    domain = util.domain_from_link(url)
    if domain == self.gr_source.DOMAIN:
      return url
    user = self.__class__.query(self.__class__.domains == domain).get()
    if user:
      return self.gr_source.user_url(user.key.id())

Example #48

0

Show file

File: models.py Project: mblaney/bridgy

    def infer_profile_url(self, url):
        """Given an arbitrary URL representing a person, try to find their
    profile URL for *this* service.

    Queries Bridgy's registered accounts for users with a particular
    domain in their silo profile.

    Args:
      url: string, a person's URL

    Return:
      a string URL for their profile on this service (or None)
    """
        domain = util.domain_from_link(url)
        if domain == self.gr_source.DOMAIN:
            return url
        user = self.__class__.query(self.__class__.domains == domain).get()
        if user:
            return self.gr_source.user_url(user.key.id())

Example #49

0

Show file

File: instagram.py Project: paulscallanjr/bridgy

  def finish(self, auth_entity, state=None):
    if auth_entity:
      user_json = json.loads(auth_entity.user_json)

      # find instagram profile URL
      urls = user_json.get('rel-me', [])
      logging.info('rel-mes: %s', urls)
      for url in util.trim_nulls(urls):
        if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
          username = urlparse.urlparse(url).path.strip('/')
          break
      else:
        self.messages.add(
          'No Instagram profile found. Please <a href="https://indieauth.com/setup">'
          'add an Instagram rel-me link</a>, then try again.')
        return self.redirect_home_or_user_page(state)

      # check that instagram profile links to web site
      actor = gr_instagram.Instagram(scrape=True).get_actor(username)
      if not actor:
        self.messages.add(
          "Couldn't find Instagram user '%s'. Please check your site's rel-me "
          "link and your Instagram account." % username)
        return self.redirect_home_or_user_page(state)

      canonicalize = util.UrlCanonicalizer(redirects=False)
      website = canonicalize(auth_entity.key.id())
      urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
      logging.info('Looking for %s in %s', website, urls)
      if website not in urls:
        self.messages.add("Please add %s to your Instagram profile's website or "
                          'bio field and try again.' % website)
        return self.redirect_home_or_user_page(state)

      # check that the instagram account is public
      if not gr_source.Source.is_public(actor):
        self.messages.add('Your Instagram account is private. '
                          'Bridgy only supports public accounts.')
        return self.redirect_home_or_user_page(state)

    source = self.maybe_add_or_delete_source(Instagram, auth_entity, state,
                                             actor=actor)

Example #50

0

Show file

File: facebook.py Project: lcorbasson/bridgy

  def canonicalize_syndication_url(self, url, activity=None, **kwargs):
    """Facebook-specific standardization of syndicated urls. Canonical form is
    https://www.facebook.com/USERID/posts/POSTID

    Args:
      url: a string, the url of the syndicated content
      activity: the activity this URL came from. If it has an fb_object_id,
        we'll use that instead of fetching the post from Facebook
      kwargs: unused

    Return:
      a string, the canonical form of the syndication url
    """
    if util.domain_from_link(url) != self.gr_source.DOMAIN:
      return url

    def post_url(id):
      return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id)

    parsed = urlparse.urlparse(url)
    params = urlparse.parse_qs(parsed.query)
    url_id = self.gr_source.post_id(url)

    ids = params.get('story_fbid') or params.get('fbid')
    if ids:
      url = post_url(ids[0])
    elif url_id:
      if parsed.path.startswith('/notes/'):
        url = post_url(url_id)
      else:
        object_id = self.cached_resolve_object_id(url_id, activity=activity)
        if object_id:
          url = post_url(object_id)

    username = self.username or self.inferred_username
    if username:
      url = url.replace('facebook.com/%s/' % username,
                        'facebook.com/%s/' % self.key.id())

    # facebook always uses https and www
    return super(FacebookPage, self).canonicalize_syndication_url(
      url, scheme='https', subdomain='www.')

Example #51

0

Show file

File: tumblr.py Project: snarfed/bridgy

    def finish(self, auth_entity, state=None):
        if not auth_entity:
            self.maybe_add_or_delete_source(Tumblr, auth_entity, state)
            return

        vars = {
            "action": "/tumblr/add",
            "state": state,
            "auth_entity_key": auth_entity.key.urlsafe(),
            "blogs": [
                {"id": b["name"], "title": b.get("title", ""), "domain": util.domain_from_link(b["url"])}
                # user_json is the user/info response:
                # http://www.tumblr.com/docs/en/api/v2#user-methods
                for b in json.loads(auth_entity.user_json)["user"]["blogs"]
                if b.get("name") and b.get("url")
            ],
        }
        logging.info("Rendering choose_blog.html with %s", vars)

        self.response.headers["Content-Type"] = "text/html"
        self.response.out.write(template.render("templates/choose_blog.html", vars))

Example #52

0

Show file

File: tumblr.py Project: LennonFlores/bridgy

  def finish(self, auth_entity, state=None):
    if not auth_entity:
      self.maybe_add_or_delete_source(Tumblr, auth_entity, state)
      return

    vars = {
      'action': '/tumblr/add',
      'state': state,
      'auth_entity_key': auth_entity.key.urlsafe(),
      'blogs': [{'id': b['name'],
                 'title': b.get('title', ''),
                 'domain': util.domain_from_link(b['url'])}
                # user_json is the user/info response:
                # http://www.tumblr.com/docs/en/api/v2#user-methods
                for b in json.loads(auth_entity.user_json)['user']['blogs']
                if b.get('name') and b.get('url')],
      }
    logging.info('Rendering choose_blog.html with %s', vars)

    self.response.headers['Content-Type'] = 'text/html'
    self.response.out.write(template.render('templates/choose_blog.html', vars))

Example #53

0

Show file

File: facebook.py Project: lcorbasson/bridgy

  def infer_profile_url(self, url):
    """Find a Facebook profile URL (ideally the one with the user's numeric ID)

    Looks up existing sources by username, inferred username, and domain.

    Args:
      url: string, a person's URL

    Return:
      a string URL for their Facebook profile (or None)
    """
    domain = util.domain_from_link(url)
    if domain == self.gr_source.DOMAIN:
      username = urlparse.urlparse(url).path.strip('/')
      if '/' not in username:
        user = FacebookPage.query(ndb.OR(
          FacebookPage.username == username,
          FacebookPage.inferred_username == username)).get()
        if user:
          return self.gr_source.user_url(user.key.id())
    return super(FacebookPage, self).infer_profile_url(url)

Example #54

0

Show file

  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    Twitter search supports OR:
    https://dev.twitter.com/rest/public/search

    ...but it only returns complete(ish) results if we strip scheme from URLs,
    ie search for example.com instead of http://example.com/, and that also
    returns false positivies, so we check that the returned tweets actually have
    matching links. https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
    urls = set(util.fragmentless(url) for url in self.domain_urls
               if not util.in_webmention_blacklist(util.domain_from_link(url)))
    if not urls:
      return []

    query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls)
    candidates = self.get_activities(
      search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag,
      fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)

    # filter out retweets and search false positives that don't actually link to us
    results = []
    for candidate in candidates:
      if candidate.get('verb') == 'share':
        continue
      obj = candidate['object']
      tags = obj.get('tags', [])
      atts = obj.get('attachments', [])
      for url in urls:
        if (url in obj.get('content', '') or
            any(t.get('url', '').startswith(url) for t in tags + atts)):
          id = candidate['id']
          results.append(candidate)
          break

    return results

Example #55

0

Show file

File: googleplus.py Project: paulscallanjr/bridgy

  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    Only searches for root domain web site URLs! Skips URLs with paths; they
    tend to generate false positive results in G+'s search. Not sure why yet.

    G+ search supports OR:
    https://developers.google.com/+/api/latest/activities/search

    Returns: sequence of ActivityStreams activity dicts
    """
    urls = ['"%s"' % util.fragmentless(url) for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url))
            and urlparse.urlparse(url).path in ('', '/')
           ][:models.MAX_AUTHOR_URLS]

    if urls:
      return self.get_activities(
        search_query=' OR '.join(urls), group_id=gr_source.SEARCH,
        etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False,
        fetch_shares=False, count=50)

    return []

Example #56

0

Show file

File: instagram.py Project: paulscallanjr/bridgy

  def new(handler, auth_entity=None, actor=None, **kwargs):
    """Creates and returns a InstagramPage for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.instagram.InstagramAuth
    """
    user = json.loads(auth_entity.user_json)
    user['actor'] = actor
    auth_entity.user_json = json.dumps(user)
    auth_entity.put()

    username = actor['username']
    if not kwargs.get('features'):
      kwargs['features'] = ['listen']
    urls = microformats2.object_urls(actor)
    return Instagram(id=username,
                     auth_entity=auth_entity.key,
                     name=actor.get('displayName'),
                     picture=actor.get('image', {}).get('url'),
                     url=gr_instagram.Instagram.user_url(username),
                     domain_urls=urls,
                     domains=[util.domain_from_link(url) for url in urls],
                     **kwargs)

Example #57

0

Show file

File: original_post_discovery.py Project: kylewm/bridgy

def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of models.Source
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new models.SyndicatedPost
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  # TODO skip sites we know don't have microformats2 markup
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = BeautifulSoup(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.warning('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if not feed_type:
      # type is not specified, use this to confirm that it's text/html
      feed_url, _, feed_type_ok = util.get_webmention_target(feed_url)
    else:
      feed_type_ok = feed_type == 'text/html'

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_type_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.warning('Could not fetch h-feed url %s.', feed_url,
                      exc_info=True)

  permalink_to_entry = {}
  for child in feeditems:
    if 'h-entry' in child['type']:
      # TODO maybe limit to first ~30 entries? (do that here rather than,
      # below because we want the *first* n entries)
      for permalink in child['properties'].get('url', []):
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = _process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    now = util.now_fn()
    logging.debug('updating source last_syndication_url %s', now)
    source.updates['last_syndication_url'] = now

  return results

Example #58

0

Show file

File: blog_webmention.py Project: LennonFlores/bridgy

  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    if urlparse.urlparse(self.target_url).path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.')

    # create BlogWebmention entity
    id = u'%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data)
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception, e:
      code, body = util.interpret_http_exception(e)
      msg = 'Error: %s %s; %s' % (code, e, body)
      if code == '401':
        logging.warning('Disabling source!')
        self.source.status = 'disabled'
        self.source.put()
        return self.error(msg, status=code, mail=False)
      elif code == '404':
        # post is gone
        return self.error(msg, status=code, mail=False)
      elif code or body:
        return self.error(msg, status=code, mail=True)
      else:
        raise

Example #59

0

Show file

File: original_post_discovery.py Project: notenoughneon/bridgy

def _process_entry(source, permalink, refetch_blanks, preexisting):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    permalink: url of the unprocessed post
    syndication_url: url of the syndicated content
    refetch_blanks: boolean whether we should ignore blank preexisting
      SyndicatedPosts
    preexisting: dict of original url to SyndicatedPost

  Return:
    a dict from syndicated url to new models.SyndicatedPosts
  """
  results = {}
  preexisting_relationship = preexisting.get(permalink)

  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting_relationship:
    # if we're refetching blanks and this one is blank, do not return
    if refetch_blanks and not preexisting_relationship.syndication:
      logging.debug('ignoring blank relationship for original %s', permalink)
    else:
      return results

  syndication_urls = set()
  parsed = None
  try:
    logging.debug('fetching post permalink %s', permalink)
    permalink, _, type_ok = util.get_webmention_target(permalink)
    if type_ok:
      resp = requests.get(permalink, timeout=HTTP_TIMEOUT)
      resp.raise_for_status()
      parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
  except BaseException:
    # TODO limit the number of allowed failures
    logging.warning('Could not fetch permalink %s', permalink, exc_info=True)

  if parsed:
    relsynd = parsed.get('rels').get('syndication', [])
    logging.debug('rel-syndication links: %s', relsynd)
    syndication_urls.update(relsynd)

    # there should only be one h-entry on a permalink page, but
    # we'll check all of them just in case.
    for hentry in (item for item in parsed['items']
                   if 'h-entry' in item['type']):
      usynd = hentry.get('properties', {}).get('syndication', [])
      logging.debug('u-syndication links: %s', usynd)
      syndication_urls.update(usynd)

  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    parsed = urlparse.urlparse(syndication_url)
    if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.get_or_insert_by_syndication_url(
          source, syndication=syndication_url, original=permalink)
      results[syndication_url] = relationship

  if not results:
    logging.debug('no syndication links from %s to current source %s. '
                  'saving empty relationship so that it will not be '
                  'searched again', permalink, source.label())
    # remember that this post doesn't have syndication links for this
    # particular source
    SyndicatedPost(parent=source.key, original=permalink,
                   syndication=None).put()

  logging.debug('discovered relationships %s', results)

  return results

Example #60

0

Show file

File: tasks.py Project: notenoughneon/bridgy

  def do_send_webmentions(self):
    unsent = set()
    for url in self.entity.unsent + self.entity.error:
      # recheck the url here since the checks may have failed during the poll
      # or streaming add.
      url, domain, ok = util.get_webmention_target(url)
      if ok:
        # When debugging locally, redirect our own webmentions to localhost
        if appengine_config.DEBUG and domain in util.LOCALHOST_TEST_DOMAINS:
            url = url.replace(domain, 'localhost')
        unsent.add(url)
    self.entity.unsent = sorted(unsent)
    self.entity.error = []

    while self.entity.unsent:
      target = self.entity.unsent.pop(0)
      source_url = self.source_url(target)
      logging.info('Webmention from %s to %s', source_url, target)

      # see if we've cached webmention discovery for this domain. the cache
      # value is a string URL endpoint if discovery succeeded, a
      # WebmentionSend error dict if it failed (semi-)permanently, or None.
      domain = util.domain_from_link(target)
      cache_key = 'W ' + domain
      cached = memcache.get(cache_key)
      if cached:
        logging.info('Using cached webmention endpoint for %s: %s',
                     domain, cached)

      # send! and handle response or error
      error = None
      if isinstance(cached, dict):
        error = cached
      else:
        mention = send.WebmentionSend(source_url, target, endpoint=cached)
        logging.info('Sending...')
        try:
          if not mention.send(timeout=999):
            error = mention.error
        except:
          logging.warning('', exc_info=True)
          error = getattr(mention, 'error', None)
          if not error:
            error = {'code': 'EXCEPTION'}

      if error is None:
        logging.info('Sent! %s', mention.response)
        if not self.entity.sent:
          self.set_last_webmention_sent()
        self.entity.sent.append(target)
        memcache.set(cache_key, mention.receiver_endpoint,
                     time=WEBMENTION_DISCOVERY_CACHE_TIME)
      else:
        if error['code'] == 'NO_ENDPOINT':
          logging.info('Giving up this target. %s', error)
          self.entity.skipped.append(target)
          memcache.set(cache_key, error, time=WEBMENTION_DISCOVERY_CACHE_TIME)
        elif (error['code'] == 'BAD_TARGET_URL' and
              error['http_status'] / 100 == 4):
          # Give up on 4XX errors; we don't expect later retries to succeed.
          logging.info('Giving up this target. %s', error)
          self.entity.failed.append(target)
        else:
          self.fail('Error sending to endpoint: %s' % error)
          self.entity.error.append(target)

      if target in self.entity.unsent:
        self.entity.unsent.remove(target)

    if self.entity.error:
      logging.warning('Propagate task failed')
      self.release('error')
    else:
      self.complete()