Exemple #1
0
  def test_follow_redirects(self):
    self.expect_requests_head('http://will/redirect',
                              redirected_url='http://final/url')
    self.mox.ReplayAll()
    self.assert_equals('http://final/url',
                       util.follow_redirects('http://will/redirect').url)

    # the result should now be in memcache, so we shouldn't fetch the URL again
    self.assert_equals('http://final/url',
                       util.follow_redirects('http://will/redirect').url)
Exemple #2
0
  def canonicalize_syndication_url(self, url, **kwargs):
    """Follow redirects to find and use profile nicknames instead of ids.

    ...e.g. +RyanBarrett in https://plus.google.com/+RyanBarrett/posts/JPpA8mApAv2.
    """
    return super(GooglePlusPage, self).canonicalize_syndication_url(
      util.follow_redirects(url).url)
def _process_syndication_urls(source, permalink, syndication_urls):
  """Process a list of syndication URLs looking for one that matches the
  current source.  If one is found, stores a new SyndicatedPost in the
  db.

  Args:
    source: a models.Source subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication_urls
  """

  results = {}
  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.insert(
        source, syndication=syndication_url, original=permalink)
      results.setdefault(syndication_url, []).append(relationship)
  return results
Exemple #4
0
  def test_follow_redirects_with_refresh_header(self):
    self.expect_requests_head('http://will/redirect',
                              response_headers={'refresh': '0; url=http://refresh'})
    self.expect_requests_head('http://refresh', redirected_url='http://final')

    self.mox.ReplayAll()
    self.assert_equals('http://final',
                       util.follow_redirects('http://will/redirect').url)
Exemple #5
0
  def test_follow_redirects_with_refresh_header(self):
    headers = {'x': 'y'}
    self.expect_requests_head('http://will/redirect', headers=headers,
                              response_headers={'refresh': '0; url=http://refresh'})
    self.expect_requests_head('http://refresh', headers=headers,
                              redirected_url='http://final')

    self.mox.ReplayAll()
    cache = util.CacheDict()
    self.assert_equals('http://final',
                       util.follow_redirects('http://will/redirect', cache=cache,
                                             headers=headers).url)
def discover(source, activity, fetch_hfeed=True):
  """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously
  found SyndicatedPosts but will not do posse-post-discovery to find
  new ones.

  Args:
    source: models.Source subclass. (Immutable! At least mostly. Changes to
      property values will *not* automatically be stored back in the datastore.
      last_syndication_url is special-cased in tasks.Poll.)
    activity: activity dict
    fetch_hfeed: boolean

  Return:
    the activity, updated with original post urls if any are found
  """
  as_source.Source.original_post_discovery(activity)

  # TODO possible optimization: if we've discovered a backlink to a
  # post on the author's domain (i.e., it included a link or
  # citation), then skip the rest of this.

  # Use source.domain_urls for now; it seems more reliable than the
  # activity.actor.url (which depends on getting the right data back from
  # various APIs). Consider using the actor's url, with domain_urls as the
  # fallback in the future to support content from non-Bridgy users.
  #
  # author_url = activity.get('actor', {}).get('url')
  obj = activity.get('object') or activity
  author_url = source.get_author_url()
  syndication_url = obj.get('url')

  if not author_url:
    logging.debug('no author url, cannot find h-feed %s', author_url)
    return activity

  if not syndication_url:
    logging.debug('no syndication url, cannot process h-entries %s',
                  syndication_url)
    return activity

  # use the canonical syndication url on both sides, so that we have
  # the best chance of finding a match. Some silos allow several
  # different permalink formats to point to the same place (e.g.,
  # facebook user id instead of user name)
  syndication_url = source.canonicalize_syndication_url(
    util.follow_redirects(syndication_url).url)

  return _posse_post_discovery(source, activity,
                               author_url, syndication_url,
                               fetch_hfeed)
Exemple #7
0
  def test_follow_redirects(self):
    for i in range(2):
      self.expect_requests_head('http://will/redirect',
                                redirected_url='http://final/url')
    self.mox.ReplayAll()

    cache = util.CacheDict()
    self.assert_equals(
      'http://final/url',
      util.follow_redirects('http://will/redirect', cache=cache).url)

    self.assertEquals('http://final/url', cache['R http://will/redirect'].url)

    # another call without cache should refetch
    self.assert_equals(
      'http://final/url',
      util.follow_redirects('http://will/redirect').url)

    # another call with cache shouldn't refetch
    self.assert_equals(
      'http://final/url',
      util.follow_redirects('http://will/redirect', cache=cache).url)
def discover(source, activity, fetch_hfeed=True):
    """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously
  found SyndicatedPosts but will not do posse-post-discovery to find
  new ones.

  Args:
    source: models.Source subclass. (Immutable! At least mostly. Changes to
      property values will *not* automatically be stored back in the datastore.
      last_syndication_url is special-cased in tasks.Poll.)
    activity: activity dict
    fetch_hfeed: boolean

  Return:
    the activity, updated with original post urls if any are found
  """
    gr_source.Source.original_post_discovery(activity)

    # TODO possible optimization: if we've discovered a backlink to a
    # post on the author's domain (i.e., it included a link or
    # citation), then skip the rest of this.
    obj = activity.get("object") or activity
    syndication_url = obj.get("url")

    if not source.get_author_urls():
        logging.debug("no author url(s), cannot find h-feed")
        return activity

    if not syndication_url:
        logging.debug("no syndication url, cannot process h-entries %s", syndication_url)
        return activity

    # use the canonical syndication url on both sides, so that we have
    # the best chance of finding a match. Some silos allow several
    # different permalink formats to point to the same place (e.g.,
    # facebook user id instead of user name)
    syndication_url = source.canonicalize_syndication_url(util.follow_redirects(syndication_url).url)

    return _posse_post_discovery(source, activity, syndication_url, fetch_hfeed)
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True):
  """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously
  found SyndicatedPosts but will not do posse-post-discovery to find
  new ones.

  Args:
    source: models.Source subclass. Changes to property values (e.g. domains,
      domain_urls, last_syndication_url) are stored in source.updates; they
      should be updated transactionally later.
    activity: activity dict
    fetch_hfeed: boolean
    include_redirect_sources: boolean, whether to include URLs that redirect as
      well as their final destination URLs

  Returns: (set(string original post URLs), set(string mention URLs)) tuple

  """
  if not source.updates:
    source.updates = {}

  originals, mentions = gr_source.Source.original_post_discovery(
    activity, domains=source.domains, cache=memcache,
    include_redirect_sources=include_redirect_sources,
    headers=util.USER_AGENT_HEADER)

  obj = activity.get('object', {})
  author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id')
  if author_id and author_id != source.user_tag_id():
    logging.info(
      "Demoting original post links because user %s doesn't match author %s",
      source.user_tag_id(), author_id)
    # this is someone else's post, so all links must be mentions
    mentions.update(originals)
    originals = set()

  # look for original URL of attachments (e.g. quote tweets)
  for att in obj.get('attachments', []):
    if (att.get('objectType') in ('note', 'article')
        and att.get('author', {}).get('id') == source.user_tag_id()):
      logging.debug('running original post discovery on attachment: %s',
                    att.get('id'))
      att_origs, _ = discover(
        source, att, include_redirect_sources=include_redirect_sources)
      logging.debug('original post discovery found originals for attachment, %s',
                    att_origs)
      mentions.update(att_origs)

  def resolve(urls):
    resolved = set()
    for url in urls:
      final, _, send = util.get_webmention_target(url)
      if send:
        resolved.add(final)
        if include_redirect_sources:
          resolved.add(url)
    return resolved

  originals = resolve(originals)
  mentions = resolve(mentions)

  if not source.get_author_urls():
    logging.debug('no author url(s), cannot find h-feed')
    return originals, mentions

  # TODO possible optimization: if we've discovered a backlink to a post on the
  # author's domain (i.e., it included a link or citation), then skip the rest
  # of this.
  syndication_url = obj.get('url') or activity.get('url')
  if syndication_url:
    # use the canonical syndication url on both sides, so that we have
    # the best chance of finding a match. Some silos allow several
    # different permalink formats to point to the same place (e.g.,
    # facebook user id instead of user name)
    syndication_url = source.canonicalize_syndication_url(
      util.follow_redirects(syndication_url).url)
    originals.update(_posse_post_discovery(
      source, activity, syndication_url, fetch_hfeed))
    originals = set(util.dedupe_urls(originals))
  else:
    logging.debug('no syndication url, cannot process h-entries')

  return originals, mentions
  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    if urlparse.urlparse(self.target_url).path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.')

    # create BlogWebmention entity
    id = u'%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data)
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception, e:
      code, body = util.interpret_http_exception(e)
      msg = 'Error: %s %s; %s' % (code, e, body)
      if code == '401':
        logging.warning('Disabling source!')
        self.source.status = 'disabled'
        self.source.put()
        return self.error(msg, status=code, mail=False)
      elif code == '404':
        # post is gone
        return self.error(msg, status=code, mail=False)
      elif code or body:
        return self.error(msg, status=code, mail=True)
      else:
        raise
Exemple #11
0
    def post(self, source_short_name):
        logging.info('Params: %s', list(self.request.params.items()))
        # strip fragments from source and target url
        self.source_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'source'))[0]
        self.target_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'target'))[0]

        # follow target url through any redirects, strip utm_* query params
        resp = util.follow_redirects(self.target_url)
        redirected_target_urls = [r.url for r in resp.history]
        self.target_url = util.clean_url(resp.url)

        # parse and validate target URL
        domain = util.domain_from_link(self.target_url)
        if not domain:
            return self.error('Could not parse target URL %s' %
                              self.target_url)

        # look up source by domain
        source_cls = models.sources[source_short_name]
        domain = domain.lower()
        self.source = (source_cls.query().filter(
            source_cls.domains == domain).filter(
                source_cls.features == 'webmention').filter(
                    source_cls.status == 'enabled').get())
        if not self.source:
            # check for a rel-canonical link. Blogger uses these when it serves a post
            # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
            # epeus.blogspot.com.
            # https://github.com/snarfed/bridgy/issues/805
            mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
            if not mf2:
                # fetch_mf2() already wrote the error response
                return
            domains = util.dedupe_urls(
                util.domain_from_link(url)
                for url in mf2[1]['rels'].get('canonical', []))
            if domains:
                self.source = (source_cls.query().filter(
                    source_cls.domains.IN(domains)).filter(
                        source_cls.features == 'webmention').filter(
                            source_cls.status == 'enabled').get())

        if not self.source:
            return self.error(
                'Could not find %s account for %s. Is it registered with Bridgy?'
                % (source_cls.GR_CLASS.NAME, domain))

        # check that the target URL path is supported
        target_path = urllib.parse.urlparse(self.target_url).path
        if target_path in ('', '/'):
            return self.error(
                'Home page webmentions are not currently supported.',
                status=202)
        for pattern in self.source.PATH_BLOCKLIST:
            if pattern.match(target_path):
                return self.error(
                    '%s webmentions are not supported for URL path: %s' %
                    (self.source.GR_CLASS.NAME, target_path),
                    status=202)

        # create BlogWebmention entity
        id = '%s %s' % (self.source_url, self.target_url)
        self.entity = BlogWebmention.get_or_insert(
            id,
            source=self.source.key,
            redirected_target_urls=redirected_target_urls)
        if self.entity.status == 'complete':
            # TODO: response message saying update isn't supported
            self.response.write(self.entity.published)
            return
        logging.debug("BlogWebmention entity: '%s'",
                      self.entity.key.urlsafe().decode())

        # fetch source page
        fetched = self.fetch_mf2(self.source_url)
        if not fetched:
            return
        resp, mf2 = fetched

        item = self.find_mention_item(mf2.get('items', []))
        if not item:
            return self.error(
                'Could not find target URL %s in source page %s' %
                (self.target_url, resp.url),
                data=mf2,
                log_exception=False)

        # default author to target domain
        author_name = domain
        author_url = 'http://%s/' % domain

        # extract author name and URL from h-card, if any
        props = item['properties']
        author = first_value(props, 'author')
        if author:
            if isinstance(author, str):
                author_name = author
            else:
                author_props = author.get('properties', {})
                author_name = first_value(author_props, 'name')
                author_url = first_value(author_props, 'url')

        # if present, u-url overrides source url
        u_url = first_value(props, 'url')
        if u_url:
            self.entity.u_url = u_url

        # generate content
        content = props['content'][
            0]  # find_mention_item() guaranteed this is here
        text = (content.get('html') or content.get('value')).strip()
        source_url = self.entity.source_url()
        text += ' <br /> <a href="%s">via %s</a>' % (
            source_url, util.domain_from_link(source_url))

        # write comment
        try:
            self.entity.published = self.source.create_comment(
                self.target_url, author_name, author_url, text)
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            msg = 'Error: %s %s; %s' % (code, e, body)
            if code == '401':
                logging.warning('Disabling source due to: %s' % e,
                                stack_info=True)
                self.source.status = 'disabled'
                self.source.put()
                return self.error(msg,
                                  status=code,
                                  report=self.source.is_beta_user())
            elif code == '404':
                # post is gone
                return self.error(msg, status=code, report=False)
            elif util.is_connection_failure(e) or (code
                                                   and int(code) // 100 == 5):
                return self.error(msg,
                                  status=util.ERROR_HTTP_RETURN_CODE,
                                  report=False)
            elif code or body:
                return self.error(msg, status=code, report=True)
            else:
                raise

        # write results to datastore
        self.entity.status = 'complete'
        self.entity.put()
        self.response.write(json_dumps(self.entity.published))
def _process_entry(source, permalink, refetch_blanks, preexisting):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    permalink: url of the unprocessed post
    syndication_url: url of the syndicated content
    refetch_blanks: boolean whether we should ignore blank preexisting
      SyndicatedPosts
    preexisting: dict of original url to SyndicatedPost

  Return:
    a dict from syndicated url to new models.SyndicatedPosts
  """
  results = {}
  preexisting_relationship = preexisting.get(permalink)

  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting_relationship:
    # if we're refetching blanks and this one is blank, do not return
    if refetch_blanks and not preexisting_relationship.syndication:
      logging.debug('ignoring blank relationship for original %s', permalink)
    else:
      return results

  syndication_urls = set()
  parsed = None
  try:
    logging.debug('fetching post permalink %s', permalink)
    permalink, _, type_ok = util.get_webmention_target(permalink)
    if type_ok:
      resp = requests.get(permalink, timeout=HTTP_TIMEOUT)
      resp.raise_for_status()
      parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
  except BaseException:
    # TODO limit the number of allowed failures
    logging.warning('Could not fetch permalink %s', permalink, exc_info=True)

  if parsed:
    relsynd = parsed.get('rels').get('syndication', [])
    logging.debug('rel-syndication links: %s', relsynd)
    syndication_urls.update(relsynd)

    # there should only be one h-entry on a permalink page, but
    # we'll check all of them just in case.
    for hentry in (item for item in parsed['items']
                   if 'h-entry' in item['type']):
      usynd = hentry.get('properties', {}).get('syndication', [])
      logging.debug('u-syndication links: %s', usynd)
      syndication_urls.update(usynd)

  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    parsed = urlparse.urlparse(syndication_url)
    if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.get_or_insert_by_syndication_url(
          source, syndication=syndication_url, original=permalink)
      results[syndication_url] = relationship

  if not results:
    logging.debug('no syndication links from %s to current source %s. '
                  'saving empty relationship so that it will not be '
                  'searched again', permalink, source.label())
    # remember that this post doesn't have syndication links for this
    # particular source
    SyndicatedPost(parent=source.key, original=permalink,
                   syndication=None).put()

  logging.debug('discovered relationships %s', results)

  return results
Exemple #13
0
  def dispatch_request(self, site):
    logger.info(f'Params: {list(request.values.items())}')
    # strip fragments from source and target url
    self.source_url = urllib.parse.urldefrag(request.form['source'])[0]
    self.target_url = urllib.parse.urldefrag(request.form['target'])[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      self.error(f'Could not parse target URL {self.target_url}')

    # look up source by domain
    source_cls = models.sources[site]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1]['rels'].get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      self.error(
        f'Could not find {source_cls.GR_CLASS.NAME} account for {domain}. Is it registered with Bridgy?')

    # check that the target URL path is supported
    target_path = urllib.parse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      msg = 'Home page webmentions are not currently supported.'
      logger.info(msg)
      return {'error': msg}, 202
    for pattern in self.source.PATH_BLOCKLIST:
      if pattern.match(target_path):
        msg = f'{self.source.GR_CLASS.NAME} webmentions are not supported for URL path: {target_path}'
        logger.info(msg)
        return {'error': msg}, 202

    # create BlogWebmention entity
    id = f'{self.source_url} {self.target_url}'
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      return self.entity.published
    logger.debug(f'BlogWebmention entity: {self.entity.key.urlsafe().decode()}')

    # fetch source page
    fetched = self.fetch_mf2(self.source_url)
    if not fetched:
      return
    resp, mf2 = fetched

    item = self.find_mention_item(mf2.get('items', []))
    if not item:
      self.error(f'Could not find target URL {self.target_url} in source page {resp.url}', data=mf2, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = f'http://{domain}/'

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = get_first(props, 'author')
    if author:
      if isinstance(author, str):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = get_first(author_props, 'name')
        author_url = get_first(author_props, 'url')

    # if present, u-url overrides source url
    u_url = get_first(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += f' <br /> <a href="{source_url}">via {util.domain_from_link(source_url)}</a>'

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = f'Error: {code}: {e}; {body}'
      if code == '401':
        logger.warning(f'Disabling source due to: {e}', exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        self.error(msg, status=code, report=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        self.error(msg, status=code, report=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        self.error(msg, status=502, report=False)
      elif code or body:
        self.error(msg, status=code, report=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()

    return self.entity.published
Exemple #14
0
  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1].get('rels', {}).get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    # check that the target URL path is supported
    target_path = urlparse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.',
                        status=202)
    for pattern in self.source.PATH_BLACKLIST:
      if pattern.match(target_path):
        return self.error('%s webmentions are not supported for URL path: %s' %
                          (self.source.GR_CLASS.NAME, target_path), status=202)

    # create BlogWebmention entity
    id = '%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data.get('items', []))
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = 'Error: %s %s; %s' % (code, e, body)
      if code == '401':
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        return self.error(msg, status=code, mail=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        return self.error(msg, status=code, mail=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False)
      elif code or body:
        return self.error(msg, status=code, mail=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()
    self.response.write(json.dumps(self.entity.published))
Exemple #15
0
 def test_follow_redirects_defaults_scheme_to_http(self):
   self.expect_requests_head('http://foo/bar', redirected_url='http://final')
   self.mox.ReplayAll()
   self.assert_equals('http://final', util.follow_redirects('foo/bar').url)
Exemple #16
0
    def post(self, source_short_name):
        logging.info('Params: %self', self.request.params.items())
        # strip fragments from source and target url
        self.source_url = urlparse.urldefrag(
            util.get_required_param(self, 'source'))[0]
        self.target_url = urlparse.urldefrag(
            util.get_required_param(self, 'target'))[0]

        # follow target url through any redirects, strip utm_* query params
        resp = util.follow_redirects(self.target_url)
        redirected_target_urls = [r.url for r in resp.history]
        self.target_url = util.clean_url(resp.url)

        # parse and validate target URL
        domain = util.domain_from_link(self.target_url)
        if not domain:
            return self.error('Could not parse target URL %s' %
                              self.target_url)

        # look up source by domain
        source_cls = models.sources[source_short_name]
        domain = domain.lower()
        self.source = (source_cls.query().filter(
            source_cls.domains == domain).filter(
                source_cls.features == 'webmention').filter(
                    source_cls.status == 'enabled').get())
        if not self.source:
            return self.error(
                'Could not find %s account for %s. Is it registered with Bridgy?'
                % (source_cls.GR_CLASS.NAME, domain))

        if urlparse.urlparse(self.target_url).path in ('', '/'):
            return self.error(
                'Home page webmentions are not currently supported.')

        # create BlogWebmention entity
        id = u'%s %s' % (self.source_url, self.target_url)
        self.entity = BlogWebmention.get_or_insert(
            id,
            source=self.source.key,
            redirected_target_urls=redirected_target_urls)
        if self.entity.status == 'complete':
            # TODO: response message saying update isn't supported
            self.response.write(self.entity.published)
            return
        logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe())

        # fetch source page
        resp = self.fetch_mf2(self.source_url)
        if not resp:
            return
        self.fetched, data = resp

        item = self.find_mention_item(data)
        if not item:
            return self.error(
                'Could not find target URL %s in source page %s' %
                (self.target_url, self.fetched.url),
                data=data,
                log_exception=False)

        # default author to target domain
        author_name = domain
        author_url = 'http://%s/' % domain

        # extract author name and URL from h-card, if any
        props = item['properties']
        author = first_value(props, 'author')
        if author:
            if isinstance(author, basestring):
                author_name = author
            else:
                author_props = author.get('properties', {})
                author_name = first_value(author_props, 'name')
                author_url = first_value(author_props, 'url')

        # if present, u-url overrides source url
        u_url = first_value(props, 'url')
        if u_url:
            self.entity.u_url = u_url

        # generate content
        content = props['content'][
            0]  # find_mention_item() guaranteed this is here
        text = (content.get('html') or content.get('value')).strip()
        source_url = self.entity.source_url()
        text += ' <br /> <a href="%s">via %s</a>' % (
            source_url, util.domain_from_link(source_url))

        # write comment
        try:
            self.entity.published = self.source.create_comment(
                self.target_url, author_name, author_url, text)
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            msg = 'Error: %s %s; %s' % (code, e, body)
            if code == '401':
                logging.warning('Disabling source due to: %s' % e,
                                exc_info=True)
                self.source.status = 'disabled'
                self.source.put()
                return self.error(msg,
                                  status=code,
                                  mail=self.source.is_beta_user())
            elif code == '404':
                # post is gone
                return self.error(msg, status=code, mail=False)
            elif util.is_connection_failure(e) or (code
                                                   and int(code) // 100 == 5):
                return self.error(msg,
                                  status=util.ERROR_HTTP_RETURN_CODE,
                                  mail=False)
            elif code or body:
                return self.error(msg, status=code, mail=True)
            else:
                raise

        # write results to datastore
        self.entity.status = 'complete'
        self.entity.put()
        self.response.write(json.dumps(self.entity.published))