Python clean_webmention_url Examples, util.clean_webmention_url Python Examples

Example #1

0

Show file

File: handlers.py Project: sanduhrs/bridgy

  def add_original_post_urls(self, post_id, obj, prop):
    """Extracts original post URLs and adds them to an object, in place.

    If the post object has upstreamDuplicates, *only* they are considered
    original post URLs and added as tags with objectType 'article', and the
    post's own links and 'article' tags are added with objectType 'mention'.

    Args:
      post_id: string post id
      obj: ActivityStreams post object
      prop: string property name in obj to add the original post URLs to
    """
    post = None
    try:
      post = self.source.get_post(post_id)
    except:
      logging.warning('Error fetching source post %s', post_id, exc_info=True)
      return
    if not post:
      logging.warning('Source post %s not found', post_id)
      return

    original_post_discovery.discover(self.source, post, fetch_hfeed=False)
    tags = [tag for tag in post['object'].get('tags', [])
            if 'url' in tag and tag['objectType'] == 'article']
    upstreams = post['object'].get('upstreamDuplicates', [])

    if not isinstance(obj.setdefault(prop, []), list):
      obj[prop] = [obj[prop]]
    if upstreams:
      obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams]
      obj.setdefault('tags', []).extend(
        [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags])
    else:
      obj[prop] += tags

    # check for redirects, and if there are any follow them and add final urls
    # in addition to the initial urls.
    seen = set()
    for url_list in obj[prop], obj.get('tags', []):
      for url_obj in url_list:
        url = util.clean_webmention_url(url_obj.get('url', ''))
        if not url or url in seen:
          continue
        seen.add(url)
        # when debugging locally, replace my (snarfed.org) URLs with localhost
        url_obj['url'] = url = util.replace_test_domains_with_localhost(url)
        resolved, _, send = util.get_webmention_target(url)
        if send and resolved != url and resolved not in seen:
          seen.add(resolved)
          url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')})

    logging.info('After original post discovery, urls are: %s', seen)

Example #2

0

Show file

File: blog_webmention.py Project: uniteddiversity/bridgy

  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_webmention_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    if urlparse.urlparse(self.target_url).path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.')

    # create BlogWebmention entity
    id = u'%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data)
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception, e:
      code, body = util.interpret_http_exception(e)
      if code or body:
        return self.error('Error: %s %s; %s' % (code, e, body), status=code, mail=True)
      else:
        raise

Example #3

0

Show file

File: handlers.py Project: priscila225/bridgy

  def add_original_post_urls(self, post, obj, prop):
    """Extracts original post URLs and adds them to an object, in place.

    If the post object has upstreamDuplicates, *only* they are considered
    original post URLs and added as tags with objectType 'article', and the
    post's own links and 'article' tags are added with objectType 'mention'.

    Args:
      post: ActivityStreams post object to get original post URLs from
      obj: ActivityStreams post object to add original post URLs to
      prop: string property name in obj to add the original post URLs to
    """
    original_post_discovery.discover(self.source, post, fetch_hfeed=False)
    tags = [tag for tag in post['object'].get('tags', [])
            if 'url' in tag and tag['objectType'] == 'article']
    upstreams = post['object'].get('upstreamDuplicates', [])

    if not isinstance(obj.setdefault(prop, []), list):
      obj[prop] = [obj[prop]]
    if upstreams:
      obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams]
      obj.setdefault('tags', []).extend(
        [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags])
    else:
      obj[prop] += tags

    # check for redirects, and if there are any follow them and add final urls
    # in addition to the initial urls.
    seen = set()
    tags = obj.get('tags', [])
    for url_list in obj[prop], tags:
      for url_obj in url_list:
        url = util.clean_webmention_url(url_obj.get('url', ''))
        if not url or url in seen:
          continue
        seen.add(url)
        # when debugging locally, replace my (snarfed.org) URLs with localhost
        url_obj['url'] = url = util.replace_test_domains_with_localhost(url)
        resolved, _, send = util.get_webmention_target(url)
        if send and resolved != url and resolved not in seen:
          seen.add(resolved)
          url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')})

    # if the http version of a link is in upstreams but the https one is just a
    # mention, or vice versa, promote them both to upstream.
    # https://github.com/snarfed/bridgy/issues/290
    #
    # TODO: for links that came from resolving redirects above, this doesn't
    # also catch the initial pre-redirect link. ah well.
    prop_schemeful = set(tag['url'] for tag in obj[prop] if tag.get('url'))
    prop_schemeless = set(util.schemeless(url) for url in prop_schemeful)

    for url_obj in copy.copy(tags):
      url = url_obj.get('url', '')
      schemeless = util.schemeless(url)
      if schemeless in prop_schemeless and url not in prop_schemeful:
        obj[prop].append(url_obj)
        tags.remove(url_obj)
        prop_schemeful.add(url)

    logging.info('After original post discovery, urls are: %s', seen)