Beispiel #1
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Beispiel #2
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Beispiel #3
0
    def _find_source(self, source_cls, url, domain):
        """Returns the source that should publish a post URL, or None if not found.

    Args:
      source_cls: :class:`models.Source` subclass for this silo
      url: string
      domain: string, url's domain

    Returns: :class:`models.Source`
    """
        domain = domain.lower()
        sources = source_cls.query().filter(
            source_cls.domains == domain).fetch(100)
        if not sources:
            self.error(
                "Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again."
                % {
                    'type': source_cls.GR_CLASS.NAME,
                    'domain': domain
                })
            return

        current_url = ''
        sources_ready = []
        best_match = None
        for source in sources:
            logging.info('Source: %s , features %s, status %s, poll status %s',
                         source.bridgy_url(self), source.features,
                         source.status, source.poll_status)
            if source.status != 'disabled' and 'publish' in source.features:
                # use a source that has a domain_url matching the url provided,
                # including path. find the source with the closest match.
                sources_ready.append(source)
                schemeless_url = util.schemeless(url.lower()).strip('/')
                for domain_url in source.domain_urls:
                    schemeless_domain_url = util.schemeless(
                        domain_url.lower()).strip('/')
                    if (schemeless_url.startswith(schemeless_domain_url)
                            and len(domain_url) > len(current_url)):
                        current_url = domain_url
                        best_match = source

        if best_match:
            return best_match
        elif sources_ready:
            self.error(
                'No account found that matches %s. Check that <a href="%s/about#profile-link">the web site URL is in your silo profile</a>, then <a href="%s/">sign up again</a>.'
                % (self.request.host_url, util.pretty_link(url),
                   self.request.host_url))
        else:
            self.error(
                'Publish is not enabled for your account. <a href="%s/">Try signing up!</a>'
                % self.request.host_url)
Beispiel #4
0
  def test_schemeless(self):
    for expected, url in (
        ('', ''),
        ('/path', '/path'),
        ('//foo', '//foo'),
        ('//foo', 'http://foo'),
        ('//foo.bar/baz', 'http://foo.bar/baz'),
        ('//foo.bar/baz', 'https://foo.bar/baz'),
      ):
      self.assertEqual(expected, util.schemeless(url))

    self.assertEqual('foo', util.schemeless('http://foo/', slashes=False))
    self.assertEqual('foo/bar', util.schemeless('http://foo/bar/', slashes=False))
Beispiel #5
0
    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains
Beispiel #6
0
    def _find_source(self, source_cls, url, domain):
        """Returns the source that should publish a post URL, or None if not found.

    Args:
      source_cls: :class:`models.Source` subclass for this silo
      url: string
      domain: string, url's domain

    Returns: :class:`models.Source`
    """
        domain = domain.lower()
        if util.domain_or_parent_in(domain, util.DOMAINS):
            return self.error(
                f'Source URL should be on your own site, not {domain}')

        sources = source_cls.query().filter(
            source_cls.domains == domain).fetch(100)
        if not sources:
            msg = f'Could not find <b>{source_cls.GR_CLASS.NAME}</b> account for <b>{domain}</b>. Check that your {source_cls.GR_CLASS.NAME} profile has {domain} in its <em>web site</em> or <em>link</em> field, then try signing up again.'
            return self.error(msg, html=msg)

        current_url = ''
        sources_ready = []
        best_match = None
        for source in sources:
            logging.info(
                f'Source: {source.bridgy_url()} , features {source.features}, status {source.status}, poll status {source.poll_status}'
            )
            if source.status != 'disabled' and 'publish' in source.features:
                # use a source that has a domain_url matching the url provided,
                # including path. find the source with the closest match.
                sources_ready.append(source)
                schemeless_url = util.schemeless(url.lower()).strip('/')
                for domain_url in source.domain_urls:
                    schemeless_domain_url = util.schemeless(
                        domain_url.lower()).strip('/')
                    if (schemeless_url.startswith(schemeless_domain_url)
                            and len(domain_url) > len(current_url)):
                        current_url = domain_url
                        best_match = source

        if best_match:
            return best_match

        if sources_ready:
            msg = f'No account found that matches {util.pretty_link(url)}. Check that <a href="{util.host_url("/about#profile-link")}">the web site URL is in your silo profile</a>, then <a href="{request.host_url}">sign up again</a>.'
        else:
            msg = f'Publish is not enabled for your account. <a href="{request.host_url}">Try signing up!</a>'
        self.error(msg, html=msg)
Beispiel #7
0
    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Twitter search supports OR:
    https://dev.twitter.com/rest/public/search

    ...but it only returns complete(ish) results if we strip scheme from URLs,
    ie search for example.com instead of http://example.com/, and that also
    returns false positivies, so we check that the returned tweets actually have
    matching links. https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
        urls = set(
            util.schemeless(util.fragmentless(url), slashes=False)
            for url in self.domain_urls
            if not util.in_webmention_blacklist(util.domain_from_link(url)))
        if not urls:
            return []

        query = ' OR '.join(urls)
        candidates = self.get_activities(search_query=query,
                                         group_id=gr_source.SEARCH,
                                         etag=self.last_activities_etag,
                                         fetch_replies=False,
                                         fetch_likes=False,
                                         fetch_shares=False,
                                         count=50)

        # filter out retweets and search false positives that don't actually link to us
        results = []
        for candidate in candidates:
            if candidate.get('verb') == 'share':
                continue
            obj = candidate['object']
            tags = obj.get('tags', [])
            atts = obj.get('attachments', [])
            for url in urls:
                if (any(
                        util.schemeless(t.get('url', ''),
                                        slashes=False).startswith(url)
                        for t in tags + atts)):
                    results.append(candidate)
                    break

        return results
Beispiel #8
0
  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.info('Too many profile links! Only resolving the first %s: %s',
                   MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if ok:
        final = final.lower()
        if util.schemeless(final).startswith(util.schemeless(url.lower())):
          # redirected to a deeper path. use the original higher level URL. #652
          final = url
        # If final has a path segment check if root has a matching rel=me.
        match = re.match(r'^(https?://[^/]+)/.+', final)
        if match and i < MAX_AUTHOR_URLS:
          root = match.group(1)
          resp = util.requests_get(root)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, root)
          me_urls = data.get('rels', {}).get('me', [])
          if final in me_urls:
            final = root
        urls.append(final)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains
Beispiel #9
0
  def _find_source(self, source_cls, url, domain):
    """Returns the source that should publish a post URL, or None if not found.

    Args:
      source_cls: :class:`models.Source` subclass for this silo
      url: string
      domain: string, url's domain

    Returns: :class:`models.Source`
    """
    domain = domain.lower()
    sources = source_cls.query().filter(source_cls.domains == domain).fetch(100)
    if not sources:
      self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." %
        {'type': source_cls.GR_CLASS.NAME, 'domain': domain})
      return

    current_url = ''
    sources_ready = []
    best_match = None
    for source in sources:
      logging.info('Source: %s , features %s, status %s, poll status %s',
                   source.bridgy_url(self), source.features, source.status,
                   source.poll_status)
      if source.status != 'disabled' and 'publish' in source.features:
        # use a source that has a domain_url matching the url provided,
        # including path. find the source with the closest match.
        sources_ready.append(source)
        schemeless_url = util.schemeless(url.lower()).strip('/')
        for domain_url in source.domain_urls:
          schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/')
          if (schemeless_url.startswith(schemeless_domain_url) and
              len(domain_url) > len(current_url)):
            current_url = domain_url
            best_match = source

    if best_match:
      return best_match
    elif sources_ready:
      self.error(
        'No account found that matches %s. Check that <a href="/about#profile-link">the web site URL is in your silo profile</a>, then <a href="/">sign up again</a>.' %
        util.pretty_link(url))
    else:
      self.error('Publish is not enabled for your account. <a href="/">Try signing up!</a>')
Beispiel #10
0
 def test_schemeless(self):
   for expected, url in (
       ('', util.schemeless('')),
       ('/path', util.schemeless('/path')),
       ('//foo', util.schemeless('//foo')),
       ('//foo', util.schemeless('http://foo')),
       ('//foo.bar/baz', util.schemeless('http://foo.bar/baz')),
       ('//foo.bar/baz', util.schemeless('https://foo.bar/baz'))):
     self.assertEqual(expected, util.schemeless(url))
Beispiel #11
0
  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    Returns:
      sequence of ActivityStreams activity dicts
    """
    urls = {util.schemeless(util.fragmentless(url), slashes=False)
            for url in self.domain_urls
            if not util.in_webmention_blocklist(util.domain_from_link(url))}
    if not urls:
      return []

    # Search syntax: https://www.reddit.com/wiki/search
    url_query = ' OR '.join(f'site:"{u}" OR selftext:"{u}"' for u in urls)
    return self.get_activities(
      search_query=url_query, group_id=gr_source.SEARCH, etag=self.last_activities_etag,
      fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)
Beispiel #12
0
    def search_for_links(self):
        """Searches for activities with links to any of this source's web sites.

    Returns:
      sequence of ActivityStreams activity dicts
    """
        urls = set(
            util.schemeless(util.fragmentless(url), slashes=False)
            for url in self.domain_urls
            if not util.in_webmention_blocklist(util.domain_from_link(url)))
        if not urls:
            return []

        url_query = ' OR '.join([f'"{u}"' for u in urls])
        return self.get_activities(search_query=url_query,
                                   group_id=gr_source.SEARCH,
                                   etag=self.last_activities_etag,
                                   fetch_replies=True,
                                   fetch_likes=False,
                                   fetch_shares=False,
                                   count=50)
Beispiel #13
0
  def search_for_links(self):
    """Searches for activities with links to any of this source's web sites.

    Twitter search supports OR:
    https://dev.twitter.com/rest/public/search

    ...but it only returns complete(ish) results if we strip scheme from URLs,
    ie search for example.com instead of http://example.com/, and that also
    returns false positivies, so we check that the returned tweets actually have
    matching links. https://github.com/snarfed/bridgy/issues/565

    Returns:
      sequence of ActivityStreams activity dicts
    """
    urls = set(util.fragmentless(url) for url in self.domain_urls
               if not util.in_webmention_blacklist(util.domain_from_link(url)))
    if not urls:
      return []

    query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls)
    candidates = self.get_activities(
      search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag,
      fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)

    # filter out retweets and search false positives that don't actually link to us
    results = []
    for candidate in candidates:
      if candidate.get('verb') == 'share':
        continue
      obj = candidate['object']
      tags = obj.get('tags', [])
      atts = obj.get('attachments', [])
      for url in urls:
        if (url in obj.get('content', '') or
            any(t.get('url', '').startswith(url) for t in tags + atts)):
          id = candidate['id']
          results.append(candidate)
          break

    return results
Beispiel #14
0
  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', self.request.params.items())
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}')
    elif source_cls == GooglePlusPage:
      return self.error('Sorry, %s is not yet supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    domain = domain.lower()
    sources = source_cls.query().filter(source_cls.domains == domain).fetch(100)
    if not sources:
      return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." %
        {'type': source_cls.GR_CLASS.NAME, 'domain': domain})

    current_url = ''
    for source in sources:
      logging.info('Source: %s , features %s, status %s, poll status %s',
                   source.bridgy_url(self), source.features, source.status,
                   source.poll_status)
      if source.status != 'disabled' and 'publish' in source.features:
        # use a source that has a domain_url matching the url provided.
        # look through each source to find the one with the closest match.
        schemeless_url = util.schemeless(url.lower()).strip('/')
        for domain_url in source.domain_urls:
          schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/')
          if (schemeless_url.startswith(schemeless_domain_url) and
              len(domain_url) > len(current_url)):
            self.source = source
            current_url = domain_url

    if not self.source:
      return self.error(
        'Publish is not enabled for your account. Please visit https://brid.gy and sign up!')

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      for source_url in url, self.source_url():
        parts = urlparse.urlparse(source_url)
        if (parts.netloc == domain_url_parts.netloc and
            parts.path.strip('/') == domain_url_parts.path.strip('/') and
            not parts.query):
          return self.error(
            "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    entity = self.get_or_add_publish_entity(url)
    if (entity.status == 'complete' and entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84")
    self.entity = entity

    # fetch source page
    resp = self.fetch_mf2(url)
    if not resp:
      return
    self.fetched, data = resp

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    soup = util.beautifulsoup_parse(self.fetched.text)
    shortlinks = (soup.find_all('link', rel='shortlink') +
                  soup.find_all('a', rel='shortlink') +
                  soup.find_all('a', class_='shortlink'))
    if shortlinks:
      self.shortlink = shortlinks[0]['href']

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    result = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        types = types.union(item_types)
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = util.interpret_http_exception(e)
        if not code:
          raise
        msg = 'Error from %s API or your site: %s %s' % (
          self.source.GR_CLASS.NAME, body or '', e)
        return self.error(msg, status=code, mail=code not in ('502', '503', '504'))
Beispiel #15
0
  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', self.request.params.items())
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}')
    elif source_cls == GooglePlusPage:
      return self.error('Sorry, %s is not yet supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    domain = domain.lower()
    sources = source_cls.query().filter(source_cls.domains == domain).fetch(100)
    if not sources:
      return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." %
        {'type': source_cls.GR_CLASS.NAME, 'domain': domain})

    current_url = ''
    for source in sources:
      logging.info('Source: %s , features %s, status %s, poll status %s',
                   source.bridgy_url(self), source.features, source.status,
                   source.poll_status)
      if source.status != 'disabled' and 'publish' in source.features:
        # use a source that has a domain_url matching the url provided.
        # look through each source to find the one with the closest match.
        schemeless_url = util.schemeless(url.lower()).strip('/')
        for domain_url in source.domain_urls:
          schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/')
          if (schemeless_url.startswith(schemeless_domain_url) and
              len(domain_url) > len(current_url)):
            self.source = source
            current_url = domain_url

    if not self.source:
      return self.error(
        'Publish is not enabled for your account. Please visit https://brid.gy and sign up!')

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      source_url_parts = urlparse.urlparse(self.source_url())
      if (source_url_parts.netloc == domain_url_parts.netloc and
          source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and
          not source_url_parts.query):
        return self.error(
          "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    entity = self.get_or_add_publish_entity(url)
    if (entity.status == 'complete' and entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!")
    self.entity = entity

    # fetch source page
    resp = self.fetch_mf2(url)
    if not resp:
      return
    self.fetched, data = resp

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    soup = util.beautifulsoup_parse(self.fetched.text)
    shortlinks = (soup.find_all('link', rel='shortlink') +
                  soup.find_all('a', rel='shortlink') +
                  soup.find_all('a', class_='shortlink'))
    if shortlinks:
      self.shortlink = shortlinks[0]['href']

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    result = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = util.interpret_http_exception(e)
        mail = True
        if (not code or code == 500) and util.is_connection_failure(e):
          code = 502
          mail=False
        msg = '%s API error: %s %s' % (self.source.GR_CLASS.NAME, body or '', e)
        return self.error(msg, status=code or 500, mail=mail)
Beispiel #16
0
  def add_original_post_urls(self, post, obj, prop):
    """Extracts original post URLs and adds them to an object, in place.

    If the post object has upstreamDuplicates, *only* they are considered
    original post URLs and added as tags with objectType 'article', and the
    post's own links and 'article' tags are added with objectType 'mention'.

    Args:
      post: ActivityStreams post object to get original post URLs from
      obj: ActivityStreams post object to add original post URLs to
      prop: string property name in obj to add the original post URLs to
    """
    original_post_discovery.discover(self.source, post, fetch_hfeed=False)
    tags = [tag for tag in post['object'].get('tags', [])
            if 'url' in tag and tag['objectType'] == 'article']
    upstreams = post['object'].get('upstreamDuplicates', [])

    if not isinstance(obj.setdefault(prop, []), list):
      obj[prop] = [obj[prop]]
    if upstreams:
      obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams]
      obj.setdefault('tags', []).extend(
        [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags])
    else:
      obj[prop] += tags

    # check for redirects, and if there are any follow them and add final urls
    # in addition to the initial urls.
    seen = set()
    tags = obj.get('tags', [])
    for url_list in obj[prop], tags:
      for url_obj in url_list:
        url = util.clean_webmention_url(url_obj.get('url', ''))
        if not url or url in seen:
          continue
        seen.add(url)
        # when debugging locally, replace my (snarfed.org) URLs with localhost
        url_obj['url'] = url = util.replace_test_domains_with_localhost(url)
        resolved, _, send = util.get_webmention_target(url)
        if send and resolved != url and resolved not in seen:
          seen.add(resolved)
          url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')})

    # if the http version of a link is in upstreams but the https one is just a
    # mention, or vice versa, promote them both to upstream.
    # https://github.com/snarfed/bridgy/issues/290
    #
    # TODO: for links that came from resolving redirects above, this doesn't
    # also catch the initial pre-redirect link. ah well.
    prop_schemeful = set(tag['url'] for tag in obj[prop] if tag.get('url'))
    prop_schemeless = set(util.schemeless(url) for url in prop_schemeful)

    for url_obj in copy.copy(tags):
      url = url_obj.get('url', '')
      schemeless = util.schemeless(url)
      if schemeless in prop_schemeless and url not in prop_schemeful:
        obj[prop].append(url_obj)
        tags.remove(url_obj)
        prop_schemeful.add(url)

    logging.info('After original post discovery, urls are: %s', seen)