Beispiel #1
0
 def test_get_webmention_target_blacklisted_urls(self):
   for resolve in True, False:
     self.assertTrue(util.get_webmention_target(
       'http://good.com/a', resolve=resolve)[2])
     for bad in ('http://facebook.com/x', 'https://www.facebook.com/y',
                 'http://sub.dom.ain.facebook.com/z'):
       self.assertFalse(util.get_webmention_target(bad, resolve=resolve)[2], bad)
Beispiel #2
0
 def test_get_webmention_target_blocklisted_urls(self):
   for resolve in True, False:
     self.assertTrue(util.get_webmention_target(
       'http://good.com/a', resolve=resolve)[2])
     for bad in ('http://facebook.com/x', 'https://www.facebook.com/y',
                 'http://sub.dom.ain.facebook.com/z'):
       self.assertFalse(util.get_webmention_target(bad, resolve=resolve)[2], bad)
Beispiel #3
0
def get_webmention_targets(source, activity):
  """Returns a set of string target URLs to attempt to send webmentions to.

  Side effect: runs the original post discovery algorithm on the activity and
  adds the resulting URLs to the activity as tags, in place.

  Args:
   source: models.Source subclass
   activity: activity dict
  """
  original_post_discovery.discover(source, activity)

  targets = set()
  obj = activity.get('object') or activity

  for tag in obj.get('tags', []):
    url = tag.get('url')
    if url and tag.get('objectType') == 'article':
      url, domain, send = util.get_webmention_target(url)
      tag['url'] = url
      if send:
        targets.add(url)

  for url in obj.get('upstreamDuplicates', []):
    url, domain, send = util.get_webmention_target(url)
    if send:
      targets.add(url)

  return targets
Beispiel #4
0
  def test_get_webmention_target_blacklisted_urls(self):
    gwt = util.get_webmention_target
    for bad in ('http://facebook.com/x', 'https://www.facebook.com/y',
                'http://sub.dom.ain.facebook.com/z'):
      self.assertFalse(util.get_webmention_target(bad)[2], bad)

    self.assertTrue(util.get_webmention_target('http://good.com/a')[2])
Beispiel #5
0
  def test_get_webmention_cleans_redirected_urls(self):
    self.expect_requests_head('http://foo/bar',
                              redirected_url='http://final?utm_source=x')
    self.mox.ReplayAll()

    self.assert_equals(('http://final', 'final', True),
                       util.get_webmention_target('http://foo/bar', resolve=True))
    self.assert_equals(('http://foo/bar', 'foo', True),
                       util.get_webmention_target('http://foo/bar', resolve=False))
Beispiel #6
0
  def test_get_webmention_cleans_redirected_urls(self):
    self.expect_requests_head('http://foo/bar',
                              redirected_url='http://final?utm_source=x')
    self.mox.ReplayAll()

    self.assert_equals(('http://final', 'final', True),
                       util.get_webmention_target('http://foo/bar', resolve=True))
    self.assert_equals(('http://foo/bar', 'foo', True),
                       util.get_webmention_target('http://foo/bar', resolve=False))
Beispiel #7
0
 def test_get_webmention_second_redirect_not_text_html(self):
   self.expect_requests_head('http://orig',
                             redirected_url=['http://middle', 'https://end'],
                             content_type='application/pdf')
   self.mox.ReplayAll()
   self.assert_equals(('https://end', 'end', False),
                      util.get_webmention_target('http://orig', resolve=True))
Beispiel #8
0
 def test_get_webmention_target_too_big(self):
   self.expect_requests_head('http://orig', response_headers={
     'Content-Length': str(util.MAX_HTTP_RESPONSE_SIZE + 1),
   })
   self.mox.ReplayAll()
   self.assert_equals(('http://orig', 'orig', False),
                      util.get_webmention_target('http://orig'))
Beispiel #9
0
 def test_get_webmention_second_redirect_not_text_html(self):
   self.expect_requests_head('http://orig',
                             redirected_url=['http://middle', 'https://end'],
                             content_type='application/pdf')
   self.mox.ReplayAll()
   self.assert_equals(('https://end', 'end', False),
                      util.get_webmention_target('http://orig', resolve=True))
Beispiel #10
0
 def test_get_webmention_target_too_big(self):
   self.expect_requests_head('http://orig', response_headers={
     'Content-Length': str(util.MAX_HTTP_RESPONSE_SIZE + 1),
   })
   self.mox.ReplayAll()
   self.assert_equals(('http://orig', 'orig', False),
                      util.get_webmention_target('http://orig'))
Beispiel #11
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Beispiel #12
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Beispiel #13
0
    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.warning(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            url, domain, send = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if send:
                urls.append(url)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains
Beispiel #14
0
  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.warning('Too many profile links! Only resolving the first %s: %s',
                      MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if send:
        urls.append(url)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains
Beispiel #15
0
  def _url_and_domain(self, auth_entity):
    """Returns this source's URL and domain.

    Uses the auth entity user_json 'url' field by default. May be overridden
    by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth

    Returns: (string url, string domain, boolean ok) tuple
    """
    user_json = json.loads(auth_entity.user_json)
    actor = self.as_source.user_to_actor(user_json)
    urls = util.trim_nulls([actor.get('url')] +
                           # also look at G+'s urls field
                           [u.get('value') for u in user_json.get('urls', [])])

    first_url = first_domain = None
    for url in urls:
      # TODO: fully support multiple urls
      for url in url.split():
        url, domain, ok = util.get_webmention_target(url)
        if ok:
          domain = domain.lower()
          return url, domain, True
        elif not first_url:
          first_url = url
          first_domain = domain

    return first_url, first_domain, False
Beispiel #16
0
    def test_get_webmention_middle_redirect_blacklisted(self):
        """We should allow blacklisted domains in the middle of a redirect chain.

    ...e.g. Google's redirector https://www.google.com/url?...
    """
        self.expect_requests_head("http://orig", redirected_url=["https://www.google.com/url?xyz", "https://end"])
        self.mox.ReplayAll()
        self.assert_equals(("https://end", "end", True), util.get_webmention_target("http://orig", resolve=True))
 def resolve(urls):
   resolved = set()
   for url in urls:
     final, _, send = util.get_webmention_target(url)
     if send:
       resolved.add(final)
       if include_redirect_sources:
         resolved.add(url)
   return resolved
 def resolve(urls):
     resolved = set()
     for url in urls:
         final, _, send = util.get_webmention_target(url)
         if send:
             resolved.add(final)
             if include_redirect_sources:
                 resolved.add(url)
     return resolved
Beispiel #19
0
 def resolve(urls):
     resolved = set()
     for url in urls:
         final, domain, send = util.get_webmention_target(url)
         if send and domain != source.gr_source.DOMAIN:
             resolved.add(final)
             if include_redirect_sources:
                 resolved.add(url)
     return resolved
Beispiel #20
0
  def test_get_webmention_middle_redirect_blocklisted(self):
    """We should allow blocklisted domains in the middle of a redirect chain.

    ...e.g. Google's redirector https://www.google.com/url?...
    """
    self.expect_requests_head(
      'http://orig',
      redirected_url=['https://www.google.com/url?xyz', 'https://end'])
    self.mox.ReplayAll()
    self.assert_equals(('https://end', 'end', True),
                       util.get_webmention_target('http://orig', resolve=True))
Beispiel #21
0
  def add_original_post_urls(self, post_id, obj, prop):
    """Extracts original post URLs and adds them to an object, in place.

    If the post object has upstreamDuplicates, *only* they are considered
    original post URLs and added as tags with objectType 'article', and the
    post's own links and 'article' tags are added with objectType 'mention'.

    Args:
      post_id: string post id
      obj: ActivityStreams post object
      prop: string property name in obj to add the original post URLs to
    """
    post = None
    try:
      post = self.source.get_post(post_id)
    except:
      logging.warning('Error fetching source post %s', post_id, exc_info=True)
      return
    if not post:
      logging.warning('Source post %s not found', post_id)
      return

    original_post_discovery.discover(self.source, post, fetch_hfeed=False)
    tags = [tag for tag in post['object'].get('tags', [])
            if 'url' in tag and tag['objectType'] == 'article']
    upstreams = post['object'].get('upstreamDuplicates', [])

    if not isinstance(obj.setdefault(prop, []), list):
      obj[prop] = [obj[prop]]
    if upstreams:
      obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams]
      obj.setdefault('tags', []).extend(
        [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags])
    else:
      obj[prop] += tags

    # check for redirects, and if there are any follow them and add final urls
    # in addition to the initial urls.
    seen = set()
    for url_list in obj[prop], obj.get('tags', []):
      for url_obj in url_list:
        url = util.clean_webmention_url(url_obj.get('url', ''))
        if not url or url in seen:
          continue
        seen.add(url)
        # when debugging locally, replace my (snarfed.org) URLs with localhost
        url_obj['url'] = url = util.replace_test_domains_with_localhost(url)
        resolved, _, send = util.get_webmention_target(url)
        if send and resolved != url and resolved not in seen:
          seen.add(resolved)
          url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')})

    logging.info('After original post discovery, urls are: %s', seen)
Beispiel #22
0
  def post(self):
    logging.debug('Params: %s', self.request.params)

    if self.lease(ndb.Key(urlsafe=self.request.params['key'])):
      source_domains = self.entity.source.get().domains
      to_send = set()
      for url in self.entity.unsent:
        url, domain, ok = util.get_webmention_target(url)
        # skip "self" links to this blog's domain
        if ok and domain not in source_domains:
          to_send.add(url)

      self.entity.unsent = list(to_send)
      self.send_webmentions()
Beispiel #23
0
    def post(self):
        logging.debug('Params: %s', self.request.params)

        if self.lease(ndb.Key(urlsafe=self.request.params['key'])):
            source_domains = self.entity.source.get().domains
            to_send = set()
            for url in self.entity.unsent:
                url, domain, ok = util.get_webmention_target(url)
                # skip "self" links to this blog's domain
                if ok and domain not in source_domains:
                    to_send.add(url)

            self.entity.unsent = list(to_send)
            self.send_webmentions()
Beispiel #24
0
    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains
Beispiel #25
0
  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.info('Too many profile links! Only resolving the first %s: %s',
                   MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if ok:
        final = final.lower()
        if util.schemeless(final).startswith(util.schemeless(url.lower())):
          # redirected to a deeper path. use the original higher level URL. #652
          final = url
        # If final has a path segment check if root has a matching rel=me.
        match = re.match(r'^(https?://[^/]+)/.+', final)
        if match and i < MAX_AUTHOR_URLS:
          root = match.group(1)
          resp = util.requests_get(root)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, root)
          me_urls = data.get('rels', {}).get('me', [])
          if final in me_urls:
            final = root
        urls.append(final)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains
Beispiel #26
0
    def dispatch_request(self):
        logger.debug(f'Params: {list(request.values.items())}')

        if not self.lease(ndb.Key(urlsafe=request.values['key'])):
            return ('', ERROR_HTTP_RETURN_CODE) if getattr(g, 'failed',
                                                           None) else 'OK'

        to_send = set()
        for url in self.entity.unsent:
            url, domain, ok = util.get_webmention_target(url)
            # skip "self" links to this blog's domain
            if ok and domain not in g.source.domains:
                to_send.add(url)

        self.entity.unsent = list(to_send)
        self.send_webmentions()
        return ('',
                ERROR_HTTP_RETURN_CODE) if getattr(g, 'failed', None) else 'OK'
Beispiel #27
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info("Webmention from %s to %s", source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, a
            # WebmentionSend error dict if it failed (semi-)permanently, or None.
            cache_key = util.webmention_endpoint_cache_key(target)
            cached = memcache.get(cache_key)
            if cached:
                logging.info("Using cached webmention endpoint %r: %s", cache_key, cached)

            # send! and handle response or error
            error = None
            if isinstance(cached, dict):
                error = cached
            else:
                mention = send.WebmentionSend(source_url, target, endpoint=cached)
                logging.info("Sending...")
                try:
                    if not mention.send(timeout=999, headers=util.REQUEST_HEADERS):
                        error = mention.error
                except BaseException, e:
                    logging.warning("", exc_info=True)
                    error = getattr(mention, "error")
                    if not error:
                        error = (
                            {"code": "BAD_TARGET_URL", "http_status": 499}
                            if "DNS lookup failed for URL:" in str(e)
                            else {"code": "EXCEPTION"}
                        )

            error_code = error["code"] if error else None
            if error_code != "BAD_TARGET_URL" and not cached:
                val = error if error_code == "NO_ENDPOINT" else mention.receiver_endpoint
                memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME)

            if error is None:
                logging.info("Sent! %s", mention.response)
                self.record_source_webmention(mention)
                self.entity.sent.append(target)
            else:
                status = error.get("http_status", 0)
                if error_code == "NO_ENDPOINT" or (error_code == "BAD_TARGET_URL" and status == 204):  # No Content
                    logging.info("Giving up this target. %s", error)
                    self.entity.skipped.append(target)
                elif status // 100 == 4:
                    # Give up on 4XX errors; we don't expect later retries to succeed.
                    logging.info("Giving up this target. %s", error)
                    self.entity.failed.append(target)
                else:
                    self.fail("Error sending to endpoint: %s" % error)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)
Beispiel #28
0
  def expand_target_urls(self, activity):
    """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
    for field in ('inReplyTo', 'object'):
      # microformats2.json_to_object de-dupes, no need to do it here
      objs = activity.get(field)
      if not objs:
        continue

      if isinstance(objs, dict):
        objs = [objs]

      augmented = list(objs)
      for obj in objs:
        url = obj.get('url')
        if not url:
          continue

        parsed = urllib.parse.urlparse(url)
        # ignore home pages. https://github.com/snarfed/bridgy/issues/760
        if parsed.path in ('', '/'):
          continue

        # get_webmention_target weeds out silos and non-HTML targets
        # that we wouldn't want to download and parse
        url, _, ok = util.get_webmention_target(url)
        if not ok:
          continue

        logging.debug('expand_target_urls fetching field=%s, url=%s', field, url)
        try:
          mf2 = util.fetch_mf2(url)
        except AssertionError:
          raise  # for unit tests
        except BaseException:
          # it's not a big deal if we can't fetch an in-reply-to url
          logging.info('expand_target_urls could not fetch field=%s, url=%s',
                       field, url, stack_info=True)
          continue

        synd_urls = mf2['rels'].get('syndication', [])

        # look for syndication urls in the first h-entry
        queue = collections.deque(mf2.get('items', []))
        while queue:
          item = queue.popleft()
          item_types = set(item.get('type', []))
          if 'h-feed' in item_types and 'h-entry' not in item_types:
            queue.extend(item.get('children', []))
            continue

          # these can be urls or h-cites
          synd_urls += microformats2.get_string_urls(
            item.get('properties', {}).get('syndication', []))

        logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls)
        augmented += [{'url': u} for u in synd_urls]

      activity[field] = augmented
Beispiel #29
0
  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', list(self.request.params.items()))
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urllib.parse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in util.DOMAINS or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{flickr,github,mastodon,twitter}')
    elif source_cls == Instagram:
      return self.error('Sorry, %s is not supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    self.source = self._find_source(source_cls, url, domain)
    if not self.source:
      return  # _find_source rendered the error

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urllib.parse.urlparse(domain_url)
      for source_url in url, self.source_url():
        parts = urllib.parse.urlparse(source_url)
        if (parts.netloc == domain_url_parts.netloc and
            parts.path.strip('/') == domain_url_parts.path.strip('/') and
            not parts.query):
          return self.error(
            "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    self.entity = self.get_or_add_publish_entity(url)
    try:
      resp = self.fetch_mf2(url, raise_errors=True)
    except BaseException as e:
      status, body = util.interpret_http_exception(e)
      if status == '410':
        return self.delete(url)
      return self.error('Could not fetch source URL %s' % url)

    if not resp:
      return
    self.fetched, mf2 = resp

    # create the Publish entity so we can store the result.
    if (self.entity.status == 'complete' and self.entity.type != 'preview' and
        not self.PREVIEW and not appengine_info.LOCAL):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't support updating existing posts. Details: https://github.com/snarfed/bridgy/issues/84",
                        extra_json={'original': self.entity.published})

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    shortlinks = mf2['rels'].get('shortlink')
    if shortlinks:
      self.shortlink = urllib.parse.urljoin(url, shortlinks[0])

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    result = None
    types = set()
    queue = collections.deque(mf2.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        types = types.union(item_types)
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException as e:
        code, body = util.interpret_http_exception(e)
        if code in self.source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource):
          # the user deauthorized the bridgy app, or the token expired, so
          # disable this source.
          logging.warning('Disabling source due to: %s' % e, stack_info=True)
          self.source.status = 'disabled'
          self.source.put()
          # util.email_me(subject='Bridgy Publish: disabled %s' % self.source.label(),
          #               body=body)
        if isinstance(e, (NotImplementedError, ValueError, urllib.error.URLError)):
          code = '400'
        elif not code:
          raise
        msg = 'Error: %s %s' % (body or '', e)
        return self.error(msg, status=code, report=code not in ('400', '404', '502', '503', '504'))

    if not self.entity.published:  # tried all the items
      types.discard('h-entry')
      types.discard('h-note')
      if types:
        msg = ("%s doesn't support type(s) %s, or no content was found." %
               (source_cls.GR_CLASS.NAME, ' + '.join(types)))
      else:
        msg = 'Could not find content in <a href="http://microformats.org/wiki/h-entry">h-entry</a> or any other element!'
      return self.error(msg, data=mf2)

    # write results to datastore, but don't overwrite a previous publish with a
    # preview.
    if not (self.PREVIEW and self.entity.type != 'preview'):
      self.entity.status = 'complete'
      self.entity.put()

    return result
def _process_entry(source, permalink, feed_entry, refetch, preexisting,
                   store_blanks=True):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting:
    # if we're refetching and this one is blank, do not return.
    # if there is a blank entry, it should be the one and only entry,
    # but go ahead and check 'all' of them to be safe.
    if not refetch:
      return {}
    synds = [s.syndication for s in preexisting if s.syndication]
    if synds:
      logging.debug('previously found relationship(s) for original %s: %s',
                    permalink, synds)

  # first try with the h-entry from the h-feed. if we find the syndication url
  # we're looking for, we don't have to fetch the permalink
  permalink, _, type_ok = util.get_webmention_target(permalink)
  usynd = feed_entry.get('properties', {}).get('syndication', [])
  if usynd:
    logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
  results = _process_syndication_urls(source, permalink, set(
    url for url in usynd if isinstance(url, basestring)), preexisting)
  success = True

  # fetch the full permalink page, which often has more detailed information
  if not results:
    parsed = None
    try:
      logging.debug('fetching post permalink %s', permalink)
      if type_ok:
        resp = util.requests_get(permalink)
        resp.raise_for_status()
        parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
    except AssertionError:
      raise  # for unit tests
    except BaseException:
      # TODO limit the number of allowed failures
      logging.warning('Could not fetch permalink %s', permalink, exc_info=True)
      success = False

    if parsed:
      syndication_urls = set()
      relsynd = parsed.get('rels').get('syndication', [])
      if relsynd:
        logging.debug('rel-syndication links: %s', relsynd)
      syndication_urls.update(url for url in relsynd
                              if isinstance(url, basestring))
      # there should only be one h-entry on a permalink page, but
      # we'll check all of them just in case.
      for hentry in (item for item in parsed['items']
                     if 'h-entry' in item['type']):
        usynd = hentry.get('properties', {}).get('syndication', [])
        if usynd:
          logging.debug('u-syndication links: %s', usynd)
        syndication_urls.update(url for url in usynd
                                if isinstance(url, basestring))
      results = _process_syndication_urls(
        source, permalink, syndication_urls, preexisting)

  # detect and delete SyndicatedPosts that were removed from the site
  if success:
    result_syndposts = itertools.chain(*results.values())
    for syndpost in list(preexisting):
      if syndpost.syndication and syndpost not in result_syndposts:
        logging.info('deleting relationship that disappeared: %s', syndpost)
        syndpost.key.delete()
        preexisting.remove(syndpost)

  if not results:
    logging.debug('no syndication links from %s to current source %s.',
                  permalink, source.label())
    results = {}
    if store_blanks and not preexisting:
      # remember that this post doesn't have syndication links for this
      # particular source
      logging.debug('saving empty relationship so that %s will not be '
                    'searched again', permalink)
      SyndicatedPost.insert_original_blank(source, permalink)

  # only return results that are not in the preexisting list
  new_results = {}
  for syndurl, syndposts_for_url in results.iteritems():
    for syndpost in syndposts_for_url:
      if syndpost not in preexisting:
        new_results.setdefault(syndurl, []).append(syndpost)

  if new_results:
    logging.debug('discovered relationships %s', new_results)
  return new_results
Beispiel #31
0
    def test_get_webmention_cleans_redirected_urls(self):
        self.expect_requests_head("http://foo/bar", redirected_url="http://final?utm_source=x")
        self.mox.ReplayAll()

        self.assert_equals(("http://final", "final", True), util.get_webmention_target("http://foo/bar", resolve=True))
        self.assert_equals(("http://foo/bar", "foo", True), util.get_webmention_target("http://foo/bar", resolve=False))
Beispiel #32
0
    def expand_target_urls(self, activity):
        """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
        for field in ('inReplyTo', 'object'):
            # microformats2.json_to_object de-dupes, no need to do it here
            objs = activity.get(field)
            if not objs:
                continue

            if isinstance(objs, dict):
                objs = [objs]

            augmented = list(objs)
            for obj in objs:
                url = obj.get('url')
                if not url:
                    continue

                # get_webmention_target weeds out silos and non-HTML targets
                # that we wouldn't want to download and parse
                url, _, ok = util.get_webmention_target(url)
                if not ok:
                    continue

                # fetch_mf2 raises a fuss if it can't fetch a mf2 document;
                # easier to just grab this ourselves than add a bunch of
                # special-cases to that method
                logging.debug('expand_target_urls fetching field=%s, url=%s',
                              field, url)
                try:
                    resp = util.requests_get(url)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, url)
                except AssertionError:
                    raise  # for unit tests
                except BaseException:
                    # it's not a big deal if we can't fetch an in-reply-to url
                    logging.warning(
                        'expand_target_urls could not fetch field=%s, url=%s',
                        field,
                        url,
                        exc_info=True)
                    continue

                synd_urls = data.get('rels', {}).get('syndication', [])

                # look for syndication urls in the first h-entry
                queue = collections.deque(data.get('items', []))
                while queue:
                    item = queue.popleft()
                    item_types = set(item.get('type', []))
                    if 'h-feed' in item_types and 'h-entry' not in item_types:
                        queue.extend(item.get('children', []))
                        continue

                    # these can be urls or h-cites
                    synd_urls += microformats2.get_string_urls(
                        item.get('properties', {}).get('syndication', []))

                logging.debug(
                    'expand_target_urls found rel=syndication for url=%s: %r',
                    url, synd_urls)
                augmented += [{'url': u} for u in synd_urls]

            activity[field] = augmented
Beispiel #33
0
 def test_get_webmention_target_not_http_https(self):
     self.assert_equals(('chrome://flags', 'flags', False),
                        util.get_webmention_target('chrome://flags'))
Beispiel #34
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    logger.debug(f'fetching author url {author_url}')
    try:
        author_mf2 = util.fetch_mf2(author_url)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logger.info(f'Could not fetch author url {author_url}', exc_info=True)
        return {}

    feeditems = _find_feed_items(author_mf2)

    # try rel=feeds and rel=alternates
    feed_urls = set()
    candidates = (author_mf2['rels'].get('feed', []) + [
        a.get('url') for a in author_mf2.get('alternates', [])
        if a.get('type') == MF2_HTML_MIME_TYPE
    ])
    for feed_url in candidates:
        # check that it's html, not too big, etc
        feed_url, _, feed_ok = util.get_webmention_target(feed_url)
        if feed_url == author_url:
            logger.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logger.debug("skipping feed since it's not HTML or otherwise bad")
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logger.debug(f"fetching author's rel-feed {feed_url}")
            feed_mf2 = util.fetch_mf2(feed_url)
            feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2))
            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logger.info(
                        f'rel-feed found new domain {domain}! adding to source'
                    )
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logger.info(f'Could not fetch h-feed url {feed_url}.',
                        exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published') or ''

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logger.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, str):
                    permalink_to_entry[permalink] = child
                else:
                    logger.warning(
                        f'unexpected non-string "url" property: {permalink}')

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logger.info(f'Hit cap of {max} permalinks. Stopping.')
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.items():
        logger.debug(f'processing permalink: {permalink}')
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.items():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
def _process_author(source, author_url, refetch_blanks=False):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of models.Source
    author_url: the author's homepage URL
    refetch_blanks: boolean, if true, refetch SyndicatedPosts that have
      previously been marked as not having a rel=syndication link

  Return:
    a dict of syndicated_url to models.SyndicatedPost
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  # TODO skip sites we know don't have microformats2 markup
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author domain %s', author_url)
    author_resp = requests.get(author_url, timeout=HTTP_TIMEOUT)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.warning('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  author_dom = BeautifulSoup(author_resp.text)
  author_parser = mf2py.Parser(url=author_url, doc=author_dom)
  author_parsed = author_parser.to_dict()

  # look for canonical feed url (if it isn't this one) using
  # rel='feed', type='text/html'
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if not feed_type:
      # type is not specified, use this to confirm that it's text/html
      feed_url, _, feed_type_ok = util.get_webmention_target(feed_url)
    else:
      feed_type_ok = feed_type == 'text/html'

    if feed_url == author_url:
      logging.debug('author url is the feed url, proceeding')
      break
    elif not feed_type_ok:
      logging.debug('skipping feed of type %s', feed_type)
      continue

    try:
      logging.debug("fetching author's h-feed %s", feed_url)
      feed_resp = requests.get(feed_url, timeout=HTTP_TIMEOUT)
      feed_resp.raise_for_status()
      logging.debug("author's h-feed fetched successfully %s", feed_url)
      author_parsed = mf2py.Parser(
        url=feed_url, doc=feed_resp.text).to_dict()
      break
    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.warning('Could not fetch h-feed url %s.', feed_url, exc_info=True)

  feeditems = author_parsed['items']
  hfeed = next((item for item in feeditems
                if 'h-feed' in item['type']), None)
  if hfeed:
    feeditems = hfeed.get('children', [])
  else:
    logging.info('No h-feed found, fallback to top-level h-entrys.')

  permalinks = set()
  for child in feeditems:
    if 'h-entry' in child['type']:
      # TODO if this h-entry in the h-feed has u-syndication links, we
      # can just use it without fetching its permalink page
      # TODO maybe limit to first ~30 entries? (do that here rather than,
      # below because we want the *first* n entries)
      for permalink in child['properties'].get('url', []):
        permalinks.add(permalink)

  # query all preexisting permalinks at once, instead of once per link
  preexisting = {r.original: r for r in
                 SyndicatedPost.query_by_originals(source, permalinks)}

  results = {}
  for permalink in permalinks:
    logging.debug('processing permalink: %s', permalink)
    results.update(_process_entry(source, permalink, refetch_blanks,
                                  preexisting))

  if results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    now = now_fn()
    logging.debug('updating source.last_syndication_url %s', now)
    source.last_syndication_url = now

  return results
def _process_entry(source, permalink, refetch_blanks, preexisting):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    permalink: url of the unprocessed post
    syndication_url: url of the syndicated content
    refetch_blanks: boolean whether we should ignore blank preexisting
      SyndicatedPosts
    preexisting: dict of original url to SyndicatedPost

  Return:
    a dict from syndicated url to new models.SyndicatedPosts
  """
  results = {}
  preexisting_relationship = preexisting.get(permalink)

  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting_relationship:
    # if we're refetching blanks and this one is blank, do not return
    if refetch_blanks and not preexisting_relationship.syndication:
      logging.debug('ignoring blank relationship for original %s', permalink)
    else:
      return results

  syndication_urls = set()
  parsed = None
  try:
    logging.debug('fetching post permalink %s', permalink)
    permalink, _, type_ok = util.get_webmention_target(permalink)
    if type_ok:
      resp = requests.get(permalink, timeout=HTTP_TIMEOUT)
      resp.raise_for_status()
      parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
  except BaseException:
    # TODO limit the number of allowed failures
    logging.warning('Could not fetch permalink %s', permalink, exc_info=True)

  if parsed:
    relsynd = parsed.get('rels').get('syndication', [])
    logging.debug('rel-syndication links: %s', relsynd)
    syndication_urls.update(relsynd)

    # there should only be one h-entry on a permalink page, but
    # we'll check all of them just in case.
    for hentry in (item for item in parsed['items']
                   if 'h-entry' in item['type']):
      usynd = hentry.get('properties', {}).get('syndication', [])
      logging.debug('u-syndication links: %s', usynd)
      syndication_urls.update(usynd)

  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    parsed = urlparse.urlparse(syndication_url)
    if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.get_or_insert_by_syndication_url(
          source, syndication=syndication_url, original=permalink)
      results[syndication_url] = relationship

  if not results:
    logging.debug('no syndication links from %s to current source %s. '
                  'saving empty relationship so that it will not be '
                  'searched again', permalink, source.label())
    # remember that this post doesn't have syndication links for this
    # particular source
    SyndicatedPost(parent=source.key, original=permalink,
                   syndication=None).put()

  logging.debug('discovered relationships %s', results)

  return results
def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = util.beautifulsoup_parse(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.info('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if feed_type and feed_type != 'text/html':
      feed_ok = False
    else:
      # double check that it's text/html, not too big, etc
      feed_url, _, feed_ok = util.get_webmention_target(feed_url)

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True)

  # sort by dt-updated/dt-published
  def updated_or_published(item):
    props = microformats2.first_props(item.get('properties'))
    return props.get('updated') or props.get('published')

  feeditems.sort(key=updated_or_published, reverse=True)

  permalink_to_entry = collections.OrderedDict()
  for child in feeditems:
    if 'h-entry' in child['type']:
      permalinks = child['properties'].get('url', [])
      if not permalinks:
        logging.debug('ignoring h-entry with no u-url!')
      for permalink in permalinks:
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

    max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user()
           else MAX_PERMALINK_FETCHES)
    if len(permalink_to_entry) >= max:
      logging.info('Hit cap of %d permalinks. Stopping.', max)
      break

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    source.updates['last_syndication_url'] = util.now_fn()

  return results
Beispiel #38
0
  def expand_target_urls(self, activity):
    """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
    for field in ('inReplyTo', 'object'):
      # microformats2.json_to_object de-dupes, no need to do it here
      objs = activity.get(field)
      if not objs:
        continue

      if isinstance(objs, dict):
        objs = [objs]

      augmented = list(objs)
      for obj in objs:
        url = obj.get('url')
        if not url:
          continue

        # get_webmention_target weeds out silos and non-HTML targets
        # that we wouldn't want to download and parse
        url, _, ok = util.get_webmention_target(url)
        if not ok:
          continue

        # fetch_mf2 raises a fuss if it can't fetch a mf2 document;
        # easier to just grab this ourselves than add a bunch of
        # special-cases to that method
        logging.debug('expand_target_urls fetching field=%s, url=%s', field, url)
        try:
          resp = util.requests_get(url)
          resp.raise_for_status()
          data = mf2py.Parser(url=url, doc=resp.text).to_dict()
        except AssertionError:
          raise  # for unit tests
        except BaseException:
          # it's not a big deal if we can't fetch an in-reply-to url
          logging.warning('expand_target_urls could not fetch field=%s, url=%s',
                          field, url, exc_info=True)
          continue

        synd_urls = data.get('rels', {}).get('syndication', [])

        # look for syndication urls in the first h-entry
        queue = collections.deque(data.get('items', []))
        while queue:
          item = queue.popleft()
          item_types = set(item.get('type', []))
          if 'h-feed' in item_types and 'h-entry' not in item_types:
            queue.extend(item.get('children', []))
            continue

          # these can be urls or h-cites
          synd_urls += microformats2.get_string_urls(
            item.get('properties', {}).get('syndication', []))

        logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls)
        augmented += [{'url': u} for u in synd_urls]

      activity[field] = augmented
Beispiel #39
0
 def test_get_webmention_text_mf2_html(self):
     self.expect_requests_head('http://orig', content_type='text/mf2+html')
     self.mox.ReplayAll()
     self.assert_equals(('http://orig', 'orig', True),
                        util.get_webmention_target('http://orig'))
def _process_entry(source,
                   permalink,
                   feed_entry,
                   refetch,
                   preexisting,
                   store_blanks=True):
    """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logging.debug(
                'previously found relationship(s) for original %s: %s',
                permalink, synds)

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    if usynd:
        logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
    results = _process_syndication_urls(
        source, permalink,
        set(url for url in usynd if isinstance(url, basestring)), preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url:
        # fetch the full permalink page if we think it might have more details
        parsed = None
        try:
            logging.debug('fetching post permalink %s', permalink)
            if type_ok:
                resp = util.requests_get(permalink)
                resp.raise_for_status()
                parsed = util.mf2py_parse(resp.text, permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logging.warning('Could not fetch permalink %s',
                            permalink,
                            exc_info=True)
            success = False

        if parsed:
            syndication_urls = set()
            relsynd = parsed.get('rels').get('syndication', [])
            if relsynd:
                logging.debug('rel-syndication links: %s', relsynd)
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, basestring))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in parsed['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logging.debug('u-syndication links: %s', usynd)
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, basestring))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = itertools.chain(*results.values())
        for syndpost in list(preexisting):
            if syndpost.syndication and syndpost not in result_syndposts:
                logging.info('deleting relationship that disappeared: %s',
                             syndpost)
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logging.debug('no syndication links from %s to current source %s.',
                      permalink, source.label())
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logging.debug(
                'saving empty relationship so that %s will not be '
                'searched again', permalink)
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.iteritems():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logging.debug('discovered relationships %s', new_results)
    return new_results
Beispiel #41
0
  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', self.request.params.items())
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{facebook,flickr,github,twitter}')
    elif source_cls == Instagram:
      return self.error('Sorry, %s is not supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    self.source = self._find_source(source_cls, url, domain)
    if not self.source:
      return  # _find_source rendered the error

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      for source_url in url, self.source_url():
        parts = urlparse.urlparse(source_url)
        if (parts.netloc == domain_url_parts.netloc and
            parts.path.strip('/') == domain_url_parts.path.strip('/') and
            not parts.query):
          return self.error(
            "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    self.entity = self.get_or_add_publish_entity(url)
    try:
      resp = self.fetch_mf2(url, raise_errors=True)
    except BaseException as e:
      status, body = util.interpret_http_exception(e)
      if status == '410':
        return self.delete(url)
      return self.error('Could not fetch source URL %s' % url)

    if not resp:
      return
    self.fetched, data = resp

    # create the Publish entity so we can store the result.
    if (self.entity.status == 'complete' and self.entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84")

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    soup = util.beautifulsoup_parse(self.fetched.text)
    shortlinks = (soup.find_all('link', rel='shortlink') +
                  soup.find_all('a', rel='shortlink') +
                  soup.find_all('a', class_='shortlink'))
    if shortlinks:
      self.shortlink = shortlinks[0]['href']

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    result = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        types = types.union(item_types)
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = util.interpret_http_exception(e)
        if code in self.source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource):
          # the user deauthorized the bridgy app, or the token expired, so
          # disable this source.
          logging.warning('Disabling source due to: %s' % e, exc_info=True)
          self.source.status = 'disabled'
          self.source.put()
          # TODO: eventually drop this to just if source.is_beta_user(). leaving
          # for everyone right now for initial monitoring.
          util.email_me(subject='Bridgy Publish: disabled %s' % self.source.label(),
                        body=body)
        if isinstance(e, (NotImplementedError, ValueError, urllib2.URLError)):
          code = '400'
        elif not code:
          raise
        msg = 'Error: %s %s' % (body or '', e)
        return self.error(msg, status=code, mail=code not in ('400', '404', '502', '503', '504'))
Beispiel #42
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.info('Giving up on target URL over %s chars! %s',
                                 _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info('Webmention from %s to %s', source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, a
            # WebmentionSend error dict if it failed (semi-)permanently, or None.
            cache_key = util.webmention_endpoint_cache_key(target)
            cached = util.webmention_endpoint_cache.get(cache_key)
            if cached:
                logging.info('Using cached webmention endpoint %r: %s',
                             cache_key, cached)

            # send! and handle response or error
            error = None
            if isinstance(cached, dict):
                error = cached
            else:
                mention = send.WebmentionSend(source_url,
                                              target,
                                              endpoint=cached)
                headers = util.request_headers(source=self.source)
                logging.info('Sending...')
                try:
                    if not mention.send(timeout=999, headers=headers):
                        error = mention.error
                except BaseException as e:
                    logging.info('', stack_info=True)
                    error = getattr(mention, 'error')
                    if not error:
                        error = ({
                            'code': 'BAD_TARGET_URL',
                            'http_status': 499
                        } if 'DNS lookup failed for URL:' in str(e) else {
                            'code': 'EXCEPTION'
                        })

            error_code = error['code'] if error else None
            if error_code != 'BAD_TARGET_URL' and not cached:
                val = error if error_code == 'NO_ENDPOINT' else mention.receiver_endpoint
                with util.webmention_endpoint_cache_lock:
                    util.webmention_endpoint_cache[cache_key] = val

            if error is None:
                logging.info('Sent! %s', mention.response)
                self.record_source_webmention(mention)
                self.entity.sent.append(target)
            else:
                status = error.get('http_status', 0)
                if (error_code == 'NO_ENDPOINT'
                        or (error_code == 'BAD_TARGET_URL'
                            and status == 204)):  # No Content
                    logging.info('Giving up this target. %s', error)
                    self.entity.skipped.append(target)
                elif status // 100 == 4:
                    # Give up on 4XX errors; we don't expect later retries to succeed.
                    logging.info('Giving up this target. %s', error)
                    self.entity.failed.append(target)
                else:
                    self.fail('Error sending to endpoint: %s' % error,
                              level=logging.INFO)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)

        if self.entity.error:
            logging.info('Propagate task failed')
            self.release('error')
        else:
            self.complete()
Beispiel #43
0
    def _run(self):
        """Returns CreationResult on success, None otherwise."""
        logging.info('Params: %s', self.request.params.items())
        assert self.PREVIEW in (True, False)

        # parse and validate target URL
        try:
            parsed = urlparse.urlparse(self.target_url())
        except BaseException:
            return self.error('Could not parse target URL %s' %
                              self.target_url())

        domain = parsed.netloc
        path_parts = parsed.path.rsplit('/', 1)
        source_cls = SOURCE_NAMES.get(path_parts[-1])
        if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080')
                or len(path_parts) != 2 or path_parts[0] != '/publish'
                or not source_cls):
            return self.error(
                'Target must be brid.gy/publish/{facebook,flickr,github,twitter}'
            )
        elif source_cls in (GooglePlusPage, Instagram):
            return self.error('Sorry, %s is not yet supported.' %
                              source_cls.GR_CLASS.NAME)

        # resolve source URL
        url, domain, ok = util.get_webmention_target(
            self.source_url(), replace_test_domains=False)
        # show nice error message if they're trying to publish a silo post
        if domain in SOURCE_DOMAINS:
            return self.error(
                "Looks like that's a %s URL. Try one from your web site instead!"
                % SOURCE_DOMAINS[domain].GR_CLASS.NAME)
        elif not ok:
            return self.error('Unsupported source URL %s' % url)
        elif not domain:
            return self.error('Could not parse source URL %s' % url)

        # look up source by domain
        self.source = self._find_source(source_cls, url, domain)
        if not self.source:
            return  # _find_source rendered the error

        content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
        if content_param in self.request.params:
            return self.error('The %s parameter is not supported' %
                              content_param)

        # show nice error message if they're trying to publish their home page
        for domain_url in self.source.domain_urls:
            domain_url_parts = urlparse.urlparse(domain_url)
            for source_url in url, self.source_url():
                parts = urlparse.urlparse(source_url)
                if (parts.netloc == domain_url_parts.netloc
                        and parts.path.strip('/')
                        == domain_url_parts.path.strip('/')
                        and not parts.query):
                    return self.error(
                        "Looks like that's your home page. Try one of your posts instead!"
                    )

        # done with the sanity checks, ready to fetch the source url. create the
        # Publish entity so we can store the result.
        entity = self.get_or_add_publish_entity(url)
        if (entity.status == 'complete' and entity.type != 'preview'
                and not self.PREVIEW and not appengine_config.DEBUG):
            return self.error(
                "Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84"
            )
        self.entity = entity

        # fetch source page
        resp = self.fetch_mf2(url)
        if not resp:
            return
        self.fetched, data = resp

        # find rel-shortlink, if any
        # http://microformats.org/wiki/rel-shortlink
        # https://github.com/snarfed/bridgy/issues/173
        soup = util.beautifulsoup_parse(self.fetched.text)
        shortlinks = (soup.find_all('link', rel='shortlink') +
                      soup.find_all('a', rel='shortlink') +
                      soup.find_all('a', class_='shortlink'))
        if shortlinks:
            self.shortlink = shortlinks[0]['href']

        # loop through each item and its children and try to preview/create it. if
        # it fails, try the next one. break after the first one that works.
        result = None
        types = set()
        queue = collections.deque(data.get('items', []))
        while queue:
            item = queue.popleft()
            item_types = set(item.get('type'))
            if 'h-feed' in item_types and 'h-entry' not in item_types:
                queue.extend(item.get('children', []))
                continue
            elif not item_types & PUBLISHABLE_TYPES:
                types = types.union(item_types)
                continue

            try:
                result = self.attempt_single_item(item)
                if self.entity.published:
                    break
                if result.abort:
                    if result.error_plain:
                        self.error(result.error_plain,
                                   html=result.error_html,
                                   data=item)
                    return
                # try the next item
                for embedded in ('rsvp', 'invitee', 'repost', 'repost-of',
                                 'like', 'like-of', 'in-reply-to'):
                    if embedded in item.get('properties', []):
                        item_types.add(embedded)
                logging.info(
                    'Object type(s) %s not supported; error=%s; trying next.',
                    item_types, result.error_plain)
                types = types.union(item_types)
                queue.extend(item.get('children', []))
            except BaseException, e:
                code, body = util.interpret_http_exception(e)
                if not code:
                    raise
                msg = 'Error from %s API or your site: %s %s' % (
                    self.source.GR_CLASS.NAME, body or '', e)
                return self.error(msg,
                                  status=code,
                                  mail=code not in ('502', '503', '504'))
Beispiel #44
0
    def _run(self):
        """Returns CreationResult on success, None otherwise."""
        logging.info('Params: %s', self.request.params.items())
        assert self.PREVIEW in (True, False)

        # parse and validate target URL
        try:
            parsed = urlparse.urlparse(self.target_url())
        except BaseException:
            return self.error('Could not parse target URL %s' %
                              self.target_url())

        domain = parsed.netloc
        path_parts = parsed.path.rsplit('/', 1)
        source_cls = SOURCE_NAMES.get(path_parts[-1])
        if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080')
                or len(path_parts) != 2 or path_parts[0] != '/publish'
                or not source_cls):
            return self.error(
                'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}'
            )
        elif source_cls == GooglePlusPage:
            return self.error('Sorry, %s is not yet supported.' %
                              source_cls.GR_CLASS.NAME)

        # resolve source URL
        url, domain, ok = util.get_webmention_target(
            self.source_url(), replace_test_domains=False)
        # show nice error message if they're trying to publish a silo post
        if domain in SOURCE_DOMAINS:
            return self.error(
                "Looks like that's a %s URL. Try one from your web site instead!"
                % SOURCE_DOMAINS[domain].GR_CLASS.NAME)
        elif not ok:
            return self.error('Unsupported source URL %s' % url)
        elif not domain:
            return self.error('Could not parse source URL %s' % url)

        # look up source by domain
        domain = domain.lower()
        sources = source_cls.query().filter(
            source_cls.domains == domain).fetch(100)
        if not sources:
            return self.error(
                "Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again."
                % {
                    'type': source_cls.GR_CLASS.NAME,
                    'domain': domain
                })

        current_url = ''
        for source in sources:
            logging.info('Source: %s , features %s, status %s, poll status %s',
                         source.bridgy_url(self), source.features,
                         source.status, source.poll_status)
            if source.status != 'disabled' and 'publish' in source.features:
                # use a source that has a domain_url matching the url provided.
                # look through each source to find the one with the closest match.
                for domain_url in source.domain_urls:
                    if (url.lower().startswith(domain_url.lower().strip('/'))
                            and len(domain_url) > len(current_url)):
                        self.source = source
                        current_url = domain_url

        if not self.source:
            return self.error(
                'Publish is not enabled for your account. Please visit https://brid.gy and sign up!'
            )

        content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
        if content_param in self.request.params:
            return self.error('The %s parameter is not supported' %
                              content_param)

        # show nice error message if they're trying to publish their home page
        for domain_url in self.source.domain_urls:
            domain_url_parts = urlparse.urlparse(domain_url)
            source_url_parts = urlparse.urlparse(self.source_url())
            if (source_url_parts.netloc == domain_url_parts.netloc
                    and source_url_parts.path.strip('/')
                    == domain_url_parts.path.strip('/')
                    and not source_url_parts.query):
                return self.error(
                    "Looks like that's your home page. Try one of your posts instead!"
                )

        # done with the sanity checks, ready to fetch the source url. create the
        # Publish entity so we can store the result.
        entity = self.get_or_add_publish_entity(url)
        if (entity.status == 'complete' and entity.type != 'preview'
                and not self.PREVIEW and not appengine_config.DEBUG):
            return self.error(
                "Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!"
            )
        self.entity = entity

        # fetch source page
        resp = self.fetch_mf2(url)
        if not resp:
            return
        self.fetched, data = resp

        # find rel-shortlink, if any
        # http://microformats.org/wiki/rel-shortlink
        # https://github.com/snarfed/bridgy/issues/173
        soup = util.beautifulsoup_parse(self.fetched.text)
        shortlinks = (soup.find_all('link', rel='shortlink') +
                      soup.find_all('a', rel='shortlink') +
                      soup.find_all('a', class_='shortlink'))
        if shortlinks:
            self.shortlink = shortlinks[0]['href']

        # loop through each item and its children and try to preview/create it. if
        # it fails, try the next one. break after the first one that works.
        resp = None
        types = set()
        queue = collections.deque(data.get('items', []))
        while queue:
            item = queue.popleft()
            item_types = set(item.get('type'))
            if 'h-feed' in item_types and 'h-entry' not in item_types:
                queue.extend(item.get('children', []))
                continue
            elif not item_types & PUBLISHABLE_TYPES:
                continue

            try:
                result = self.attempt_single_item(item)
                if self.entity.published:
                    break
                if result.abort:
                    if result.error_plain:
                        self.error(result.error_plain,
                                   html=result.error_html,
                                   data=item)
                    return
                # try the next item
                for embedded in ('rsvp', 'invitee', 'repost', 'repost-of',
                                 'like', 'like-of', 'in-reply-to'):
                    if embedded in item.get('properties', []):
                        item_types.add(embedded)
                logging.info(
                    'Object type(s) %s not supported; error=%s; trying next.',
                    item_types, result.error_plain)
                types = types.union(item_types)
                queue.extend(item.get('children', []))
            except BaseException, e:
                code, body = util.interpret_http_exception(e)
                return self.error('Error: %s %s' % (body or '', e),
                                  status=code or 500,
                                  mail=True)
Beispiel #45
0
  def do_send_webmentions(self):
    unsent = set()
    for url in self.entity.unsent + self.entity.error:
      # recheck the url here since the checks may have failed during the poll
      # or streaming add.
      url, domain, ok = util.get_webmention_target(url)
      if ok:
        # When debugging locally, redirect our own webmentions to localhost
        if appengine_config.DEBUG and domain in util.LOCALHOST_TEST_DOMAINS:
            url = url.replace(domain, 'localhost')
        unsent.add(url)
    self.entity.unsent = sorted(unsent)
    self.entity.error = []

    while self.entity.unsent:
      target = self.entity.unsent.pop(0)
      source_url = self.source_url(target)
      logging.info('Webmention from %s to %s', source_url, target)

      # see if we've cached webmention discovery for this domain. the cache
      # value is a string URL endpoint if discovery succeeded, a
      # WebmentionSend error dict if it failed (semi-)permanently, or None.
      domain = util.domain_from_link(target)
      cache_key = 'W ' + domain
      cached = memcache.get(cache_key)
      if cached:
        logging.info('Using cached webmention endpoint for %s: %s',
                     domain, cached)

      # send! and handle response or error
      error = None
      if isinstance(cached, dict):
        error = cached
      else:
        mention = send.WebmentionSend(source_url, target, endpoint=cached)
        logging.info('Sending...')
        try:
          if not mention.send(timeout=999):
            error = mention.error
        except:
          logging.warning('', exc_info=True)
          error = getattr(mention, 'error', None)
          if not error:
            error = {'code': 'EXCEPTION'}

      if error is None:
        logging.info('Sent! %s', mention.response)
        if not self.entity.sent:
          self.set_last_webmention_sent()
        self.entity.sent.append(target)
        memcache.set(cache_key, mention.receiver_endpoint,
                     time=WEBMENTION_DISCOVERY_CACHE_TIME)
      else:
        if error['code'] == 'NO_ENDPOINT':
          logging.info('Giving up this target. %s', error)
          self.entity.skipped.append(target)
          memcache.set(cache_key, error, time=WEBMENTION_DISCOVERY_CACHE_TIME)
        elif (error['code'] == 'BAD_TARGET_URL' and
              error['http_status'] / 100 == 4):
          # Give up on 4XX errors; we don't expect later retries to succeed.
          logging.info('Giving up this target. %s', error)
          self.entity.failed.append(target)
        else:
          self.fail('Error sending to endpoint: %s' % error)
          self.entity.error.append(target)

      if target in self.entity.unsent:
        self.entity.unsent.remove(target)

    if self.entity.error:
      logging.warning('Propagate task failed')
      self.release('error')
    else:
      self.complete()
Beispiel #46
0
  def do_send_webmentions(self):
    urls = self.entity.unsent + self.entity.error + self.entity.failed
    unsent = set()
    self.entity.error = []
    self.entity.failed = []

    for orig_url in urls:
      # recheck the url here since the checks may have failed during the poll
      # or streaming add.
      url, domain, ok = util.get_webmention_target(orig_url)
      if ok:
        if len(url) <= _MAX_STRING_LENGTH:
          unsent.add(url)
        else:
          logging.warning('Giving up on target URL over %s chars! %s',
                          _MAX_STRING_LENGTH, url)
          self.entity.failed.append(orig_url)
    self.entity.unsent = sorted(unsent)

    while self.entity.unsent:
      target = self.entity.unsent.pop(0)
      source_url = self.source_url(target)
      logging.info('Webmention from %s to %s', source_url, target)

      # see if we've cached webmention discovery for this domain. the cache
      # value is a string URL endpoint if discovery succeeded, a
      # WebmentionSend error dict if it failed (semi-)permanently, or None.
      cache_key = util.webmention_endpoint_cache_key(target)
      cached = memcache.get(cache_key)
      if cached:
        logging.info('Using cached webmention endpoint %r: %s', cache_key, cached)

      # send! and handle response or error
      error = None
      if isinstance(cached, dict):
        error = cached
      else:
        mention = send.WebmentionSend(source_url, target, endpoint=cached)
        logging.info('Sending...')
        try:
          if not mention.send(timeout=999, headers=util.USER_AGENT_HEADER):
            error = mention.error
        except BaseException, e:
          logging.warning('', exc_info=True)
          error = getattr(mention, 'error')
          if not error:
            error = ({'code': 'BAD_TARGET_URL', 'http_status': 499}
                     if 'DNS lookup failed for URL:' in str(e)
                     else {'code': 'EXCEPTION'})

      if not cached:
        val = (error if error and error['code'] in ('NO_ENDPOINT', 'BAD_TARGET_URL')
               else mention.receiver_endpoint)
        memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME)

      if error is None:
        logging.info('Sent! %s', mention.response)
        self.record_source_webmention(mention)
        self.entity.sent.append(target)
      else:
        code = error['code']
        status = error.get('http_status', 0)
        if (code == 'NO_ENDPOINT' or
            (code == 'BAD_TARGET_URL' and status == 204)):  # 204 is No Content
          logging.info('Giving up this target. %s', error)
          self.entity.skipped.append(target)
        elif status // 100 == 4:
          # Give up on 4XX errors; we don't expect later retries to succeed.
          logging.info('Giving up this target. %s', error)
          self.entity.failed.append(target)
        else:
          self.fail('Error sending to endpoint: %s' % error)
          self.entity.error.append(target)

      if target in self.entity.unsent:
        self.entity.unsent.remove(target)
Beispiel #47
0
  def add_original_post_urls(self, post, obj, prop):
    """Extracts original post URLs and adds them to an object, in place.

    If the post object has upstreamDuplicates, *only* they are considered
    original post URLs and added as tags with objectType 'article', and the
    post's own links and 'article' tags are added with objectType 'mention'.

    Args:
      post: ActivityStreams post object to get original post URLs from
      obj: ActivityStreams post object to add original post URLs to
      prop: string property name in obj to add the original post URLs to
    """
    original_post_discovery.discover(self.source, post, fetch_hfeed=False)
    tags = [tag for tag in post['object'].get('tags', [])
            if 'url' in tag and tag['objectType'] == 'article']
    upstreams = post['object'].get('upstreamDuplicates', [])

    if not isinstance(obj.setdefault(prop, []), list):
      obj[prop] = [obj[prop]]
    if upstreams:
      obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams]
      obj.setdefault('tags', []).extend(
        [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags])
    else:
      obj[prop] += tags

    # check for redirects, and if there are any follow them and add final urls
    # in addition to the initial urls.
    seen = set()
    tags = obj.get('tags', [])
    for url_list in obj[prop], tags:
      for url_obj in url_list:
        url = util.clean_webmention_url(url_obj.get('url', ''))
        if not url or url in seen:
          continue
        seen.add(url)
        # when debugging locally, replace my (snarfed.org) URLs with localhost
        url_obj['url'] = url = util.replace_test_domains_with_localhost(url)
        resolved, _, send = util.get_webmention_target(url)
        if send and resolved != url and resolved not in seen:
          seen.add(resolved)
          url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')})

    # if the http version of a link is in upstreams but the https one is just a
    # mention, or vice versa, promote them both to upstream.
    # https://github.com/snarfed/bridgy/issues/290
    #
    # TODO: for links that came from resolving redirects above, this doesn't
    # also catch the initial pre-redirect link. ah well.
    prop_schemeful = set(tag['url'] for tag in obj[prop] if tag.get('url'))
    prop_schemeless = set(util.schemeless(url) for url in prop_schemeful)

    for url_obj in copy.copy(tags):
      url = url_obj.get('url', '')
      schemeless = util.schemeless(url)
      if schemeless in prop_schemeless and url not in prop_schemeful:
        obj[prop].append(url_obj)
        tags.remove(url_obj)
        prop_schemeful.add(url)

    logging.info('After original post discovery, urls are: %s', seen)
Beispiel #48
0
 def test_get_webmention_second_redirect_not_text_html(self):
     self.expect_requests_head(
         "http://orig", redirected_url=["http://middle", "https://end"], content_type="application/pdf"
     )
     self.mox.ReplayAll()
     self.assert_equals(("https://end", "end", False), util.get_webmention_target("http://orig", resolve=True))
Beispiel #49
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    try:
        logging.debug('fetching author url %s', author_url)
        author_resp = util.requests_get(author_url)
        # TODO for error codes that indicate a temporary error, should we make
        # a certain number of retries before giving up forever?
        author_resp.raise_for_status()
        author_dom = util.beautifulsoup_parse(author_resp.text)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logging.info('Could not fetch author url %s',
                     author_url,
                     exc_info=True)
        return {}

    feeditems = _find_feed_items(author_url, author_dom)

    # look for all other feed urls using rel='feed', type='text/html'
    feed_urls = set()
    for rel_feed_node in (author_dom.find_all('link', rel='feed') +
                          author_dom.find_all('a', rel='feed')):
        feed_url = rel_feed_node.get('href')
        if not feed_url:
            continue

        feed_url = urlparse.urljoin(author_url, feed_url)
        feed_type = rel_feed_node.get('type')
        if feed_type and feed_type != 'text/html':
            feed_ok = False
        else:
            # double check that it's text/html, not too big, etc
            feed_url, _, feed_ok = util.get_webmention_target(feed_url)

        if feed_url == author_url:
            logging.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logging.debug('skipping feed of type %s', feed_type)
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logging.debug("fetching author's rel-feed %s", feed_url)
            feed_resp = util.requests_get(feed_url)
            feed_resp.raise_for_status()
            logging.debug("author's rel-feed fetched successfully %s",
                          feed_url)
            feeditems = _merge_hfeeds(
                feeditems, _find_feed_items(feed_url, feed_resp.text))

            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logging.info(
                        'rel-feed found new domain %s! adding to source',
                        domain)
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logging.info('Could not fetch h-feed url %s.',
                         feed_url,
                         exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published')

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logging.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, basestring):
                    permalink_to_entry[permalink] = child
                else:
                    logging.warn('unexpected non-string "url" property: %s',
                                 permalink)

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logging.info('Hit cap of %d permalinks. Stopping.', max)
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.iteritems():
        logging.debug('processing permalink: %s', permalink)
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.iteritems():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
Beispiel #50
0
def process_entry(source,
                  permalink,
                  feed_entry,
                  refetch,
                  preexisting,
                  store_blanks=True):
    """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logger.debug(
                f'previously found relationship(s) for original {permalink}: {synds}'
            )

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    usynd_urls = {url for url in usynd if isinstance(url, str)}
    if usynd_urls:
        logger.debug(
            f'u-syndication links on the h-feed h-entry: {usynd_urls}')
    results = _process_syndication_urls(source, permalink, usynd_urls,
                                        preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url or not feed_entry:
        # fetch the full permalink page if we think it might have more details
        mf2 = None
        try:
            if type_ok:
                logger.debug(f'fetching post permalink {permalink}')
                mf2 = util.fetch_mf2(permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logger.info(f'Could not fetch permalink {permalink}',
                        exc_info=True)
            success = False

        if mf2:
            syndication_urls = set()
            relsynd = mf2['rels'].get('syndication', [])
            if relsynd:
                logger.debug(f'rel-syndication links: {relsynd}')
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, str))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in mf2['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logger.debug(f'u-syndication links: {usynd}')
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, str))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = list(itertools.chain(*results.values()))
        for syndpost in preexisting:
            if syndpost.syndication and syndpost not in result_syndposts:
                logger.info(
                    f'deleting relationship that disappeared: {syndpost}')
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logger.debug(
            f'no syndication links from {permalink} to current source {source.label()}.'
        )
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logger.debug(
                f'saving empty relationship so that {permalink} will not be searched again'
            )
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.items():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logger.debug(f'discovered relationships {new_results}')
    return new_results
Beispiel #51
0
  def post(self):
    logging.info('Params: %self', self.request.params.items())
    self.source_url = util.get_required_param(self, 'source')
    self.target_url = util.get_required_param(self, 'target')
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url)
    except BaseException:
      return self.error(msg, 'Could not parse target URL %s' % self.target_url)

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error('Target must be brid.gy/publish/{facebook,twitter}')
    elif source_cls in (Instagram, GooglePlusPage):
      return self.error('Sorry, %s is not yet supported.' %
                        source_cls.AS_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(self.source_url)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].AS_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # When debugging locally, use snarfed.org for localhost webmentions
    if appengine_config.DEBUG and domain == 'localhost':
      domain = 'snarfed.org'

    # look up source by domain
    domain = domain.lower()
    sources = source_cls.query().filter(source_cls.domains == domain).fetch(100)
    if not sources:
      return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." %
        {'type': source_cls.AS_CLASS.NAME, 'domain': domain})

    for source in sources:
      logging.info('Source: %s , features %s, status %s' %
                   (source.bridgy_url(self), source.features, source.status))
      if source.status == 'enabled' and 'publish' in source.features:
        self.source = source
        break
    else:
      return self.error(
        'Publish is not enabled for your account(s). Please visit %s and sign up!' %
        ' or '.join(s.bridgy_url(self) for s in sources))

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      source_url_parts = urlparse.urlparse(self.source_url)
      if (source_url_parts.netloc == domain_url_parts.netloc and
          source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and
          not source_url_parts.query):
        return self.error(
          "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    entity = self.get_or_add_publish_entity(url)
    if (entity.status == 'complete' and entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!")
    self.entity = entity

    # fetch source page
    resp = self.fetch_mf2(url)
    if not resp:
      return
    self.fetched, data = resp

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    resp = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue

      try:
        resp = self.attempt_single_item(item)
        if self.entity.published:
          break
        if resp.abort:
          return self.error(resp.error_plain, html=resp.error_html, data=item)
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.error(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, resp.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = handlers.interpret_http_exception(e)
        return self.error('Error: %s %s' % (body or '', e), status=code or 500,
                          mail=True)
def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of models.Source
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new models.SyndicatedPost
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  # TODO skip sites we know don't have microformats2 markup
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = BeautifulSoup(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.warning('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if not feed_type:
      # type is not specified, use this to confirm that it's text/html
      feed_url, _, feed_type_ok = util.get_webmention_target(feed_url)
    else:
      feed_type_ok = feed_type == 'text/html'

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_type_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.warning('Could not fetch h-feed url %s.', feed_url,
                      exc_info=True)

  permalink_to_entry = {}
  for child in feeditems:
    if 'h-entry' in child['type']:
      # TODO maybe limit to first ~30 entries? (do that here rather than,
      # below because we want the *first* n entries)
      for permalink in child['properties'].get('url', []):
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = _process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    now = util.now_fn()
    logging.debug('updating source last_syndication_url %s', now)
    source.updates['last_syndication_url'] = now

  return results
Beispiel #53
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.info('Giving up on target URL over %s chars! %s',
                                 _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info('Webmention from %s to %s', source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, NO_ENDPOINT if
            # no endpoint was ofund.
            cache_key = util.webmention_endpoint_cache_key(target)
            endpoint = util.webmention_endpoint_cache.get(cache_key)
            if endpoint:
                logging.info('Using cached webmention endpoint %r: %s',
                             cache_key, endpoint)

            # send! and handle response or error
            try:
                resp = None
                headers = util.request_headers(source=self.source)
                if not endpoint:
                    endpoint, resp = webmention.discover(target,
                                                         headers=headers)
                    with util.webmention_endpoint_cache_lock:
                        util.webmention_endpoint_cache[
                            cache_key] = endpoint or NO_ENDPOINT

                if endpoint and endpoint != NO_ENDPOINT:
                    logging.info('Sending...')
                    resp = webmention.send(endpoint,
                                           source_url,
                                           target,
                                           timeout=999,
                                           headers=headers)
                    logging.info('Sent! %s', resp)
                    self.record_source_webmention(endpoint, target)
                    self.entity.sent.append(target)
                else:
                    logging.info('Giving up this target.')
                    self.entity.skipped.append(target)

            except ValueError:
                logging.info('Bad URL; giving up this target.')
                self.entity.skipped.append(target)

            except BaseException as e:
                logging.info('', exc_info=True)
                # Give up on 4XX and DNS errors; we don't expect retries to succeed.
                code, _ = util.interpret_http_exception(e)
                if (code and
                        code.startswith('4')) or 'DNS lookup failed' in str(e):
                    logging.info('Giving up this target.')
                    self.entity.failed.append(target)
                else:
                    self.fail(f'Error sending to endpoint: {resp}',
                              level=logging.INFO)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)

        if self.entity.error:
            logging.info('Propagate task failed')
            self.release('error')
        else:
            self.complete()
Beispiel #54
0
  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', self.request.params.items())
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}')
    elif source_cls == GooglePlusPage:
      return self.error('Sorry, %s is not yet supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    domain = domain.lower()
    sources = source_cls.query().filter(source_cls.domains == domain).fetch(100)
    if not sources:
      return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." %
        {'type': source_cls.GR_CLASS.NAME, 'domain': domain})

    for source in sources:
      logging.info('Source: %s , features %s, status %s, poll status %s',
                   source.bridgy_url(self), source.features, source.status,
                   source.poll_status)
      if source.status != 'disabled' and 'publish' in source.features:
        self.source = source
        break
    else:
      return self.error(
        'Publish is not enabled for your account(s). Please visit %s and sign up!' %
        ' or '.join(s.bridgy_url(self) for s in sources))

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      source_url_parts = urlparse.urlparse(self.source_url())
      if (source_url_parts.netloc == domain_url_parts.netloc and
          source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and
          not source_url_parts.query):
        return self.error(
          "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    entity = self.get_or_add_publish_entity(url)
    if (entity.status == 'complete' and entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!")
    self.entity = entity

    # fetch source page
    resp = self.fetch_mf2(url)
    if not resp:
      return
    self.fetched, data = resp

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    soup = BeautifulSoup(self.fetched.text)
    shortlinks = (soup.find_all('link', rel='shortlink') +
                  soup.find_all('a', rel='shortlink') +
                  soup.find_all('a', class_='shortlink'))
    if shortlinks:
      self.shortlink = shortlinks[0]['href']

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    resp = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = util.interpret_http_exception(e)
        return self.error('Error: %s %s' % (body or '', e), status=code or 500,
                          mail=True)
def _process_entry(source, permalink, feed_entry, refetch_blanks, preexisting):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch_blanks: boolean whether we should ignore blank preexisting
      SyndicatedPosts
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
  results = {}

  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting:
    # if we're refetching blanks and this one is blank, do not return.
    # if there is a blank entry, it should be the one and only entry,
    # but go ahead and check 'all' of them to be safe.
    if refetch_blanks and all(not p.syndication for p in preexisting):
      logging.debug('ignoring blank relationship for original %s', permalink)
    else:
      return results

  # first try with the h-entry from the h-feed. if we find the syndication url
  # we're looking for, we don't have to fetch the permalink
  usynd = feed_entry.get('properties', {}).get('syndication', [])
  logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
  results = _process_syndication_urls(source, permalink, set(
    url for url in usynd if isinstance(url, basestring)))

  # fetch the full permalink page, which often has more detailed information
  if not results:
    parsed = None
    try:
      logging.debug('fetching post permalink %s', permalink)
      permalink, _, type_ok = util.get_webmention_target(permalink)
      if type_ok:
        resp = requests.get(permalink, timeout=HTTP_TIMEOUT)
        resp.raise_for_status()
        parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
    except BaseException:
      # TODO limit the number of allowed failures
      logging.warning('Could not fetch permalink %s', permalink, exc_info=True)

    if parsed:
      syndication_urls = set()
      relsynd = parsed.get('rels').get('syndication', [])
      logging.debug('rel-syndication links: %s', relsynd)
      syndication_urls.update(url for url in relsynd
                              if isinstance(url, basestring))
      # there should only be one h-entry on a permalink page, but
      # we'll check all of them just in case.
      for hentry in (item for item in parsed['items']
                     if 'h-entry' in item['type']):
        usynd = hentry.get('properties', {}).get('syndication', [])
        logging.debug('u-syndication links: %s', usynd)
        syndication_urls.update(url for url in usynd
                                if isinstance(url, basestring))
      results = _process_syndication_urls(source, permalink,
                                          syndication_urls)

  if not results:
    logging.debug('no syndication links from %s to current source %s.',
                  permalink, source.label())
    if not preexisting:
      # remember that this post doesn't have syndication links for this
      # particular source
      logging.debug('saving empty relationship so that it %s will not be '
                    'searched again', permalink)
      SyndicatedPost.insert_original_blank(source, permalink)

  logging.debug('discovered relationships %s', results)
  return results