Esempio n. 1
0
    def test_no_accept_header(self):
        self.assertEqual({}, util.request_headers(url='http://foo/bar'))
        self.assertEqual({},
                         util.request_headers(source=Twitter(id='not-rhiaro')))

        self.expect_requests_get('http://foo/bar', '')
        self.mox.ReplayAll()
        util.requests_get('http://foo/bar')
Esempio n. 2
0
  def test_no_accept_header(self):
    self.assertEquals(util.REQUEST_HEADERS,
                      util.request_headers(url='http://foo/bar'))
    self.assertEquals(util.REQUEST_HEADERS,
                      util.request_headers(source=Twitter(id='not-rhiaro')))

    self.expect_requests_get('http://foo/bar', '', headers=util.REQUEST_HEADERS)
    self.mox.ReplayAll()
    util.requests_get('http://foo/bar')
Esempio n. 3
0
  def test_rhiaro_accept_header(self):
    """Only send Accept header to rhiaro.co.uk right now.
    https://github.com/snarfed/bridgy/issues/713
    """
    self.assertEqual(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(url='http://rhiaro.co.uk/'))
    self.assertEqual(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(source=Twitter(id='rhiaro')))

    self.expect_requests_get('http://rhiaro.co.uk/', '',
                             headers=util.REQUEST_HEADERS_CONNEG)
    self.mox.ReplayAll()
    util.requests_get('http://rhiaro.co.uk/')
Esempio n. 4
0
  def test_rhiaro_accept_header(self):
    """Only send Accept header to rhiaro.co.uk right now.
    https://github.com/snarfed/bridgy/issues/713
    """
    self.assertEquals(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(url='http://rhiaro.co.uk/'))
    self.assertEquals(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(source=Twitter(id='rhiaro')))

    self.expect_requests_get('http://rhiaro.co.uk/', '',
                             headers=util.REQUEST_HEADERS_CONNEG)
    self.mox.ReplayAll()
    util.requests_get('http://rhiaro.co.uk/')
Esempio n. 5
0
def discover(source,
             activity,
             fetch_hfeed=True,
             include_redirect_sources=True,
             already_fetched_hfeeds=None):
    """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously found
  :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find
  new ones.

  Args:
    source: :class:`models.Source` subclass. Changes to property values (e.g.
      domains, domain_urls, last_syndication_url) are stored in source.updates;
      they should be updated transactionally later.
    activity: activity dict
    fetch_hfeed: boolean
    include_redirect_sources: boolean, whether to include URLs that redirect as
      well as their final destination URLs
    already_fetched_hfeeds: set, URLs that we have already fetched and run
      posse-post-discovery on, so we can avoid running it multiple times

  Returns:
    (set(string original post URLs), set(string mention URLs)) tuple
  """
    label = activity.get('url') or activity.get('id')
    logger.debug(f'discovering original posts for: {label}')

    if not source.updates:
        source.updates = {}

    if already_fetched_hfeeds is None:
        already_fetched_hfeeds = set()

    originals, mentions = gr_source.Source.original_post_discovery(
        activity,
        domains=source.domains,
        include_redirect_sources=include_redirect_sources,
        include_reserved_hosts=DEBUG,
        max_redirect_fetches=MAX_ORIGINAL_CANDIDATES,
        headers=util.request_headers(source=source))

    # only include mentions of the author themselves.
    # (mostly just for Mastodon; other silos' domains are all in the blocklist, so
    # their mention URLs get dropped later anyway.)
    # (these are originally added in Source._inject_user_urls() and in poll step 2.)
    obj = activity.get('object', {})
    other_user_mentions = set(
        t.get('url') for t in obj.get('tags', [])
        if t.get('objectType') == 'person'
        and t.get('url') not in source.domain_urls)
    originals -= other_user_mentions
    mentions -= other_user_mentions

    # original posts are only from the author themselves
    obj_author = obj.get('author', {})
    activity_author = activity.get('actor', {})
    author_id = obj_author.get('id') or activity_author.get('id')
    author_username = obj_author.get('username') or activity_author.get(
        'username')
    if (author_id and author_id != source.user_tag_id()
            and author_username != source.key.id()):
        logger.info(
            f"Demoting original post links because user {source.user_tag_id()} doesn't match author id {author_id} username {author_username}"
        )
        # this is someone else's post, so all links must be mentions
        mentions.update(originals)
        originals = set()

    # look for original URL of attachments (e.g. quote tweets)
    for att in obj.get('attachments', []):
        if (att.get('objectType') in ('note', 'article')
                and att.get('author', {}).get('id') == source.user_tag_id()):
            logger.debug(
                f"running original post discovery on attachment: {att.get('id')}"
            )
            att_origs, _ = discover(
                source, att, include_redirect_sources=include_redirect_sources)
            logger.debug(
                f'original post discovery found originals for attachment, {att_origs}'
            )
            mentions.update(att_origs)

    if len(originals) > MAX_ORIGINAL_CANDIDATES:
        logging.info(
            f'{len(originals)} originals, pruning down to {MAX_ORIGINAL_CANDIDATES}'
        )
        originals = sorted(originals)[:MAX_ORIGINAL_CANDIDATES]
    if len(mentions) > MAX_MENTION_CANDIDATES:
        logging.info(
            f'{len(mentions)} mentions, pruning down to {MAX_MENTION_CANDIDATES}'
        )
        mentions = sorted(mentions)[:MAX_MENTION_CANDIDATES]

    def resolve(urls):
        resolved = set()
        for url in urls:
            final, domain, send = util.get_webmention_target(url)
            if send and domain != source.gr_source.DOMAIN:
                resolved.add(final)
                if include_redirect_sources:
                    resolved.add(url)
        return resolved

    originals = resolve(originals)
    mentions = resolve(mentions)

    if not source.get_author_urls():
        logger.debug('no author url(s), cannot find h-feed')
        return ((originals, mentions)
                if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
                (set(), set()))

    # TODO possible optimization: if we've discovered a backlink to a post on the
    # author's domain (i.e., it included a link or citation), then skip the rest
    # of this.
    syndicated = []
    syndication_url = obj.get('url') or activity.get('url')
    if syndication_url:
        # use the canonical syndication url on both sides, so that we have
        # the best chance of finding a match. Some silos allow several
        # different permalink formats to point to the same place.
        syndication_url = source.canonicalize_url(syndication_url)
        if syndication_url:
            syndicated = _posse_post_discovery(source, activity,
                                               syndication_url, fetch_hfeed,
                                               already_fetched_hfeeds)
            originals.update(syndicated)
        originals = set(util.dedupe_urls(originals))

    if not syndication_url:
        logger.debug(
            f'no {source.SHORT_NAME} syndication url, cannot process h-entries'
        )

    return ((originals,
             mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
            (set(syndicated), set()))
Esempio n. 6
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.info('Giving up on target URL over %s chars! %s',
                                 _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info('Webmention from %s to %s', source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, a
            # WebmentionSend error dict if it failed (semi-)permanently, or None.
            cache_key = util.webmention_endpoint_cache_key(target)
            cached = util.webmention_endpoint_cache.get(cache_key)
            if cached:
                logging.info('Using cached webmention endpoint %r: %s',
                             cache_key, cached)

            # send! and handle response or error
            error = None
            if isinstance(cached, dict):
                error = cached
            else:
                mention = send.WebmentionSend(source_url,
                                              target,
                                              endpoint=cached)
                headers = util.request_headers(source=self.source)
                logging.info('Sending...')
                try:
                    if not mention.send(timeout=999, headers=headers):
                        error = mention.error
                except BaseException as e:
                    logging.info('', stack_info=True)
                    error = getattr(mention, 'error')
                    if not error:
                        error = ({
                            'code': 'BAD_TARGET_URL',
                            'http_status': 499
                        } if 'DNS lookup failed for URL:' in str(e) else {
                            'code': 'EXCEPTION'
                        })

            error_code = error['code'] if error else None
            if error_code != 'BAD_TARGET_URL' and not cached:
                val = error if error_code == 'NO_ENDPOINT' else mention.receiver_endpoint
                with util.webmention_endpoint_cache_lock:
                    util.webmention_endpoint_cache[cache_key] = val

            if error is None:
                logging.info('Sent! %s', mention.response)
                self.record_source_webmention(mention)
                self.entity.sent.append(target)
            else:
                status = error.get('http_status', 0)
                if (error_code == 'NO_ENDPOINT'
                        or (error_code == 'BAD_TARGET_URL'
                            and status == 204)):  # No Content
                    logging.info('Giving up this target. %s', error)
                    self.entity.skipped.append(target)
                elif status // 100 == 4:
                    # Give up on 4XX errors; we don't expect later retries to succeed.
                    logging.info('Giving up this target. %s', error)
                    self.entity.failed.append(target)
                else:
                    self.fail('Error sending to endpoint: %s' % error,
                              level=logging.INFO)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)

        if self.entity.error:
            logging.info('Propagate task failed')
            self.release('error')
        else:
            self.complete()
Esempio n. 7
0
def discover(source,
             activity,
             fetch_hfeed=True,
             include_redirect_sources=True,
             already_fetched_hfeeds=None):
    """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously found
  :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new
  ones.

  Args:
    source: :class:`models.Source` subclass. Changes to property values (e.g.
      domains, domain_urls, last_syndication_url) are stored in source.updates;
      they should be updated transactionally later.
    activity: activity dict
    fetch_hfeed: boolean
    include_redirect_sources: boolean, whether to include URLs that redirect as
      well as their final destination URLs
    already_fetched_hfeeds: set, URLs that we have already fetched and run
      posse-post-discovery on, so we can avoid running it multiple times

  Returns:
    (set(string original post URLs), set(string mention URLs)) tuple
  """
    logging.debug('discovering original posts for: %s',
                  activity.get('url') or activity.get('id'))

    if not source.updates:
        source.updates = {}

    if already_fetched_hfeeds is None:
        already_fetched_hfeeds = set()

    originals, mentions = gr_source.Source.original_post_discovery(
        activity,
        domains=source.domains,
        cache=memcache,
        include_redirect_sources=include_redirect_sources,
        headers=util.request_headers(source=source))

    obj = activity.get('object', {})
    author_id = obj.get('author', {}).get('id') or activity.get('author',
                                                                {}).get('id')
    if author_id and author_id != source.user_tag_id():
        logging.info(
            "Demoting original post links because user %s doesn't match author %s",
            source.user_tag_id(), author_id)
        # this is someone else's post, so all links must be mentions
        mentions.update(originals)
        originals = set()

    # look for original URL of attachments (e.g. quote tweets)
    for att in obj.get('attachments', []):
        if (att.get('objectType') in ('note', 'article')
                and att.get('author', {}).get('id') == source.user_tag_id()):
            logging.debug('running original post discovery on attachment: %s',
                          att.get('id'))
            att_origs, _ = discover(
                source, att, include_redirect_sources=include_redirect_sources)
            logging.debug(
                'original post discovery found originals for attachment, %s',
                att_origs)
            mentions.update(att_origs)

    def resolve(urls):
        resolved = set()
        for url in urls:
            final, _, send = util.get_webmention_target(url)
            if send:
                resolved.add(final)
                if include_redirect_sources:
                    resolved.add(url)
        return resolved

    originals = resolve(originals)
    mentions = resolve(mentions)

    if not source.get_author_urls():
        logging.debug('no author url(s), cannot find h-feed')
        return ((originals, mentions)
                if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
                (set(), set()))

    # TODO possible optimization: if we've discovered a backlink to a post on the
    # author's domain (i.e., it included a link or citation), then skip the rest
    # of this.
    syndicated = []
    syndication_url = obj.get('url') or activity.get('url')
    if syndication_url:
        # use the canonical syndication url on both sides, so that we have
        # the best chance of finding a match. Some silos allow several
        # different permalink formats to point to the same place (e.g.,
        # facebook user id instead of user name)
        syndication_url = source.canonicalize_url(syndication_url)
        if syndication_url:
            syndicated = _posse_post_discovery(source, activity,
                                               syndication_url, fetch_hfeed,
                                               already_fetched_hfeeds)
            originals.update(syndicated)
        originals = set(util.dedupe_urls(originals))

    if not syndication_url:
        logging.debug('no %s syndication url, cannot process h-entries',
                      source.SHORT_NAME)

    return ((originals,
             mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else
            (set(syndicated), set()))
Esempio n. 8
0
  def do_send_webmentions(self):
    urls = self.entity.unsent + self.entity.error + self.entity.failed
    unsent = set()
    self.entity.error = []
    self.entity.failed = []

    for orig_url in urls:
      # recheck the url here since the checks may have failed during the poll
      # or streaming add.
      url, domain, ok = util.get_webmention_target(orig_url)
      if ok:
        if len(url) <= _MAX_STRING_LENGTH:
          unsent.add(url)
        else:
          logging.info('Giving up on target URL over %s chars! %s',
                       _MAX_STRING_LENGTH, url)
          self.entity.failed.append(orig_url)
    self.entity.unsent = sorted(unsent)

    while self.entity.unsent:
      target = self.entity.unsent.pop(0)
      source_url = self.source_url(target)
      logging.info('Webmention from %s to %s', source_url, target)

      # see if we've cached webmention discovery for this domain. the cache
      # value is a string URL endpoint if discovery succeeded, a
      # WebmentionSend error dict if it failed (semi-)permanently, or None.
      cache_key = util.webmention_endpoint_cache_key(target)
      cached = memcache.get(cache_key)
      if cached:
        logging.info('Using cached webmention endpoint %r: %s', cache_key, cached)

      # send! and handle response or error
      error = None
      if isinstance(cached, dict):
        error = cached
      else:
        mention = send.WebmentionSend(source_url, target, endpoint=cached)
        headers = util.request_headers(source=self.source)
        logging.info('Sending...')
        try:
          if not mention.send(timeout=999, headers=headers):
            error = mention.error
        except BaseException, e:
          logging.info('', exc_info=True)
          error = getattr(mention, 'error')
          if not error:
            error = ({'code': 'BAD_TARGET_URL', 'http_status': 499}
                     if 'DNS lookup failed for URL:' in str(e)
                     else {'code': 'EXCEPTION'})

      error_code = error['code'] if error else None
      if error_code != 'BAD_TARGET_URL' and not cached:
        val = error if error_code == 'NO_ENDPOINT' else mention.receiver_endpoint
        memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME)

      if error is None:
        logging.info('Sent! %s', mention.response)
        self.record_source_webmention(mention)
        self.entity.sent.append(target)
      else:
        status = error.get('http_status', 0)
        if (error_code == 'NO_ENDPOINT' or
            (error_code == 'BAD_TARGET_URL' and status == 204)):  # No Content
          logging.info('Giving up this target. %s', error)
          self.entity.skipped.append(target)
        elif status // 100 == 4:
          # Give up on 4XX errors; we don't expect later retries to succeed.
          logging.info('Giving up this target. %s', error)
          self.entity.failed.append(target)
        else:
          self.fail('Error sending to endpoint: %s' % error, level=logging.INFO)
          self.entity.error.append(target)

      if target in self.entity.unsent:
        self.entity.unsent.remove(target)
Esempio n. 9
0
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True,
             already_fetched_hfeeds=None):
  """Augments the standard original_post_discovery algorithm with a
  reverse lookup that supports posts without a backlink or citation.

  If fetch_hfeed is False, then we will check the db for previously found
  :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new
  ones.

  Args:
    source: :class:`models.Source` subclass. Changes to property values (e.g.
      domains, domain_urls, last_syndication_url) are stored in source.updates;
      they should be updated transactionally later.
    activity: activity dict
    fetch_hfeed: boolean
    include_redirect_sources: boolean, whether to include URLs that redirect as
      well as their final destination URLs
    already_fetched_hfeeds: set, URLs that we have already fetched and run
      posse-post-discovery on, so we can avoid running it multiple times

  Returns:
    (set(string original post URLs), set(string mention URLs)) tuple
  """
  if not source.updates:
    source.updates = {}

  if already_fetched_hfeeds is None:
    already_fetched_hfeeds = set()

  originals, mentions = gr_source.Source.original_post_discovery(
    activity, domains=source.domains, cache=memcache,
    include_redirect_sources=include_redirect_sources,
    headers=util.request_headers(source=source))

  obj = activity.get('object', {})
  author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id')
  if author_id and author_id != source.user_tag_id():
    logging.info(
      "Demoting original post links because user %s doesn't match author %s",
      source.user_tag_id(), author_id)
    # this is someone else's post, so all links must be mentions
    mentions.update(originals)
    originals = set()

  # look for original URL of attachments (e.g. quote tweets)
  for att in obj.get('attachments', []):
    if (att.get('objectType') in ('note', 'article')
        and att.get('author', {}).get('id') == source.user_tag_id()):
      logging.debug('running original post discovery on attachment: %s',
                    att.get('id'))
      att_origs, _ = discover(
        source, att, include_redirect_sources=include_redirect_sources)
      logging.debug('original post discovery found originals for attachment, %s',
                    att_origs)
      mentions.update(att_origs)

  def resolve(urls):
    resolved = set()
    for url in urls:
      final, _, send = util.get_webmention_target(url)
      if send:
        resolved.add(final)
        if include_redirect_sources:
          resolved.add(url)
    return resolved

  originals = resolve(originals)
  mentions = resolve(mentions)

  if not source.get_author_urls():
    logging.debug('no author url(s), cannot find h-feed')
    return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK
            else (set(), set()))

  # TODO possible optimization: if we've discovered a backlink to a post on the
  # author's domain (i.e., it included a link or citation), then skip the rest
  # of this.
  syndicated = []
  syndication_url = obj.get('url') or activity.get('url')
  if syndication_url:
    # use the canonical syndication url on both sides, so that we have
    # the best chance of finding a match. Some silos allow several
    # different permalink formats to point to the same place (e.g.,
    # facebook user id instead of user name)
    syndication_url = source.canonicalize_url(syndication_url)
    if syndication_url:
      syndicated = _posse_post_discovery(source, activity, syndication_url,
                                         fetch_hfeed, already_fetched_hfeeds)
      originals.update(syndicated)
    originals = set(util.dedupe_urls(originals))

  if not syndication_url:
    logging.debug('no %s syndication url, cannot process h-entries', source.SHORT_NAME)

  return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK
          else (set(syndicated), set()))
Esempio n. 10
0
    def do_send_webmentions(self):
        urls = self.entity.unsent + self.entity.error + self.entity.failed
        unsent = set()
        self.entity.error = []
        self.entity.failed = []

        for orig_url in urls:
            # recheck the url here since the checks may have failed during the poll
            # or streaming add.
            url, domain, ok = util.get_webmention_target(orig_url)
            if ok:
                if len(url) <= _MAX_STRING_LENGTH:
                    unsent.add(url)
                else:
                    logging.info('Giving up on target URL over %s chars! %s',
                                 _MAX_STRING_LENGTH, url)
                    self.entity.failed.append(orig_url)
        self.entity.unsent = sorted(unsent)

        while self.entity.unsent:
            target = self.entity.unsent.pop(0)
            source_url = self.source_url(target)
            logging.info('Webmention from %s to %s', source_url, target)

            # see if we've cached webmention discovery for this domain. the cache
            # value is a string URL endpoint if discovery succeeded, NO_ENDPOINT if
            # no endpoint was ofund.
            cache_key = util.webmention_endpoint_cache_key(target)
            endpoint = util.webmention_endpoint_cache.get(cache_key)
            if endpoint:
                logging.info('Using cached webmention endpoint %r: %s',
                             cache_key, endpoint)

            # send! and handle response or error
            try:
                resp = None
                headers = util.request_headers(source=self.source)
                if not endpoint:
                    endpoint, resp = webmention.discover(target,
                                                         headers=headers)
                    with util.webmention_endpoint_cache_lock:
                        util.webmention_endpoint_cache[
                            cache_key] = endpoint or NO_ENDPOINT

                if endpoint and endpoint != NO_ENDPOINT:
                    logging.info('Sending...')
                    resp = webmention.send(endpoint,
                                           source_url,
                                           target,
                                           timeout=999,
                                           headers=headers)
                    logging.info('Sent! %s', resp)
                    self.record_source_webmention(endpoint, target)
                    self.entity.sent.append(target)
                else:
                    logging.info('Giving up this target.')
                    self.entity.skipped.append(target)

            except ValueError:
                logging.info('Bad URL; giving up this target.')
                self.entity.skipped.append(target)

            except BaseException as e:
                logging.info('', exc_info=True)
                # Give up on 4XX and DNS errors; we don't expect retries to succeed.
                code, _ = util.interpret_http_exception(e)
                if (code and
                        code.startswith('4')) or 'DNS lookup failed' in str(e):
                    logging.info('Giving up this target.')
                    self.entity.failed.append(target)
                else:
                    self.fail(f'Error sending to endpoint: {resp}',
                              level=logging.INFO)
                    self.entity.error.append(target)

            if target in self.entity.unsent:
                self.entity.unsent.remove(target)

        if self.entity.error:
            logging.info('Propagate task failed')
            self.release('error')
        else:
            self.complete()