def test_no_accept_header(self): self.assertEqual({}, util.request_headers(url='http://foo/bar')) self.assertEqual({}, util.request_headers(source=Twitter(id='not-rhiaro'))) self.expect_requests_get('http://foo/bar', '') self.mox.ReplayAll() util.requests_get('http://foo/bar')
def test_no_accept_header(self): self.assertEquals(util.REQUEST_HEADERS, util.request_headers(url='http://foo/bar')) self.assertEquals(util.REQUEST_HEADERS, util.request_headers(source=Twitter(id='not-rhiaro'))) self.expect_requests_get('http://foo/bar', '', headers=util.REQUEST_HEADERS) self.mox.ReplayAll() util.requests_get('http://foo/bar')
def test_rhiaro_accept_header(self): """Only send Accept header to rhiaro.co.uk right now. https://github.com/snarfed/bridgy/issues/713 """ self.assertEqual(util.REQUEST_HEADERS_CONNEG, util.request_headers(url='http://rhiaro.co.uk/')) self.assertEqual(util.REQUEST_HEADERS_CONNEG, util.request_headers(source=Twitter(id='rhiaro'))) self.expect_requests_get('http://rhiaro.co.uk/', '', headers=util.REQUEST_HEADERS_CONNEG) self.mox.ReplayAll() util.requests_get('http://rhiaro.co.uk/')
def test_rhiaro_accept_header(self): """Only send Accept header to rhiaro.co.uk right now. https://github.com/snarfed/bridgy/issues/713 """ self.assertEquals(util.REQUEST_HEADERS_CONNEG, util.request_headers(url='http://rhiaro.co.uk/')) self.assertEquals(util.REQUEST_HEADERS_CONNEG, util.request_headers(source=Twitter(id='rhiaro'))) self.expect_requests_get('http://rhiaro.co.uk/', '', headers=util.REQUEST_HEADERS_CONNEG) self.mox.ReplayAll() util.requests_get('http://rhiaro.co.uk/')
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True, already_fetched_hfeeds=None): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new ones. Args: source: :class:`models.Source` subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. activity: activity dict fetch_hfeed: boolean include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs already_fetched_hfeeds: set, URLs that we have already fetched and run posse-post-discovery on, so we can avoid running it multiple times Returns: (set(string original post URLs), set(string mention URLs)) tuple """ label = activity.get('url') or activity.get('id') logger.debug(f'discovering original posts for: {label}') if not source.updates: source.updates = {} if already_fetched_hfeeds is None: already_fetched_hfeeds = set() originals, mentions = gr_source.Source.original_post_discovery( activity, domains=source.domains, include_redirect_sources=include_redirect_sources, include_reserved_hosts=DEBUG, max_redirect_fetches=MAX_ORIGINAL_CANDIDATES, headers=util.request_headers(source=source)) # only include mentions of the author themselves. # (mostly just for Mastodon; other silos' domains are all in the blocklist, so # their mention URLs get dropped later anyway.) # (these are originally added in Source._inject_user_urls() and in poll step 2.) obj = activity.get('object', {}) other_user_mentions = set( t.get('url') for t in obj.get('tags', []) if t.get('objectType') == 'person' and t.get('url') not in source.domain_urls) originals -= other_user_mentions mentions -= other_user_mentions # original posts are only from the author themselves obj_author = obj.get('author', {}) activity_author = activity.get('actor', {}) author_id = obj_author.get('id') or activity_author.get('id') author_username = obj_author.get('username') or activity_author.get( 'username') if (author_id and author_id != source.user_tag_id() and author_username != source.key.id()): logger.info( f"Demoting original post links because user {source.user_tag_id()} doesn't match author id {author_id} username {author_username}" ) # this is someone else's post, so all links must be mentions mentions.update(originals) originals = set() # look for original URL of attachments (e.g. quote tweets) for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): logger.debug( f"running original post discovery on attachment: {att.get('id')}" ) att_origs, _ = discover( source, att, include_redirect_sources=include_redirect_sources) logger.debug( f'original post discovery found originals for attachment, {att_origs}' ) mentions.update(att_origs) if len(originals) > MAX_ORIGINAL_CANDIDATES: logging.info( f'{len(originals)} originals, pruning down to {MAX_ORIGINAL_CANDIDATES}' ) originals = sorted(originals)[:MAX_ORIGINAL_CANDIDATES] if len(mentions) > MAX_MENTION_CANDIDATES: logging.info( f'{len(mentions)} mentions, pruning down to {MAX_MENTION_CANDIDATES}' ) mentions = sorted(mentions)[:MAX_MENTION_CANDIDATES] def resolve(urls): resolved = set() for url in urls: final, domain, send = util.get_webmention_target(url) if send and domain != source.gr_source.DOMAIN: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved originals = resolve(originals) mentions = resolve(mentions) if not source.get_author_urls(): logger.debug('no author url(s), cannot find h-feed') return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(), set())) # TODO possible optimization: if we've discovered a backlink to a post on the # author's domain (i.e., it included a link or citation), then skip the rest # of this. syndicated = [] syndication_url = obj.get('url') or activity.get('url') if syndication_url: # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place. syndication_url = source.canonicalize_url(syndication_url) if syndication_url: syndicated = _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds) originals.update(syndicated) originals = set(util.dedupe_urls(originals)) if not syndication_url: logger.debug( f'no {source.SHORT_NAME} syndication url, cannot process h-entries' ) return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(syndicated), set()))
def do_send_webmentions(self): urls = self.entity.unsent + self.entity.error + self.entity.failed unsent = set() self.entity.error = [] self.entity.failed = [] for orig_url in urls: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(orig_url) if ok: if len(url) <= _MAX_STRING_LENGTH: unsent.add(url) else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, url) self.entity.failed.append(orig_url) self.entity.unsent = sorted(unsent) while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, a # WebmentionSend error dict if it failed (semi-)permanently, or None. cache_key = util.webmention_endpoint_cache_key(target) cached = util.webmention_endpoint_cache.get(cache_key) if cached: logging.info('Using cached webmention endpoint %r: %s', cache_key, cached) # send! and handle response or error error = None if isinstance(cached, dict): error = cached else: mention = send.WebmentionSend(source_url, target, endpoint=cached) headers = util.request_headers(source=self.source) logging.info('Sending...') try: if not mention.send(timeout=999, headers=headers): error = mention.error except BaseException as e: logging.info('', stack_info=True) error = getattr(mention, 'error') if not error: error = ({ 'code': 'BAD_TARGET_URL', 'http_status': 499 } if 'DNS lookup failed for URL:' in str(e) else { 'code': 'EXCEPTION' }) error_code = error['code'] if error else None if error_code != 'BAD_TARGET_URL' and not cached: val = error if error_code == 'NO_ENDPOINT' else mention.receiver_endpoint with util.webmention_endpoint_cache_lock: util.webmention_endpoint_cache[cache_key] = val if error is None: logging.info('Sent! %s', mention.response) self.record_source_webmention(mention) self.entity.sent.append(target) else: status = error.get('http_status', 0) if (error_code == 'NO_ENDPOINT' or (error_code == 'BAD_TARGET_URL' and status == 204)): # No Content logging.info('Giving up this target. %s', error) self.entity.skipped.append(target) elif status // 100 == 4: # Give up on 4XX errors; we don't expect later retries to succeed. logging.info('Giving up this target. %s', error) self.entity.failed.append(target) else: self.fail('Error sending to endpoint: %s' % error, level=logging.INFO) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target) if self.entity.error: logging.info('Propagate task failed') self.release('error') else: self.complete()
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True, already_fetched_hfeeds=None): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new ones. Args: source: :class:`models.Source` subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. activity: activity dict fetch_hfeed: boolean include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs already_fetched_hfeeds: set, URLs that we have already fetched and run posse-post-discovery on, so we can avoid running it multiple times Returns: (set(string original post URLs), set(string mention URLs)) tuple """ logging.debug('discovering original posts for: %s', activity.get('url') or activity.get('id')) if not source.updates: source.updates = {} if already_fetched_hfeeds is None: already_fetched_hfeeds = set() originals, mentions = gr_source.Source.original_post_discovery( activity, domains=source.domains, cache=memcache, include_redirect_sources=include_redirect_sources, headers=util.request_headers(source=source)) obj = activity.get('object', {}) author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id') if author_id and author_id != source.user_tag_id(): logging.info( "Demoting original post links because user %s doesn't match author %s", source.user_tag_id(), author_id) # this is someone else's post, so all links must be mentions mentions.update(originals) originals = set() # look for original URL of attachments (e.g. quote tweets) for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): logging.debug('running original post discovery on attachment: %s', att.get('id')) att_origs, _ = discover( source, att, include_redirect_sources=include_redirect_sources) logging.debug( 'original post discovery found originals for attachment, %s', att_origs) mentions.update(att_origs) def resolve(urls): resolved = set() for url in urls: final, _, send = util.get_webmention_target(url) if send: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved originals = resolve(originals) mentions = resolve(mentions) if not source.get_author_urls(): logging.debug('no author url(s), cannot find h-feed') return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(), set())) # TODO possible optimization: if we've discovered a backlink to a post on the # author's domain (i.e., it included a link or citation), then skip the rest # of this. syndicated = [] syndication_url = obj.get('url') or activity.get('url') if syndication_url: # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place (e.g., # facebook user id instead of user name) syndication_url = source.canonicalize_url(syndication_url) if syndication_url: syndicated = _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds) originals.update(syndicated) originals = set(util.dedupe_urls(originals)) if not syndication_url: logging.debug('no %s syndication url, cannot process h-entries', source.SHORT_NAME) return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(syndicated), set()))
def do_send_webmentions(self): urls = self.entity.unsent + self.entity.error + self.entity.failed unsent = set() self.entity.error = [] self.entity.failed = [] for orig_url in urls: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(orig_url) if ok: if len(url) <= _MAX_STRING_LENGTH: unsent.add(url) else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, url) self.entity.failed.append(orig_url) self.entity.unsent = sorted(unsent) while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, a # WebmentionSend error dict if it failed (semi-)permanently, or None. cache_key = util.webmention_endpoint_cache_key(target) cached = memcache.get(cache_key) if cached: logging.info('Using cached webmention endpoint %r: %s', cache_key, cached) # send! and handle response or error error = None if isinstance(cached, dict): error = cached else: mention = send.WebmentionSend(source_url, target, endpoint=cached) headers = util.request_headers(source=self.source) logging.info('Sending...') try: if not mention.send(timeout=999, headers=headers): error = mention.error except BaseException, e: logging.info('', exc_info=True) error = getattr(mention, 'error') if not error: error = ({'code': 'BAD_TARGET_URL', 'http_status': 499} if 'DNS lookup failed for URL:' in str(e) else {'code': 'EXCEPTION'}) error_code = error['code'] if error else None if error_code != 'BAD_TARGET_URL' and not cached: val = error if error_code == 'NO_ENDPOINT' else mention.receiver_endpoint memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME) if error is None: logging.info('Sent! %s', mention.response) self.record_source_webmention(mention) self.entity.sent.append(target) else: status = error.get('http_status', 0) if (error_code == 'NO_ENDPOINT' or (error_code == 'BAD_TARGET_URL' and status == 204)): # No Content logging.info('Giving up this target. %s', error) self.entity.skipped.append(target) elif status // 100 == 4: # Give up on 4XX errors; we don't expect later retries to succeed. logging.info('Giving up this target. %s', error) self.entity.failed.append(target) else: self.fail('Error sending to endpoint: %s' % error, level=logging.INFO) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target)
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True, already_fetched_hfeeds=None): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found :class:`models.SyndicatedPost`\ s but will not do posse-post-discovery to find new ones. Args: source: :class:`models.Source` subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. activity: activity dict fetch_hfeed: boolean include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs already_fetched_hfeeds: set, URLs that we have already fetched and run posse-post-discovery on, so we can avoid running it multiple times Returns: (set(string original post URLs), set(string mention URLs)) tuple """ if not source.updates: source.updates = {} if already_fetched_hfeeds is None: already_fetched_hfeeds = set() originals, mentions = gr_source.Source.original_post_discovery( activity, domains=source.domains, cache=memcache, include_redirect_sources=include_redirect_sources, headers=util.request_headers(source=source)) obj = activity.get('object', {}) author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id') if author_id and author_id != source.user_tag_id(): logging.info( "Demoting original post links because user %s doesn't match author %s", source.user_tag_id(), author_id) # this is someone else's post, so all links must be mentions mentions.update(originals) originals = set() # look for original URL of attachments (e.g. quote tweets) for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): logging.debug('running original post discovery on attachment: %s', att.get('id')) att_origs, _ = discover( source, att, include_redirect_sources=include_redirect_sources) logging.debug('original post discovery found originals for attachment, %s', att_origs) mentions.update(att_origs) def resolve(urls): resolved = set() for url in urls: final, _, send = util.get_webmention_target(url) if send: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved originals = resolve(originals) mentions = resolve(mentions) if not source.get_author_urls(): logging.debug('no author url(s), cannot find h-feed') return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(), set())) # TODO possible optimization: if we've discovered a backlink to a post on the # author's domain (i.e., it included a link or citation), then skip the rest # of this. syndicated = [] syndication_url = obj.get('url') or activity.get('url') if syndication_url: # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place (e.g., # facebook user id instead of user name) syndication_url = source.canonicalize_url(syndication_url) if syndication_url: syndicated = _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds) originals.update(syndicated) originals = set(util.dedupe_urls(originals)) if not syndication_url: logging.debug('no %s syndication url, cannot process h-entries', source.SHORT_NAME) return ((originals, mentions) if not source.BACKFEED_REQUIRES_SYNDICATION_LINK else (set(syndicated), set()))
def do_send_webmentions(self): urls = self.entity.unsent + self.entity.error + self.entity.failed unsent = set() self.entity.error = [] self.entity.failed = [] for orig_url in urls: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(orig_url) if ok: if len(url) <= _MAX_STRING_LENGTH: unsent.add(url) else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, url) self.entity.failed.append(orig_url) self.entity.unsent = sorted(unsent) while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, NO_ENDPOINT if # no endpoint was ofund. cache_key = util.webmention_endpoint_cache_key(target) endpoint = util.webmention_endpoint_cache.get(cache_key) if endpoint: logging.info('Using cached webmention endpoint %r: %s', cache_key, endpoint) # send! and handle response or error try: resp = None headers = util.request_headers(source=self.source) if not endpoint: endpoint, resp = webmention.discover(target, headers=headers) with util.webmention_endpoint_cache_lock: util.webmention_endpoint_cache[ cache_key] = endpoint or NO_ENDPOINT if endpoint and endpoint != NO_ENDPOINT: logging.info('Sending...') resp = webmention.send(endpoint, source_url, target, timeout=999, headers=headers) logging.info('Sent! %s', resp) self.record_source_webmention(endpoint, target) self.entity.sent.append(target) else: logging.info('Giving up this target.') self.entity.skipped.append(target) except ValueError: logging.info('Bad URL; giving up this target.') self.entity.skipped.append(target) except BaseException as e: logging.info('', exc_info=True) # Give up on 4XX and DNS errors; we don't expect retries to succeed. code, _ = util.interpret_http_exception(e) if (code and code.startswith('4')) or 'DNS lookup failed' in str(e): logging.info('Giving up this target.') self.entity.failed.append(target) else: self.fail(f'Error sending to endpoint: {resp}', level=logging.INFO) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target) if self.entity.error: logging.info('Propagate task failed') self.release('error') else: self.complete()