def search_for_links(self): """Searches for activities with links to any of this source's web sites. Only searches for root domain web site URLs! Skips URLs with paths; they tend to generate false positive results in G+'s search. Not sure why yet. G+ search supports OR: https://developers.google.com/+/api/latest/activities/search Returns: sequence of ActivityStreams activity dicts """ urls = [ '"%s"' % util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url)) and urlparse.urlparse(url).path in ('', '/') ][:models.MAX_AUTHOR_URLS] if urls: return self.get_activities(search_query=' OR '.join(urls), group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) return []
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) urls = [] for url in util.trim_nulls(util.uniquify( [user_url] + [actor.get('url')] + [u.get('value') for u in actor.get('urls', [])])): domain = util.domain_from_link(url) if domain and not util.in_webmention_blacklist(domain.lower()): urls.append(url) urls = util.dedupe_urls(urls) domains = [util.domain_from_link(url).lower() for url in urls] return urls, domains
def search_for_links(self): """Searches for activities with links to any of this source's web sites. G+ search supports OR: https://developers.google.com/+/api/latest/activities/search Returns: sequence of ActivityStreams activity dicts """ query = ' OR '.join( '"%s"' % util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) return self.get_activities( search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)
def verify(self, force=False): """Checks that this source is ready to be used. For blog and listen sources, this fetches their front page HTML and discovers their webmention endpoint. For publish sources, this checks that they have a domain. May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`. Args: force: if True, fully verifies (e.g. re-fetches the blog's HTML and performs webmention discovery) even we already think this source is verified. """ author_urls = [ u for u, d in zip(self.get_author_urls(), self.domains) if not util.in_webmention_blacklist(d) ] if ((self.verified() and not force) or self.status == 'disabled' or not self.features or not author_urls): return author_url = author_urls[0] logging.info('Attempting to discover webmention endpoint on %s', author_url) mention = send.WebmentionSend('https://brid.gy/', author_url) mention.requests_kwargs = { 'timeout': HTTP_TIMEOUT, 'headers': util.REQUEST_HEADERS } try: mention._discoverEndpoint() except BaseException: logging.info('Error discovering webmention endpoint', exc_info=True) mention.error = {'code': 'EXCEPTION'} self._fetched_html = getattr(mention, 'html', None) error = getattr(mention, 'error', None) endpoint = getattr(mention, 'receiver_endpoint', None) if error or not endpoint: logging.info("No webmention endpoint found: %s %r", error, endpoint) self.webmention_endpoint = None else: logging.info("Discovered webmention endpoint %s", endpoint) self.webmention_endpoint = endpoint self.put()
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Twitter search supports OR: https://dev.twitter.com/rest/public/search ...but it only returns complete(ish) results if we strip scheme from URLs, ie search for example.com instead of http://example.com/, and that also returns false positivies, so we check that the returned tweets actually have matching links. https://github.com/snarfed/bridgy/issues/565 Returns: sequence of ActivityStreams activity dicts """ urls = set( util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) if not urls: return [] query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls) candidates = self.get_activities(search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) # filter out retweets and search false positives that don't actually link to us results = [] for candidate in candidates: if candidate.get('verb') == 'share': continue obj = candidate['object'] tags = obj.get('tags', []) atts = obj.get('attachments', []) for url in urls: if (url in obj.get('content', '') or any( t.get('url', '').startswith(url) for t in tags + atts)): id = candidate['id'] results.append(candidate) break return results
def verify(self, force=False): """Checks that this source is ready to be used. For blog and listen sources, this fetches their front page HTML and discovers their webmention endpoint. For publish sources, this checks that they have a domain. May be overridden by subclasses, e.g. :class:`tumblr.Tumblr`. Args: force: if True, fully verifies (e.g. re-fetches the blog's HTML and performs webmention discovery) even we already think this source is verified. """ author_urls = [u for u, d in zip(self.get_author_urls(), self.domains) if not util.in_webmention_blacklist(d)] if ((self.verified() and not force) or self.status == 'disabled' or not self.features or not author_urls): return author_url = author_urls[0] logging.info('Attempting to discover webmention endpoint on %s', author_url) mention = send.WebmentionSend('https://brid.gy/', author_url) mention.requests_kwargs = {'timeout': HTTP_TIMEOUT, 'headers': util.REQUEST_HEADERS} try: mention._discoverEndpoint() except BaseException: logging.info('Error discovering webmention endpoint', exc_info=True) mention.error = {'code': 'EXCEPTION'} self._fetched_html = getattr(mention, 'html', None) error = getattr(mention, 'error', None) endpoint = getattr(mention, 'receiver_endpoint', None) if error or not endpoint: logging.info("No webmention endpoint found: %s %r", error, endpoint) self.webmention_endpoint = None else: logging.info("Discovered webmention endpoint %s", endpoint) self.webmention_endpoint = endpoint self.put()
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Twitter search supports OR: https://dev.twitter.com/rest/public/search ...but it only returns complete(ish) results if we strip scheme from URLs, ie search for example.com instead of http://example.com/, and that also returns false positivies, so we check that the returned tweets actually have matching links. https://github.com/snarfed/bridgy/issues/565 Returns: sequence of ActivityStreams activity dicts """ urls = set(util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) if not urls: return [] query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls) candidates = self.get_activities( search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) # filter out retweets and search false positives that don't actually link to us results = [] for candidate in candidates: if candidate.get('verb') == 'share': continue obj = candidate['object'] tags = obj.get('tags', []) atts = obj.get('attachments', []) for url in urls: if (url in obj.get('content', '') or any(t.get('url', '').startswith(url) for t in tags + atts)): id = candidate['id'] results.append(candidate) break return results
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Only searches for root domain web site URLs! Skips URLs with paths; they tend to generate false positive results in G+'s search. Not sure why yet. G+ search supports OR: https://developers.google.com/+/api/latest/activities/search Returns: sequence of ActivityStreams activity dicts """ urls = ['"%s"' % util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url)) and urlparse.urlparse(url).path in ('', '/') ][:models.MAX_AUTHOR_URLS] if urls: return self.get_activities( search_query=' OR '.join(urls), group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) return []
def test_in_webmention_blacklist(self): for bad in 't.co', 'x.t.co', 'x.y.t.co', 'abc.onion': self.assertTrue(util.in_webmention_blacklist(bad), bad) for good in 'snarfed.org', 'www.snarfed.org', 't.co.com': self.assertFalse(util.in_webmention_blacklist(good), good)
def test_in_webmention_blacklist(self): for bad in "t.co", "x.t.co", "x.y.t.co", "abc.onion": self.assertTrue(util.in_webmention_blacklist(bad), bad) for good in "snarfed.org", "www.snarfed.org", "t.co.com": self.assertFalse(util.in_webmention_blacklist(good), good)