def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) urls = [] for url in util.trim_nulls(util.uniquify( [user_url] + [actor.get('url')] + [u.get('value') for u in actor.get('urls', [])])): domain = util.domain_from_link(url) if domain and not util.in_webmention_blacklist(domain.lower()): urls.append(url) urls = util.dedupe_urls(urls) domains = [util.domain_from_link(url).lower() for url in urls] return urls, domains
def test_domain_from_link(self): self.assertEqual('localhost', util.domain_from_link('http://localhost/foo')) self.assertEqual('a.b.c.d', util.domain_from_link('http://a.b.c.d/foo')) for good_link in ('asdf.com', 'www.asdf.com', 'https://asdf.com/', 'asdf.com/foo?bar#baz'): actual = util.domain_from_link(good_link) self.assertEqual('asdf.com', actual, '%s returned %s' % (good_link, actual)) self.assertEqual('asdf.com.', util.domain_from_link('http://asdf.com./x')) for bad_link in '', ' ', 'a&b.com', 'http://', 'file:///': self.assertEquals(None, util.domain_from_link(bad_link))
def test_domain_from_link(self): self.assertEqual('localhost', util.domain_from_link('http://localhost/foo')) self.assertEqual('a.b.c.d', util.domain_from_link('http://a.b.c.d/foo')) for good_link in ('asdf.com', 'www.asdf.com', 'https://asdf.com/', 'asdf.com/foo?bar#baz', 'm.asdf.com', 'asdf.com:1234', 'mobile.asdf.com/foo?bar#baz', '//asdf.com/foo/bar', 'https://m.asdf.com/foo?bar#baz'): actual = util.domain_from_link(good_link) self.assertEqual('asdf.com', actual, '%s returned %s' % (good_link, actual)) self.assertEqual('asdf.com.', util.domain_from_link('http://asdf.com./x')) for bad_link in '', ' ', 'a&b.com', 'http://', 'file:///': self.assertEquals(None, util.domain_from_link(bad_link))
def urls_and_domains(self, auth_entity, user_url, actor=None, resolve_source_domain=True): """Returns this user's valid (not webmention-blocklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing actor: dict, optional AS actor for the user. If provided, overrides auth_entity resolve_source_domain: boolean, whether to follow redirects on URLs on this source's domain Returns: ([string url, ...], [string domain, ...]) """ if not actor: actor = self.gr_source.user_to_actor(json_loads(auth_entity.user_json)) logger.debug(f'Extracting URLs and domains from actor: {json_dumps(actor, indent=2)}') candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logger.info(f'Too many profile links! Only resolving the first {MAX_AUTHOR_URLS}: {candidates}') urls = [] for i, url in enumerate(candidates): on_source_domain = util.domain_from_link(url) == self.gr_source.DOMAIN resolve = ((resolve_source_domain or not on_source_domain) and i < MAX_AUTHOR_URLS) resolved = self.resolve_profile_url(url, resolve=resolve) if resolved: urls.append(resolved) final_urls = [] domains = [] for url in util.dedupe_urls(urls): # normalizes domains to lower case # skip links on this source's domain itself. only currently needed for # Mastodon; the other silo domains are in the webmention blocklist. domain = util.domain_from_link(url) if domain != self.gr_source.DOMAIN: final_urls.append(url) domains.append(domain) return final_urls, domains
def check_token_for_actor(self, actor): """Checks that the given actor is public and matches the request's token. Raises: :class:`HTTPException` with HTTP 400 """ if not actor: self.abort(400, f'Missing actor!') if not gr_source.Source.is_public(actor): self.abort( 400, f'Your {self.gr_source().NAME} account is private. Bridgy only supports public accounts.' ) token = util.get_required_param(self, 'token') domains = set( util.domain_from_link(util.replace_test_domains_with_localhost(u)) for u in microformats2.object_urls(actor)) domains.discard(self.source_class().GR_CLASS.DOMAIN) logging.info(f'Checking token against domains {domains}') for domain in ndb.get_multi(ndb.Key(Domain, d) for d in domains): if domain and token in domain.tokens: return self.abort(403, f'Token {token} is not authorized for any of: {domains}')
def finish(self, auth_entity, state=None): if not auth_entity: util.maybe_add_or_delete_source(Tumblr, auth_entity, state) return vars = { 'action': '/tumblr/add', 'state': state, 'auth_entity_key': auth_entity.key.urlsafe().decode(), 'blogs': [ { 'id': b['name'], 'title': b.get('title', ''), 'domain': util.domain_from_link(b['url']) } # user_json is the user/info response: # http://www.tumblr.com/docs/en/api/v2#user-methods for b in json_loads(auth_entity.user_json)['user']['blogs'] if b.get('name') and b.get('url') ], } print(logger.getEffectiveLevel()) assert logger.isEnabledFor(logging.DEBUG) logger.info(f'Rendering choose_blog.html with {vars}') return render_template('choose_blog.html', **vars)
def _process_syndication_urls(source, permalink, syndication_urls): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new SyndicatedPost in the db. Args: source: a models.Source subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication_urls """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.insert( source, syndication=syndication_url, original=permalink) results.setdefault(syndication_url, []).append(relationship) return results
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def new(handler, auth_entity=None, **kwargs): """Creates and returns a WordPress for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth """ auth_domain = auth_entity.key.id() site_info = WordPress.get_site_info(handler, auth_entity) if site_info is None: return urls = util.dedupe_urls(util.trim_nulls( [site_info.get('URL'), auth_entity.blog_url])) domains = [util.domain_from_link(u) for u in urls] avatar = (json.loads(auth_entity.user_json).get('avatar_URL') if auth_entity.user_json else None) return WordPress(id=domains[0], auth_entity=auth_entity.key, name=auth_entity.user_display_name(), picture=avatar, superfeedr_secret=util.generate_secret(), url=urls[0], domain_urls=urls, domains=domains, site_info=site_info, **kwargs)
def post(self): # load source try: source = ndb.Key(urlsafe=util.get_required_param(self, 'source_key')).get() if not source: self.abort(400, 'Source key not found') except ProtocolBufferDecodeError: logging.exception('Bad value for source_key') self.abort(400, 'Bad value for source_key') # validate URL, find silo post url = util.get_required_param(self, 'url') domain = util.domain_from_link(url) msg = 'Discovering now. Refresh in a minute to see the results!' if domain == source.GR_CLASS.DOMAIN: post_id = source.GR_CLASS.post_id(url) util.add_discover_task(source, post_id) elif util.domain_or_parent_in(domain, source.domains): synd_links = original_post_discovery.process_entry(source, url, {}, False, []) if synd_links: for link in synd_links: util.add_discover_task(source, source.GR_CLASS.post_id(link)) else: msg = 'Failed to fetch %s or find a %s syndication link.' % ( util.pretty_link(url), source.GR_CLASS.NAME) else: msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME self.messages.add(msg) self.redirect(source.bridgy_url(self))
def post(self): source = self.load_source() # validate URL, find silo post url = util.get_required_param(self, 'url') domain = util.domain_from_link(url) path = urllib.parse.urlparse(url).path msg = 'Discovering now. Refresh in a minute to see the results!' if domain == source.GR_CLASS.DOMAIN: post_id = source.GR_CLASS.post_id(url) if post_id: type = 'event' if path.startswith('/events/') else None util.add_discover_task(source, post_id, type=type) else: msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME elif util.domain_or_parent_in(domain, source.domains): synd_links = original_post_discovery.process_entry( source, url, {}, False, []) if synd_links: for link in synd_links: util.add_discover_task(source, source.GR_CLASS.post_id(link)) source.updates = {'last_syndication_url': util.now_fn()} models.Source.put_updates(source) else: msg = 'Failed to fetch %s or find a %s syndication link.' % ( util.pretty_link(url), source.GR_CLASS.NAME) else: msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME self.messages.add(msg) self.redirect(source.bridgy_url(self))
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Only searches for root domain web site URLs! Skips URLs with paths; they tend to generate false positive results in G+'s search. Not sure why yet. G+ search supports OR: https://developers.google.com/+/api/latest/activities/search Returns: sequence of ActivityStreams activity dicts """ urls = [ '"%s"' % util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url)) and urlparse.urlparse(url).path in ('', '/') ][:models.MAX_AUTHOR_URLS] if urls: return self.get_activities(search_query=' OR '.join(urls), group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) return []
def discover(): source = util.load_source() # validate URL, find silo post url = request.form['url'] domain = util.domain_from_link(url) path = urllib.parse.urlparse(url).path msg = 'Discovering now. Refresh in a minute to see the results!' gr_source = source.gr_source if domain == gr_source.DOMAIN: post_id = gr_source.post_id(url) if post_id: type = 'event' if path.startswith('/events/') else None util.add_discover_task(source, post_id, type=type) else: msg = f"Sorry, that doesn't look like a {gr_source.NAME} post URL." elif util.domain_or_parent_in(domain, source.domains): synd_links = original_post_discovery.process_entry( source, url, {}, False, []) if synd_links: for link in synd_links: util.add_discover_task(source, gr_source.post_id(link)) source.updates = {'last_syndication_url': util.now_fn()} models.Source.put_updates(source) else: msg = f'Failed to fetch {util.pretty_link(url)} or find a {gr_source.NAME} syndication link.' else: msg = f'Please enter a URL on either your web site or {gr_source.NAME}.' flash(msg) return redirect(source.bridgy_url())
def new(handler, auth_entity=None, **kwargs): """Creates and returns a WordPress for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth """ auth_domain = auth_entity.key.id() site_info = WordPress.get_site_info(handler, auth_entity) if site_info is None: return urls = util.dedupe_urls( util.trim_nulls([site_info.get('URL'), auth_entity.blog_url])) domains = [util.domain_from_link(u) for u in urls] avatar = (json.loads(auth_entity.user_json).get('avatar_URL') if auth_entity.user_json else None) return WordPress(id=domains[0], auth_entity=auth_entity.key, name=auth_entity.user_display_name(), picture=avatar, superfeedr_secret=util.generate_secret(), url=urls[0], domain_urls=urls, domains=domains, site_info=site_info, **kwargs)
def new(handler, auth_entity=None, **kwargs): """Creates and returns a WordPress for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth """ # Fetch blog's site info auth_domain = auth_entity.key.id() site_info = json.loads(auth_entity.urlopen( API_SITE_URL % auth_entity.blog_id).read()) site_url = site_info.get('URL') if site_url: domains = [util.domain_from_link(site_url), auth_domain] urls = [site_url, auth_entity.blog_url] else: domains = [auth_domain] urls = [auth_entity.blog_url] avatar = (json.loads(auth_entity.user_json).get('avatar_URL') if auth_entity.user_json else None) return WordPress(id=domains[0], auth_entity=auth_entity.key, name=auth_entity.user_display_name(), picture=avatar, superfeedr_secret=util.generate_secret(), url=urls[0], domain_urls=urls, domains=domains, site_info=site_info, **kwargs)
def canonicalize_url(self, url, **kwargs): """Facebook-specific standardization of syndicated urls. Canonical form is https://www.facebook.com/USERID/posts/POSTID Args: url: a string, the url of the syndicated content kwargs: unused Return: a string, the canonical form of the syndication url """ if util.domain_from_link(url) != self.gr_source.DOMAIN: return None def post_url(id): return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id) parsed = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(parsed.query) path = parsed.path.strip('/').split('/') url_id = self.gr_source.post_id(url) ids = params.get('story_fbid') or params.get('fbid') post_id = ids[0] if ids else url_id if post_id: url = post_url(post_id) url = url.replace('facebook.com/%s/' % self.username, 'facebook.com/%s/' % self.key.id()) return super(Facebook, self).canonicalize_url(url)
def post(self): source = self.load_source() # validate URL, find silo post url = util.get_required_param(self, 'url') domain = util.domain_from_link(url) path = urlparse.urlparse(url).path msg = 'Discovering now. Refresh in a minute to see the results!' if domain == source.GR_CLASS.DOMAIN: post_id = source.GR_CLASS.post_id(url) if post_id: type = 'event' if path.startswith('/events/') else None util.add_discover_task(source, post_id, type=type) else: msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME elif util.domain_or_parent_in(domain, source.domains): synd_links = original_post_discovery.process_entry(source, url, {}, False, []) if synd_links: for link in synd_links: util.add_discover_task(source, source.GR_CLASS.post_id(link)) source.updates = {'last_syndication_url': util.now_fn()} models.Source.put_updates(source) else: msg = 'Failed to fetch %s or find a %s syndication link.' % ( util.pretty_link(url), source.GR_CLASS.NAME) else: msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME self.messages.add(msg) self.redirect(source.bridgy_url(self))
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def finish(self, auth_entity, state=None): if not auth_entity: self.maybe_add_or_delete_source(Tumblr, auth_entity, state) return vars = { 'action': '/tumblr/add', 'state': state, 'auth_entity_key': auth_entity.key.urlsafe().decode(), 'blogs': [ { 'id': b['name'], 'title': b.get('title', ''), 'domain': util.domain_from_link(b['url']) } # user_json is the user/info response: # http://www.tumblr.com/docs/en/api/v2#user-methods for b in json_loads(auth_entity.user_json)['user']['blogs'] if b.get('name') and b.get('url') ], } logging.info('Rendering choose_blog.html with %s', vars) self.response.headers['Content-Type'] = 'text/html' self.response.out.write( JINJA_ENV.get_template('choose_blog.html').render(**vars))
def post(self): source = self.load_source() redirect_url = '%s?%s' % (self.request.path, urllib.parse.urlencode({ 'source_key': source.key.urlsafe().decode(), })) add = self.request.get('add') delete = self.request.get('delete') if (add and delete) or (not add and not delete): self.abort(400, 'Either add or delete param (but not both) required') link = util.pretty_link(add or delete) if add: resolved = Source.resolve_profile_url(add) if resolved: if resolved in source.domain_urls: self.messages.add('%s already exists.' % link) else: source.domain_urls.append(resolved) domain = util.domain_from_link(resolved) source.domains.append(domain) source.put() self.messages.add('Added %s.' % link) else: self.messages.add( "%s doesn't look like your web site. Try again?" % link) else: assert delete try: source.domain_urls.remove(delete) except ValueError: self.abort( 400, "%s not found in %s's current web sites" % (delete, source.label())) domain = util.domain_from_link(delete) if domain not in set( util.domain_from_link(url) for url in source.domain_urls): source.domains.remove(domain) source.put() self.messages.add('Removed %s.' % link) self.redirect(redirect_url)
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [ util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains ] logging.info('Found links: %s', links) if len(url) > _MAX_KEYPART_BYTES: logging.warning( 'Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=links) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=links) bp.get_or_save()
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] unique = [] for link in util.dedupe_urls(links): if len(link) <= _MAX_STRING_LENGTH: unique.append(link) else: logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link) logging.info('Found links: %s', unique) if len(url) > _MAX_KEYPART_BYTES: logging.warning('Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=unique) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique) bp.get_or_save()
def finish(self, auth_entity, state=None): if auth_entity: user_json = json.loads(auth_entity.user_json) # find instagram profile URL urls = user_json.get('rel-me', []) logging.info('rel-mes: %s', urls) for url in util.trim_nulls(urls): if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN: username = urllib.parse.urlparse(url).path.strip('/') break else: self.messages.add( 'No Instagram profile found. Please <a href="https://indieauth.com/setup">add an Instagram rel-me link</a>, then try again.' ) return self.redirect('/') # check that instagram profile links to web site try: actor = gr_instagram.Instagram(scrape=True).get_actor( username, ignore_rate_limit=True) except Exception as e: code, _ = util.interpret_http_exception(e) if code in Instagram.RATE_LIMIT_HTTP_CODES: self.messages.add( '<a href="https://github.com/snarfed/bridgy/issues/665#issuecomment-524977427">Apologies, Instagram is temporarily blocking us.</a> Please try again later!' ) return self.redirect('/') else: raise if not actor: self.messages.add( "Couldn't find Instagram user '%s'. Please check your site's rel-me link and your Instagram account." % username) return self.redirect('/') canonicalize = util.UrlCanonicalizer(redirects=False) website = canonicalize(auth_entity.key.id()) urls = [canonicalize(u) for u in microformats2.object_urls(actor)] logging.info('Looking for %s in %s', website, urls) if website not in urls: self.messages.add( "Please add %s to your Instagram profile's website or bio field and try again." % website) return self.redirect('/') # check that the instagram account is public if not gr_source.Source.is_public(actor): self.messages.add( 'Your Instagram account is private. Bridgy only supports public accounts.' ) return self.redirect('/') self.maybe_add_or_delete_source(Instagram, auth_entity, state, actor=actor)
def add_or_update_domain(): domain = Domain.get_or_insert( util.domain_from_link( util.replace_test_domains_with_localhost( auth_entity.key.id()))) domain.auth = auth_entity.key if state not in domain.tokens: domain.tokens.append(state) domain.put() flash(f'Authorized you for {domain.key.id()}.')
def edit_websites_post(): source = util.load_source() redirect_url = f'{request.path}?{urllib.parse.urlencode({"source_key": source.key.urlsafe().decode()})}' add = request.values.get('add') delete = request.values.get('delete') if (add and delete) or (not add and not delete): error('Either add or delete param (but not both) required') link = util.pretty_link(add or delete) if add: resolved = Source.resolve_profile_url(add) if resolved: if resolved in source.domain_urls: flash(f'{link} already exists.') else: source.domain_urls.append(resolved) domain = util.domain_from_link(resolved) source.domains.append(domain) source.put() flash(f'Added {link}.') else: flash(f"{link} doesn't look like your web site. Try again?") else: assert delete try: source.domain_urls.remove(delete) except ValueError: error( f"{delete} not found in {source.label()}'s current web sites") domain = util.domain_from_link(delete) if domain not in { util.domain_from_link(url) for url in source.domain_urls }: source.domains.remove(domain) source.put() flash(f'Removed {link}.') return redirect(redirect_url)
def post(self): source = self.load_source() redirect_url = '%s?%s' % (self.request.path, urllib.urlencode({ 'source_key': source.key.urlsafe(), })) add = self.request.get('add') delete = self.request.get('delete') if (add and delete) or (not add and not delete): self.abort(400, 'Either add or delete param (but not both) required') link = util.pretty_link(add or delete) if add: resolved = Source.resolve_profile_url(add) if resolved: if resolved in source.domain_urls: self.messages.add('%s already exists.' % link) else: source.domain_urls.append(resolved) domain = util.domain_from_link(resolved) source.domains.append(domain) source.put() self.messages.add('Added %s.' % link) else: self.messages.add("%s doesn't look like your web site. Try again?" % link) else: assert delete try: source.domain_urls.remove(delete) except ValueError: self.abort(400, "%s not found in %s's current web sites" % ( delete, source.label())) domain = util.domain_from_link(delete) if domain not in set(util.domain_from_link(url) for url in source.domain_urls): source.domains.remove(domain) source.put() self.messages.add('Removed %s.' % link) self.redirect(redirect_url)
def post(self): logging.debug('Params: %s', self.request.params) if self.lease(ndb.Key(urlsafe=self.request.params['key'])): # skip "self" links to this blog's domain source_domains = self.entity.source.get().domains to_send = set() for url in self.entity.unsent: link_domain = util.domain_from_link(url) if link_domain and link_domain not in source_domains: to_send.add(url) self.entity.unsent = list(to_send) self.send_webmentions()
def finish(self, auth_entity, state=None): if not auth_entity: return assert state domain = Domain.get_or_insert(util.domain_from_link(auth_entity.key.id())) domain.auth = auth_entity.key if state not in domain.tokens: domain.tokens.append(state) domain.put() self.messages.add(f'Authorized you for {domain.key.id()}.') self.redirect('/')
def search_for_links(self): """Searches for activities with links to any of this source's web sites. G+ search supports OR: https://developers.google.com/+/api/latest/activities/search Returns: sequence of ActivityStreams activity dicts """ query = ' OR '.join( '"%s"' % util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) return self.get_activities( search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith( util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def _url_and_domain(auth_entity, blog_name=None): """Returns the blog URL and domain. Args: auth_entity: oauth_dropins.tumblr.TumblrAuth blog_name: which blog. optional. matches the 'name' field for one of the blogs in auth_entity.user_json['user']['blogs']. Returns: (string url, string domain, boolean ok) """ for blog in json.loads(auth_entity.user_json).get('user', {}).get('blogs', []): if ((blog_name and blog_name == blog.get('name')) or (not blog_name and blog.get('primary'))): return blog['url'], util.domain_from_link(blog['url']), True return None, None, False
def _urls_and_domains(auth_entity, blog_name=None): """Returns this blog's URL and domain. Args: auth_entity: oauth_dropins.tumblr.TumblrAuth blog_name: which blog. optional. matches the 'name' field for one of the blogs in auth_entity.user_json['user']['blogs']. Returns: ([string url], [string domain]) """ for blog in json.loads(auth_entity.user_json).get('user', {}).get('blogs', []): if ((blog_name and blog_name == blog.get('name')) or (not blog_name and blog.get('primary'))): return [blog['url']], [util.domain_from_link(blog['url']).lower()] return [], []
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Twitter search supports OR: https://dev.twitter.com/rest/public/search ...but it only returns complete(ish) results if we strip scheme from URLs, ie search for example.com instead of http://example.com/, and that also returns false positivies, so we check that the returned tweets actually have matching links. https://github.com/snarfed/bridgy/issues/565 Returns: sequence of ActivityStreams activity dicts """ urls = set( util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) if not urls: return [] query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls) candidates = self.get_activities(search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) # filter out retweets and search false positives that don't actually link to us results = [] for candidate in candidates: if candidate.get('verb') == 'share': continue obj = candidate['object'] tags = obj.get('tags', []) atts = obj.get('attachments', []) for url in urls: if (url in obj.get('content', '') or any( t.get('url', '').startswith(url) for t in tags + atts)): id = candidate['id'] results.append(candidate) break return results
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates BlogPost entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.warning('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.warning("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue source.preprocess_superfeedr_item(item) # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [l for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] logging.info('Found links: %s', links) models.BlogPost(id=url, source=source.key, feed_item=item, unsent=links, ).get_or_save()
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ user = json_loads(auth_entity.user_json) actor = ( user.get('actor') # for Instagram; its user_json is IndieAuth or self.gr_source.user_to_actor(user)) logging.debug('Extracting URLs and domains from actor: %s', json_dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): resolved = self.resolve_profile_url(url, resolve=i < MAX_AUTHOR_URLS) if resolved: urls.append(resolved) final_urls = [] domains = [] for url in util.dedupe_urls(urls): # normalizes domains to lower case # skip links on this source's domain itself. only currently needed for # Mastodon; the other silo domains are in the webmention blacklist. domain = util.domain_from_link(url) if domain != self.gr_source.DOMAIN: final_urls.append(url) domains.append(domain) return final_urls, domains
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Returns: sequence of ActivityStreams activity dicts """ urls = {util.schemeless(util.fragmentless(url), slashes=False) for url in self.domain_urls if not util.in_webmention_blocklist(util.domain_from_link(url))} if not urls: return [] # Search syntax: https://www.reddit.com/wiki/search url_query = ' OR '.join(f'site:"{u}" OR selftext:"{u}"' for u in urls) return self.get_activities( search_query=url_query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50)
def canonicalize_url(self, url, activity=None, **kwargs): """Facebook-specific standardization of syndicated urls. Canonical form is https://www.facebook.com/USERID/posts/POSTID Args: url: a string, the url of the syndicated content activity: the activity this URL came from. If it has an fb_object_id, we'll use that instead of fetching the post from Facebook kwargs: unused Return: a string, the canonical form of the syndication url """ if util.domain_from_link(url) != self.gr_source.DOMAIN: return None def post_url(id): return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id) parsed = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(parsed.query) path = parsed.path.strip('/').split('/') url_id = self.gr_source.post_id(url) ids = params.get('story_fbid') or params.get('fbid') if ids: url = post_url(ids[0]) elif url_id: if path and path[0] == 'notes': url = post_url(url_id) else: object_id = self.cached_resolve_object_id(url_id, activity=activity) if object_id: url = post_url(object_id) elif path and len(path) > 1 and path[1] == 'posts': url = post_url(url_id) for alternate_id in util.trim_nulls( itertools.chain((self.username or self.inferred_username, ), self.inferred_user_ids)): url = url.replace('facebook.com/%s/' % alternate_id, 'facebook.com/%s/' % self.key.id()) return super(FacebookPage, self).canonicalize_url(url)
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def finish(self, auth_entity, state=None): if auth_entity: user_json = json.loads(auth_entity.user_json) # find instagram profile URL urls = user_json.get('rel-me', []) logging.info('rel-mes: %s', urls) for url in util.trim_nulls(urls): if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN: username = urlparse.urlparse(url).path.strip('/') break else: self.messages.add( 'No Instagram profile found. Please <a href="https://indieauth.com/setup">' 'add an Instagram rel-me link</a>, then try again.') return self.redirect('/') # check that instagram profile links to web site actor = gr_instagram.Instagram(scrape=True).get_actor( username, ignore_rate_limit=True) if not actor: self.messages.add( "Couldn't find Instagram user '%s'. Please check your site's rel-me " "link and your Instagram account." % username) return self.redirect('/') canonicalize = util.UrlCanonicalizer(redirects=False) website = canonicalize(auth_entity.key.id()) urls = [canonicalize(u) for u in microformats2.object_urls(actor)] logging.info('Looking for %s in %s', website, urls) if website not in urls: self.messages.add( "Please add %s to your Instagram profile's website or " 'bio field and try again.' % website) return self.redirect('/') # check that the instagram account is public if not gr_source.Source.is_public(actor): self.messages.add('Your Instagram account is private. ' 'Bridgy only supports public accounts.') return self.redirect('/') self.maybe_add_or_delete_source(Instagram, auth_entity, state, actor=actor)
def _urls_and_domains(auth_entity, blog_name=None): """Returns this blog's URL and domain. Args: auth_entity: :class:`oauth_dropins.tumblr.TumblrAuth` blog_name: which blog. optional. matches the 'name' field for one of the blogs in auth_entity.user_json['user']['blogs']. Returns: ([string url], [string domain]) """ for blog in json_loads(auth_entity.user_json).get('user', {}).get('blogs', []): if ((blog_name and blog_name == blog.get('name')) or (not blog_name and blog.get('primary'))): return [blog['url']], [util.domain_from_link(blog['url']).lower()] return [], []
def source_url(self, target_url): # determine which activity to use try: activity = self.activities[0] if self.entity.urls_to_activity: urls_to_activity = json_loads(self.entity.urls_to_activity) if urls_to_activity: activity = self.activities[urls_to_activity[target_url]] except (KeyError, IndexError): logging.warning( """\ Hit https://github.com/snarfed/bridgy/issues/237 KeyError! target url %s not in urls_to_activity: %s activities: %s""", target_url, self.entity.urls_to_activity, self.activities) self.abort(util.ERROR_HTTP_RETURN_CODE) # generate source URL id = activity['id'] parsed = util.parse_tag_uri(id) post_id = parsed[1] if parsed else id # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL) # currently have problems with brid.gy's SSL cert. details: # https://github.com/snarfed/bridgy/issues/20 host_url = self.request.host_url domain = util.domain_from_link(host_url) if domain == util.PRIMARY_DOMAIN or domain in util.OTHER_DOMAINS: host_url = 'https://brid-gy.appspot.com' path = [ host_url, self.entity.type, self.entity.source.get().SHORT_NAME, self.entity.source.string_id(), post_id ] if self.entity.type != 'post': # parse and add response id. (we know Response key ids are always tag URIs) _, response_id = util.parse_tag_uri(self.entity.key.string_id()) reaction_id = response_id if self.entity.type in ('like', 'react', 'repost', 'rsvp'): response_id = response_id.split('_')[ -1] # extract responder user id path.append(response_id) if self.entity.type == 'react': path.append(reaction_id) return '/'.join(path)
def authorize(self): """Check for a backlink to brid.gy/publish/SILO.""" bases = set() if util.domain_from_link(self.request.host_url) == 'brid.gy': bases.add('brid.gy') bases.add('www.brid.gy') # also accept www else: bases.add(self.request.host_url) expected = ['%s/publish/%s' % (base, self.source.SHORT_NAME) for base in bases] if self.entity.html: for url in expected: if url in self.entity.html or urllib.quote(url, safe='') in self.entity.html: return True self.error("Couldn't find link to %s" % expected[0]) return False
def record_source_webmention(self, mention): """Sets this source's last_webmention_sent and maybe webmention_endpoint. Args: mention: webmentiontools.send.WebmentionSend """ self.source = self.source.key.get() logging.info('Setting last_webmention_sent') self.source.last_webmention_sent = util.now_fn() if (mention.receiver_endpoint != self.source.webmention_endpoint and util.domain_from_link(mention.target_url) in self.source.domains): logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)', mention.receiver_endpoint, mention.target_url, self.source.webmention_endpoint) self.source.webmention_endpoint = mention.receiver_endpoint self.source.put()
def record_source_webmention(self, mention): """Sets this source's last_webmention_sent and maybe webmention_endpoint. Args: mention: :class:`webmentiontools.send.WebmentionSend` """ self.source = self.source.key.get() logging.info('Setting last_webmention_sent') self.source.last_webmention_sent = util.now_fn() if (mention.receiver_endpoint != self.source.webmention_endpoint and util.domain_from_link(mention.target_url) in self.source.domains): logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)', mention.receiver_endpoint, mention.target_url, self.source.webmention_endpoint) self.source.webmention_endpoint = mention.receiver_endpoint self.source.put()
def record_source_webmention(self, endpoint, target): """Sets this source's last_webmention_sent and maybe webmention_endpoint. Args: endpoint: str, URL target: str, URL """ self.source = self.source.key.get() logging.info('Setting last_webmention_sent') self.source.last_webmention_sent = util.now_fn() if (endpoint != self.source.webmention_endpoint and util.domain_from_link(target) in self.source.domains): logging.info( 'Also setting webmention_endpoint to %s (discovered in %s; was %s)', endpoint, target, self.source.webmention_endpoint) self.source.webmention_endpoint = endpoint self.source.put()
def infer_profile_url(self, url): """Given an arbitrary URL representing a person, try to find their profile URL for *this* service. Queries Bridgy's registered accounts for users with a particular domain in their silo profile. Args: url: string, a person's URL Return: a string URL for their profile on this service (or None) """ domain = util.domain_from_link(url) if domain == self.gr_source.DOMAIN: return url user = self.__class__.query(self.__class__.domains == domain).get() if user: return self.gr_source.user_url(user.key.id())
def finish(self, auth_entity, state=None): if auth_entity: user_json = json.loads(auth_entity.user_json) # find instagram profile URL urls = user_json.get('rel-me', []) logging.info('rel-mes: %s', urls) for url in util.trim_nulls(urls): if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN: username = urlparse.urlparse(url).path.strip('/') break else: self.messages.add( 'No Instagram profile found. Please <a href="https://indieauth.com/setup">' 'add an Instagram rel-me link</a>, then try again.') return self.redirect_home_or_user_page(state) # check that instagram profile links to web site actor = gr_instagram.Instagram(scrape=True).get_actor(username) if not actor: self.messages.add( "Couldn't find Instagram user '%s'. Please check your site's rel-me " "link and your Instagram account." % username) return self.redirect_home_or_user_page(state) canonicalize = util.UrlCanonicalizer(redirects=False) website = canonicalize(auth_entity.key.id()) urls = [canonicalize(u) for u in microformats2.object_urls(actor)] logging.info('Looking for %s in %s', website, urls) if website not in urls: self.messages.add("Please add %s to your Instagram profile's website or " 'bio field and try again.' % website) return self.redirect_home_or_user_page(state) # check that the instagram account is public if not gr_source.Source.is_public(actor): self.messages.add('Your Instagram account is private. ' 'Bridgy only supports public accounts.') return self.redirect_home_or_user_page(state) source = self.maybe_add_or_delete_source(Instagram, auth_entity, state, actor=actor)
def canonicalize_syndication_url(self, url, activity=None, **kwargs): """Facebook-specific standardization of syndicated urls. Canonical form is https://www.facebook.com/USERID/posts/POSTID Args: url: a string, the url of the syndicated content activity: the activity this URL came from. If it has an fb_object_id, we'll use that instead of fetching the post from Facebook kwargs: unused Return: a string, the canonical form of the syndication url """ if util.domain_from_link(url) != self.gr_source.DOMAIN: return url def post_url(id): return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id) parsed = urlparse.urlparse(url) params = urlparse.parse_qs(parsed.query) url_id = self.gr_source.post_id(url) ids = params.get('story_fbid') or params.get('fbid') if ids: url = post_url(ids[0]) elif url_id: if parsed.path.startswith('/notes/'): url = post_url(url_id) else: object_id = self.cached_resolve_object_id(url_id, activity=activity) if object_id: url = post_url(object_id) username = self.username or self.inferred_username if username: url = url.replace('facebook.com/%s/' % username, 'facebook.com/%s/' % self.key.id()) # facebook always uses https and www return super(FacebookPage, self).canonicalize_syndication_url( url, scheme='https', subdomain='www.')
def finish(self, auth_entity, state=None): if not auth_entity: self.maybe_add_or_delete_source(Tumblr, auth_entity, state) return vars = { "action": "/tumblr/add", "state": state, "auth_entity_key": auth_entity.key.urlsafe(), "blogs": [ {"id": b["name"], "title": b.get("title", ""), "domain": util.domain_from_link(b["url"])} # user_json is the user/info response: # http://www.tumblr.com/docs/en/api/v2#user-methods for b in json.loads(auth_entity.user_json)["user"]["blogs"] if b.get("name") and b.get("url") ], } logging.info("Rendering choose_blog.html with %s", vars) self.response.headers["Content-Type"] = "text/html" self.response.out.write(template.render("templates/choose_blog.html", vars))
def finish(self, auth_entity, state=None): if not auth_entity: self.maybe_add_or_delete_source(Tumblr, auth_entity, state) return vars = { 'action': '/tumblr/add', 'state': state, 'auth_entity_key': auth_entity.key.urlsafe(), 'blogs': [{'id': b['name'], 'title': b.get('title', ''), 'domain': util.domain_from_link(b['url'])} # user_json is the user/info response: # http://www.tumblr.com/docs/en/api/v2#user-methods for b in json.loads(auth_entity.user_json)['user']['blogs'] if b.get('name') and b.get('url')], } logging.info('Rendering choose_blog.html with %s', vars) self.response.headers['Content-Type'] = 'text/html' self.response.out.write(template.render('templates/choose_blog.html', vars))
def infer_profile_url(self, url): """Find a Facebook profile URL (ideally the one with the user's numeric ID) Looks up existing sources by username, inferred username, and domain. Args: url: string, a person's URL Return: a string URL for their Facebook profile (or None) """ domain = util.domain_from_link(url) if domain == self.gr_source.DOMAIN: username = urlparse.urlparse(url).path.strip('/') if '/' not in username: user = FacebookPage.query(ndb.OR( FacebookPage.username == username, FacebookPage.inferred_username == username)).get() if user: return self.gr_source.user_url(user.key.id()) return super(FacebookPage, self).infer_profile_url(url)
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Twitter search supports OR: https://dev.twitter.com/rest/public/search ...but it only returns complete(ish) results if we strip scheme from URLs, ie search for example.com instead of http://example.com/, and that also returns false positivies, so we check that the returned tweets actually have matching links. https://github.com/snarfed/bridgy/issues/565 Returns: sequence of ActivityStreams activity dicts """ urls = set(util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url))) if not urls: return [] query = ' OR '.join('"%s"' % util.schemeless(url, slashes=False) for url in urls) candidates = self.get_activities( search_query=query, group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) # filter out retweets and search false positives that don't actually link to us results = [] for candidate in candidates: if candidate.get('verb') == 'share': continue obj = candidate['object'] tags = obj.get('tags', []) atts = obj.get('attachments', []) for url in urls: if (url in obj.get('content', '') or any(t.get('url', '').startswith(url) for t in tags + atts)): id = candidate['id'] results.append(candidate) break return results
def search_for_links(self): """Searches for activities with links to any of this source's web sites. Only searches for root domain web site URLs! Skips URLs with paths; they tend to generate false positive results in G+'s search. Not sure why yet. G+ search supports OR: https://developers.google.com/+/api/latest/activities/search Returns: sequence of ActivityStreams activity dicts """ urls = ['"%s"' % util.fragmentless(url) for url in self.domain_urls if not util.in_webmention_blacklist(util.domain_from_link(url)) and urlparse.urlparse(url).path in ('', '/') ][:models.MAX_AUTHOR_URLS] if urls: return self.get_activities( search_query=' OR '.join(urls), group_id=gr_source.SEARCH, etag=self.last_activities_etag, fetch_replies=False, fetch_likes=False, fetch_shares=False, count=50) return []
def new(handler, auth_entity=None, actor=None, **kwargs): """Creates and returns a InstagramPage for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.instagram.InstagramAuth """ user = json.loads(auth_entity.user_json) user['actor'] = actor auth_entity.user_json = json.dumps(user) auth_entity.put() username = actor['username'] if not kwargs.get('features'): kwargs['features'] = ['listen'] urls = microformats2.object_urls(actor) return Instagram(id=username, auth_entity=auth_entity.key, name=actor.get('displayName'), picture=actor.get('image', {}).get('url'), url=gr_instagram.Instagram.user_url(username), domain_urls=urls, domains=[util.domain_from_link(url) for url in urls], **kwargs)
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of models.Source author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Return: a dict of syndicated_url to a list of new models.SyndicatedPost """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. # TODO skip sites we know don't have microformats2 markup author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = BeautifulSoup(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.warning('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if not feed_type: # type is not specified, use this to confirm that it's text/html feed_url, _, feed_type_ok = util.get_webmention_target(feed_url) else: feed_type_ok = feed_type == 'text/html' if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_type_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.warning('Could not fetch h-feed url %s.', feed_url, exc_info=True) permalink_to_entry = {} for child in feeditems: if 'h-entry' in child['type']: # TODO maybe limit to first ~30 entries? (do that here rather than, # below because we want the *first* n entries) for permalink in child['properties'].get('url', []): if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = _process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling now = util.now_fn() logging.debug('updating source last_syndication_url %s', now) source.updates['last_syndication_url'] = now return results
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) if urlparse.urlparse(self.target_url).path in ('', '/'): return self.error('Home page webmentions are not currently supported.') # create BlogWebmention entity id = u'%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception, e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source!') self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=False) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise
def _process_entry(source, permalink, refetch_blanks, preexisting): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: permalink: url of the unprocessed post syndication_url: url of the syndicated content refetch_blanks: boolean whether we should ignore blank preexisting SyndicatedPosts preexisting: dict of original url to SyndicatedPost Return: a dict from syndicated url to new models.SyndicatedPosts """ results = {} preexisting_relationship = preexisting.get(permalink) # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting_relationship: # if we're refetching blanks and this one is blank, do not return if refetch_blanks and not preexisting_relationship.syndication: logging.debug('ignoring blank relationship for original %s', permalink) else: return results syndication_urls = set() parsed = None try: logging.debug('fetching post permalink %s', permalink) permalink, _, type_ok = util.get_webmention_target(permalink) if type_ok: resp = requests.get(permalink, timeout=HTTP_TIMEOUT) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) if parsed: relsynd = parsed.get('rels').get('syndication', []) logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(relsynd) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links: %s', usynd) syndication_urls.update(usynd) # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) parsed = urlparse.urlparse(syndication_url) if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.get_or_insert_by_syndication_url( source, syndication=syndication_url, original=permalink) results[syndication_url] = relationship if not results: logging.debug('no syndication links from %s to current source %s. ' 'saving empty relationship so that it will not be ' 'searched again', permalink, source.label()) # remember that this post doesn't have syndication links for this # particular source SyndicatedPost(parent=source.key, original=permalink, syndication=None).put() logging.debug('discovered relationships %s', results) return results
def do_send_webmentions(self): unsent = set() for url in self.entity.unsent + self.entity.error: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(url) if ok: # When debugging locally, redirect our own webmentions to localhost if appengine_config.DEBUG and domain in util.LOCALHOST_TEST_DOMAINS: url = url.replace(domain, 'localhost') unsent.add(url) self.entity.unsent = sorted(unsent) self.entity.error = [] while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, a # WebmentionSend error dict if it failed (semi-)permanently, or None. domain = util.domain_from_link(target) cache_key = 'W ' + domain cached = memcache.get(cache_key) if cached: logging.info('Using cached webmention endpoint for %s: %s', domain, cached) # send! and handle response or error error = None if isinstance(cached, dict): error = cached else: mention = send.WebmentionSend(source_url, target, endpoint=cached) logging.info('Sending...') try: if not mention.send(timeout=999): error = mention.error except: logging.warning('', exc_info=True) error = getattr(mention, 'error', None) if not error: error = {'code': 'EXCEPTION'} if error is None: logging.info('Sent! %s', mention.response) if not self.entity.sent: self.set_last_webmention_sent() self.entity.sent.append(target) memcache.set(cache_key, mention.receiver_endpoint, time=WEBMENTION_DISCOVERY_CACHE_TIME) else: if error['code'] == 'NO_ENDPOINT': logging.info('Giving up this target. %s', error) self.entity.skipped.append(target) memcache.set(cache_key, error, time=WEBMENTION_DISCOVERY_CACHE_TIME) elif (error['code'] == 'BAD_TARGET_URL' and error['http_status'] / 100 == 4): # Give up on 4XX errors; we don't expect later retries to succeed. logging.info('Giving up this target. %s', error) self.entity.failed.append(target) else: self.fail('Error sending to endpoint: %s' % error) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target) if self.entity.error: logging.warning('Propagate task failed') self.release('error') else: self.complete()