def html_to_atom(html, url=None, fetch_author=False, reader=True): """Converts microformats2 HTML to an Atom feed. Args: html: unicode string url: string URL html came from, optional fetch_author: boolean, whether to make HTTP request to fetch rel-author link reader: boolean, whether the output will be rendered in a feed reader. Currently just includes location if True, not otherwise. Returns: unicode string with Atom XML """ if fetch_author: assert url, 'fetch_author=True requires url!' parsed = util.parse_mf2(html, url=url) actor = microformats2.find_author(parsed, fetch_mf2_func=util.fetch_mf2) return activities_to_atom( microformats2.html_to_activities(html, url, actor), actor, title=microformats2.get_title(parsed), xml_base=util.base_url(url), host_url=url, reader=reader)
def html_to_activities(html, url=None, actor=None, id=None): """Converts a microformats2 HTML h-feed to ActivityStreams activities. Args: html: unicode string HTML or :class:`requests.Response` url: optional string URL that HTML came from actor: optional author AS actor object for all activities. usually comes from a rel="author" link. id: string, optional id of specific element to extract and parse. defaults to the whole page. Returns: list of ActivityStreams activity dicts """ parsed = util.parse_mf2(html, url=url, id=id) hfeed = mf2util.find_first_entry(parsed, ['h-feed']) items = hfeed.get('children', []) if hfeed else parsed.get('items', []) activities = [] for item in items: types = item.get('type', []) if 'h-entry' in types or 'h-event' in types or 'h-cite' in types: obj = json_to_object(item, actor=actor) obj['content_is_html'] = True activities.append({'object': obj}) return activities
def post(self): logging.info('Params: %s', list(self.request.params.items())) # fetch source page source = util.get_required_param(self, 'source') source_resp = common.requests_get(source) self.source_url = source_resp.url or source self.source_domain = urllib.parse.urlparse(self.source_url).netloc.split(':')[0] self.source_mf2 = util.parse_mf2(source_resp) # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json_dumps(self.source_mf2 indent=2)) # check for backlink to bridgy fed (for webmention spec and to confirm # source's intent to federate to mastodon) if (self.request.host_url not in source_resp.text and urllib.parse.quote(self.request.host_url, safe='') not in source_resp.text): self.error("Couldn't find link to %s" % self.request.host_url) # convert source page to ActivityStreams entry = mf2util.find_first_entry(self.source_mf2, ['h-entry']) if not entry: self.error('No microformats2 found on %s' % self.source_url) logging.info('First entry: %s', json_dumps(entry, indent=2)) # make sure it has url, since we use that for AS2 id, which is required # for ActivityPub. props = entry.setdefault('properties', {}) if not props.get('url'): props['url'] = [self.source_url] self.source_obj = microformats2.json_to_object(entry, fetch_mf2=True) logging.info('Converted to AS1: %s', json_dumps(self.source_obj, indent=2)) self.try_activitypub() or self.try_salmon()
def dispatch_request(self): logging.info(f'Params: {list(request.form.items())}') # fetch source page source = flask_util.get_required_param('source') source_resp = common.requests_get(source) self.source_url = source_resp.url or source self.source_domain = urllib.parse.urlparse( self.source_url).netloc.split(':')[0] self.source_mf2 = util.parse_mf2(source_resp) # logging.debug(f'Parsed mf2 for {source_resp.url} : {json_dumps(self.source_mf2 indent=2)}') # check for backlink to bridgy fed (for webmention spec and to confirm # source's intent to federate to mastodon) if (request.host_url not in source_resp.text and urllib.parse.quote( request.host_url, safe='') not in source_resp.text): error("Couldn't find link to {request.host_url}") # convert source page to ActivityStreams entry = mf2util.find_first_entry(self.source_mf2, ['h-entry']) if not entry: error(f'No microformats2 found on {self.source_url}') logging.info(f'First entry: {json_dumps(entry, indent=2)}') # make sure it has url, since we use that for AS2 id, which is required # for ActivityPub. props = entry.setdefault('properties', {}) if not props.get('url'): props['url'] = [self.source_url] self.source_obj = microformats2.json_to_object(entry, fetch_mf2=True) logging.info( f'Converted to AS1: {json_dumps(self.source_obj, indent=2)}') for method in self.try_activitypub, self.try_salmon: ret = method() if ret: return ret return ''
def get(self): input = util.get_required_param(self, 'input') if input not in INPUTS: raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' % (input, INPUTS)) orig_url = util.get_required_param(self, 'url') fragment = urllib.parse.urlparse(orig_url).fragment if fragment and input != 'html': raise exc.HTTPBadRequest( 'URL fragments only supported with input=html.') resp = util.requests_get(orig_url, gateway=True) final_url = resp.url # decode data if input in ('activitystreams', 'as1', 'as2', 'mf2-json', 'json-mf2', 'jsonfeed'): try: body_json = json_loads(resp.text) body_items = (body_json if isinstance(body_json, list) else body_json.get('items') or [body_json]) except (TypeError, ValueError): raise exc.HTTPBadRequest('Could not decode %s as JSON' % final_url) mf2 = None if input == 'html': mf2 = util.parse_mf2(resp, id=fragment) if id and not mf2: raise exc.HTTPBadRequest( 'Got fragment %s but no element found with that id.' % fragment) elif input in ('mf2-json', 'json-mf2'): mf2 = body_json if not hasattr(mf2, 'get'): raise exc.HTTPBadRequest( 'Expected microformats2 JSON input to be dict, got %s' % mf2.__class__.__name__) mf2.setdefault('rels', {}) # mf2util expects rels actor = None title = None hfeed = None if mf2: def fetch_mf2_func(url): if util.domain_or_parent_in( urllib.parse.urlparse(url).netloc, SILO_DOMAINS): return { 'items': [{ 'type': ['h-card'], 'properties': { 'url': [url] } }] } return util.fetch_mf2(url, gateway=True) try: actor = microformats2.find_author( mf2, fetch_mf2_func=fetch_mf2_func) title = microformats2.get_title(mf2) hfeed = mf2util.find_first_entry(mf2, ['h-feed']) except (KeyError, ValueError) as e: raise exc.HTTPBadRequest('Could not parse %s as %s: %s' % (final_url, input, e)) try: if input in ('as1', 'activitystreams'): activities = body_items elif input == 'as2': activities = [as2.to_as1(obj) for obj in body_items] elif input == 'atom': try: activities = atom.atom_to_activities(resp.text) except ElementTree.ParseError as e: raise exc.HTTPBadRequest('Could not parse %s as XML: %s' % (final_url, e)) except ValueError as e: raise exc.HTTPBadRequest('Could not parse %s as Atom: %s' % (final_url, e)) elif input == 'html': activities = microformats2.html_to_activities(resp, url=final_url, id=fragment, actor=actor) elif input in ('mf2-json', 'json-mf2'): activities = [ microformats2.json_to_object(item, actor=actor) for item in mf2.get('items', []) ] elif input == 'jsonfeed': activities, actor = jsonfeed.jsonfeed_to_activities(body_json) except ValueError as e: logging.warning('parsing input failed', stack_info=True) self.abort( 400, 'Could not parse %s as %s: %s' % (final_url, input, str(e))) self.write_response( source.Source.make_activities_base_response(activities), url=final_url, actor=actor, title=title, hfeed=hfeed)
def setUp(self): super(WebmentionTest, self).setUp() self.key = MagicKey.get_or_create('a') self.orig_html_as2 = requests_response("""\ <html> <meta> <link href='http://orig/atom' rel='alternate' type='application/atom+xml'> <link href='http://orig/as2' rel='alternate' type='application/activity+json'> </meta> </html> """, url='http://orig/post', content_type=CONTENT_TYPE_HTML) self.orig_html_atom = requests_response("""\ <html> <meta> <link href='http://orig/atom' rel='alternate' type='application/atom+xml'> </meta> </html> """, url='http://orig/post', content_type=CONTENT_TYPE_HTML) self.orig_atom = requests_response("""\ <?xml version="1.0"?> <entry xmlns="http://www.w3.org/2005/Atom"> <id>tag:fed.brid.gy,2017-08-22:orig-post</id> <link rel="salmon" href="http://orig/salmon"/> <content type="html">baz ☕ baj</content> </entry> """, content_type=CONTENT_TYPE_ATOM) self.orig_as2_data = { '@context': ['https://www.w3.org/ns/activitystreams'], 'type': 'Article', 'id': 'tag:orig,2017:as2', 'content': 'Lots of ☕ words...', 'actor': { 'url': 'http://orig/author' }, 'to': ['http://orig/recipient'], 'cc': ['http://orig/bystander', AS2_PUBLIC_AUDIENCE], } self.orig_as2 = requests_response(self.orig_as2_data, url='http://orig/as2', content_type=CONTENT_TYPE_AS2 + '; charset=utf-8') self.reply_html = """\ <html> <body> <div class="h-entry"> <a class="u-url" href="http://a/reply"></a> <p class="e-content p-name"> <a class="u-in-reply-to" href="http://not/fediverse"></a> <a class="u-in-reply-to" href="http://orig/post">foo ☕ bar</a> <a href="http://localhost/"></a> </p> <a class="p-author h-card" href="http://orig">Ms. ☕ Baz</a> </div> </body> </html> """ self.reply = requests_response(self.reply_html, content_type=CONTENT_TYPE_HTML) self.reply_mf2 = util.parse_mf2(self.reply_html, url='http://a/reply') self.repost_html = REPOST_HTML self.repost = requests_response(self.repost_html, content_type=CONTENT_TYPE_HTML) self.repost_mf2 = util.parse_mf2(self.repost_html, url='http://a/repost') self.repost_as2 = REPOST_AS2 self.like_html = """\ <html> <body class="h-entry"> <a class="u-url" href="http://a/like"></a> <a class="u-like-of" href="http://orig/post"></a> <!--<a class="u-like-of p-name" href="http://orig/post">liked!</a>--> <a class="p-author h-card" href="http://orig">Ms. ☕ Baz</a> <a href="http://localhost/"></a> </body> </html> """ self.like = requests_response(self.like_html, content_type=CONTENT_TYPE_HTML) self.like_mf2 = util.parse_mf2(self.like_html, url='http://a/like') self.actor = requests_response( { 'objectType': 'person', 'displayName': 'Mrs. ☕ Foo', 'url': 'https://foo.com/about-me', 'inbox': 'https://foo.com/inbox', }, content_type=CONTENT_TYPE_AS2) self.as2_create = { '@context': 'https://www.w3.org/ns/activitystreams', 'type': 'Create', 'object': { '@context': 'https://www.w3.org/ns/activitystreams', 'type': 'Note', 'id': 'http://localhost/r/http://a/reply', 'url': 'http://localhost/r/http://a/reply', 'name': 'foo ☕ bar', 'content': """\ <a class="u-in-reply-to" href="http://not/fediverse"></a> <a class="u-in-reply-to" href="http://orig/post">foo ☕ bar</a> <a href="http://localhost/"></a>""", 'inReplyTo': 'tag:orig,2017:as2', 'cc': [ AS2_PUBLIC_AUDIENCE, 'http://orig/author', 'http://orig/recipient', 'http://orig/bystander', ], 'attributedTo': [{ 'type': 'Person', 'id': 'http://localhost/orig', 'url': 'http://localhost/r/http://orig', 'preferredUsername': '******', 'name': 'Ms. ☕ Baz', }], 'tag': [{ 'type': 'Mention', 'href': 'http://orig/author', }], }, } self.as2_update = copy.deepcopy(self.as2_create) self.as2_update['type'] = 'Update' self.follow_html = """\ <html> <body class="h-entry"> <a class="u-url" href="http://a/follow"></a> <a class="u-follow-of" href="http://followee"></a> <a class="p-author h-card" href="https://orig">Ms. ☕ Baz</a> <a href="http://localhost/"></a> </body> </html> """ self.follow = requests_response(self.follow_html, content_type=CONTENT_TYPE_HTML) self.follow_mf2 = util.parse_mf2(self.follow_html, url='http://a/follow') self.follow_as2 = { '@context': 'https://www.w3.org/ns/activitystreams', 'type': 'Follow', 'id': 'http://localhost/r/http://a/follow', 'url': 'http://localhost/r/http://a/follow', 'object': 'http://followee', 'actor': { 'id': 'http://localhost/orig', 'name': 'Ms. ☕ Baz', 'preferredUsername': '******', 'type': 'Person', 'url': 'http://localhost/r/https://orig', }, 'cc': ['https://www.w3.org/ns/activitystreams#Public'], } self.create_html = """\ <html> <body class="h-entry"> <a class="u-url" href="http://orig/post"></a> <p class="e-content p-name">hello i am a post</p> <a class="p-author h-card" href="https://orig">Ms. ☕ Baz</a> <a href="http://localhost/"></a> </body> </html> """ self.create = requests_response(self.create_html, content_type=CONTENT_TYPE_HTML) self.create_mf2 = util.parse_mf2(self.create_html, url='http://a/create') self.create_as2 = { '@context': 'https://www.w3.org/ns/activitystreams', 'type': 'Create', 'object': { '@context': 'https://www.w3.org/ns/activitystreams', 'type': 'Note', 'id': 'http://localhost/r/http://orig/post', 'url': 'http://localhost/r/http://orig/post', 'name': 'hello i am a post', 'content': 'hello i am a post', 'attributedTo': [{ 'type': 'Person', 'id': 'http://localhost/orig', 'url': 'http://localhost/r/https://orig', 'name': 'Ms. ☕ Baz', 'preferredUsername': '******', }], 'cc': ['https://www.w3.org/ns/activitystreams#Public'], }, } self.not_fediverse = requests_response("""\ <html> <body>foo</body> </html> """, url='http://not/fediverse', content_type=CONTENT_TYPE_HTML) self.activitypub_gets = [ self.reply, self.not_fediverse, self.orig_as2, self.actor ]
def template_vars(self, domain, url=None): assert domain if domain.split('.')[-1] in NON_TLDS: self.error("%s doesn't look like a domain" % domain, status=404) # find representative h-card. try url, then url's home page, then domain urls = ['http://%s/' % domain] if url: urls = [url, urllib.parse.urljoin(url, '/')] + urls for candidate in urls: resp = common.requests_get(candidate) parsed = util.parse_html(resp) mf2 = util.parse_mf2(parsed, url=resp.url) # logging.debug('Parsed mf2 for %s: %s', resp.url, json_dumps(mf2, indent=2)) hcard = mf2util.representative_hcard(mf2, resp.url) if hcard: logging.info('Representative h-card: %s', json_dumps(hcard, indent=2)) break else: self.error("""\ Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url) logging.info('Generating WebFinger data for %s', domain) key = models.MagicKey.get_or_create(domain) props = hcard.get('properties', {}) urls = util.dedupe_urls(props.get('url', []) + [resp.url]) canonical_url = urls[0] acct = '%s@%s' % (domain, domain) for url in urls: if url.startswith('acct:'): urluser, urldomain = util.parse_acct_uri(url) if urldomain == domain: acct = '%s@%s' % (urluser, domain) logging.info('Found custom username: acct:%s', acct) break # discover atom feed, if any atom = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM) if atom and atom['href']: atom = urllib.parse.urljoin(resp.url, atom['href']) else: atom = 'https://granary.io/url?' + urllib.parse.urlencode( { 'input': 'html', 'output': 'atom', 'url': resp.url, 'hub': resp.url, }) # discover PuSH, if any for link in resp.headers.get('Link', '').split(','): match = common.LINK_HEADER_RE.match(link) if match and match.group(2) == 'hub': hub = match.group(1) else: hub = 'https://bridgy-fed.superfeedr.com/' # generate webfinger content data = util.trim_nulls({ 'subject': 'acct:' + acct, 'aliases': urls, 'magic_keys': [{ 'value': key.href() }], 'links': sum(([{ 'rel': 'http://webfinger.net/rel/profile-page', 'type': 'text/html', 'href': url, }] for url in urls if url.startswith("http")), []) + [{ 'rel': 'http://webfinger.net/rel/avatar', 'href': url, } for url in props.get('photo', [])] + [ { 'rel': 'canonical_uri', 'type': 'text/html', 'href': canonical_url, }, # ActivityPub { 'rel': 'self', 'type': common.CONTENT_TYPE_AS2, # WARNING: in python 2 sometimes request.host_url lost port, # http://localhost:8080 would become just http://localhost. no # clue how or why. pay attention here if that happens again. 'href': '%s/%s' % (self.request.host_url, domain), }, { 'rel': 'inbox', 'type': common.CONTENT_TYPE_AS2, 'href': '%s/%s/inbox' % (self.request.host_url, domain), }, # OStatus { 'rel': 'http://schemas.google.com/g/2010#updates-from', 'type': common.CONTENT_TYPE_ATOM, 'href': atom, }, { 'rel': 'hub', 'href': hub, }, { 'rel': 'magic-public-key', 'href': key.href(), }, { 'rel': 'salmon', 'href': '%s/%s/salmon' % (self.request.host_url, domain), } ] }) logging.info('Returning WebFinger data: %s', json_dumps(data, indent=2)) return data