def test_one_matching_url(): p = { 'rels': {}, 'items': [ { 'type': ['h-card'], 'properties': { 'url': ['http://tilde.club/~foobar'], 'name': ['Bad'], } }, { 'type': ['h-card'], 'properties': { 'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'], 'name': ['Good'], } }, ] } hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert hcard assert hcard['properties']['name'][0] == 'Good' p['items'].append({ 'type': ['h-card'], 'properties': { 'url': ['http://foo.com/bar', 'http://flickr.com/photos/foobar'], 'name': ['Too Many Cooks'], } }) hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert not hcard
def test_url_matches_uid(): p = { 'rels': {}, 'items': [ { 'type': ['h-card'], 'properties': { 'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'], 'name': ['Bad'], } }, { 'type': ['h-card'], 'properties': { 'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'], 'uid': ['http://foo.com/bar'], 'name': ['Good'], } }, ] } hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert hcard assert hcard['properties']['name'][0] == 'Good' # removing the uid should prevent us from finding the h-card del p['items'][1]['properties']['uid'] hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert not hcard
def test_url_matches_rel_me(): # rel-me points to identity hosted on about.me p = { 'rels': { 'me': ['http://about.me/foobar'], }, 'items': [ { 'type': ['h-card'], 'properties': { 'url': ['http://tilde.club/~foobar'], 'name': ['Bad'], } }, { 'type': ['h-card'], 'properties': { 'url': ['http://about.me/foobar', 'http://tilde.club/~foobar'], 'name': ['Good'], } }, ] } hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert hcard assert hcard['properties']['name'][0] == 'Good'
def test_nested_hcard(): p = { 'rels': {}, 'items': [ { 'type': ['h-card'], 'properties': { 'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'], 'name': ['Bad'], } }, { 'type': ['h-entry'], 'children': [ { 'type': ['h-card'], 'properties': { 'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'], 'uid': ['http://foo.com/bar'], 'name': ['Good'], } }, ] }, ] } hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert hcard assert hcard['properties']['name'][0] == 'Good'
def get(self, domain): url = 'http://%s/' % domain resp = common.requests_get(url) mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True) # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2)) hcard = mf2util.representative_hcard(mf2, resp.url) logging.info('Representative h-card: %s', json.dumps(hcard, indent=2)) if not hcard: common.error( self, """\ Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url) key = MagicKey.get_or_create(domain) obj = common.postprocess_as2(as2.from_as1( microformats2.json_to_object(hcard)), key=key) obj.update({ 'inbox': '%s/%s/inbox' % (appengine_config.HOST_URL, domain), 'outbox': '%s/%s/outbox' % (appengine_config.HOST_URL, domain), 'following': '%s/%s/following' % (appengine_config.HOST_URL, domain), 'followers': '%s/%s/followers' % (appengine_config.HOST_URL, domain), }) logging.info('Returning: %s', json.dumps(obj, indent=2)) self.response.headers.update({ 'Content-Type': common.CONTENT_TYPE_AS2, 'Access-Control-Allow-Origin': '*', }) self.response.write(json.dumps(obj, indent=2))
def build_user_json(me, resp=None): """user_json contains an h-card, rel-me links, and "me" Args: me: string, URL of the user, returned by resp: :class:`requests.Response` (optional), re-use response if it's already been fetched Return: dict, with 'me', the URL for this person; 'h-card', the representative h-card for this page; 'rel-me', a list of rel-me URLs found at this page """ user_json = {'me': me} resp = resp or util.requests_get(me) if resp.status_code // 100 != 2: logging.warning('could not fetch user url "%s". got response code: %d', me, resp.status_code) return user_json mf2 = util.parse_mf2(resp, resp.url) user_json['rel-me'] = mf2['rels'].get('me') user_json['h-card'] = mf2util.representative_hcard(mf2, me) logging.debug('built user-json %r', user_json) return util.trim_nulls(user_json)
def build_user_json(me, resp=None): """user_json contains an h-card, rel-me links, and "me" Args: me: string, URL of the user, returned by resp: requests.Response (optional), re-use response if it's already been fetched Return: dict, with 'me', the URL for this person; 'h-card', the representative h-card for this page; 'rel-me', a list of rel-me URLs found at this page """ user_json = {'me': me} resp = resp or util.requests_get(me) if resp.status_code // 100 != 2: logging.warning( 'could not fetch user url "%s". got response code: %d', me, resp.status_code) return user_json # Requests doesn't look at the HTML body to find <meta charset> # tags, so if the character encoding isn't given in a header, then # we pass on the raw bytes and let BS4 deal with it. p = mf2py.parse(doc=resp.text if 'charset' in resp.headers.get('content-type', '') else resp.content, url=me) user_json['rel-me'] = p.get('rels', {}).get('me') user_json['h-card'] = mf2util.representative_hcard(p, me) logging.debug('built user-json %r', user_json) return util.trim_nulls(user_json)
def fetch(self): # fetch the website and parse for microformats try: parser = mf2py.Parser(url=self.url) except: return None # identify the representative h-card parsed = parser.to_dict() hcard = mf2util.representative_hcard(parsed, self.url) if not hcard: hcards = parser.to_dict(filter_by_type='h-card') if len(hcards): hcard = hcards[0] if hcard: self.name = hcard['properties'].get('name', [None])[0] self.nicknames = hcard['properties'].get('nickname', None) # identify rel-me links as pseudonyms matches = {} for url in parser.to_dict()['rels'].get('me', []): match = Pseudonym.identify_url(url, self) if not match: continue if match.target not in self.pseudonyms: self.pseudonyms[match.target] = match # remember the last time I fetched self.timestamp = time.time() # save to the database self.save()
def actor(domain): """Serves /[DOMAIN], fetches its mf2, converts to AS Actor, and serves it.""" tld = domain.split('.')[-1] if tld in common.TLD_BLOCKLIST: error('', status=404) mf2 = util.fetch_mf2(f'http://{domain}/', gateway=True, headers=common.HEADERS) hcard = mf2util.representative_hcard(mf2, mf2['url']) logging.info(f'Representative h-card: {json_dumps(hcard, indent=2)}') if not hcard: error( f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {mf2['url']}" ) key = MagicKey.get_or_create(domain) obj = common.postprocess_as2(as2.from_as1( microformats2.json_to_object(hcard)), key=key) obj.update({ 'preferredUsername': domain, 'inbox': f'{request.host_url}{domain}/inbox', 'outbox': f'{request.host_url}{domain}/outbox', 'following': f'{request.host_url}{domain}/following', 'followers': f'{request.host_url}{domain}/followers', }) logging.info(f'Returning: {json_dumps(obj, indent=2)}') return (obj, { 'Content-Type': common.CONTENT_TYPE_AS2, 'Access-Control-Allow-Origin': '*', })
def get(self, domain): tld = domain.split('.')[-1] if tld in common.TLD_BLOCKLIST: self.error('', status=404) mf2 = util.fetch_mf2('http://%s/' % domain, gateway=True, headers=common.HEADERS) # logging.info('Parsed mf2 for %s: %s', resp.url, json_dumps(mf2, indent=2)) hcard = mf2util.representative_hcard(mf2, mf2['url']) logging.info('Representative h-card: %s', json_dumps(hcard, indent=2)) if not hcard: self.error("""\ Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % mf2['url']) key = MagicKey.get_or_create(domain) obj = self.postprocess_as2(as2.from_as1(microformats2.json_to_object(hcard)), key=key) obj.update({ 'inbox': '%s/%s/inbox' % (self.request.host_url, domain), 'outbox': '%s/%s/outbox' % (self.request.host_url, domain), 'following': '%s/%s/following' % (self.request.host_url, domain), 'followers': '%s/%s/followers' % (self.request.host_url, domain), }) logging.info('Returning: %s', json_dumps(obj, indent=2)) self.response.headers.update({ 'Content-Type': common.CONTENT_TYPE_AS2, 'Access-Control-Allow-Origin': '*', }) self.response.write(json_dumps(obj, indent=2))
def test_hcard_as_a_property(): """h-card is the p-author of the primary h-feed """ p = { 'rels': {}, 'items': [ { 'type': ['h-feed'], 'properties': { 'author': [ { 'type': ['h-card'], 'properties': { 'name': ['Elliot Alderson'], 'url': ['http://foo.com/bar'] } } ] } } ] } hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert hcard assert hcard['properties']['name'][0] == 'Elliot Alderson'
def test_nested_hcard(): p = { 'rels': {}, 'items': [ { 'type': ['h-card'], 'properties': { 'url': ['http://foo.com/bar', 'http://tilde.club/~foobar'], 'name': ['Bad'], } }, { 'type': ['h-entry'], 'children': [ { 'type': ['h-card'], 'properties': { 'url': [ 'http://foo.com/bar', 'http://tilde.club/~foobar' ], 'uid': ['http://foo.com/bar'], 'name': ['Good'], } }, ] }, ] } hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert hcard assert hcard['properties']['name'][0] == 'Good'
def login_callback(): current_app.logger.debug('callback fields: %s', request.args) state = request.args.get('state') next_url = state or url_for('views.index') # TODO rediscover these endpoints based on 'me'. Assuming # they are the same is not totally safe. auth_url, token_url, micropub_url = session['endpoints'] if not auth_url: flash('Login failed: No authorization URL in session') return redirect(next_url) code = request.args.get('code') client_id = get_settings().site_url redirect_uri = url_for('.login_callback', _external=True) current_app.logger.debug('callback with auth endpoint %s', auth_url) response = requests.post(auth_url, data={ 'code': code, 'client_id': client_id, 'redirect_uri': redirect_uri, 'state': state, }) rdata = urllib.parse.parse_qs(response.text) if response.status_code != 200: current_app.logger.debug('call to auth endpoint failed %s', response) flash('Login failed {}: {}'.format(rdata.get('error'), rdata.get('error_description'))) return redirect(next_url) current_app.logger.debug('verify response %s', response.text) if 'me' not in rdata: current_app.logger.debug('Verify response missing required "me" field') flash('Verify response missing required "me" field {}'.format( response.text)) return redirect(next_url) me = rdata.get('me')[0] scopes = rdata.get('scope') try_micropub_config(token_url, micropub_url, scopes, code, me, redirect_uri, client_id, state) cred = Credential.query.get(('indieauth', me)) if not cred: cred = Credential(type='indieauth', value=me, display=me) db.session.add(cred) db.session.commit() # offer to associate credential with existing user or create a new user p = mf2py.parse(url=me) hcard = mf2util.representative_hcard(p, me) author = hcard and mf2util.parse_author(hcard) return do_login(cred, author and author.get('name'), next_url)
def test_hcard_as_a_property(): """h-card is the p-author of the primary h-feed """ p = { 'rels': {}, 'items': [{ 'type': ['h-feed'], 'properties': { 'author': [{ 'type': ['h-card'], 'properties': { 'name': ['Elliot Alderson'], 'url': ['http://foo.com/bar'] } }] } }] } hcard = mf2util.representative_hcard(p, 'http://foo.com/bar') assert hcard assert hcard['properties']['name'][0] == 'Elliot Alderson'
def template_vars(self, domain=None, url=None): logging.debug(f'Headers: {list(request.headers.items())}') if domain.split('.')[-1] in NON_TLDS: error(f"{domain} doesn't look like a domain", status=404) # find representative h-card. try url, then url's home page, then domain urls = [f'http://{domain}/'] if url: urls = [url, urllib.parse.urljoin(url, '/')] + urls for candidate in urls: resp = common.requests_get(candidate) parsed = util.parse_html(resp) mf2 = util.parse_mf2(parsed, url=resp.url) # logging.debug(f'Parsed mf2 for {resp.url}: {json_dumps(mf2, indent=2)}') hcard = mf2util.representative_hcard(mf2, resp.url) if hcard: logging.info( f'Representative h-card: {json_dumps(hcard, indent=2)}') break else: error( f"didn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {resp.url}" ) logging.info(f'Generating WebFinger data for {domain}') key = models.MagicKey.get_or_create(domain) props = hcard.get('properties', {}) urls = util.dedupe_urls(props.get('url', []) + [resp.url]) canonical_url = urls[0] acct = f'{domain}@{domain}' for url in urls: if url.startswith('acct:'): urluser, urldomain = util.parse_acct_uri(url) if urldomain == domain: acct = f'{urluser}@{domain}' logging.info(f'Found custom username: acct:{acct}') break # discover atom feed, if any atom = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM) if atom and atom['href']: atom = urllib.parse.urljoin(resp.url, atom['href']) else: atom = 'https://granary.io/url?' + urllib.parse.urlencode( { 'input': 'html', 'output': 'atom', 'url': resp.url, 'hub': resp.url, }) # discover PuSH, if any for link in resp.headers.get('Link', '').split(','): match = common.LINK_HEADER_RE.match(link) if match and match.group(2) == 'hub': hub = match.group(1) else: hub = 'https://bridgy-fed.superfeedr.com/' # generate webfinger content data = util.trim_nulls({ 'subject': 'acct:' + acct, 'aliases': urls, 'magic_keys': [{ 'value': key.href() }], 'links': sum(([{ 'rel': 'http://webfinger.net/rel/profile-page', 'type': 'text/html', 'href': url, }] for url in urls if url.startswith("http")), []) + [{ 'rel': 'http://webfinger.net/rel/avatar', 'href': get_text(url), } for url in props.get('photo', [])] + [ { 'rel': 'canonical_uri', 'type': 'text/html', 'href': canonical_url, }, # ActivityPub { 'rel': 'self', 'type': common.CONTENT_TYPE_AS2, # WARNING: in python 2 sometimes request.host_url lost port, # http://localhost:8080 would become just http://localhost. no # clue how or why. pay attention here if that happens again. 'href': f'{request.host_url}{domain}', }, { 'rel': 'inbox', 'type': common.CONTENT_TYPE_AS2, 'href': f'{request.host_url}{domain}/inbox', }, # OStatus { 'rel': 'http://schemas.google.com/g/2010#updates-from', 'type': common.CONTENT_TYPE_ATOM, 'href': atom, }, { 'rel': 'hub', 'href': hub, }, { 'rel': 'magic-public-key', 'href': key.href(), }, { 'rel': 'salmon', 'href': f'{request.host_url}{domain}/salmon', } ] }) logging.info(f'Returning WebFinger data: {json_dumps(data, indent=2)}') return data
def template_vars(self, domain, url=None): assert domain if domain.split('.')[-1] in NON_TLDS: common.error(self, "%s doesn't look like a domain" % domain, status=404) # find representative h-card. try url, then url's home page, then domain urls = ['http://%s/' % domain] if url: urls = [url, urlparse.urljoin(url, '/')] + urls for candidate in urls: resp = common.requests_get(candidate) parsed = common.beautifulsoup_parse(resp.content, from_encoding=resp.encoding) mf2 = mf2py.parse(parsed, url=resp.url, img_with_alt=True) # logging.debug('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2)) hcard = mf2util.representative_hcard(mf2, resp.url) if hcard: logging.info('Representative h-card: %s', json.dumps(hcard, indent=2)) break else: common.error( self, """\ Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url) logging.info('Generating WebFinger data for %s', domain) key = models.MagicKey.get_or_create(domain) props = hcard.get('properties', {}) urls = util.dedupe_urls(props.get('url', []) + [resp.url]) canonical_url = urls[0] acct = '%s@%s' % (domain, domain) for url in urls: if url.startswith('acct:'): urluser, urldomain = util.parse_acct_uri(url) if urldomain == domain: acct = '%s@%s' % (urluser, domain) logging.info('Found custom username: acct:%s', acct) break # discover atom feed, if any atom = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM) if atom and atom['href']: atom = urlparse.urljoin(resp.url, atom['href']) else: atom = 'https://granary.io/url?' + urllib.urlencode( { 'input': 'html', 'output': 'atom', 'url': resp.url, 'hub': resp.url, }) # discover PuSH, if any for link in resp.headers.get('Link', '').split(','): match = common.LINK_HEADER_RE.match(link) if match and match.group(2) == 'hub': hub = match.group(1) else: hub = 'https://bridgy-fed.superfeedr.com/' # generate webfinger content data = util.trim_nulls({ 'subject': 'acct:' + acct, 'aliases': urls, 'magic_keys': [{ 'value': key.href() }], 'links': sum(([{ 'rel': 'http://webfinger.net/rel/profile-page', 'type': 'text/html', 'href': url, }] for url in urls if url.startswith("http")), []) + [{ 'rel': 'http://webfinger.net/rel/avatar', 'href': url, } for url in props.get('photo', [])] + [ { 'rel': 'canonical_uri', 'type': 'text/html', 'href': canonical_url, }, # ActivityPub { 'rel': 'self', 'type': 'application/activity+json', # use HOST_URL instead of e.g. request.host_url because it # sometimes lost port, e.g. http://localhost:8080 would become # just http://localhost. no clue how or why. 'href': '%s/%s' % (appengine_config.HOST_URL, domain), }, { 'rel': 'inbox', 'type': 'application/activity+json', 'href': '%s/%s/inbox' % (appengine_config.HOST_URL, domain), }, # OStatus { 'rel': 'http://schemas.google.com/g/2010#updates-from', 'type': common.CONTENT_TYPE_ATOM, 'href': atom, }, { 'rel': 'hub', 'href': hub, }, { 'rel': 'magic-public-key', 'href': key.href(), }, { 'rel': 'salmon', 'href': '%s/%s/salmon' % (appengine_config.HOST_URL, domain), } ] }) logging.info('Returning WebFinger data: %s', json.dumps(data, indent=2)) return data
def generate(domain): try: resp = requests.get('http://' + domain, timeout=60, verify=False) resp.raise_for_status() except Exception as e: print(str(e), file=sys.stderr) return fetch_time = datetime.datetime.now() soup = bs4.BeautifulSoup(resp.text, 'lxml') # extract these from: # * mf2 representative h-card # * HTML head and meta tags # * Open Graph tags # * Twitter card tags # * Clearbit's Enrichment and Logo APIs urls = FieldSet() names = FieldSet() descriptions = FieldSet() pictures = FieldSet() mf2 = mf2py.parse(url=resp.url, doc=soup) hcard = mf2util.representative_hcard(mf2, resp.url) if hcard: names.update(get_texts(hcard, 'name')) urls.update(get_texts(hcard, 'url')) pictures.update(get_texts(hcard, 'photo')) for prop in 'note', 'label', 'description': descriptions.update(get_texts(hcard, prop)) # HTML head/meta tags rels = mf2.get('rels', {}) urls.update(rels.get('canonical', [])) names.add(soup.title) descriptions.add_metas(soup, attrs={'name': 'description'}) pictures.update(rels.get('icon', [])) # Open Graph tags, http://ogp.me/ urls.add_metas(soup, property='og:url') descriptions.add_metas(soup, property='og:description') names.add_metas(soup, property=('og:title', 'og:site_name')) pictures.add_metas( soup, property=('og:image', 'og:image:url', 'og:image:secure_url')) # Twitter card tags, https://dev.twitter.com/cards/overview urls.add_metas(soup, attrs={'name': 'twitter:url'}) names.add_metas(soup, attrs={'name': 'twitter:title'}) descriptions.add_metas(soup, attrs={'name': 'twitter:description'}) pictures.add_metas(soup, attrs={'name': 'twitter:image'}) # Clearbit: # https://dashboard.clearbit.com/docs#enrichment-api # https://logo.clearbit.com/snarfed.org # https://person.clearbit.com/v2/combined/find?domain=snarfed.org # (needs account and oauth token) if not urls: urls = [u'http://{}/'.format(domain)] if not names: names = [domain] return { 'domain': domain, 'fetch_time': fetch_time.isoformat('T'), 'urls': list(urls), 'names': list(names), 'descriptions': list(descriptions), 'pictures': list(pictures), 'hcard': json.dumps(hcard, sort_keys=True), 'rel_mes': rels.get('me', []), 'mf2': json.dumps(mf2, sort_keys=True), 'html': resp.text, }