def test_location_geo_url(): """Test the location algorithm with a u-geo geo: URL. http://microformats.org/wiki/microformats2#h-card https://tools.ietf.org/html/rfc5870 """ parsed = { 'items': [{ 'type': ['h-entry'], 'properties': { 'geo': [u'geo:48.2010,16.3695,183;crs=wgs84;u=40'], }, }]} result = mf2util.interpret(parsed, 'http://example.com/') assert result['location'] == { 'altitude': '183', 'latitude': '48.2010', 'longitude': '16.3695', } parsed['items'][0]['properties']['geo'] = ['geo:48.2010,16.3695'] result = mf2util.interpret(parsed, 'http://example.com/') assert result['location'] == { 'latitude': '48.2010', 'longitude': '16.3695', }
def fetch_reply_contexts(reply_pairs, now, fetch_mf2_func): old_contexts = {} in_reply_tos = [url for _, url in reply_pairs] if in_reply_tos: for entry in (Entry.query .join(Entry.feed) .filter(Entry.permalink.in_(in_reply_tos), Feed.type == 'html')): old_contexts[entry.permalink] = entry for entry, in_reply_to in reply_pairs: context = old_contexts.get(in_reply_to) if not context: current_app.logger.info('fetching in-reply-to: %s', in_reply_to) try: proxied_reply_url = proxy_url(in_reply_to) parsed = mf2util.interpret( mf2py.parse(url=proxied_reply_url), in_reply_to, fetch_mf2_func=fetch_mf2_func) if parsed: context = hentry_to_entry(parsed, None, False, now) except requests.exceptions.RequestException as err: current_app.logger.warn( '%s fetching reply context: %s for entry: %s', type(err).__name__, proxied_reply_url, entry.permalink) if context: db.session.add(context) entry.reply_context.append(context)
def convert_mf2util(): def dates_to_string(json): if isinstance(json, dict): return {k: dates_to_string(v) for (k, v) in json.items()} if isinstance(json, list): return [dates_to_string(v) for v in json] if isinstance(json, datetime.date) or isinstance(json, datetime.datetime): return json.isoformat() return json url = request.args.get('url') as_feed = request.args.get('as-feed') op = request.args.get('op') if url: try: d = mf2py.parse(url=url) if op == 'post-type-discovery': entry = mf2util.find_first_entry(d, ['h-entry', 'h-event']) return jsonify({'type': mf2util.post_type_discovery(entry)}) if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']): json = mf2util.interpret_feed(d, url) else: json = mf2util.interpret(d, url) return jsonify(dates_to_string(json)) except: current_app.logger.exception('running mf2util service') return jsonify({'error': str(sys.exc_info()[0])}) return """
def create_context(url): for context in hooks.fire('create-context', url): if context: return context context = None response = None try: response = util.fetch_html(url) response.raise_for_status() context = Context.query.filter_by(url=url).first() app.logger.debug('checked for pre-existing context for this url: %s', context) blob = mf2py.Parser(doc=response.text, url=url).to_dict() if blob: app.logger.debug('parsed successfully by mf2py: %s', url) entry = mf2util.interpret(blob, url) if entry: app.logger.debug('parsed successfully by mf2util: %s', url) published = entry.get('published') content = util.clean_foreign_html(entry.get('content', '')) content_plain = util.format_as_text( content, link_fn=lambda a: a) title = entry.get('name') author_name = entry.get('author', {}).get('name', '') author_image = entry.get('author', {}).get('photo') permalink = entry.get('url') if not permalink or not isinstance(permalink, str): permalink = url context = Context() context.url = url context.permalink = permalink context.author_name = author_name context.author_url = entry.get('author', {}).get('url', '') context.author_image = author_image context.content = content context.content_plain = content_plain context.published = published context.title = title except: app.logger.exception( 'Could not fetch context for url %s, received response %s', url, response) if not context: app.logger.debug('Generating default context: %s', url) context = Context() context.url = context.permalink = url if response: html = response.text soup = bs4.BeautifulSoup(html) if soup.title: app.logger.debug('Found title: %s', soup.title.string) context.title = soup.title.string return context
def test_location_top_level(): """Test the location algorithm with top level properties.""" parsed = load_test('location_top_level') result = mf2util.interpret(parsed, 'http://example.com/') assert result['location'] == { 'latitude': '37.83', 'longitude': '-122.25', }
def test_article_naive_datetime(): parsed = load_test('article_naive_datetime') result = mf2util.interpret( parsed, 'http://tantek.com/2014/120/b1/markup-people-focused-mobile-communication') assert result['type'] == 'entry' assert result['name'] == 'Markup For People Focused Mobile Communication' assert '<h2>Action labels not app names</h2>' in result['content'] assert result['published'] == datetime(2014, 4, 30, 12, 11) assert result['updated'] == datetime(2014, 4, 30, 12, 11)
def test_location_geo(): """Test the location algorithm with an h-geo.""" parsed = load_test('location_h-geo') result = mf2util.interpret(parsed, 'http://example.com/') assert result['location'] == { 'altitude': '123.0', 'latitude': '37.83', 'longitude': '-122.25', }
def test_location_adr(): """Test the location algorithm with an h-adr.""" parsed = load_test('location_h-adr') result = mf2util.interpret(parsed, 'http://example.com/') assert result['location'] == { 'street-address': '17 Austerstræti', 'locality': 'Reykjavík', 'country-name': 'Iceland', 'postal-code': '107', 'name': '17 Austerstræti Reykjavík Iceland 107', }
def test_location_hcard(): """Test the location algorithm with an h-card. https://indieweb.org/location#How_to_determine_the_location_of_a_microformat """ parsed = load_test('location_h-card') result = mf2util.interpret(parsed, 'http://example.com/') assert result['location'] == { 'name': 'Timeless Coffee Roasters', 'latitude': '37.83', 'longitude': '-122.25', }
def test_article_two_published_dates(): """Test for a case that was throwing exceptions. Could not interpret datetime on posts with two dt-published dates because I was concatenating them. Should just take the first instead. """ parsed = load_test('article_two_published_dates') result = mf2util.interpret( parsed, 'article.html') assert result['type'] == 'entry' assert result['name'] == 'Test Article with Two Published Dates' assert result['published'].replace(tzinfo=None) == datetime(2014, 4, 30, 12, 11, 00) assert result['published'].utcoffset() == timedelta(hours=-8)
def test_event(): # HWC event from werd.io parsed = load_test('hwc-event') result = mf2util.interpret( parsed, 'http://werd.io/2014/homebrew-website-club-4') assert result['type'] == 'event' assert result['name'] == 'Homebrew Website Club' assert 'Are you building your own website?' in result['content'] assert result['start'].replace(tzinfo=None) == datetime(2014, 5, 7, 18, 30) assert result['start'].utcoffset() == timedelta(hours=0) assert result['end'].replace(tzinfo=None) == datetime(2014, 5, 7, 19, 30) assert result['end'].utcoffset() == timedelta(hours=0)
def extract_mf2_context(context, doc, url): """ Gets Microformats2 data from the given document """ cached_mf2 = {} # used by authorship algorithm def fetch_mf2(url): if url in cached_mf2: return cached_mf2[url] p = mf2py.parse(url=url) cached_mf2[url] = p return p blob = mf2py.parse(doc=doc, url=url) cached_mf2[url] = blob if blob: current_app.logger.debug('parsed successfully by mf2py: %s', url) entry = mf2util.interpret(blob, url, fetch_mf2_func=fetch_mf2) if entry: current_app.logger.debug( 'parsed successfully by mf2util: %s', url) published = entry.get('published') content = util.clean_foreign_html(entry.get('content', '')) content_plain = util.format_as_text( content, link_fn=lambda a: a) title = entry.get('name') if title and len(title) > 512: # FIXME is there a db setting to do this automatically? title = title[:512] author_name = entry.get('author', {}).get('name', '') author_image = entry.get('author', {}).get('photo') permalink = entry.get('url') if not permalink or not isinstance(permalink, str): permalink = url context.url = url context.permalink = permalink context.author_name = author_name context.author_url = entry.get('author', {}).get('url', '') context.author_image = author_image context.content = content context.content_plain = content_plain context.published = published context.title = title return context
def test_p_content(): """make sure p-content (instead of the usual e-content) doesn't cause us to throw an exception """ parsed = {"items": [{"properties": {"author": [{"properties": {"name": ["Kyle"], "url": ["https://kylewm.com"]}, "type": ["h-card"], "value": "Kyle"}], "content": ["Thanks for hosting!"], "in-reply-to": ["https://snarfed.org/2014-06-16_homebrew-website-club-at-quip"], "name": ["I'm attending\n Homebrew Website Club at Quip\n Thanks for hosting!\n Kyle"], "rsvp": ["yes"]}, "type": ["h-entry"]}], "rel-urls": {}, "rels": {}} result = mf2util.interpret(parsed, 'http://kylewm.com/test/rsvp.html') assert 'Thanks for hosting!' == result.get('content')
def test_event(): # HWC event from werd.io parsed = load_test('hwc-event') result = mf2util.interpret( parsed, 'http://werd.io/2014/homebrew-website-club-4') assert result['type'] == 'event' assert result['name'] == 'Homebrew Website Club' assert 'Are you building your own website?' in result['content'] assert result['start'].replace(tzinfo=None) == datetime(2014, 5, 7, 18, 30) assert result['start'].utcoffset() == timedelta(hours=0) assert result['end'].replace(tzinfo=None) == datetime(2014, 5, 7, 19, 30) assert result['end'].utcoffset() == timedelta(hours=0) assert result['location'] == { 'name': 'Mozilla SF, 1st floor, 2 Harrison st. (at Embarcadero), San Francisco, CA ', }
def extract_mf2_context(context, doc, url): """ Gets Microformats2 data from the given document """ cached_mf2 = {} # used by authorship algorithm def fetch_mf2(url): if url in cached_mf2: return cached_mf2[url] p = mf2py.parse(url=url) cached_mf2[url] = p return p blob = mf2py.parse(doc=doc, url=url) cached_mf2[url] = blob if blob: current_app.logger.debug('parsed successfully by mf2py: %s', url) entry = mf2util.interpret(blob, url, fetch_mf2_func=fetch_mf2) if entry: current_app.logger.debug('parsed successfully by mf2util: %s', url) published = entry.get('published') content = util.clean_foreign_html(entry.get('content', '')) content_plain = util.format_as_text(content, link_fn=lambda a: a) title = entry.get('name') if title and len(title) > 512: # FIXME is there a db setting to do this automatically? title = title[:512] author_name = entry.get('author', {}).get('name', '') author_image = entry.get('author', {}).get('photo') permalink = entry.get('url') if not permalink or not isinstance(permalink, str): permalink = url context.url = url context.permalink = permalink context.author_name = author_name context.author_url = entry.get('author', {}).get('url', '') context.author_image = author_image context.content = content context.content_plain = content_plain context.published = published context.title = title return context
def convert_mf2util(): def dates_to_string(json): if isinstance(json, dict): return {k: dates_to_string(v) for (k, v) in json.items()} if isinstance(json, list): return [dates_to_string(v) for v in json] if isinstance(json, datetime.date) or isinstance(json, datetime.datetime): return json.isoformat() return json url = request.args.get('url') if url: d = mf2py.Parser(url=url).to_dict() if mf2util.find_first_entry(d, ['h-feed']): json = mf2util.interpret_feed(d, url) else: json = mf2util.interpret(d, url) return jsonify(dates_to_string(json)) return """
def fetch_reply_context(entry_id, in_reply_to, now): with flask_app(): entry = Entry.query.get(entry_id) context = Entry.query\ .join(Entry.feed)\ .filter(Entry.permalink==in_reply_to, Feed.type == 'html')\ .first() if not context: current_app.logger.info('fetching in-reply-to url: %s', in_reply_to) parsed = mf2util.interpret( mf2py.parse(url=proxy_url(in_reply_to)), in_reply_to) if parsed: context = hentry_to_entry(parsed, in_reply_to, False, now) if context: entry.reply_context.append(context) db.session.commit()
def fetch_context(): url = request.args.get('url') if not url: return make_response( jsonify({ 'error': 'missing_url', 'message': "Missing 'url' query parameter", }), 400) # TODO cache everything. check newer urls more frequently than # older urls. be careful not to overwrite previous good responses # with failure. url = maybe_proxy(url) resp = fetch(url) if resp.status_code // 100 != 2: return make_response( jsonify({ 'error': 'fetch_failed', 'message': 'Failed to fetch resource at ' + url, 'response': resp.text, 'code': resp.status_code, }), resp.status_code) parsed = mf2py.parse( doc=resp.text if 'content-type' in resp.headers else resp.content, url=url) entry = mf2util.interpret(parsed, url, want_json=True) blob = {} if entry: blob['data'] = entry cb = request.args.get('callback') if cb: # jsonp resp = make_response('{}({})'.format(cb, json.dumps(blob))) resp.headers['content-type'] = 'application/javascript; charset=utf-8' return resp return jsonify(blob)
def fetch_context(): url = request.args.get('url') if not url: return make_response(jsonify({ 'error': 'missing_url', 'message': "Missing 'url' query parameter", }), 400) # TODO cache everything. check newer urls more frequently than # older urls. be careful not to overwrite previous good responses # with failure. url = maybe_proxy(url) resp = fetch(url) if resp.status_code // 100 != 2: return make_response(jsonify({ 'error': 'fetch_failed', 'message': 'Failed to fetch resource at ' + url, 'response': resp.text, 'code': resp.status_code, }), resp.status_code) parsed = mf2py.parse( doc=resp.text if 'content-type' in resp.headers else resp.content, url=url) entry = mf2util.interpret(parsed, url, want_json=True) blob = {} if entry: blob['data'] = entry cb = request.args.get('callback') if cb: # jsonp resp = make_response('{}({})'.format(cb, json.dumps(blob))) resp.headers['content-type'] = 'application/javascript; charset=utf-8' return resp return jsonify(blob)
def test_comment_and_like(): parsed = load_test('note_with_comment_and_like') result = mf2util.interpret( parsed, 'https://kylewm.com/2015/10/big-thing-missing-from-my-indieweb-experience-is') assert result['type'] == 'entry' assert len(result['comment']) == 1 assert result['comment'][0]['type'] == 'cite' assert result['comment'][0]['author'] == { 'name': 'Aaron Parecki', 'photo': 'https://twitter.com/aaronpk/profile_image?size=original', 'url': 'http://aaronparecki.com', } assert result['comment'][0]['content'] == '<a href=\"https://twitter.com/kylewmahan\">@kylewmahan</a> I usually click through a couple levels up looking to see if any of the URLs up the chain show comments <a href=\"https://twitter.com/search?q=%23indieweb\">#indieweb</a>' assert len(result['like']) == 1 assert result['like'][0]['type'] == 'cite' assert result['like'][0]['author'] == { 'name': '', 'url': 'https://twitter.com/benwerd', 'photo': 'https://kylewm.com/imageproxy?url=https%3A%2F%2Ftwitter.com%2Fbenwerd%2Fprofile_image%3Fsize%3Doriginal&size=48&sig=fde7ce5635f5ea132a2545ff5c7d3d33', }
def convert_mf2util(): def dates_to_string(json): if isinstance(json, dict): return {k: dates_to_string(v) for (k, v) in json.items()} if isinstance(json, list): return [dates_to_string(v) for v in json] if isinstance(json, datetime.date) or isinstance(json, datetime.datetime): return json.isoformat() return json url = request.args.get('url') as_feed = request.args.get('as-feed') if url: try: d = mf2py.parse(url=url) if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']): json = mf2util.interpret_feed(d, url) else: json = mf2util.interpret(d, url) return jsonify(dates_to_string(json)) except: return jsonify({'error': str(sys.exc_info()[0])}) return """
def test_no_p_name(): parsed = load_test('article_no_p-name') result = mf2util.interpret( parsed, 'http://example.com') assert 'Give me crayons and I will draw a rocketship.' in result['content'] assert 'name' not in result
def test_convert_relative_paths(): parsed = load_test('relative_paths') result = mf2util.interpret( parsed, 'http://example.com/blog/', base_href='../') assert result['content'] == 'This is an <img alt="alt text" title="the title" src="http://example.com/static/img.jpg"/> example document with <a href="http://example.com/relative_paths.html">relative paths</a>.'
def test_convert_relative_paths(): parsed = load_test('relative_paths') result = mf2util.interpret( parsed, 'http://example.com') assert result['content'] == 'This is an <img alt="alt text" title="the title" src="http://example.com/static/img.jpg"/> example document with <a href="http://example.com/relative_paths.html">relative paths</a>.'
def test_unusual_properties(): parsed = load_test('unusual_properties') result = mf2util.interpret(parsed, 'https://example.com/') assert 'Rocky Raccoon' == result.get('name') assert 'https://foo.bar/' == result.get('url') assert 'https://foo.bar/' == result.get('uid')
import mf2py import mf2util import pprint source_url = r'https://brid.gy/comment/twitter/desmondrivet/1117876830478852096/1118148721034891264' target_url = r'https://desmondrivet.com/2019/04/15/20190415154611' parsed = mf2py.Parser(url=source_url).to_dict() comment = mf2util.interpret_comment(parsed, source_url, [target_url]) general = mf2util.interpret(parsed, source_url) pprint.pprint(parsed) print('-----\n') pprint.pprint(comment)
def json_to_object(mf2, actor=None): """Converts microformats2 JSON to an ActivityStreams object. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') rsvp_verb = 'rsvp-%s' % rsvp if rsvp else None author = json_to_object(prop['author']) if prop.get('author') else actor # maps mf2 type to ActivityStreams objectType and optional verb. mf2_type_to_as_type = { 'rsvp': ('activity', rsvp_verb), 'invite': ('activity', 'invite'), 'repost': ('activity', 'share'), 'like': ('activity', 'like'), 'reply': ('comment', None), 'person': ('person', None), 'location': ('place', None), 'note': ('note', None), 'article': ('article', None), } mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = mf2_type_to_as_type.get(mf2_type, (None, None)) def absolute_urls(prop): return [ { 'url': url } for url in get_string_urls(props.get(prop, [])) # filter out relative and invalid URLs (mf2py gives absolute urls) if urlparse.urlparse(url).netloc ] urls = props.get('url') and get_string_urls(props.get('url')) obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{ 'value': u } for u in urls] if urls and len(urls) > 1 else None, 'image': absolute_urls('photo'), 'stream': absolute_urls('video'), 'location': json_to_object(prop.get('location')), 'replies': { 'items': [json_to_object(c) for c in props.get('comment', [])] }, 'tags': [{ 'objectType': 'hashtag', 'displayName': cat } if isinstance(cat, basestring) else json_to_object(cat) for cat in props.get('category', [])], } # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location']['latitude'] = float(lat) obj['location']['longitude'] = float(lng) # TODO fill in 'position', maybe using Source.postprocess_object? except ValueError: logging.warn( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ('like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else { 'url': target } # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) else: obj.update({ 'inReplyTo': [{ 'url': url } for url in get_string_urls(props.get('in-reply-to', []))], 'author': author, }) return util.trim_nulls(obj)
def json_to_object(mf2, actor=None, fetch_mf2=False): """Converts microformats2 JSON to an ActivityStreams object. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. fetch_mf2: boolean, whether to fetch additional pages via HTTP if necessary, e.g. to determine authorship: https://indieweb.org/authorship Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') # convert author mf2_author = prop.get('author') if mf2_author and isinstance(mf2_author, dict): author = json_to_object(mf2_author) else: # the author h-card may be on another page. run full authorship algorithm: # https://indieweb.org/authorship def fetch(url): return mf2py.parse(util.requests_get(url).text, url=url) author = mf2util.find_author( {'items': [mf2]}, hentry=mf2, fetch_mf2_func=fetch if fetch_mf2 else None) if author: author = { 'objectType': 'person', 'url': author.get('url'), 'displayName': author.get('name'), 'image': [{ 'url': author.get('photo') }], } if not author: author = actor mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = MF2_TO_AS_TYPE_VERB.get(mf2_type, (None, None)) if rsvp: as_verb = 'rsvp-%s' % rsvp # special case GitHub issues that are in-reply-to the repo or its issues URL in_reply_tos = get_string_urls(props.get('in-reply-to', [])) for url in in_reply_tos: if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url): as_type = 'issue' def absolute_urls(prop): return [ url for url in get_string_urls(props.get(prop, [])) # filter out relative and invalid URLs (mf2py gives absolute urls) if urllib.parse.urlparse(url).netloc ] urls = props.get('url') and get_string_urls(props.get('url')) # quotations: https://indieweb.org/quotation#How_to_markup attachments = [ json_to_object(quote) for quote in mf2.get('children', []) + props.get('quotation-of', []) if isinstance(quote, dict) and 'h-cite' in set(quote.get('type', [])) ] # audio and video for type in 'audio', 'video': attachments.extend({ 'objectType': type, 'stream': { 'url': url } } for url in get_string_urls(props.get(type, []))) obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'startTime': prop.get('start'), 'endTime': prop.get('end'), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{ 'value': u } for u in urls] if urls and len(urls) > 1 else None, 'image': [{ 'url': url } for url in dedupe_urls( absolute_urls('photo') + absolute_urls('featured'))], 'stream': [{ 'url': url } for url in absolute_urls('video')], 'location': json_to_object(prop.get('location')), 'replies': { 'items': [json_to_object(c) for c in props.get('comment', [])] }, 'tags': [{ 'objectType': 'hashtag', 'displayName': cat } if isinstance(cat, basestring) else json_to_object(cat) for cat in props.get('category', [])], 'attachments': attachments, } # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location'].update({ 'latitude': float(lat), 'longitude': float(lng), }) except ValueError: logging.warn( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ('like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else { 'url': target } # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) else: obj.update({ 'inReplyTo': [{ 'url': url } for url in in_reply_tos], 'author': author, }) return source.Source.postprocess_object(obj)
def json_to_object(mf2, actor=None, fetch_mf2=False): """Converts a single microformats2 JSON item to an ActivityStreams object. Supports h-entry, h-event, h-card, and other single item times. Does *not* yet support h-feed. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. fetch_mf2: boolean, whether to fetch additional pages via HTTP if necessary, e.g. to determine authorship: https://indieweb.org/authorship Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') # convert author mf2_author = prop.get('author') if mf2_author and isinstance(mf2_author, dict): author = json_to_object(mf2_author) else: # the author h-card may be on another page. run full authorship algorithm: # https://indieweb.org/authorship author = mf2util.find_author({'items': [mf2]}, hentry=mf2, fetch_mf2_func=util.fetch_mf2 if fetch_mf2 else None) if author: author = { 'objectType': 'person', 'url': author.get('url'), 'displayName': author.get('name'), 'image': [{'url': author.get('photo')}], } if not author: author = actor mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' elif 'tag-of' in props: # TODO: remove once this is in mf2util # https://github.com/kylewm/mf2util/issues/18 mf2_type = 'tag' elif 'follow-of' in props: # ditto mf2_type = 'follow' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = MF2_TO_AS_TYPE_VERB.get(mf2_type, (None, None)) if rsvp: as_verb = 'rsvp-%s' % rsvp # special case GitHub issues that are in-reply-to the repo or its issues URL in_reply_tos = get_string_urls(props.get('in-reply-to', [])) for url in in_reply_tos: if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url): as_type = 'issue' def is_absolute(url): """Filter out relative and invalid URLs (mf2py gives absolute urls).""" return urllib.parse.urlparse(url).netloc urls = props.get('url') and get_string_urls(props.get('url')) # quotations: https://indieweb.org/quotation#How_to_markup attachments = [ json_to_object(quote) for quote in mf2.get('children', []) + props.get('quotation-of', []) if isinstance(quote, dict) and 'h-cite' in set(quote.get('type', []))] # audio and video # # the duration mf2 property is still emerging. examples in the wild use both # integer seconds and ISO 8601 durations. # https://indieweb.org/duration # https://en.wikipedia.org/wiki/ISO_8601#Durations duration = prop.get('duration') or prop.get('length') if duration: if util.is_int(duration): duration = int(duration) else: parsed = util.parse_iso8601_duration(duration) if parsed: duration = int(parsed.total_seconds()) else: logging.debug('Unknown format for length or duration %r', duration) duration = None stream = None bytes = size_to_bytes(prop.get('size')) for type in 'audio', 'video': atts = [{ 'objectType': type, 'stream': { 'url': url, # integer seconds: http://activitystrea.ms/specs/json/1.0/#media-link 'duration': duration, # file size in bytes. nonstandard, not in AS1 or AS2 'size': bytes, }, } for url in get_string_urls(props.get(type, []))] attachments.extend(atts) if atts: stream = atts[0]['stream'] obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'startTime': prop.get('start'), 'endTime': prop.get('end'), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{'value': u} for u in urls] if urls and len(urls) > 1 else None, # image is special cased below, to handle alt 'stream': [stream], 'location': json_to_object(prop.get('location')), 'replies': {'items': [json_to_object(c) for c in props.get('comment', [])]}, 'tags': [{'objectType': 'hashtag', 'displayName': cat} if isinstance(cat, str) else json_to_object(cat) for cat in props.get('category', [])], 'attachments': attachments, } # images, including alt text photo_urls = set() obj['image'] = [] for photo in props.get('photo', []) + props.get('featured', []): url = photo alt = None if isinstance(photo, dict): photo = photo.get('properties') or photo url = get_first(photo, 'value') or get_first(photo, 'url') alt = get_first(photo, 'alt') if url and url not in photo_urls and is_absolute(url): photo_urls.add(url) obj['image'].append({'url': url, 'displayName': alt}) # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location'].update({ 'latitude': float(lat), 'longitude': float(lng), }) except ValueError: logging.debug( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ( 'follow-of', 'like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else {'url': target} # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) if as_verb == 'tag': obj['target'] = {'url': prop['tag-of']} if obj.get('object'): raise NotImplementedError( 'Combined in-reply-to and tag-of is not yet supported.') obj['object'] = obj.pop('tags') else: obj.update({ 'inReplyTo': [{'url': url} for url in in_reply_tos], 'author': author, }) return source.Source.postprocess_object(obj)
def json_to_object(mf2, actor=None, fetch_mf2=False): """Converts a single microformats2 JSON item to an ActivityStreams object. Supports h-entry, h-event, h-card, and other single item times. Does *not* yet support h-feed. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. fetch_mf2: boolean, whether to fetch additional pages via HTTP if necessary, e.g. to determine authorship: https://indieweb.org/authorship Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') # convert author mf2_author = prop.get('author') if mf2_author and isinstance(mf2_author, dict): author = json_to_object(mf2_author) else: # the author h-card may be on another page. run full authorship algorithm: # https://indieweb.org/authorship def fetch(url): return mf2py.parse(util.requests_get(url).text, url=url, img_with_alt=True) author = mf2util.find_author( {'items': [mf2]}, hentry=mf2, fetch_mf2_func=fetch if fetch_mf2 else None) if author: author = { 'objectType': 'person', 'url': author.get('url'), 'displayName': author.get('name'), 'image': [{'url': author.get('photo')}], } if not author: author = actor mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' elif 'tag-of' in props: # TODO: remove once this is in mf2util # https://github.com/kylewm/mf2util/issues/18 mf2_type = 'tag' elif 'follow-of' in props: # ditto mf2_type = 'follow' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = MF2_TO_AS_TYPE_VERB.get(mf2_type, (None, None)) if rsvp: as_verb = 'rsvp-%s' % rsvp # special case GitHub issues that are in-reply-to the repo or its issues URL in_reply_tos = get_string_urls(props.get('in-reply-to', [])) for url in in_reply_tos: if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url): as_type = 'issue' def is_absolute(url): """Filter out relative and invalid URLs (mf2py gives absolute urls).""" return urllib.parse.urlparse(url).netloc urls = props.get('url') and get_string_urls(props.get('url')) # quotations: https://indieweb.org/quotation#How_to_markup attachments = [ json_to_object(quote) for quote in mf2.get('children', []) + props.get('quotation-of', []) if isinstance(quote, dict) and 'h-cite' in set(quote.get('type', []))] # audio and video for type in 'audio', 'video': attachments.extend({'objectType': type, 'stream': {'url': url}} for url in get_string_urls(props.get(type, []))) obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'startTime': prop.get('start'), 'endTime': prop.get('end'), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{'value': u} for u in urls] if urls and len(urls) > 1 else None, # image is special cased below, to handle alt 'stream': [{'url': url} for url in get_string_urls(props.get('video', []))], 'location': json_to_object(prop.get('location')), 'replies': {'items': [json_to_object(c) for c in props.get('comment', [])]}, 'tags': [{'objectType': 'hashtag', 'displayName': cat} if isinstance(cat, basestring) else json_to_object(cat) for cat in props.get('category', [])], 'attachments': attachments, } # images, including alt text photo_urls = set() obj['image'] = [] for photo in props.get('photo', []) + props.get('featured', []): url = photo alt = None if isinstance(photo, dict): photo = photo.get('properties') or photo url = get_first(photo, 'value') or get_first(photo, 'url') alt = get_first(photo, 'alt') if url and url not in photo_urls and is_absolute(url): photo_urls.add(url) obj['image'].append({'url': url, 'displayName': alt}) # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location'].update({ 'latitude': float(lat), 'longitude': float(lng), }) except ValueError: logging.warn( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ( 'follow-of', 'like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else {'url': target} # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) if as_verb == 'tag': obj['target'] = {'url': prop['tag-of']} if obj.get('object'): raise NotImplementedError( 'Combined in-reply-to and tag-of is not yet supported.') obj['object'] = obj.pop('tags') else: obj.update({ 'inReplyTo': [{'url': url} for url in in_reply_tos], 'author': author, }) return source.Source.postprocess_object(obj)
def create_dcontext(url): repost_preview = None # youtube embeds m = YOUTUBE_RE.match(url) if m: repost_preview = ( """<iframe width="560" height="315" """ """src="//www.youtube.com/embed/{}" frameborder="0" """ """allowfullscreen></iframe>""" .format(m.group(1))) # instagram embeds m = INSTAGRAM_RE.match(url) if m: repost_preview = ( """<iframe src="//instagram.com/p/{}/embed/" """ """width="400" height="500" frameborder="0" scrolling="no" """ """allowtransparency="true"></iframe>""" .format(m.group(1))) blob = archiver.load_json_from_archive(url) if blob: try: entry = mf2util.interpret(blob, url) pub_date = entry.get('published') content = entry.get('content', '') content_plain = format_as_text(content) if len(content_plain) < 512: content = bleach.clean(autolink(content), strip=True) else: content = ( jinja2.filters.do_truncate(content_plain, 512) + ' <a class="u-url" href="{}">continued</a>'.format(url)) title = entry.get('name', 'a post') if len(title) > 256: title = jinja2.filters.do_truncate(title, 256) author_name = bleach.clean(entry.get('author', {}).get('name', '')) author_image = entry.get('author', {}).get('photo') if author_image: author_image = local_mirror_resource(author_image) return DContext( url=url, permalink=entry.get('url', url), author_name=author_name, author_url=entry.get('author', {}).get('url', ''), author_image=author_image or url_for( 'static', filename=AUTHOR_PLACEHOLDER), content=content, repost_preview=repost_preview, pub_date=pub_date, pub_date_iso=isotime_filter(pub_date), pub_date_human=human_time(pub_date), title=title, deleted=False, ) except: app.logger.exception('error interpreting %s', url) return DContext( url=url, permalink=url, author_name=None, author_url=None, author_image=None, content=None, repost_preview=repost_preview, pub_date=None, pub_date_iso=None, pub_date_human=None, title='a post', deleted=False, )
def json_to_object(mf2, actor=None): """Converts microformats2 JSON to an ActivityStreams object. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') rsvp_verb = 'rsvp-%s' % rsvp if rsvp else None author = json_to_object(prop['author']) if prop.get('author') else actor # maps mf2 type to ActivityStreams objectType and optional verb. mf2_type_to_as_type = { 'rsvp': ('activity', rsvp_verb), 'invite': ('activity', 'invite'), 'repost': ('activity', 'share'), 'like': ('activity', 'like'), 'reply': ('comment', None), 'person': ('person', None), 'location': ('place', None), 'note': ('note', None), 'article': ('article', None), } mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = mf2_type_to_as_type.get(mf2_type, (None, None)) def absolute_urls(prop): return [{'url': url} for url in get_string_urls(props.get(prop, [])) # filter out relative and invalid URLs (mf2py gives absolute urls) if urlparse.urlparse(url).netloc] urls = props.get('url') and get_string_urls(props.get('url')) obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{'value': u} for u in urls] if urls and len(urls) > 1 else None, 'image': absolute_urls('photo'), 'stream': absolute_urls('video'), 'location': json_to_object(prop.get('location')), 'replies': {'items': [json_to_object(c) for c in props.get('comment', [])]}, 'tags': [{'objectType': 'hashtag', 'displayName': cat} if isinstance(cat, basestring) else json_to_object(cat) for cat in props.get('category', [])], } # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location']['latitude'] = float(lat) obj['location']['longitude'] = float(lng) # TODO fill in 'position', maybe using Source.postprocess_object? except ValueError: logging.warn( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ( 'like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else {'url': target} # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) else: obj.update({ 'inReplyTo': [{'url': url} for url in get_string_urls(props.get('in-reply-to', []))], 'author': author, }) return util.trim_nulls(obj)
def interpret_entry( parsed, source_url, base_href=None, hentry=None, use_rel_syndication=True, want_json=False, fetch_mf2_func=None, ): """ Given a document containing an h-entry, return a dictionary. {'type': 'entry', 'url': permalink of the document (may be different than source_url), 'published': datetime or date, 'updated': datetime or date, 'name': title of the entry, 'content': body of entry (contains HTML), 'author': { 'name': author name, 'url': author url, 'photo': author photo }, 'syndication': [ 'syndication url', ... ], 'in-reply-to': [...], 'like-of': [...], 'repost-of': [...]} :param dict parsed: the result of parsing a document containing mf2 markup :param str source_url: the URL of the parsed document, used by the authorship algorithm :param str base_href: (optional) the href value of the base tag :param dict hentry: (optional) the item in the above document representing the h-entry. if provided, we can avoid a redundant call to find_first_entry :param boolean use_rel_syndication: (optional, default True) Whether to include rel=syndication in the list of syndication sources. Sometimes useful to set this to False when parsing h-feeds that erroneously include rel=syndication on each entry. :param boolean want_json: (optional, default False) if true, the result will be pure json with datetimes as strings instead of python objects :param callable fetch_mf2_func: (optional) function to fetch mf2 parsed output for a given URL. :return: a dict with some or all of the described properties """ # find the h-entry if it wasn't provided if not hentry: hentry = util.find_first_entry(parsed, ["h-entry"]) if not hentry: return {} result = _interpret_common_properties( parsed, source_url, base_href, hentry, use_rel_syndication, want_json, fetch_mf2_func, ) if "h-cite" in hentry.get("type", []): result["type"] = "cite" else: result["type"] = "entry" # NOTE patch start if "category" in hentry["properties"]: result["category"] = hentry["properties"]["category"] if "pubkey" in hentry["properties"]: result["pubkey"] = hentry["properties"]["pubkey"] if "vote" in hentry["properties"]: result["vote"] = hentry["properties"]["vote"] # NOTE patch end title = util.get_plain_text(hentry["properties"].get("name")) if title and util.is_name_a_title(title, result.get("content-plain")): result["name"] = title for prop in ( "in-reply-to", "like-of", "repost-of", "bookmark-of", "vote-on", "comment", "like", "repost", ): # NOTE added vote-on for url_val in hentry["properties"].get(prop, []): if isinstance(url_val, dict): result.setdefault(prop, []).append( util.interpret( parsed, source_url, base_href, url_val, use_rel_syndication=False, want_json=want_json, fetch_mf2_func=fetch_mf2_func, ) ) else: result.setdefault(prop, []).append( { "url": url_val, } ) return result