def test_p_author_string(): blob = { 'items': [{ 'type': ['h-entry'], 'properties': { 'author': ['John Doe'] } }] } assert mf2util.find_author(blob) == {'name': 'John Doe'}
def test_p_author_string(): blob = { 'items': [ { 'type': ['h-entry'], 'properties': { 'author': ['John Doe'] } } ] } assert mf2util.find_author(blob) == {'name': 'John Doe'}
def find_author(parsed, **kwargs): """Returns the author of a page as a ActivityStreams actor dict. Args: parsed: return value from mf2py.parse() kwargs: passed through to mf2util.find_author() """ author = mf2util.find_author(parsed, 'http://123', **kwargs) if author: return { 'displayName': author.get('name'), 'url': author.get('url'), 'image': {'url': author.get('photo')}, }
def load_test(testname, hentry_func=None): def fetch_mf2(url): testname = url prefix = 'http://example.com/' if testname.startswith(prefix): testname = testname[len(prefix):] with open('tests/authorship/' + testname) as f: return mf2py.parse(url=url, doc=f.read()) url = 'http://example.com/' + testname parsed = fetch_mf2(url) hentry = hentry_func and hentry_func(parsed) return mf2util.find_author( parsed, url, hentry=hentry, fetch_mf2_func=fetch_mf2)
def find_author(parsed, **kwargs): """Returns the author of a page as a ActivityStreams actor dict. Args: parsed: dict, parsed mf2 object (ie return value from mf2py.parse()) kwargs: passed through to mf2util.find_author() """ author = mf2util.find_author(parsed, 'http://123', **kwargs) if author: photo = author.get('photo') if isinstance(photo, dict): photo = photo.get('url') or photo.get('value') return { 'displayName': author.get('name'), 'url': author.get('url'), 'image': {'url': photo}, }
def load_test(testname, hentry_func=None): def fetch_mf2(url): testname = url prefix = 'http://example.com/' if testname.startswith(prefix): testname = testname[len(prefix):] with open('tests/authorship/' + testname) as f: return mf2py.parse(url=url, doc=f.read()) url = 'http://example.com/' + testname parsed = fetch_mf2(url) hentry = hentry_func and hentry_func(parsed) return mf2util.find_author(parsed, url, hentry=hentry, fetch_mf2_func=fetch_mf2)
def json_to_object(mf2, actor=None, fetch_mf2=False): """Converts microformats2 JSON to an ActivityStreams object. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. fetch_mf2: boolean, whether to fetch additional pages via HTTP if necessary, e.g. to determine authorship: https://indieweb.org/authorship Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') # convert author mf2_author = prop.get('author') if mf2_author and isinstance(mf2_author, dict): author = json_to_object(mf2_author) else: # the author h-card may be on another page. run full authorship algorithm: # https://indieweb.org/authorship def fetch(url): return mf2py.parse(util.requests_get(url).text, url=url) author = mf2util.find_author( {'items': [mf2]}, hentry=mf2, fetch_mf2_func=fetch if fetch_mf2 else None) if author: author = { 'objectType': 'person', 'url': author.get('url'), 'displayName': author.get('name'), 'image': [{ 'url': author.get('photo') }], } if not author: author = actor mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = MF2_TO_AS_TYPE_VERB.get(mf2_type, (None, None)) if rsvp: as_verb = 'rsvp-%s' % rsvp # special case GitHub issues that are in-reply-to the repo or its issues URL in_reply_tos = get_string_urls(props.get('in-reply-to', [])) for url in in_reply_tos: if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url): as_type = 'issue' def absolute_urls(prop): return [ url for url in get_string_urls(props.get(prop, [])) # filter out relative and invalid URLs (mf2py gives absolute urls) if urllib.parse.urlparse(url).netloc ] urls = props.get('url') and get_string_urls(props.get('url')) # quotations: https://indieweb.org/quotation#How_to_markup attachments = [ json_to_object(quote) for quote in mf2.get('children', []) + props.get('quotation-of', []) if isinstance(quote, dict) and 'h-cite' in set(quote.get('type', [])) ] # audio and video for type in 'audio', 'video': attachments.extend({ 'objectType': type, 'stream': { 'url': url } } for url in get_string_urls(props.get(type, []))) obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'startTime': prop.get('start'), 'endTime': prop.get('end'), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{ 'value': u } for u in urls] if urls and len(urls) > 1 else None, 'image': [{ 'url': url } for url in dedupe_urls( absolute_urls('photo') + absolute_urls('featured'))], 'stream': [{ 'url': url } for url in absolute_urls('video')], 'location': json_to_object(prop.get('location')), 'replies': { 'items': [json_to_object(c) for c in props.get('comment', [])] }, 'tags': [{ 'objectType': 'hashtag', 'displayName': cat } if isinstance(cat, basestring) else json_to_object(cat) for cat in props.get('category', [])], 'attachments': attachments, } # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location'].update({ 'latitude': float(lat), 'longitude': float(lng), }) except ValueError: logging.warn( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ('like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else { 'url': target } # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) else: obj.update({ 'inReplyTo': [{ 'url': url } for url in in_reply_tos], 'author': author, }) return source.Source.postprocess_object(obj)
def json_to_object(mf2, actor=None, fetch_mf2=False): """Converts a single microformats2 JSON item to an ActivityStreams object. Supports h-entry, h-event, h-card, and other single item times. Does *not* yet support h-feed. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. fetch_mf2: boolean, whether to fetch additional pages via HTTP if necessary, e.g. to determine authorship: https://indieweb.org/authorship Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') # convert author mf2_author = prop.get('author') if mf2_author and isinstance(mf2_author, dict): author = json_to_object(mf2_author) else: # the author h-card may be on another page. run full authorship algorithm: # https://indieweb.org/authorship def fetch(url): return mf2py.parse(util.requests_get(url).text, url=url, img_with_alt=True) author = mf2util.find_author( {'items': [mf2]}, hentry=mf2, fetch_mf2_func=fetch if fetch_mf2 else None) if author: author = { 'objectType': 'person', 'url': author.get('url'), 'displayName': author.get('name'), 'image': [{'url': author.get('photo')}], } if not author: author = actor mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' elif 'tag-of' in props: # TODO: remove once this is in mf2util # https://github.com/kylewm/mf2util/issues/18 mf2_type = 'tag' elif 'follow-of' in props: # ditto mf2_type = 'follow' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = MF2_TO_AS_TYPE_VERB.get(mf2_type, (None, None)) if rsvp: as_verb = 'rsvp-%s' % rsvp # special case GitHub issues that are in-reply-to the repo or its issues URL in_reply_tos = get_string_urls(props.get('in-reply-to', [])) for url in in_reply_tos: if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url): as_type = 'issue' def is_absolute(url): """Filter out relative and invalid URLs (mf2py gives absolute urls).""" return urllib.parse.urlparse(url).netloc urls = props.get('url') and get_string_urls(props.get('url')) # quotations: https://indieweb.org/quotation#How_to_markup attachments = [ json_to_object(quote) for quote in mf2.get('children', []) + props.get('quotation-of', []) if isinstance(quote, dict) and 'h-cite' in set(quote.get('type', []))] # audio and video for type in 'audio', 'video': attachments.extend({'objectType': type, 'stream': {'url': url}} for url in get_string_urls(props.get(type, []))) obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'startTime': prop.get('start'), 'endTime': prop.get('end'), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{'value': u} for u in urls] if urls and len(urls) > 1 else None, # image is special cased below, to handle alt 'stream': [{'url': url} for url in get_string_urls(props.get('video', []))], 'location': json_to_object(prop.get('location')), 'replies': {'items': [json_to_object(c) for c in props.get('comment', [])]}, 'tags': [{'objectType': 'hashtag', 'displayName': cat} if isinstance(cat, basestring) else json_to_object(cat) for cat in props.get('category', [])], 'attachments': attachments, } # images, including alt text photo_urls = set() obj['image'] = [] for photo in props.get('photo', []) + props.get('featured', []): url = photo alt = None if isinstance(photo, dict): photo = photo.get('properties') or photo url = get_first(photo, 'value') or get_first(photo, 'url') alt = get_first(photo, 'alt') if url and url not in photo_urls and is_absolute(url): photo_urls.add(url) obj['image'].append({'url': url, 'displayName': alt}) # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location'].update({ 'latitude': float(lat), 'longitude': float(lng), }) except ValueError: logging.warn( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ( 'follow-of', 'like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else {'url': target} # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) if as_verb == 'tag': obj['target'] = {'url': prop['tag-of']} if obj.get('object'): raise NotImplementedError( 'Combined in-reply-to and tag-of is not yet supported.') obj['object'] = obj.pop('tags') else: obj.update({ 'inReplyTo': [{'url': url} for url in in_reply_tos], 'author': author, }) return source.Source.postprocess_object(obj)
def load_test(testname): parsed = json.load(open('tests/authorship/%s.json' % testname)) return mf2util.find_author(parsed, '%s.html' % testname)
def _interpret_common_properties( parsed, source_url, base_href, hentry, use_rel_syndication, want_json, fetch_mf2_func, ): result = {} props = hentry["properties"] for prop in ("url", "uid", "photo", "featured" "logo"): value = util.get_plain_text(props.get(prop)) if value: result[prop] = value for prop in ("start", "end", "published", "updated", "deleted"): date_str = util.get_plain_text(props.get(prop)) if date_str: if want_json: result[prop] = date_str else: result[prop + "-str"] = date_str try: date = util.parse_datetime(date_str) if date: result[prop] = date except ValueError: util.logging.warn("Failed to parse datetime %s", date_str) author = util.find_author(parsed, source_url, hentry, fetch_mf2_func) if author: result["author"] = author content_prop = props.get("content") content_value = None if content_prop: if isinstance(content_prop[0], dict): content_html = content_prop[0].get("html", "").strip() content_value = content_prop[0].get("value", "").strip() else: content_value = content_html = content_prop[0] result["content"] = util.convert_relative_paths_to_absolute( source_url, base_href, content_html ) result["content-plain"] = content_value summary_prop = props.get("summary") if summary_prop: if isinstance(summary_prop[0], dict): result["summary"] = summary_prop[0]["value"] else: result["summary"] = summary_prop[0] # Collect location objects, then follow this algorithm to consolidate # their properties: # //indieweb.org/location#How_to_determine_the_location_of_a_microformat location_stack = [props] for prop in "location", "adr": vals = props.get(prop) if vals: if isinstance(vals[0], util.string_type): location_stack.append({"name": vals}) else: location_stack.append(vals[0].get("properties", {})) geo = props.get("geo") if geo: if isinstance(geo[0], dict): location_stack.append(geo[0].get("properties", {})) else: if geo[0].startswith("geo:"): # a geo: URL. try to parse it. # //tools.ietf.org/html/rfc5870 parts = geo[0][len("geo:") :].split(";")[0].split(",") if len(parts) >= 2: location_stack.append( { "latitude": [parts[0]], "longitude": [parts[1]], "altitude": [parts[2]] if len(parts) >= 3 else [], } ) for prop in util.LOCATION_PROPERTIES: for obj in location_stack: if obj and obj.get(prop) and not (obj == props and prop == "name"): result.setdefault("location", {})[prop] = obj[prop][0] if use_rel_syndication: result["syndication"] = list( set( parsed.get("rels", {}).get("syndication", []) + hentry["properties"].get("syndication", []) ) ) else: result["syndication"] = hentry["properties"].get("syndication", []) # TODO patch start checkin_prop = props.get("checkin") if checkin_prop: if isinstance(checkin_prop[0], dict): props = checkin_prop[0]["properties"] result["checkin"] = {"name": props["name"][0]} try: result.update( { "latitude": props["latitude"][0], "longitude": props["longitude"][0], } ) except KeyError: pass else: result["checkin"] = checkin_prop[0] categories = props.get("category") if categories: result["category"] = categories # TODO patch end return result
def json_to_object(mf2, actor=None, fetch_mf2=False): """Converts a single microformats2 JSON item to an ActivityStreams object. Supports h-entry, h-event, h-card, and other single item times. Does *not* yet support h-feed. Args: mf2: dict, decoded JSON microformats2 object actor: optional author AS actor object. usually comes from a rel="author" link. if mf2 has its own author, that will override this. fetch_mf2: boolean, whether to fetch additional pages via HTTP if necessary, e.g. to determine authorship: https://indieweb.org/authorship Returns: dict, ActivityStreams object """ if not mf2 or not isinstance(mf2, dict): return {} mf2 = copy.copy(mf2) props = mf2.setdefault('properties', {}) prop = first_props(props) rsvp = prop.get('rsvp') # convert author mf2_author = prop.get('author') if mf2_author and isinstance(mf2_author, dict): author = json_to_object(mf2_author) else: # the author h-card may be on another page. run full authorship algorithm: # https://indieweb.org/authorship author = mf2util.find_author({'items': [mf2]}, hentry=mf2, fetch_mf2_func=util.fetch_mf2 if fetch_mf2 else None) if author: author = { 'objectType': 'person', 'url': author.get('url'), 'displayName': author.get('name'), 'image': [{'url': author.get('photo')}], } if not author: author = actor mf2_types = mf2.get('type') or [] if 'h-geo' in mf2_types or 'p-location' in mf2_types: mf2_type = 'location' elif 'tag-of' in props: # TODO: remove once this is in mf2util # https://github.com/kylewm/mf2util/issues/18 mf2_type = 'tag' elif 'follow-of' in props: # ditto mf2_type = 'follow' else: # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type # *is* a photo. so, special case photo type to fall through to underlying # mf2 type without photo. # https://github.com/snarfed/bridgy/issues/702 without_photo = copy.deepcopy(mf2) without_photo.get('properties', {}).pop('photo', None) mf2_type = mf2util.post_type_discovery(without_photo) as_type, as_verb = MF2_TO_AS_TYPE_VERB.get(mf2_type, (None, None)) if rsvp: as_verb = 'rsvp-%s' % rsvp # special case GitHub issues that are in-reply-to the repo or its issues URL in_reply_tos = get_string_urls(props.get('in-reply-to', [])) for url in in_reply_tos: if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url): as_type = 'issue' def is_absolute(url): """Filter out relative and invalid URLs (mf2py gives absolute urls).""" return urllib.parse.urlparse(url).netloc urls = props.get('url') and get_string_urls(props.get('url')) # quotations: https://indieweb.org/quotation#How_to_markup attachments = [ json_to_object(quote) for quote in mf2.get('children', []) + props.get('quotation-of', []) if isinstance(quote, dict) and 'h-cite' in set(quote.get('type', []))] # audio and video # # the duration mf2 property is still emerging. examples in the wild use both # integer seconds and ISO 8601 durations. # https://indieweb.org/duration # https://en.wikipedia.org/wiki/ISO_8601#Durations duration = prop.get('duration') or prop.get('length') if duration: if util.is_int(duration): duration = int(duration) else: parsed = util.parse_iso8601_duration(duration) if parsed: duration = int(parsed.total_seconds()) else: logging.debug('Unknown format for length or duration %r', duration) duration = None stream = None bytes = size_to_bytes(prop.get('size')) for type in 'audio', 'video': atts = [{ 'objectType': type, 'stream': { 'url': url, # integer seconds: http://activitystrea.ms/specs/json/1.0/#media-link 'duration': duration, # file size in bytes. nonstandard, not in AS1 or AS2 'size': bytes, }, } for url in get_string_urls(props.get(type, []))] attachments.extend(atts) if atts: stream = atts[0]['stream'] obj = { 'id': prop.get('uid'), 'objectType': as_type, 'verb': as_verb, 'published': prop.get('published', ''), 'updated': prop.get('updated', ''), 'startTime': prop.get('start'), 'endTime': prop.get('end'), 'displayName': get_text(prop.get('name')), 'summary': get_text(prop.get('summary')), 'content': get_html(prop.get('content')), 'url': urls[0] if urls else None, 'urls': [{'value': u} for u in urls] if urls and len(urls) > 1 else None, # image is special cased below, to handle alt 'stream': [stream], 'location': json_to_object(prop.get('location')), 'replies': {'items': [json_to_object(c) for c in props.get('comment', [])]}, 'tags': [{'objectType': 'hashtag', 'displayName': cat} if isinstance(cat, str) else json_to_object(cat) for cat in props.get('category', [])], 'attachments': attachments, } # images, including alt text photo_urls = set() obj['image'] = [] for photo in props.get('photo', []) + props.get('featured', []): url = photo alt = None if isinstance(photo, dict): photo = photo.get('properties') or photo url = get_first(photo, 'value') or get_first(photo, 'url') alt = get_first(photo, 'alt') if url and url not in photo_urls and is_absolute(url): photo_urls.add(url) obj['image'].append({'url': url, 'displayName': alt}) # mf2util uses the indieweb/mf2 location algorithm to collect location properties. interpreted = mf2util.interpret({'items': [mf2]}, None) if interpreted: loc = interpreted.get('location') if loc: obj['location']['objectType'] = 'place' lat, lng = loc.get('latitude'), loc.get('longitude') if lat and lng: try: obj['location'].update({ 'latitude': float(lat), 'longitude': float(lng), }) except ValueError: logging.debug( 'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng) if as_type == 'activity': objects = [] for target in itertools.chain.from_iterable( props.get(field, []) for field in ( 'follow-of', 'like', 'like-of', 'repost', 'repost-of', 'in-reply-to', 'invitee')): t = json_to_object(target) if isinstance(target, dict) else {'url': target} # eliminate duplicates from redundant backcompat properties if t not in objects: objects.append(t) obj.update({ 'object': objects[0] if len(objects) == 1 else objects, 'actor': author, }) if as_verb == 'tag': obj['target'] = {'url': prop['tag-of']} if obj.get('object'): raise NotImplementedError( 'Combined in-reply-to and tag-of is not yet supported.') obj['object'] = obj.pop('tags') else: obj.update({ 'inReplyTo': [{'url': url} for url in in_reply_tos], 'author': author, }) return source.Source.postprocess_object(obj)