def _to_object(cls, input, repo_id=False): """Starts to convert aHub GraphQL or REST API object to ActivityStreams. Args: input: dict, decoded JSON GraphQL or REST object repo_id: boolean, whether to inject repo owner and name into id Returns: an ActivityStreams object dict, ready to be JSON-encoded """ if not input: return {} id = input.get('node_id') or input.get('id') number = input.get('number') url = input.get('html_url') or input.get('url') or '' if repo_id and id and url: # inject repo owner and name path = urllib.parse.urlparse(url).path.strip('/').split('/') owner, repo = path[:2] # join with : because github allows ., _, and - in repo names. (see # REPO_NAME_RE.) id = ':'.join((owner, repo, str(number or id))) return { 'id': cls.tag_uri(id), 'url': url, 'author': cls.user_to_actor(input.get('author') or input.get('user')), 'title': input.get('title'), 'content': (input.get('body') or '').replace('\r\n', '\n'), 'published': util.maybe_iso8601_to_rfc3339(input.get('createdAt') or input.get('created_at')), 'updated': util.maybe_iso8601_to_rfc3339(input.get('lastEditedAt') or input.get('updated_at')), }
def user_to_actor(self, user): """Converts a user or page to an actor. Args: user: dict, a decoded JSON Facebook user or page Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ if not user: return {} id = user.get('id') username = user.get('username') handle = username or id if not handle: return {} # facebook implements this as a 302 redirect actor = { # FB only returns the type field if you fetch the object with ?metadata=1 # https://developers.facebook.com/docs/graph-api/using-graph-api/v2.2#introspection 'objectType': 'page' if user.get('type') == 'page' else 'person', 'displayName': user.get('name') or username, 'id': self.tag_uri(handle), 'updated': util.maybe_iso8601_to_rfc3339(user.get('updated_time')), 'username': username, 'description': user.get('bio') or user.get('description'), 'summary': user.get('about'), } # numeric_id is our own custom field that always has the source's numeric # user id, if available. if util.is_int(id): actor.update({ 'numeric_id': id, 'image': { 'url': 'https://graph.facebook.com/v2.2/%s/picture?type=large' % id, }, }) # extract web site links. extract_links uniquifies and preserves order urls = util.extract_links(user.get('website')) if not urls: urls = util.extract_links(user.get('link')) or [self.user_url(handle)] actor['url'] = urls[0] if len(urls) > 1: actor['urls'] = [{'value': u} for u in urls] location = user.get('location') if location: actor['location'] = {'id': location.get('id'), 'displayName': location.get('name')} return util.trim_nulls(actor)
def user_to_actor(self, user): """Converts a user to an actor. Args: user: dict, a decoded JSON Facebook user Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ if not user: return {} id = user.get('id') username = user.get('username') handle = username or id if not handle: return {} url = (user.get('website') or user.get('link') or 'http://facebook.com/' + handle) # facebook implements this as a 302 redirect image_url = 'http://graph.facebook.com/%s/picture?type=large' % handle actor = { 'displayName': user.get('name'), 'image': {'url': image_url}, 'id': self.tag_uri(handle), # numeric_id is our own custom field that always has the source's numeric # user id, if available. 'numeric_id': id, 'updated': util.maybe_iso8601_to_rfc3339(user.get('updated_time')), 'url': url, 'username': username, 'description': user.get('bio'), } location = user.get('location') if location: actor['location'] = {'id': location.get('id'), 'displayName': location.get('name')} return util.trim_nulls(actor)
def _prepare_activity(a, reader=True): """Preprocesses an activity to prepare it to be rendered as Atom. Modifies a in place. Args: a: ActivityStreams 1 activity dict reader: boolean, whether the output will be rendered in a feed reader. Currently just includes location if True, not otherwise. """ act_type = source.object_type(a) if not act_type or act_type == 'post': primary = a.get('object', {}) else: primary = a obj = a.setdefault('object', {}) # Render content as HTML; escape &s obj['rendered_content'] = _encode_ampersands( microformats2.render_content(primary, include_location=reader, render_attachments=True)) # Make sure every activity has the title field, since Atom <entry> requires # the title element. if not a.get('title'): a['title'] = util.ellipsize( _encode_ampersands( a.get('displayName') or a.get('content') or obj.get('title') or obj.get('displayName') or obj.get('content') or 'Untitled')) # strip HTML tags. the Atom spec says title is plain text: # http://atomenabled.org/developers/syndication/#requiredEntryElements a['title'] = xml.sax.saxutils.escape( BeautifulSoup(a['title']).get_text('')) children = [] image_urls_seen = set() image_atts = [] # normalize attachments, render attached notes/articles attachments = a.get('attachments') or obj.get('attachments') or [] for att in attachments: att['stream'] = util.get_first(att, 'stream') type = att.get('objectType') if type == 'image': image_atts.append(util.get_first(att, 'image')) continue image_urls_seen |= set(util.get_urls(att, 'image')) if type in ('note', 'article'): html = microformats2.render_content(att, include_location=reader, render_attachments=True) author = att.get('author') if author: name = microformats2.maybe_linked_name( microformats2.object_to_json(author).get('properties', [])) html = '%s: %s' % (name.strip(), html) children.append(html) # render image(s) that we haven't already seen for image in image_atts + util.get_list(obj, 'image'): if not image: continue url = image.get('url') parsed = urllib.parse.urlparse(url) rest = urllib.parse.urlunparse(('', '') + parsed[2:]) img_src_re = re.compile( r"""src *= *['"] *((https?:)?//%s)?%s *['"]""" % (re.escape(parsed.netloc), re.escape(rest))) if (url and url not in image_urls_seen and not img_src_re.search(obj['rendered_content'])): children.append(microformats2.img(url)) image_urls_seen.add(url) obj['rendered_children'] = [ _encode_ampersands(child) for child in children ] # make sure published and updated are strict RFC 3339 timestamps for prop in 'published', 'updated': val = obj.get(prop) if val: obj[prop] = util.maybe_iso8601_to_rfc3339(val) # Atom timestamps are even stricter than RFC 3339: they can't be naive ie # time zone unaware. They must have either an offset or the Z suffix. # https://www.feedvalidator.org/docs/error/InvalidRFC3339Date.html if not util.TIMEZONE_OFFSET_RE.search(obj[prop]): obj[prop] += 'Z'
def post_to_object(self, post, remove_id_prefix=False): """Converts a post to an object. Args: post: dict, a decoded JSON post Returns: an ActivityStreams object dict, ready to be JSON-encoded """ id = post.get('id') if not id: return {} post_type = post.get('type') status_type = post.get('status_type') url = self.post_url(post) picture = post.get('picture') display_name = None message = (post.get('message') or post.get('story') or post.get('description') or post.get('name')) data = post.get('data', {}) for field in ('object', 'song'): obj = data.get(field) if obj: id = obj.get('id') post_type = obj.get('type') url = obj.get('url') display_name = obj.get('title') object_type = OBJECT_TYPES.get(post_type) author = self.user_to_actor(post.get('from')) link = post.get('link', '') if link.startswith('/gifts/'): object_type = 'product' if not object_type: if picture and not message: object_type = 'image' else: object_type = 'note' obj = { 'id': self.tag_uri(str(id)), 'objectType': object_type, 'published': util.maybe_iso8601_to_rfc3339(post.get('created_time')), 'updated': util.maybe_iso8601_to_rfc3339(post.get('updated_time')), 'author': author, 'content': message, # FB post ids are of the form USERID_POSTID 'url': url, 'image': {'url': picture}, 'displayName': display_name, 'fb_object_id': post.get('object_id'), } privacy = post.get('privacy', {}) if isinstance(privacy, dict): privacy = privacy.get('value') if privacy is not None: # privacy value '' means it doesn't have an explicit audience set, so i # *think* it inherits from its parent. TODO: use that value as opposed to # defaulting to public. public = privacy.lower() in ('', 'everyone', 'open') obj['to'] = [{'objectType': 'group', 'alias': '@public' if public else '@private'}] # tags and likes tags = itertools.chain(post.get('to', {}).get('data', []), post.get('with_tags', {}).get('data', []), *post.get('message_tags', {}).values()) obj['tags'] = [self.postprocess_object({ 'objectType': OBJECT_TYPES.get(t.get('type'), 'person'), 'id': self.tag_uri(t.get('id')), 'url': self.object_url(t.get('id')), 'displayName': t.get('name'), 'startIndex': t.get('offset'), 'length': t.get('length'), }) for t in tags] obj['tags'] += [self.postprocess_object({ 'id': self.tag_uri('%s_liked_by_%s' % (id, like.get('id'))), 'url': url, 'objectType': 'activity', 'verb': 'like', 'object': {'url': url}, 'author': self.user_to_actor(like), 'content': 'likes this.', }) for like in post.get('likes', {}).get('data', [])] # "See Original" links post_actions = post.get('actions',[]) see_orig_actions = (act for act in post_actions if act.get('name', '').lower() in SEE_ORIGINAL_ACTIONS) obj['tags'] += [self.postprocess_object({ 'objectType': 'article', 'url': act.get('link'), 'displayName': act.get('name') }) for act in see_orig_actions] # is there an attachment? prefer to represent it as a picture (ie image # object), but if not, fall back to a link. att = { 'url': link if link else url, 'image': {'url': picture}, 'displayName': post.get('name'), 'summary': post.get('caption'), 'content': post.get('description'), } if (picture and picture.endswith('_s.jpg') and (post_type == 'photo' or status_type == 'added_photos')): # a picture the user posted. get a larger size. att.update({ 'objectType': 'image', 'image': {'url': picture[:-6] + '_o.jpg'}, }) obj['attachments'] = [att] elif link and not link.startswith('/gifts/'): att['objectType'] = 'article' obj['attachments'] = [att] # location place = post.get('place') if place: id = place.get('id') obj['location'] = { 'displayName': place.get('name'), 'id': id, 'url': self.object_url(id), } location = place.get('location', None) if isinstance(location, dict): lat = location.get('latitude') lon = location.get('longitude') if lat and lon: obj['location'].update({ 'latitude': lat, 'longitude': lon, # ISO 6709 location string. details: http://en.wikipedia.org/wiki/ISO_6709 'position': '%+f%+f/' % (lat, lon), }) elif 'location' in post: obj['location'] = {'displayName': post['location']} # comments go in the replies field, according to the "Responses for # Activity Streams" extension spec: # http://activitystrea.ms/specs/json/replies/1.0/ comments = post.get('comments', {}).get('data') if comments: items = [self.comment_to_object(c) for c in comments] obj['replies'] = { 'items': items, 'totalItems': len(items), } return self.postprocess_object(obj)
def _prepare_activity(a, reader=True): """Preprocesses an activity to prepare it to be rendered as Atom. Modifies a in place. Args: a: ActivityStreams 1 activity dict reader: boolean, whether the output will be rendered in a feed reader. Currently just includes location if True, not otherwise. """ act_type = source.object_type(a) obj = util.get_first(a, 'object', default={}) primary = obj if (not act_type or act_type == 'post') else a # Render content as HTML; escape &s obj['rendered_content'] = _encode_ampersands(microformats2.render_content( primary, include_location=reader, render_attachments=True)) # Make sure every activity has the title field, since Atom <entry> requires # the title element. if not a.get('title'): a['title'] = util.ellipsize(_encode_ampersands( a.get('displayName') or a.get('content') or obj.get('title') or obj.get('displayName') or obj.get('content') or 'Untitled')) # strip HTML tags. the Atom spec says title is plain text: # http://atomenabled.org/developers/syndication/#requiredEntryElements a['title'] = xml.sax.saxutils.escape(BeautifulSoup(a['title']).get_text('')) children = [] image_urls_seen = set() image_atts = [] # normalize actor images for elem in a, obj: actor = elem.get('actor') if actor: actor['image'] = util.get_first(actor, 'image') # normalize attachments, render attached notes/articles attachments = a.get('attachments') or obj.get('attachments') or [] for att in attachments: att['stream'] = util.get_first(att, 'stream') type = att.get('objectType') if type == 'image': att['image'] = util.get_first(att, 'image') image_atts.append(att['image']) continue image_urls_seen |= set(util.get_urls(att, 'image')) if type in ('note', 'article'): html = microformats2.render_content(att, include_location=reader, render_attachments=True) author = att.get('author') if author: name = microformats2.maybe_linked_name( microformats2.object_to_json(author).get('properties') or {}) html = '%s: %s' % (name.strip(), html) children.append(html) # render image(s) that we haven't already seen for image in image_atts + util.get_list(obj, 'image'): if not image: continue url = image.get('url') parsed = urllib.parse.urlparse(url) rest = urllib.parse.urlunparse(('', '') + parsed[2:]) img_src_re = re.compile(r"""src *= *['"] *((https?:)?//%s)?%s *['"]""" % (re.escape(parsed.netloc), re.escape(rest))) if (url and url not in image_urls_seen and not img_src_re.search(obj['rendered_content'])): children.append(microformats2.img(url)) image_urls_seen.add(url) obj['rendered_children'] = [_encode_ampersands(child) for child in children] # make sure published and updated are strict RFC 3339 timestamps for prop in 'published', 'updated': val = obj.get(prop) if val: obj[prop] = util.maybe_iso8601_to_rfc3339(val) # Atom timestamps are even stricter than RFC 3339: they can't be naive ie # time zone unaware. They must have either an offset or the Z suffix. # https://www.feedvalidator.org/docs/error/InvalidRFC3339Date.html if not util.TIMEZONE_OFFSET_RE.search(obj[prop]): obj[prop] += 'Z'
def post_to_object(self, post): """Converts a post to an object. Args: post: dict, a decoded JSON post Returns: an ActivityStreams object dict, ready to be JSON-encoded """ id = post.get('id', '').split('_', 1)[-1] # strip any USERID_ prefix if not id: return {} post_type = post.get('type') status_type = post.get('status_type') url = self.post_url(post) display_name = None message = (post.get('message') or post.get('story') or post.get('description') or post.get('name')) picture = post.get('picture') if isinstance(picture, dict): picture = picture.get('data', {}).get('url') data = post.get('data', {}) for field in ('object', 'song'): obj = data.get(field) if obj: id = obj.get('id') post_type = obj.get('type') url = obj.get('url') display_name = obj.get('title') object_type = OBJECT_TYPES.get(post_type) author = self.user_to_actor(post.get('from')) link = post.get('link', '') gift = link.startswith('/gifts/') if link.startswith('/'): link = 'https://www.facebook.com' + link if gift: object_type = 'product' if not object_type: if picture and not message: object_type = 'image' else: object_type = 'note' obj = { 'id': self.tag_uri(str(id)), 'objectType': object_type, 'published': util.maybe_iso8601_to_rfc3339(post.get('created_time')), 'updated': util.maybe_iso8601_to_rfc3339(post.get('updated_time')), 'author': author, # FB post ids are of the form USERID_POSTID 'url': url, 'image': {'url': picture}, 'displayName': display_name, 'fb_object_id': post.get('object_id'), } privacy = post.get('privacy', {}) if isinstance(privacy, dict): privacy = privacy.get('value') if privacy is not None: # privacy value '' means it doesn't have an explicit audience set, so i # *think* it inherits from its parent. TODO: use that value as opposed to # defaulting to public. public = privacy.lower() in ('', 'everyone', 'open') obj['to'] = [{'objectType': 'group', 'alias': '@public' if public else '@private'}] # tags and likes tags = itertools.chain(post.get('to', {}).get('data', []), post.get('with_tags', {}).get('data', []), *post.get('message_tags', {}).values()) obj['tags'] = [self.postprocess_object({ 'objectType': OBJECT_TYPES.get(t.get('type'), 'person'), 'id': self.tag_uri(t.get('id')), 'url': self.object_url(t.get('id')), 'displayName': t.get('name'), 'startIndex': t.get('offset'), 'length': t.get('length'), }) for t in tags] obj['tags'] += [self.postprocess_object({ 'id': self.tag_uri('%s_liked_by_%s' % (id, like.get('id'))), 'url': url, 'objectType': 'activity', 'verb': 'like', 'object': {'url': url}, 'author': self.user_to_actor(like), }) for like in post.get('likes', {}).get('data', [])] # Escape HTML characters: <, >, &. Have to do it manually, instead of # reusing e.g. cgi.escape, so that we can shuffle over each tag startIndex # appropriately. :( if message: content = copy.copy(message) tags = sorted([t for t in obj['tags'] if t.get('startIndex')], key=lambda t: t['startIndex']) entities = {'<': '<', '>': '>', '&': '&'} i = 0 while i < len(content): if tags and tags[0]['startIndex'] == i: tags.pop(0) entity = entities.get(content[i]) if entity: content = content[:i] + entity + content[i + 1:] for tag in tags: tag['startIndex'] += len(entity) - 1 i += 1 assert not tags obj['content'] = content # "See Original" links post_actions = post.get('actions',[]) see_orig_actions = (act for act in post_actions if act.get('name', '').lower() in SEE_ORIGINAL_ACTIONS) obj['tags'] += [self.postprocess_object({ 'objectType': 'article', 'url': act.get('link'), 'displayName': act.get('name') }) for act in see_orig_actions] # is there an attachment? prefer to represent it as a picture (ie image # object), but if not, fall back to a link. att = { 'url': link if link else url, 'image': {'url': picture}, 'displayName': post.get('name'), 'summary': post.get('caption'), 'content': post.get('description'), } if (picture and picture.endswith('_s.jpg') and (post_type == 'photo' or status_type == 'added_photos')): # a picture the user posted. get a larger size. att.update({ 'objectType': 'image', 'image': {'url': picture[:-6] + '_o.jpg'}, }) obj['attachments'] = [att] elif link and not gift: att['objectType'] = 'article' obj['attachments'] = [att] # location place = post.get('place') if place: id = place.get('id') obj['location'] = { 'displayName': place.get('name'), 'id': id, 'url': self.object_url(id), } location = place.get('location', None) if isinstance(location, dict): lat = location.get('latitude') lon = location.get('longitude') if lat and lon: obj['location'].update({'latitude': lat, 'longitude': lon}) elif 'location' in post: obj['location'] = {'displayName': post['location']} # comments go in the replies field, according to the "Responses for # Activity Streams" extension spec: # http://activitystrea.ms/specs/json/replies/1.0/ comments = post.get('comments', {}).get('data') if comments: items = [self.comment_to_object(c, post_id=post['id']) for c in comments] obj['replies'] = { 'items': items, 'totalItems': len(items), } return self.postprocess_object(obj)
def post_to_object(self, post, _type='post'): """Converts a post to an object. Args: post: dict, a decoded JSON post _type: either 'post' or 'comment' Returns: an ActivityStreams object dict, ready to be JSON-encoded """ assert _type in ('post', 'comment') fb_id = post.get('id') post_type = post.get('type') status_type = post.get('status_type') url = self.post_url(post) display_name = None message = (post.get('message') or post.get('story') or post.get('description') or post.get('name')) picture = post.get('picture') if isinstance(picture, dict): picture = picture.get('data', {}).get('url') data = post.get('data', {}) for field in ('object', 'song'): obj = data.get(field) if obj: fb_id = obj.get('id') post_type = obj.get('type') url = obj.get('url') display_name = obj.get('title') object_type = OBJECT_TYPES.get(status_type) or OBJECT_TYPES.get(post_type) author = self.user_to_actor(post.get('from')) link = post.get('link', '') gift = link.startswith('/gifts/') if link.startswith('/'): link = 'https://www.facebook.com' + link if gift: object_type = 'product' if not object_type: if picture and not message: object_type = 'image' else: object_type = 'note' id = self.parse_id(fb_id, _type) if not id: return {} obj = { 'id': self.tag_uri(id.post), 'fb_id': fb_id, 'objectType': object_type, 'published': util.maybe_iso8601_to_rfc3339(post.get('created_time')), 'updated': util.maybe_iso8601_to_rfc3339(post.get('updated_time')), 'author': author, # FB post ids are of the form USERID_POSTID 'url': url, 'image': {'url': picture}, 'displayName': display_name, 'fb_object_id': post.get('object_id'), } privacy = post.get('privacy', {}) if isinstance(privacy, dict): privacy = privacy.get('value') if privacy is not None: # privacy value '' means it doesn't have an explicit audience set, so i # *think* it inherits from its parent. TODO: use that value as opposed to # defaulting to public. public = privacy.lower() in ('', 'everyone', 'open') obj['to'] = [{'objectType': 'group', 'alias': '@public' if public else '@private'}] # message_tags is a dict in most post types, but a list in some other object # types, e.g. comments. message_tags = post.get('message_tags', []) if isinstance(message_tags, dict): message_tags = sum(message_tags.values(), []) # flatten elif not isinstance(message_tags, list): message_tags = list(message_tags) # fingers crossed! :P # tags and likes tags = itertools.chain(post.get('to', {}).get('data', []), post.get('with_tags', {}).get('data', []), message_tags) obj['tags'] = [self.postprocess_object({ 'objectType': OBJECT_TYPES.get(t.get('type'), 'person'), 'id': self.tag_uri(t.get('id')), 'url': self.object_url(t.get('id')), 'displayName': t.get('name'), 'startIndex': t.get('offset'), 'length': t.get('length'), }) for t in tags] obj['tags'] += [self.postprocess_object({ 'id': '%s_liked_by_%s' % (obj['id'], like.get('id')), 'url': url, 'objectType': 'activity', 'verb': 'like', 'object': {'url': url}, 'author': self.user_to_actor(like), }) for like in post.get('likes', {}).get('data', [])] # Escape HTML characters: <, >, &. Have to do it manually, instead of # reusing e.g. cgi.escape, so that we can shuffle over each tag startIndex # appropriately. :( if message: content = copy.copy(message) tags = sorted([t for t in obj['tags'] if t.get('startIndex')], key=lambda t: t['startIndex']) entities = {'<': '<', '>': '>', '&': '&'} i = 0 while i < len(content): if tags and tags[0]['startIndex'] == i: tags.pop(0) entity = entities.get(content[i]) if entity: content = content[:i] + entity + content[i + 1:] for tag in tags: tag['startIndex'] += len(entity) - 1 i += 1 assert not tags obj['content'] = content # "See Original" links see_orig_links = filter( None, (act.get('link') for act in post.get('actions',[]) if act.get('name', '').lower() in SEE_ORIGINAL_ACTIONS)) if see_orig_links: obj.setdefault('upstreamDuplicates', []).extend(see_orig_links) # is there an attachment? prefer to represent it as a picture (ie image # object), but if not, fall back to a link. att = { 'url': link if link else url, 'image': {'url': picture}, 'displayName': post.get('name'), 'summary': post.get('caption'), 'content': post.get('description'), } if (picture and picture.endswith('_s.jpg') and (post_type == 'photo' or status_type == 'added_photos')): # a picture the user posted. get a larger size. att.update({ 'objectType': 'image', 'image': {'url': picture[:-6] + '_o.jpg'}, }) obj['attachments'] = [att] elif link and not gift: att['objectType'] = 'article' obj['attachments'] = [att] # location place = post.get('place') if place: place_id = place.get('id') obj['location'] = { 'displayName': place.get('name'), 'id': place_id, 'url': self.object_url(place_id), } location = place.get('location', None) if isinstance(location, dict): lat = location.get('latitude') lon = location.get('longitude') if lat and lon: obj['location'].update({'latitude': lat, 'longitude': lon}) elif 'location' in post: obj['location'] = {'displayName': post['location']} # comments go in the replies field, according to the "Responses for # Activity Streams" extension spec: # http://activitystrea.ms/specs/json/replies/1.0/ comments = post.get('comments', {}).get('data') if comments: items = util.trim_nulls([self.comment_to_object(c, post_id=post['id']) for c in comments]) obj['replies'] = { 'items': items, 'totalItems': len(items), } return self.postprocess_object(obj)