def get_as2(url): """Tries to fetch the given URL as ActivityStreams 2. Uses HTTP content negotiation via the Content-Type header. If the url is HTML and it has a rel-alternate link with an AS2 content type, fetches and returns that URL. Args: url: string Returns: :class:`requests.Response` Raises: :class:`requests.HTTPError`, :class:`werkzeug.exceptions.HTTPException` If we raise a werkzeug HTTPException, it will have an additional requests_response attribute with the last requests.Response we received. """ def _error(resp): msg = "Couldn't fetch %s as ActivityStreams 2" % url logging.warning(msg) err = BadGateway(msg) err.requests_response = resp raise err resp = requests_get(url, headers=CONNEG_HEADERS_AS2_HTML) if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD): return resp parsed = util.parse_html(resp) as2 = parsed.find('link', rel=('alternate', 'self'), type=(CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD)) if not (as2 and as2['href']): _error(resp) resp = requests_get(urllib.parse.urljoin(resp.url, as2['href']), headers=CONNEG_HEADERS_AS2) if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD): return resp _error(resp)
def _try_salmon(self, resp): """ Args: resp: Response """ # fetch target HTML page, extract Atom rel-alternate link target = resp.target() if not self.target_resp: self.target_resp = common.requests_get(target) parsed = util.parse_html(self.target_resp) atom_url = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM) if not atom_url or not atom_url.get('href'): self.error('Target post %s has no Atom link' % resp.target(), status=400) # fetch Atom target post, extract and inject id into source object base_url = '' base = parsed.find('base') if base and base.get('href'): base_url = base['href'] atom_link = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM) atom_url = urllib.parse.urljoin( resp.target(), urllib.parse.urljoin(base_url, atom_link['href'])) feed = common.requests_get(atom_url).text parsed = feedparser.parse(feed) logging.info('Parsed: %s', json_dumps(parsed, indent=2)) entry = parsed.entries[0] target_id = entry.id in_reply_to = self.source_obj.get('inReplyTo') source_obj_obj = self.source_obj.get('object') if in_reply_to: for elem in in_reply_to: if elem.get('url') == target: elem['id'] = target_id elif isinstance(source_obj_obj, dict): source_obj_obj['id'] = target_id # Mastodon (and maybe others?) require a rel-mentioned link to the # original post's author to make it show up as a reply: # app/services/process_interaction_service.rb # ...so add them as a tag, which atom renders as a rel-mention link. authors = entry.get('authors', None) if authors: url = entry.authors[0].get('href') if url: self.source_obj.setdefault('tags', []).append({'url': url}) # extract and discover salmon endpoint logging.info('Discovering Salmon endpoint in %s', atom_url) endpoint = django_salmon.discover_salmon_endpoint(feed) if not endpoint: # try webfinger parsed = urllib.parse.urlparse(resp.target()) # TODO: test missing email author = entry.get('author_detail', {}) email = author.get('email') or '@'.join( (author.get('name', ''), parsed.netloc)) try: # TODO: always https? profile = common.requests_get( '%s://%s/.well-known/webfinger?resource=acct:%s' % (parsed.scheme, parsed.netloc, email), verify=False) endpoint = django_salmon.get_salmon_replies_link( profile.json()) except requests.HTTPError as e: pass if not endpoint: self.error('No salmon endpoint found!', status=400) logging.info('Discovered Salmon endpoint %s', endpoint) # construct reply Atom object self.source_url = resp.source() activity = self.source_obj if self.source_obj.get('verb') not in source.VERBS_WITH_OBJECT: activity = {'object': self.source_obj} entry = atom.activity_to_atom(activity, xml_base=self.source_url) logging.info('Converted %s to Atom:\n%s', self.source_url, entry) # sign reply and wrap in magic envelope domain = urllib.parse.urlparse(self.source_url).netloc key = MagicKey.get_or_create(domain) logging.info('Using key for %s: %s', domain, key) magic_envelope = magicsigs.magic_envelope(entry, common.CONTENT_TYPE_ATOM, key).decode() logging.info('Sending Salmon slap to %s', endpoint) common.requests_post( endpoint, data=common.XML_UTF8 + magic_envelope, headers={'Content-Type': common.CONTENT_TYPE_MAGIC_ENVELOPE}) return True
def scraped_to_activities(self, html, cookie=None, count=None, fetch_extras=False): """Converts scraped Instagram HTML to ActivityStreams activities. The input HTML may be from: * a user's feed, eg https://www.instagram.com/ while logged in * a user's profile, eg https://www.instagram.com/snarfed/ * a photo or video, eg https://www.instagram.com/p/BBWCSrfFZAk/ Args: html: unicode string cookie: string, optional sessionid cookie to be used for subsequent HTTP fetches, if necessary. count: integer, number of activities to return, None for all fetch_extras: whether to make extra HTTP fetches to get likes, etc. Returns: tuple, ([ActivityStreams activities], ActivityStreams viewer actor) """ cookie = cookie or self.cookie # extract JSON data blob # (can also get just this JSON by adding ?__a=1 to any IG URL.) matches = HTML_DATA_RE.findall(html) if not matches: # Instagram sometimes returns 200 with incomplete HTML. often it stops at # the end of one of the <style> tags inside <head>. not sure why. logging.warning('JSON script tag not found!') return [], None # find media medias = [] profile_user = None viewer_user = None for match in matches: data = util.trim_nulls(json_loads(match[1])) entry_data = data.get('entry_data', {}) # home page ie news feed for page in entry_data.get('FeedPage', []): edges = page.get('graphql', {}).get('user', {})\ .get('edge_web_feed_timeline', {}).get('edges', []) medias.extend( e.get('node') for e in edges if e.get('node', {}).get('__typename') not in ( 'GraphSuggestedUserFeedUnit', )) if 'user' in data: edges = data['user'].get('edge_web_feed_timeline', {}).get('edges', []) medias.extend(e.get('node') for e in edges) # user profiles for page in entry_data.get('ProfilePage', []): profile_user = page.get('graphql', {}).get('user', {}) medias.extend(edge['node'] for edge in profile_user.get( 'edge_owner_to_timeline_media', {}).get('edges', []) if edge.get('node')) if not viewer_user: viewer_user = data.get('config', {}).get('viewer') # individual photo/video permalinks for page in [data] + entry_data.get('PostPage', []): media = page.get('graphql', {}).get('shortcode_media') if media: medias.append(media) if not medias: # As of 2018-02-15, embedded JSON in logged in https://www.instagram.com/ # no longer has any useful data. Need to do a second header link fetch. soup = util.parse_html(html) link = soup.find('link', href=HTML_PRELOAD_RE) if link: url = urllib.parse.urljoin(HTML_BASE_URL, link['href']) data = self._scrape_json(url, cookie=cookie) edges = data.get('data', {}).get('user', {})\ .get('edge_web_feed_timeline', {}).get('edges', []) medias = [e.get('node') for e in edges] if count: medias = medias[:count] activities = [] for media in util.trim_nulls(medias): activity = self._json_media_node_to_activity(media, user=profile_user) # likes shortcode = media.get('code') or media.get('shortcode') likes = media.get('edge_media_preview_like') or {} if shortcode and fetch_extras and likes.get( 'count') and not likes.get('edges'): # extra GraphQL fetch to get likes, as of 8/2018 likes_json = self._scrape_json(HTML_LIKES_URL % shortcode, cookie=cookie) self.merge_scraped_reactions(likes_json, activity) activities.append(util.trim_nulls(activity)) actor = None user = self._json_user_to_user(viewer_user or profile_user) if user: actor = self.user_to_actor(user) return activities, actor
def _content_for_create(self, obj, ignore_formatting=False, prefer_name=False, strip_first_video_tag=False, strip_quotations=False): """Returns content text for :meth:`create()` and :meth:`preview_create()`. Returns summary if available, then content, then displayName. If using content, renders the HTML content to text using html2text so that whitespace is formatted like in the browser. Args: obj: dict, ActivityStreams object ignore_formatting: boolean, whether to use content text as is, instead of converting its HTML to plain text styling (newlines, etc.) prefer_name: boolean, whether to prefer displayName to content strip_first_video_tag: if true, removes the first <video> tag. useful when it will be uploaded and attached to the post natively in the silo. strip_quotations: if true, removes .u-quotation-of tags. useful when creating quote tweets. Returns: string, possibly empty """ summary = obj.get('summary', '').strip() name = obj.get('displayName', '').strip() content = obj.get('content', '').strip() # note that unicode() on a BeautifulSoup object preserves HTML and # whitespace, even after modifying the DOM, which is important for # formatting. # # The catch is that it adds a '<html><head></head><body>' header and # '</body></html>' footer. ah well. harmless. soup = util.parse_html(content) if strip_first_video_tag: video = soup.video or soup.find(class_='u-video') if video: video.extract() content = str(soup) if strip_quotations: quotations = soup.find_all(class_='u-quotation-of') if quotations: for q in quotations: q.extract() content = str(soup) # compare to content with HTML tags stripped if summary == soup.get_text('').strip(): # summary and content are the same; prefer content so that we can use its # HTML formatting. summary = None # sniff whether content is HTML or plain text. use html.parser instead of # the default html5lib since html.parser is stricter and expects actual # HTML tags. # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#differences-between-parsers is_html = (bool(BeautifulSoup(content, 'html.parser').find()) or HTML_ENTITY_RE.search(content)) if is_html and not ignore_formatting: content = html_to_text(content, **self.HTML2TEXT_OPTIONS) elif not is_html and ignore_formatting: content = re.sub(r'\s+', ' ', content) return summary or ((name or content) if prefer_name else (content or name)) or ''
def from_activities(activities, actor=None, title=None, feed_url=None, home_page_url=None, hfeed=None): """Converts ActivityStreams activities to an RSS 2.0 feed. Args: activities: sequence of ActivityStreams activity dicts actor: ActivityStreams actor dict, the author of the feed title: string, the feed title feed_url: string, the URL for this RSS feed home_page_url: string, the home page URL hfeed: dict, parsed mf2 h-feed, if available Returns: unicode string with RSS 2.0 XML """ try: iter(activities) except TypeError: raise TypeError('activities must be iterable') if isinstance(activities, (dict, str)): raise TypeError('activities may not be a dict or string') fg = FeedGenerator() fg.id(feed_url) assert feed_url fg.link(href=feed_url, rel='self') if home_page_url: fg.link(href=home_page_url, rel='alternate') # TODO: parse language from lang attribute: # https://github.com/microformats/mf2py/issues/150 fg.language('en') fg.generator('granary', uri='https://granary.io/') hfeed = hfeed or {} actor = actor or {} image = (util.get_url(hfeed.get('properties', {}), 'photo') or util.get_url(actor, 'image')) if image: fg.image(image) props = hfeed.get('properties') or {} content = microformats2.get_text(util.get_first(props, 'content', '')) summary = util.get_first(props, 'summary', '') desc = content or summary or '-' fg.description(desc) # required fg.title(title or util.ellipsize(desc)) # required latest = None feed_has_enclosure = False for activity in activities: obj = activity.get('object') or activity if obj.get('objectType') == 'person': continue item = fg.add_entry() url = obj.get('url') id = obj.get('id') or url item.id(id) item.link(href=url) item.guid(url, permalink=True) # title (required) title = (obj.get('title') or obj.get('displayName') or util.ellipsize(obj.get('content', '-'))) # strip HTML tags title = util.parse_html(title).get_text('').strip() item.title(title) content = microformats2.render_content(obj, include_location=True, render_attachments=True, render_image=True) if not content: content = obj.get('summary') if content: item.content(content, type='CDATA') categories = [ { 'term': t['displayName'] } for t in obj.get('tags', []) if t.get('displayName') and t.get('verb') not in ('like', 'react', 'share') and t.get('objectType') not in ('article', 'person', 'mention') ] item.category(categories) author = obj.get('author', {}) author = { 'name': author.get('displayName') or author.get('username'), 'uri': author.get('url'), 'email': author.get('email') or '-', } item.author(author) published = obj.get('published') or obj.get('updated') if published and isinstance(published, str): try: dt = mf2util.parse_datetime(published) if not isinstance(dt, datetime): dt = datetime.combine(dt, time.min) if not dt.tzinfo: dt = dt.replace(tzinfo=util.UTC) item.published(dt) if not latest or dt > latest: latest = dt except ValueError: # bad datetime string pass item_has_enclosure = False for att in obj.get('attachments', []): stream = util.get_first(att, 'stream') or att if not stream: continue url = stream.get('url') or '' mime = mimetypes.guess_type(url)[0] or '' if (att.get('objectType') in ENCLOSURE_TYPES or mime and mime.split('/')[0] in ENCLOSURE_TYPES): if item_has_enclosure: logging.info( 'Warning: item %s already has an RSS enclosure, skipping additional enclosure %s', id, url) continue item_has_enclosure = feed_has_enclosure = True item.enclosure(url=url, type=mime, length=str(stream.get('size', ''))) item.load_extension('podcast') duration = stream.get('duration') if duration: item.podcast.itunes_duration(duration) if feed_has_enclosure: fg.load_extension('podcast') fg.podcast.itunes_author( actor.get('displayName') or actor.get('username')) if summary: fg.podcast.itunes_summary(summary) fg.podcast.itunes_explicit('no') fg.podcast.itunes_block(False) name = author.get('name') if name: fg.podcast.itunes_author(name) if image: fg.podcast.itunes_image(image) fg.podcast.itunes_category(categories) if latest: fg.lastBuildDate(latest) return fg.rss_str(pretty=True).decode('utf-8')
def _prepare_activity(a, reader=True): """Preprocesses an activity to prepare it to be rendered as Atom. Modifies a in place. Args: a: ActivityStreams 1 activity dict reader: boolean, whether the output will be rendered in a feed reader. Currently just includes location if True, not otherwise. """ act_type = source.object_type(a) obj = util.get_first(a, 'object', default={}) primary = obj if (not act_type or act_type == 'post') else a # Render content as HTML; escape &s obj['rendered_content'] = _encode_ampersands(microformats2.render_content( primary, include_location=reader, render_attachments=True, # Readers often obey CSS white-space: pre strictly and don't even line wrap, # so don't use it. # https://forum.newsblur.com/t/android-cant-read-line-pre-formatted-lines/6116 white_space_pre=False)) # Make sure every activity has the title field, since Atom <entry> requires # the title element. if not a.get('title'): a['title'] = util.ellipsize(_encode_ampersands( a.get('displayName') or a.get('content') or obj.get('title') or obj.get('displayName') or obj.get('content') or 'Untitled')) # strip HTML tags. the Atom spec says title is plain text: # http://atomenabled.org/developers/syndication/#requiredEntryElements a['title'] = xml.sax.saxutils.escape(util.parse_html(a['title']).get_text('')) children = [] image_urls_seen = set() image_atts = [] # normalize actors for elem in a, obj: _prepare_actor(elem.get('actor')) # normalize attachments, render attached notes/articles attachments = a.get('attachments') or obj.get('attachments') or [] for att in attachments: att['stream'] = util.get_first(att, 'stream') type = att.get('objectType') if type == 'image': att['image'] = util.get_first(att, 'image') image_atts.append(att['image']) continue image_urls_seen |= set(util.get_urls(att, 'image')) if type in ('note', 'article'): html = microformats2.render_content( att, include_location=reader, render_attachments=True, white_space_pre=False) author = att.get('author') if author: name = microformats2.maybe_linked_name( microformats2.object_to_json(author).get('properties') or {}) html = '%s: %s' % (name.strip(), html) children.append(html) # render image(s) that we haven't already seen for image in image_atts + util.get_list(obj, 'image'): if not image: continue url = image.get('url') parsed = urllib.parse.urlparse(url) rest = urllib.parse.urlunparse(('', '') + parsed[2:]) img_src_re = re.compile(r"""src *= *['"] *((https?:)?//%s)?%s *['"]""" % (re.escape(parsed.netloc), _encode_ampersands(re.escape(rest)))) if (url and url not in image_urls_seen and not img_src_re.search(obj['rendered_content'])): children.append(microformats2.img(url)) image_urls_seen.add(url) obj['rendered_children'] = [_encode_ampersands(child) for child in children] # make sure published and updated are strict RFC 3339 timestamps for prop in 'published', 'updated': val = obj.get(prop) if val: obj[prop] = util.maybe_iso8601_to_rfc3339(val) # Atom timestamps are even stricter than RFC 3339: they can't be naive ie # time zone unaware. They must have either an offset or the Z suffix. # https://www.feedvalidator.org/docs/error/InvalidRFC3339Date.html if not util.TIMEZONE_OFFSET_RE.search(obj[prop]): obj[prop] += 'Z'
def template_vars(self, domain, url=None): assert domain if domain.split('.')[-1] in NON_TLDS: self.error("%s doesn't look like a domain" % domain, status=404) # find representative h-card. try url, then url's home page, then domain urls = ['http://%s/' % domain] if url: urls = [url, urllib.parse.urljoin(url, '/')] + urls for candidate in urls: resp = common.requests_get(candidate) parsed = util.parse_html(resp) mf2 = util.parse_mf2(parsed, url=resp.url) # logging.debug('Parsed mf2 for %s: %s', resp.url, json_dumps(mf2, indent=2)) hcard = mf2util.representative_hcard(mf2, resp.url) if hcard: logging.info('Representative h-card: %s', json_dumps(hcard, indent=2)) break else: self.error("""\ Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url) logging.info('Generating WebFinger data for %s', domain) key = models.MagicKey.get_or_create(domain) props = hcard.get('properties', {}) urls = util.dedupe_urls(props.get('url', []) + [resp.url]) canonical_url = urls[0] acct = '%s@%s' % (domain, domain) for url in urls: if url.startswith('acct:'): urluser, urldomain = util.parse_acct_uri(url) if urldomain == domain: acct = '%s@%s' % (urluser, domain) logging.info('Found custom username: acct:%s', acct) break # discover atom feed, if any atom = parsed.find('link', rel='alternate', type=common.CONTENT_TYPE_ATOM) if atom and atom['href']: atom = urllib.parse.urljoin(resp.url, atom['href']) else: atom = 'https://granary.io/url?' + urllib.parse.urlencode( { 'input': 'html', 'output': 'atom', 'url': resp.url, 'hub': resp.url, }) # discover PuSH, if any for link in resp.headers.get('Link', '').split(','): match = common.LINK_HEADER_RE.match(link) if match and match.group(2) == 'hub': hub = match.group(1) else: hub = 'https://bridgy-fed.superfeedr.com/' # generate webfinger content data = util.trim_nulls({ 'subject': 'acct:' + acct, 'aliases': urls, 'magic_keys': [{ 'value': key.href() }], 'links': sum(([{ 'rel': 'http://webfinger.net/rel/profile-page', 'type': 'text/html', 'href': url, }] for url in urls if url.startswith("http")), []) + [{ 'rel': 'http://webfinger.net/rel/avatar', 'href': url, } for url in props.get('photo', [])] + [ { 'rel': 'canonical_uri', 'type': 'text/html', 'href': canonical_url, }, # ActivityPub { 'rel': 'self', 'type': common.CONTENT_TYPE_AS2, # WARNING: in python 2 sometimes request.host_url lost port, # http://localhost:8080 would become just http://localhost. no # clue how or why. pay attention here if that happens again. 'href': '%s/%s' % (self.request.host_url, domain), }, { 'rel': 'inbox', 'type': common.CONTENT_TYPE_AS2, 'href': '%s/%s/inbox' % (self.request.host_url, domain), }, # OStatus { 'rel': 'http://schemas.google.com/g/2010#updates-from', 'type': common.CONTENT_TYPE_ATOM, 'href': atom, }, { 'rel': 'hub', 'href': hub, }, { 'rel': 'magic-public-key', 'href': key.href(), }, { 'rel': 'salmon', 'href': '%s/%s/salmon' % (self.request.host_url, domain), } ] }) logging.info('Returning WebFinger data: %s', json_dumps(data, indent=2)) return data