def get(self): expected_inputs = ('activitystreams', 'html', 'json-mf2') input = util.get_required_param(self, 'input') if input not in expected_inputs: raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' % (input, expected_inputs)) url = util.get_required_param(self, 'url') # check if request is cached cache = self.request.get('cache', '').lower() != 'false' cache_key = 'U %s' % url cached = memcache.get(cache_key) if cache else None if cached: logging.info('Serving cached response %r', cache_key) url = cached['url'] body = cached['body'] else: # fetch url try: resp = util.urlopen(url) except (ValueError, httplib.InvalidURL) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. if url != resp.geturl(): url = resp.geturl() logging.info('Redirected to %s', url) body = resp.read() if cache: logging.info('Caching response in %r', cache_key) memcache.set(cache_key, {'url': url, 'body': body}, URL_CACHE_TIME) # decode data mf2 = None if input == 'html': mf2 = mf2py.parse(doc=body, url=url) elif input == 'json-mf2': mf2 = json.loads(body) mf2.setdefault('rels', {}) # mf2util expects rels actor = None title = None if mf2: actor = microformats2.find_author( mf2, fetch_mf2_func=lambda url: mf2py.parse(url=url)) title = mf2util.interpret_feed(mf2, url).get('name') if input == 'activitystreams': activities = json.loads(body) elif input == 'html': activities = microformats2.html_to_activities(body, url, actor) elif input == 'json-mf2': activities = [microformats2.json_to_object(item, actor=actor) for item in mf2.get('items', [])] self.write_response(source.Source.make_activities_base_response(activities), url=url, actor=actor, title=title)
def html_to_atom(html, url=None, fetch_author=False, reader=True): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional fetch_author: boolean, whether to make HTTP request to fetch rel-author link reader: boolean, whether the output will be rendered in a feed reader. Currently just includes location if True, not otherwise. Returns: unicode string with Atom XML """ if fetch_author: assert url, 'fetch_author=True requires url!' parsed = mf2py.parse(doc=html, url=url) actor = microformats2.find_author( parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url)) return activities_to_atom( microformats2.html_to_activities(html, url, actor), actor, title=mf2util.interpret_feed(parsed, url).get('name'), xml_base=util.base_url(url), host_url=url, reader=reader)
def add_subscription(origin, feed_url, type, tags=None): feed = Feed.query.filter_by(feed=feed_url, type=type).first() if not feed: name = None if type == "html": flask.current_app.logger.debug("mf2py parsing %s", feed_url) resp = util.requests_get(feed_url) feed_text = resp.text if "charset" in resp.headers.get("content-type", "") else resp.content parsed = mf2util.interpret_feed(mf2py.parse(doc=feed_text, url=feed_url), feed_url) name = parsed.get("name") elif type == "xml": flask.current_app.logger.debug("feedparser parsing %s", feed_url) parsed = feedparser.parse(feed_url, agent=util.USER_AGENT) if parsed.feed: name = parsed.feed.get("title") else: flask.current_app.logger.error("unknown feed type %s", type) flask.abort(400) if not name: p = urllib.parse.urlparse(origin) name = p.netloc + p.path feed = Feed(name=name[:140], origin=origin, feed=feed_url, type=type) if feed: db.session.add(feed) flask_login.current_user.subscriptions.append(Subscription(feed=feed, name=feed.name, tags=tags)) db.session.commit() # go ahead and update the fed tasks.q.enqueue(tasks.update_feed, feed.id) return feed
def convert_mf2util(): def dates_to_string(json): if isinstance(json, dict): return {k: dates_to_string(v) for (k, v) in json.items()} if isinstance(json, list): return [dates_to_string(v) for v in json] if isinstance(json, datetime.date) or isinstance(json, datetime.datetime): return json.isoformat() return json url = request.args.get('url') as_feed = request.args.get('as-feed') op = request.args.get('op') if url: try: d = mf2py.parse(url=url) if op == 'post-type-discovery': entry = mf2util.find_first_entry(d, ['h-entry', 'h-event']) return jsonify({'type': mf2util.post_type_discovery(entry)}) if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']): json = mf2util.interpret_feed(d, url) else: json = mf2util.interpret(d, url) return jsonify(dates_to_string(json)) except: current_app.logger.exception('running mf2util service') return jsonify({'error': str(sys.exc_info()[0])}) return """
def get_title(mf2): """Returns the author of a page as a ActivityStreams actor dict. Args: mf2: dict, parsed mf2 object (ie return value from mf2py.parse()) Returns: string title, possibly ellipsized """ lines = mf2util.interpret_feed(mf2, '').get('name', '').splitlines() if lines: return util.ellipsize(lines[0]) return ''
def get_title(mf2): """Returns an mf2 object's title, ie its name. Args: mf2: dict, parsed mf2 object (ie return value from mf2py.parse()) Returns: string title, possibly ellipsized """ lines = mf2util.interpret_feed(mf2, '').get('name', '').splitlines() if lines: return util.ellipsize(lines[0]) return ''
def process_html_feed_for_new_entries(feed, content, backfill, now): # strip noscript tags before parsing, since we definitely aren't # going to preserve js content = re.sub('</?noscript[^>]*>', '', content) parsed = mf2util.interpret_feed( mf2py.parse(url=feed.feed, doc=content), feed.feed) hfeed = parsed.get('entries', []) for hentry in hfeed: entry = hentry_to_entry(hentry, feed, backfill, now) if entry: current_app.logger.debug('built entry: %s', entry.permalink) yield entry
def html_to_atom(html, url=None, **kwargs): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional Returns: unicode string with Atom XML """ parsed = mf2py.parse(doc=html, url=url) return activities_to_atom(microformats2.html_to_activities(html, url), microformats2.find_author(parsed), title=mf2util.interpret_feed(parsed, url).get('name'), xml_base=util.base_url(url), host_url=url)
def get(self): expected_inputs = ('activitystreams', 'html', 'json-mf2', 'jsonfeed') input = util.get_required_param(self, 'input') if input not in expected_inputs: raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' % (input, expected_inputs)) url, body = self._urlopen(util.get_required_param(self, 'url')) # decode data mf2 = None if input == 'html': mf2 = mf2py.parse(doc=body, url=url) elif input == 'json-mf2': mf2 = json.loads(body) mf2.setdefault('rels', {}) # mf2util expects rels actor = None title = None if mf2: def fetch_mf2_func(url): _, doc = self._urlopen(url) return mf2py.parse(doc=doc, url=url) actor = microformats2.find_author(mf2, fetch_mf2_func=fetch_mf2_func) title = mf2util.interpret_feed(mf2, url).get('name') if input == 'activitystreams': activities = json.loads(body) elif input == 'html': activities = microformats2.html_to_activities(body, url, actor) elif input == 'json-mf2': activities = [ microformats2.json_to_object(item, actor=actor) for item in mf2.get('items', []) ] elif input == 'jsonfeed': activities, actor = jsonfeed.jsonfeed_to_activities( json.loads(body)) self.write_response( source.Source.make_activities_base_response(activities), url=url, actor=actor, title=title)
def find_possible_feeds(origin): # scrape an origin source to find possible alternative feeds try: resp = util.requests_get(origin) except requests.exceptions.RequestException as e: flask.flash("Error fetching source {}".format(repr(e))) flask.current_app.logger.warn("Subscribe failed for %s with error %s", origin, repr(e)) return None feeds = [] xml_feed_types = [ "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml", "text/xml", ] xml_mime_types = xml_feed_types + ["text/xml", "text/rss+xml", "text/atom+xml"] content_type = resp.headers["content-type"] content_type = content_type.split(";", 1)[0].strip() if content_type in xml_mime_types: feeds.append({"origin": origin, "feed": origin, "type": "xml", "title": "untitled xml feed"}) elif content_type == "text/html": parsed = mf2py.parse(doc=resp.text, url=origin) # if text/html, then parse and look for h-entries hfeed = mf2util.interpret_feed(parsed, origin) if hfeed.get("entries"): ftitle = hfeed.get("name") or "untitled h-feed" feeds.append({"origin": origin, "feed": resp.url, "type": "html", "title": ftitle[:140]}) # look for link="feed" for furl in parsed.get("rels", {}).get("feed", []): fprops = parsed.get("rel-urls", {}).get(furl, {}) if not fprops.get("type") or fprops.get("type") == "text/html": feeds.append({"origin": origin, "feed": furl, "type": "html", "title": fprops.get("title")}) # then look for link rel="alternate" for link in parsed.get("alternates", []): if link.get("type") in xml_feed_types: feeds.append({"origin": origin, "feed": link.get("url"), "type": "xml", "title": link.get("title")}) return feeds
def test_h_feed_excludes_rel_syndication(): """Represents a feed that (incorrectly) includes page-scoped rel=syndication values in the feed itself. If we're not careful, these values will be slurped into every entry in the feed. """ parsed = { "items":[{ "type": ["h-entry"], "properties": { "name": ["First Post"], "url": ["http://example.com/first-post"], "content": [{ "html": "This is the body of the first post", "value": "This is the body of the first post", }], "syndication": [ "https://twitter.com/example_com/123456", "https://www.facebook.com/example.com/123456", ], }, }, { "type": ["h-event"], "properties": { "name": ["Second Post"], "url": ["http://example.com/second-post"], "content": [{ "html": "This is the body of the second post", "value": "This is the body of the second post", }], "syndication": [ "https://twitter.com/example_com/7891011", "https://www.facebook.com/example.com/7891011", ], }, }], "rels": { "syndication": [ "https://twitter.com/example_com/123456", "https://twitter.com/example_com/7891011", "https://www.facebook.com/example.com/123456", "https://www.facebook.com/example.com/7891011" ], } } result = mf2util.interpret_feed(parsed, 'http://example.com') assert result['entries'][0]['syndication'] == ["https://twitter.com/example_com/123456", "https://www.facebook.com/example.com/123456"] assert result['entries'][1]['syndication'] == ["https://twitter.com/example_com/7891011", "https://www.facebook.com/example.com/7891011"]
def convert_mf2util(): def dates_to_string(json): if isinstance(json, dict): return {k: dates_to_string(v) for (k, v) in json.items()} if isinstance(json, list): return [dates_to_string(v) for v in json] if isinstance(json, datetime.date) or isinstance(json, datetime.datetime): return json.isoformat() return json url = request.args.get('url') if url: d = mf2py.Parser(url=url).to_dict() if mf2util.find_first_entry(d, ['h-feed']): json = mf2util.interpret_feed(d, url) else: json = mf2util.interpret(d, url) return jsonify(dates_to_string(json)) return """
def process_html_feed_for_new_entries(feed, content, backfill, now, fetch_mf2_func): # strip noscript tags before parsing, since we definitely aren't # going to preserve js content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE) # look for a <base> element doc = bs4.BeautifulSoup(content, 'html5lib') base_el = doc.find('base') base_href = base_el.get('href') if base_el else None parsed = mf2util.interpret_feed( mf2py.parse(doc, feed.feed), source_url=feed.feed, base_href=base_href, fetch_mf2_func=fetch_mf2_func) hfeed = parsed.get('entries', []) for hentry in hfeed: current_app.logger.debug('building entry: %s', hentry.get('url')) entry = hentry_to_entry(hentry, feed, backfill, now) if entry: current_app.logger.debug('built entry: %s', entry.permalink) yield entry
def add_subscription(origin, feed_url, type, tags=None): feed = Feed.query.filter_by(feed=feed_url, type=type).first() if not feed: name = None if type == 'html': flask.current_app.logger.debug('mf2py parsing %s', feed_url) resp = util.requests_get(feed_url) feed_text = resp.text if 'charset' in resp.headers.get( 'content-type', '') else resp.content parsed = mf2util.interpret_feed( mf2py.parse(doc=feed_text, url=feed_url), feed_url) name = parsed.get('name') elif type == 'xml': flask.current_app.logger.debug('feedparser parsing %s', feed_url) parsed = feedparser.parse(feed_url, agent=util.USER_AGENT) if parsed.feed: name = parsed.feed.get('title') else: flask.current_app.logger.error('unknown feed type %s', type) flask.abort(400) if not name: p = urllib.parse.urlparse(origin) name = p.netloc + p.path feed = Feed(name=name[:140], origin=origin, feed=feed_url, type=type) if feed: db.session.add(feed) flask_login.current_user.subscriptions.append( Subscription(feed=feed, name=feed.name, tags=tags)) db.session.commit() # go ahead and update the fed tasks.q.enqueue(tasks.update_feed, feed.id) return feed
def html_to_atom(html, url=None, fetch_author=False): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional fetch_author: boolean, whether to make HTTP request to fetch rel-author link Returns: unicode string with Atom XML """ if fetch_author: assert url, 'fetch_author=True requires url!' parsed = mf2py.parse(doc=html, url=url) actor = microformats2.find_author( parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url)) return activities_to_atom( microformats2.html_to_activities(html, url, actor), actor, title=mf2util.interpret_feed(parsed, url).get('name'), xml_base=util.base_url(url), host_url=url)
def get(self): expected_inputs = ('activitystreams', 'html', 'json-mf2') input = util.get_required_param(self, 'input') if input not in expected_inputs: raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' % (input, expected_inputs)) # fetch url url = util.get_required_param(self, 'url') resp = util.urlopen(url) if url != resp.geturl(): url = resp.geturl() logging.info('Redirected to %s', url) body = resp.read() # decode data mf2 = None if input == 'activitystreams': activities = json.loads(body) elif input == 'html': activities = microformats2.html_to_activities(body, url) mf2 = mf2py.parse(doc=body, url=url) elif input == 'json-mf2': mf2 = json.loads(body) mf2['rels'] = {} # mf2util expects rels activities = [microformats2.json_to_object(item) for item in mf2.get('items', [])] author = None title = None if mf2: author = microformats2.find_author(mf2) title = mf2util.interpret_feed(mf2, url).get('name') self.write_response(source.Source.make_activities_base_response(activities), url=url, actor=author, title=title)
def convert_mf2util(): def dates_to_string(json): if isinstance(json, dict): return {k: dates_to_string(v) for (k, v) in json.items()} if isinstance(json, list): return [dates_to_string(v) for v in json] if isinstance(json, datetime.date) or isinstance(json, datetime.datetime): return json.isoformat() return json url = request.args.get('url') as_feed = request.args.get('as-feed') if url: try: d = mf2py.parse(url=url) if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']): json = mf2util.interpret_feed(d, url) else: json = mf2util.interpret(d, url) return jsonify(dates_to_string(json)) except: return jsonify({'error': str(sys.exc_info()[0])}) return """
def get(self): expected_inputs = ('activitystreams', 'html', 'json-mf2') input = util.get_required_param(self, 'input') if input not in expected_inputs: raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' % (input, expected_inputs)) url = util.get_required_param(self, 'url') # check if request is cached cache = self.request.get('cache', '').lower() != 'false' cache_key = 'U %s' % url cached = memcache.get(cache_key) if cache else None if cached: logging.info('Serving cached response %r', cache_key) url = cached['url'] body = cached['body'] else: # fetch url try: resp = util.urlopen(url) except (ValueError, httplib.InvalidURL) as e: self.abort(400, str(e)) except Exception as e: if util.is_connection_failure(e): # HTTP 504 Gateway Timeout self.abort(504, str(e)) raise if url != resp.geturl(): url = resp.geturl() logging.info('Redirected to %s', url) body = resp.read() if cache: logging.info('Caching response in %r', cache_key) memcache.set(cache_key, {'url': url, 'body': body}, URL_CACHE_TIME) # decode data mf2 = None if input == 'html': mf2 = mf2py.parse(doc=body, url=url) elif input == 'json-mf2': mf2 = json.loads(body) mf2.setdefault('rels', {}) # mf2util expects rels actor = None title = None if mf2: actor = microformats2.find_author( mf2, fetch_mf2_func=lambda url: mf2py.parse(url=url)) title = mf2util.interpret_feed(mf2, url).get('name') if input == 'activitystreams': activities = json.loads(body) elif input == 'html': activities = microformats2.html_to_activities(body, url, actor) elif input == 'json-mf2': activities = [microformats2.json_to_object(item, actor=actor) for item in mf2.get('items', [])] self.write_response(source.Source.make_activities_base_response(activities), url=url, actor=actor, title=title)
def find_possible_feeds(origin): # scrape an origin source to find possible alternative feeds try: resp = util.requests_get(origin) except requests.exceptions.RequestException as e: flask.flash('Error fetching source {}'.format(repr(e))) flask.current_app.logger.warn( 'Subscribe failed for %s with error %s', origin, repr(e)) return None feeds = [] xml_feed_types = [ 'application/rss+xml', 'application/atom+xml', 'application/rdf+xml', 'application/xml', 'text/xml', ] xml_mime_types = xml_feed_types + [ 'text/xml', 'text/rss+xml', 'text/atom+xml', ] html_feed_types = [ 'text/html', 'application/xhtml+xml', ] content_type = resp.headers['content-type'] content_type = content_type.split(';', 1)[0].strip() if content_type in xml_mime_types: feeds.append({ 'origin': origin, 'feed': origin, 'type': 'xml', 'title': 'untitled xml feed', }) elif content_type in html_feed_types: parsed = mf2py.parse(doc=resp.text, url=origin) # if text/html, then parse and look for h-entries hfeed = mf2util.interpret_feed(parsed, origin) if hfeed.get('entries'): ftitle = hfeed.get('name') or 'untitled h-feed' feeds.append({ 'origin': origin, 'feed': resp.url, 'type': 'html', 'title': ftitle[:140] }) # look for link="feed" for furl in parsed.get('rels', {}).get('feed', []): fprops = parsed.get('rel-urls', {}).get(furl, {}) if not fprops.get('type') or fprops.get('type') in html_feed_types: feeds.append({ 'origin': origin, 'feed': furl, 'type': 'html', 'title': fprops.get('title'), }) # then look for link rel="alternate" for link in parsed.get('alternates', []): if link.get('type') in xml_feed_types: feeds.append({ 'origin': origin, 'feed': link.get('url'), 'type': 'xml', 'title': link.get('title'), }) return feeds
def find_possible_feeds(origin): # scrape an origin source to find possible alternative feeds try: resp = util.requests_get(origin) except requests.exceptions.RequestException as e: flask.flash('Error fetching source {}'.format(repr(e))) flask.current_app.logger.warn('Subscribe failed for %s with error %s', origin, repr(e)) return None feeds = [] xml_feed_types = [ 'application/rss+xml', 'application/atom+xml', 'application/rdf+xml', 'application/xml', 'text/xml', ] xml_mime_types = xml_feed_types + [ 'text/xml', 'text/rss+xml', 'text/atom+xml', ] html_feed_types = [ 'text/html', 'application/xhtml+xml', ] content_type = resp.headers['content-type'] content_type = content_type.split(';', 1)[0].strip() if content_type in xml_mime_types: feeds.append({ 'origin': origin, 'feed': origin, 'type': 'xml', 'title': 'untitled xml feed', }) elif content_type in html_feed_types: parsed = mf2py.parse(doc=resp.text, url=origin) # if text/html, then parse and look for h-entries hfeed = mf2util.interpret_feed(parsed, origin) if hfeed.get('entries'): ftitle = hfeed.get('name') or 'untitled h-feed' feeds.append({ 'origin': origin, 'feed': resp.url, 'type': 'html', 'title': ftitle[:140] }) # look for link="feed" for furl in parsed.get('rels', {}).get('feed', []): fprops = parsed.get('rel-urls', {}).get(furl, {}) if not fprops.get('type') or fprops.get('type') in html_feed_types: feeds.append({ 'origin': origin, 'feed': furl, 'type': 'html', 'title': fprops.get('title'), }) # then look for link rel="alternate" for link in parsed.get('alternates', []): if link.get('type') in xml_feed_types: feeds.append({ 'origin': origin, 'feed': link.get('url'), 'type': 'xml', 'title': link.get('title'), }) return feeds