def html_to_atom(html, url=None, fetch_author=False): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional fetch_author: boolean, whether to make HTTP request to fetch rel-author link Returns: unicode string with Atom XML """ if fetch_author: assert url, 'fetch_author=True requires url!' parsed = mf2py.parse(doc=html, url=url) actor = microformats2.find_author( parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url)) return activities_to_atom(microformats2.html_to_activities( html, url, actor), actor, title=mf2util.interpret_feed(parsed, url).get('name'), xml_base=util.base_url(url), host_url=url)
def publish(source, target, endpoint, **kwargs): data = kwargs.get('data', {}) data['_id'] = slugify(u'mention-{0}'.format(source)) verified = data['verified'].get('state', False) if isinstance(target, list): real_target = target[-1]['url'] else: real_target = data.pop('real_target', target) post_id = post_id_from_url(real_target) if verified: content = kwargs.get('body', None) if content is not None: mfdata = mf2py.parse(doc=content, html_parser="html5lib") #mentions = mention_from_doc(content) else: mfdata = mf2py.parse(url=url, html_parser="html5lib") #mentions = mention_from_url(source) mfdata['items'] = [bleachify(item) for item in mfdata['items']] data.update({'post_id': post_id, 'type': 'mention', 'format': 'mf2py'}) data['data'] = mfdata res = update_record(endpoint.format(data['_id']), data) return res
def get(self): expected_inputs = ('activitystreams', 'html', 'json-mf2') input = util.get_required_param(self, 'input') if input not in expected_inputs: raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' % (input, expected_inputs)) url = util.get_required_param(self, 'url') # check if request is cached cache = self.request.get('cache', '').lower() != 'false' cache_key = 'U %s' % url cached = memcache.get(cache_key) if cache else None if cached: logging.info('Serving cached response %r', cache_key) url = cached['url'] body = cached['body'] else: # fetch url try: resp = util.urlopen(url) except (ValueError, httplib.InvalidURL) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. if url != resp.geturl(): url = resp.geturl() logging.info('Redirected to %s', url) body = resp.read() if cache: logging.info('Caching response in %r', cache_key) memcache.set(cache_key, {'url': url, 'body': body}, URL_CACHE_TIME) # decode data mf2 = None if input == 'html': mf2 = mf2py.parse(doc=body, url=url) elif input == 'json-mf2': mf2 = json.loads(body) mf2.setdefault('rels', {}) # mf2util expects rels actor = None title = None if mf2: actor = microformats2.find_author( mf2, fetch_mf2_func=lambda url: mf2py.parse(url=url)) title = mf2util.interpret_feed(mf2, url).get('name') if input == 'activitystreams': activities = json.loads(body) elif input == 'html': activities = microformats2.html_to_activities(body, url, actor) elif input == 'json-mf2': activities = [microformats2.json_to_object(item, actor=actor) for item in mf2.get('items', [])] self.write_response(source.Source.make_activities_base_response(activities), url=url, actor=actor, title=title)
def html_to_atom(html, url=None, fetch_author=False, reader=True): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional fetch_author: boolean, whether to make HTTP request to fetch rel-author link reader: boolean, whether the output will be rendered in a feed reader. Currently just includes location if True, not otherwise. Returns: unicode string with Atom XML """ if fetch_author: assert url, 'fetch_author=True requires url!' parsed = mf2py.parse(doc=html, url=url) actor = microformats2.find_author( parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url)) return activities_to_atom(microformats2.html_to_activities( html, url, actor), actor, title=microformats2.get_title(parsed), xml_base=util.base_url(url), host_url=url, reader=reader)
def html_to_atom(html, url=None, fetch_author=False, reader=True): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional fetch_author: boolean, whether to make HTTP request to fetch rel-author link reader: boolean, whether the output will be rendered in a feed reader. Currently just includes location if True, not otherwise. Returns: unicode string with Atom XML """ if fetch_author: assert url, 'fetch_author=True requires url!' parsed = mf2py.parse(doc=html, url=url, img_with_alt=True) actor = microformats2.find_author( parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url, img_with_alt=True)) return activities_to_atom( microformats2.html_to_activities(html, url, actor), actor, title=microformats2.get_title(parsed), xml_base=util.base_url(url), host_url=url, reader=reader)
def get_access_token(u, scopes): """Initiate an IndieAuth Authorization flow to get an acess token (for talking to the Miropub endpoint).""" # Guess the identity from the URL me = urllib.parse.urlparse(u)._replace(path="/").geturl() # Fetch the 3 endpoints needed: # TODO(tsileo): clean error if missing dat = mf2py.parse(url=u) auth_endpoint = dat["rels"]["authorization_endpoint"][0] tok_endpoint = dat["rels"]["token_endpoint"][0] micropub_endpoint = dat["rels"]["micropub"][0] # Generate a random state state = binascii.hexlify(os.urandom(6)).decode() # Actually initiate the Authorization flow auth_url = (auth_endpoint + "?" + urllib.parse.urlencode({ "me": me, "response_type": "code", "state": state, "redirect_uri": REDIRECT_URI, "scope": " ".join(scopes), "client_id": CLIENT_ID, })) # Open the URL in a tab webbrowser.open_new_tab(auth_url) click.echo("waiting for the IndieAuth callback...") tok = _wait_for_access_token(me, tok_endpoint) click.echo("success") # And wait for the callback via the redirect_uri return (me, micropub_endpoint, tok)
def do_whois(self, url): parsed = mf2py.parse(url=url) props = [] for rel in 'authorization_endpoint', 'token_endpoint', 'micropub': for val in parsed['rels'].get(rel, []): props.append((rel, val)) # top-level h-card first, then top-level h-* with .author hcard = None for item in parsed['items']: if 'h-card' in item['type']: hcard = item break if not hcard: for item in parsed['items']: if 'author' in item['properties']: hcard = item['properties']['author'][0] break if hcard: if isinstance(hcard, dict): for prop in 'name', 'photo', 'url': for val in hcard['properties'].get(prop, []): props.append((prop, val)) else: props.append(('name', hcard)) return ("Here's everything I could find about %s\n " % url) + '\n '.join( "%s: %s" % (k, v) for k, v in props)
def post(self): logging.info('(Params: %s )', self.request.params.items()) # fetch source page source = util.get_required_param(self, 'source') source_resp = common.requests_get(source) self.source_url = source_resp.url or source self.source_domain = urlparse.urlparse(self.source_url).netloc.split(':')[0] self.source_mf2 = mf2py.parse(source_resp.text, url=self.source_url, img_with_alt=True) # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2, indent=2)) # check for backlink to bridgy fed (for webmention spec and to confirm # source's intent to federate to mastodon) if (self.request.host_url not in source_resp.text and urllib.quote(self.request.host_url, safe='') not in source_resp.text): common.error(self, "Couldn't find link to %s" % self.request.host_url) # convert source page to ActivityStreams entry = mf2util.find_first_entry(self.source_mf2, ['h-entry']) if not entry: common.error(self, 'No microformats2 found on %s' % self.source_url) logging.info('First entry: %s', json.dumps(entry, indent=2)) # make sure it has url, since we use that for AS2 id, which is required # for ActivityPub. props = entry.setdefault('properties', {}) if not props.get('url'): props['url'] = [self.source_url] self.source_obj = microformats2.json_to_object(entry, fetch_mf2=True) logging.info('Converted to AS1: %s', json.dumps(self.source_obj, indent=2)) self.try_activitypub() or self.try_salmon()
def __init__(self, request: utils.RequestResult): """ Given a request object and retrieved text, parse out the feed """ text = request.text md5 = hashlib.md5(text.encode('utf-8')) self.digest = md5.digest() self.url = str(request.url) self.caching = caching.make_headers(request.headers) self.feed = feedparser.parse(text) if 'bozo_exception' in self.feed: # feedparser couldn't handle this, so maybe it's mf2 self.mf2 = mf2py.parse(text) else: self.mf2 = None self.status = request.status self.links: typing.DefaultDict[ str, typing.Set[str]] = collections.defaultdict(set) try: for link in self.feed.feed.links: # conveniently this also contains the rel links from HTML # documents, so no need to handle the mf2 version (if any) href = link.get('href') rel = link.get('rel') if rel and href: self.links[rel].add(href) except (AttributeError, KeyError): pass self.schema = SCHEMA_VERSION
def parse_with_mf2py(url): result = mf2py.parse(url=url) if not result: return None if len(result.get('items', [])) == 0: return None item = result['items'][0] if not item['properties'].get('name'): return None if not item['properties'].get('content'): return None mf2 = { 'type': ['h-entry'], 'properties': { 'name': item['properties']['name'], 'content': item['properties']['content'], 'syndication': [url], 'url': [url] } } if item['properties'].get('author'): mf2['properties']['author'] = item['properties']['author'] if item['properties'].get('published'): mf2['properties']['published'] = item['properties']['published'] return mf2
def convert_mf2(): strip_rel_urls = request.args.get('strip_rel_urls') or request.form.get('strip_rel_urls') url = request.args.get('url') or request.form.get('url') doc = request.args.get('doc') or request.form.get('doc') doc = doc and doc.strip() if url and not doc: parsed = urllib.parse.urlparse(url) if parsed.fragment: r = requests.get(url) r.raise_for_status() doc = BeautifulSoup( r.text if 'charset' in r.headers.get('content-type', '') else r.content) doc = doc.find(id=parsed.fragment) if url or doc: try: json = mf2py.parse(url=url, doc=doc) if strip_rel_urls: json.pop('rel-urls', None) return jsonify(json) except: return jsonify({'error': str(sys.exc_info()[0])}) return """
def build_user_json(me, resp=None): """user_json contains an h-card, rel-me links, and "me" Args: me: string, URL of the user, returned by resp: requests.Response (optional), re-use response if it's already been fetched Return: dict, with 'me', the URL for this person; 'h-card', the representative h-card for this page; 'rel-me', a list of rel-me URLs found at this page """ user_json = {'me': me} resp = resp or util.requests_get(me) if resp.status_code // 100 != 2: logging.warning( 'could not fetch user url "%s". got response code: %d', me, resp.status_code) return user_json # Requests doesn't look at the HTML body to find <meta charset> # tags, so if the character encoding isn't given in a header, then # we pass on the raw bytes and let BS4 deal with it. p = mf2py.parse(doc=resp.text if 'charset' in resp.headers.get('content-type', '') else resp.content, url=me) user_json['rel-me'] = p.get('rels', {}).get('me') user_json['h-card'] = mf2util.representative_hcard(p, me) logging.debug('built user-json %r', user_json) return util.trim_nulls(user_json)
def fetch_reply_contexts(reply_pairs, now, fetch_mf2_func): old_contexts = {} in_reply_tos = [url for _, url in reply_pairs] if in_reply_tos: for entry in (Entry.query .join(Entry.feed) .filter(Entry.permalink.in_(in_reply_tos), Feed.type == 'html')): old_contexts[entry.permalink] = entry for entry, in_reply_to in reply_pairs: context = old_contexts.get(in_reply_to) if not context: current_app.logger.info('fetching in-reply-to: %s', in_reply_to) try: proxied_reply_url = proxy_url(in_reply_to) parsed = mf2util.interpret( mf2py.parse(url=proxied_reply_url), in_reply_to, fetch_mf2_func=fetch_mf2_func) if parsed: context = hentry_to_entry(parsed, None, False, now) except requests.exceptions.RequestException as err: current_app.logger.warn( '%s fetching reply context: %s for entry: %s', type(err).__name__, proxied_reply_url, entry.permalink) if context: db.session.add(context) entry.reply_context.append(context)
def posse_post_discovery(original, regex): """Given an original URL and a permalink regex, looks for silo-specific syndication URLs. If the original is a silo url, that url is returned; otherwise we fetch the source and attempt to look for u-syndication URLs. """ if not hasattr(regex, 'match'): regex = re.compile(regex) if regex.match(original): return original try: d = mf2py.parse(url=original) urls = d['rels'].get('syndication', []) for item in d['items']: if 'h-entry' in item['type']: urls += item['properties'].get('syndication', []) for url in urls: if regex.match(url): return url except HTTPError: current_app.logger.exception('Could not fetch original') except SSLError: current_app.logger.exception('SSL Error') except Exception as e: current_app.logger.exception('MF2 Parser error: %s', e)
def add_subscription(origin, feed_url, type, tags=None): feed = Feed.query.filter_by(feed=feed_url, type=type).first() if not feed: name = None if type == "html": flask.current_app.logger.debug("mf2py parsing %s", feed_url) resp = util.requests_get(feed_url) feed_text = resp.text if "charset" in resp.headers.get("content-type", "") else resp.content parsed = mf2util.interpret_feed(mf2py.parse(doc=feed_text, url=feed_url), feed_url) name = parsed.get("name") elif type == "xml": flask.current_app.logger.debug("feedparser parsing %s", feed_url) parsed = feedparser.parse(feed_url, agent=util.USER_AGENT) if parsed.feed: name = parsed.feed.get("title") else: flask.current_app.logger.error("unknown feed type %s", type) flask.abort(400) if not name: p = urllib.parse.urlparse(origin) name = p.netloc + p.path feed = Feed(name=name[:140], origin=origin, feed=feed_url, type=type) if feed: db.session.add(feed) flask_login.current_user.subscriptions.append(Subscription(feed=feed, name=feed.name, tags=tags)) db.session.commit() # go ahead and update the fed tasks.q.enqueue(tasks.update_feed, feed.id) return feed
def get(self, domain): url = 'http://%s/' % domain resp = common.requests_get(url) mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True) # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2)) hcard = mf2util.representative_hcard(mf2, resp.url) logging.info('Representative h-card: %s', json.dumps(hcard, indent=2)) if not hcard: common.error( self, """\ Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s""" % resp.url) key = MagicKey.get_or_create(domain) obj = common.postprocess_as2(as2.from_as1( microformats2.json_to_object(hcard)), key=key) obj.update({ 'inbox': '%s/%s/inbox' % (appengine_config.HOST_URL, domain), 'outbox': '%s/%s/outbox' % (appengine_config.HOST_URL, domain), 'following': '%s/%s/following' % (appengine_config.HOST_URL, domain), 'followers': '%s/%s/followers' % (appengine_config.HOST_URL, domain), }) logging.info('Returning: %s', json.dumps(obj, indent=2)) self.response.headers.update({ 'Content-Type': common.CONTENT_TYPE_AS2, 'Access-Control-Allow-Origin': '*', }) self.response.write(json.dumps(obj, indent=2))
def convert_mf2util(): def dates_to_string(json): if isinstance(json, dict): return {k: dates_to_string(v) for (k, v) in json.items()} if isinstance(json, list): return [dates_to_string(v) for v in json] if isinstance(json, datetime.date) or isinstance(json, datetime.datetime): return json.isoformat() return json url = request.args.get('url') as_feed = request.args.get('as-feed') op = request.args.get('op') if url: try: d = mf2py.parse(url=url) if op == 'post-type-discovery': entry = mf2util.find_first_entry(d, ['h-entry', 'h-event']) return jsonify({'type': mf2util.post_type_discovery(entry)}) if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']): json = mf2util.interpret_feed(d, url) else: json = mf2util.interpret(d, url) return jsonify(dates_to_string(json)) except: current_app.logger.exception('running mf2util service') return jsonify({'error': str(sys.exc_info()[0])}) return """
def fetch_mf2(url): testname = url prefix = 'http://example.com/' if testname.startswith(prefix): testname = testname[len(prefix):] with open('tests/authorship/' + testname) as f: return mf2py.parse(url=url, doc=f.read())
def mf2py_parse(input, url): """Uses mf2py to parse an input HTML string or BeautifulSoup input.""" if isinstance(input, basestring): input = beautifulsoup_parse(input) # instrumenting, disabled for now: # with cache_time('mf2py', 1): return mf2py.parse(url=url, doc=input)
def mf2py_parse(input, url): """Uses mf2py to parse an input HTML string or BeautifulSoup input.""" if isinstance(input, basestring): input = beautifulsoup_parse(input) # instrumenting, disabled for now: # with cache_time('mf2py', 1): return mf2py.parse(url=url, doc=input, img_with_alt=True)
def login_callback(): current_app.logger.debug('callback fields: %s', request.args) state = request.args.get('state') next_url = state or url_for('views.index') # TODO rediscover these endpoints based on 'me'. Assuming # they are the same is not totally safe. auth_url, token_url, micropub_url = session['endpoints'] if not auth_url: flash('Login failed: No authorization URL in session') return redirect(next_url) code = request.args.get('code') client_id = get_settings().site_url redirect_uri = url_for('.login_callback', _external=True) current_app.logger.debug('callback with auth endpoint %s', auth_url) response = requests.post(auth_url, data={ 'code': code, 'client_id': client_id, 'redirect_uri': redirect_uri, 'state': state, }) rdata = urllib.parse.parse_qs(response.text) if response.status_code != 200: current_app.logger.debug('call to auth endpoint failed %s', response) flash('Login failed {}: {}'.format(rdata.get('error'), rdata.get('error_description'))) return redirect(next_url) current_app.logger.debug('verify response %s', response.text) if 'me' not in rdata: current_app.logger.debug('Verify response missing required "me" field') flash('Verify response missing required "me" field {}'.format( response.text)) return redirect(next_url) me = rdata.get('me')[0] scopes = rdata.get('scope') try_micropub_config(token_url, micropub_url, scopes, code, me, redirect_uri, client_id, state) cred = Credential.query.get(('indieauth', me)) if not cred: cred = Credential(type='indieauth', value=me, display=me) db.session.add(cred) db.session.commit() # offer to associate credential with existing user or create a new user p = mf2py.parse(url=me) hcard = mf2util.representative_hcard(p, me) author = hcard and mf2util.parse_author(hcard) return do_login(cred, author and author.get('name'), next_url)
def test_posts_by_type(client, silly_posts): text = client.get('/likes/').get_data(as_text=True) p = mf2py.parse(doc=text) feed = p['items'][0]['children'] for item, expected in zip(feed, [ 'https://buf.fy/summers/', 'https://mal.colm/reynolds', ]): assert item['properties']['like-of'][0]['properties']['url'][0] == expected
def find_screen_name(url): try: print('fetching', url) r = requests.get(url, timeout=10) p = mf2py.parse(url=url) for me in p.get('rels', {}).get('me', []): m = re.match(r'https?://(?:www.)?twitter.com/@?([\w]+)/?', me) if m: return m.group(1) except: logging.error('problem fetching %s', url)
def test_posts_by_type(client, silly_posts): text = client.get('/likes/').get_data(as_text=True) p = mf2py.parse(doc=text) feed = p['items'][0]['children'] for item, expected in zip(feed, [ 'https://buf.fy/summers/', 'https://mal.colm/reynolds', ]): assert item['properties']['like-of'][0]['properties']['url'][ 0] == expected
def test_find_author(self): self.assert_equals({ 'displayName': 'my name', 'url': 'http://li/nk', 'image': {'url': 'http://pic/ture'}, }, microformats2.find_author(mf2py.parse(doc="""\ <body class="p-author h-card"> <a href="http://li/nk">my name</a> <img class="u-photo" src="http://pic/ture" /> <div class="h-entry"></div> </body> """, url='http://123')))
def _get_mention_info_from_mf2(base_url, bs_html): import mf2py from urllib.parse import urljoin mf2 = mf2py.parse(bs_html) mf2_items = mf2.get('items') if not mf2_items: return None hentry = next(filter( lambda i: 'h-entry' in i['type'], mf2_items), None) if not hentry: return None info = {} hentry_props = hentry['properties'] pnames = hentry_props.get('name') if pnames: info['name'] = pnames[0] urls = hentry_props.get('url') if urls: info['url'] = urljoin(base_url, urls[0]) pubdates = hentry_props.get('published') if pubdates: info['published'] = pubdates[0] contents = hentry_props.get('content') if contents: info['content'] = contents[0]['html'] authors = hentry_props.get('author') if authors: hcard = next(filter( lambda i: 'h-card' in i['type'], authors), None) if hcard: hcard_props = hcard['properties'] hcard_names = hcard_props.get('name') if hcard_names: info['author_name'] = hcard_names[0] hcard_photos = hcard_props.get('photo') if hcard_photos: info['author_photo'] = urljoin(base_url, hcard_photos[0]) hcard_urls = hcard_props.get('url') if hcard_urls: info['author_url'] = urljoin(base_url, hcard_urls[0]) return info
def get_client_id_data(url): # FIXME(tsileo): ensure not localhost via `little_boxes.urlutils.is_url_valid` data = mf2py.parse(url=url) for item in data["items"]: if "h-x-app" in item["type"] or "h-app" in item["type"]: props = item.get("properties", {}) print(props) return dict( logo=_get_prop(props, "logo"), name=_get_prop(props, "name"), url=_get_prop(props, "url"), ) return dict(logo=None, name=url, url=url)
def fetch_mf2_func(url): if util.domain_or_parent_in( urlparse.urlparse(url).netloc, SILO_DOMAINS): return { 'items': [{ 'type': ['h-card'], 'properties': { 'url': [url] } }] } _, doc = self._fetch(url) return mf2py.parse(doc=doc, url=url)
def test_mf2tests(): allfiles = glob.glob(os.path.join('..', 'mf2tests', 'tests', 'microformats-v2', 'h-card', '*.json')) for jsonfile in allfiles: htmlfile = jsonfile[:-4] + 'html' with open(htmlfile) as f: p = mf2py.parse(doc=f, url='http://example.com') yield check_unicode, htmlfile, p with open(jsonfile) as jsonf: try: s = json.load(jsonf) except: s = "bad file: " + jsonfile + sys.exc_info()[0] yield check_mf2, htmlfile, p, s
def handle_child(child): full_child = mf2py.parse(url=child['url'], html_parser='lxml') result = [ item for item in full_child['items'] if item['type'][0] == 'h-entry' ] if len(result): result = result[0] result['properties']['url'] = [child['url']] return result return None
def html_to_activities(html, url=None): """Converts a microformats2 HTML h-feed to ActivityStreams activities. Args: html: string HTML url: optional string URL that HTML came from Returns: list of ActivityStreams activity dicts """ parsed = mf2py.parse(doc=html, url=url) hfeed = find_first_entry(parsed, ['h-feed']) items = hfeed.get('children', []) if hfeed else parsed.get('items', []) return [{'object': json_to_object(item)} for item in items]
def html_to_activities(html, url=None): """Converts a microformats2 HTML h-feed to ActivityStreams activities. Args: html: string HTML url: optional string URL that HTML came from Returns: list of ActivityStreams activity dicts """ parsed = mf2py.parse(doc=html, url=url) hfeed = mf2util.find_first_entry(parsed, ['h-feed']) items = hfeed.get('children', []) if hfeed else parsed.get('items', []) return [{'object': json_to_object(item)} for item in items]
def process_html_feed_for_new_entries(feed, content, backfill, now): # strip noscript tags before parsing, since we definitely aren't # going to preserve js content = re.sub('</?noscript[^>]*>', '', content) parsed = mf2util.interpret_feed( mf2py.parse(url=feed.feed, doc=content), feed.feed) hfeed = parsed.get('entries', []) for hentry in hfeed: entry = hentry_to_entry(hentry, feed, backfill, now) if entry: current_app.logger.debug('built entry: %s', entry.permalink) yield entry
def _extract(text, url): mf_doc = mf2py.parse(text) if mf_doc.get('items'): LOGGER.info("Found mf2 document") return [_extract_mf(item, url) for item in mf_doc['items']] # no valid mf2, so let's extract from DOM instead dom = BeautifulSoup(text, features='html.parser') articles = (dom.find_all('article') or dom.find_all(class_='entry') or dom.find_all(class_='article') or [dom]) LOGGER.info("Attempting to extract from ad-hoc HTML") return [_extract_dom(item, dom, url) for item in articles]
def test_mf2tests(): allfiles = glob.glob( os.path.join('.', 'testsuite', 'tests', '*', '*', '*.json')) for jsonfile in allfiles: htmlfile = jsonfile[:-4] + 'html' with open(htmlfile) as f: p = mf2py.parse(doc=f, url='http://example.com') yield check_unicode, htmlfile, p with open(jsonfile) as jsonf: try: s = json.load(jsonf) except: s = "bad file: " + jsonfile + sys.exc_info()[0] yield check_mf2, htmlfile, p, s
def test_complete(self, mock_get): actual = sites_to_bigquery.generate('orig.com') self.assertEqual({ 'domain': 'orig.com', 'urls': [ 'http://foo.com/', 'http://foo.com/canonical', 'http://foo.com/ogp', 'http://foo.com/twitter', ], 'names': [ 'My Name', 'A Title', 'OGP title', 'OGP site name', 'Twitter title', ], 'descriptions': [ 'About me', 'A meta description', 'An OGP description', 'Twitter description', ], 'pictures': [ 'http://foo.com/hcard.jpg', 'http://foo.com/icon.jpg', 'http://foo.com/ogp.jpg', 'http://foo.com/ogp2.jpg', 'https://foo.com/ogp.jpg', 'http://foo.com/twitter.jpg', ], 'hcard': json.dumps({ 'type': ['h-card'], 'properties': { 'name': ['My Name'], 'url': ['http://foo.com/'], 'note': [{ 'html': 'About <br/>me ', 'value': 'About me ', }], 'photo': ['http://foo.com/hcard.jpg'], }, }, sort_keys=True), 'mf2': json.dumps(mf2py.parse(doc=HTML, url='http://foo.com'), sort_keys=True), 'rel_mes': ['http://a/silo', 'http://b/silo'], 'html': HTML, 'fetch_time': actual['fetch_time'] }, actual)
def extract_mf2_context(context, doc, url): """ Gets Microformats2 data from the given document """ cached_mf2 = {} # used by authorship algorithm def fetch_mf2(url): if url in cached_mf2: return cached_mf2[url] p = mf2py.parse(url=url) cached_mf2[url] = p return p blob = mf2py.parse(doc=doc, url=url) cached_mf2[url] = blob if blob: current_app.logger.debug('parsed successfully by mf2py: %s', url) entry = mf2util.interpret(blob, url, fetch_mf2_func=fetch_mf2) if entry: current_app.logger.debug( 'parsed successfully by mf2util: %s', url) published = entry.get('published') content = util.clean_foreign_html(entry.get('content', '')) content_plain = util.format_as_text( content, link_fn=lambda a: a) title = entry.get('name') if title and len(title) > 512: # FIXME is there a db setting to do this automatically? title = title[:512] author_name = entry.get('author', {}).get('name', '') author_image = entry.get('author', {}).get('photo') permalink = entry.get('url') if not permalink or not isinstance(permalink, str): permalink = url context.url = url context.permalink = permalink context.author_name = author_name context.author_url = entry.get('author', {}).get('url', '') context.author_image = author_image context.content = content context.content_plain = content_plain context.published = published context.title = title return context
def html_to_atom(html, url=None, fetch_author=False): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional fetch_author: boolean, whether to make HTTP request to fetch rel-author link Returns: unicode string with Atom XML """ if fetch_author: assert url, 'fetch_author=True requires url!' parsed = mf2py.parse(doc=html, url=url) actor = microformats2.find_author( parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url)) return activities_to_atom( microformats2.html_to_activities(html, url, actor), actor, title=mf2util.interpret_feed(parsed, url).get('name'), xml_base=util.base_url(url), host_url=url)
def user_to_actor(self, resp): """Convert a Flickr user dict into an ActivityStreams actor. """ person = resp.get('person', {}) username = person.get('username', {}).get('_content') obj = util.trim_nulls({ 'objectType': 'person', 'displayName': person.get('realname', {}).get('_content') or username, 'image': { 'url': self.get_user_image(person.get('iconfarm'), person.get('iconserver'), person.get('nsid')), }, 'id': self.tag_uri(username), # numeric_id is our own custom field that always has the source's numeric # user id, if available. 'numeric_id': person.get('nsid'), 'location': { 'displayName': person.get('location', {}).get('_content'), }, 'username': username, 'description': person.get('description', {}).get('_content'), }) # fetch profile page to get url(s) profile_url = person.get('profileurl', {}).get('_content') if profile_url: try: logging.debug('fetching flickr profile page %s', profile_url) resp = urllib2.urlopen(profile_url, timeout=appengine_config.HTTP_TIMEOUT) profile_json = mf2py.parse(doc=resp, url=profile_url) # personal site is likely the first non-flickr url urls = profile_json.get('rels', {}).get('me', []) obj['urls'] = [{'value': u} for u in urls] obj['url'] = next( (u for u in urls if not u.startswith('https://www.flickr.com/')), None) except urllib2.URLError, e: logging.warning('could not fetch user homepage %s', profile_url)
def html_to_atom(html, url=None, **kwargs): """Converts microformats2 HTML to an Atom feed. Args: html: string url: string URL html came from, optional Returns: unicode string with Atom XML """ parsed = mf2py.parse(doc=html, url=url) return activities_to_atom(microformats2.html_to_activities(html, url), microformats2.find_author(parsed), title=mf2util.interpret_feed(parsed, url).get('name'), xml_base=util.base_url(url), host_url=url)
def convert_mf2(): strip_rel_urls = request.args.get('strip_rel_urls') or request.form.get('strip_rel_urls') url = request.args.get('url') or request.form.get('url') doc = request.args.get('doc') or request.form.get('doc') doc = doc and doc.strip() if url or doc: try: json = mf2py.parse(url=url, doc=doc) if strip_rel_urls: json.pop('rel-urls', None) return jsonify(json) except: return jsonify({'error': str(sys.exc_info()[0])}) return """
def get_client_id_data(url): data = mf2py.parse(url=url) for item in data['items']: if 'h-x-app' in item['type'] or 'h-app' in item['type']: props = item.get('properties', {}) print(props) return dict( logo=_get_prop(props, 'logo'), name=_get_prop(props, 'name'), url=_get_prop(props, 'url'), ) return dict( logo=None, name=url, url=url, )
def html_to_activities(html, url=None, actor=None): """Converts a microformats2 HTML h-feed to ActivityStreams activities. Args: html: string HTML url: optional string URL that HTML came from actor: optional author AS actor object for all activities. usually comes from a rel="author" link. Returns: list of ActivityStreams activity dicts """ parsed = mf2py.parse(doc=html, url=url) hfeed = mf2util.find_first_entry(parsed, ['h-feed']) items = hfeed.get('children', []) if hfeed else parsed.get('items', []) return [{'object': json_to_object(item, actor=actor)} for item in items]
def extract_mf2_context(context, doc, url): """ Gets Microformats2 data from the given document """ cached_mf2 = {} # used by authorship algorithm def fetch_mf2(url): if url in cached_mf2: return cached_mf2[url] p = mf2py.parse(url=url) cached_mf2[url] = p return p blob = mf2py.parse(doc=doc, url=url) cached_mf2[url] = blob if blob: current_app.logger.debug('parsed successfully by mf2py: %s', url) entry = mf2util.interpret(blob, url, fetch_mf2_func=fetch_mf2) if entry: current_app.logger.debug('parsed successfully by mf2util: %s', url) published = entry.get('published') content = util.clean_foreign_html(entry.get('content', '')) content_plain = util.format_as_text(content, link_fn=lambda a: a) title = entry.get('name') if title and len(title) > 512: # FIXME is there a db setting to do this automatically? title = title[:512] author_name = entry.get('author', {}).get('name', '') author_image = entry.get('author', {}).get('photo') permalink = entry.get('url') if not permalink or not isinstance(permalink, str): permalink = url context.url = url context.permalink = permalink context.author_name = author_name context.author_url = entry.get('author', {}).get('url', '') context.author_image = author_image context.content = content context.content_plain = content_plain context.published = published context.title = title return context
def lookup(url: str) -> ap.BaseActivity: """Try to find an AP object related to the given URL.""" try: if url.startswith("@"): actor_url = get_actor_url(url) if actor_url: return ap.fetch_remote_activity(actor_url) except NotAnActivityError: pass except requests.HTTPError: # Some websites may returns 404, 503 or others when they don't support webfinger, and we're just taking a guess # when performing the lookup. pass except requests.RequestException as err: raise RemoteServerUnavailableError(f"failed to fetch {url}: {err!r}") backend = ap.get_backend() try: resp = requests.head( url, timeout=10, allow_redirects=True, headers={"User-Agent": backend.user_agent()}, ) except requests.RequestException as err: raise RemoteServerUnavailableError(f"failed to GET {url}: {err!r}") try: resp.raise_for_status() except Exception: return ap.fetch_remote_activity(url) # If the page is HTML, maybe it contains an alternate link pointing to an AP object for alternate in mf2py.parse(resp.text).get("alternates", []): if alternate.get("type") == "application/activity+json": return ap.fetch_remote_activity(alternate["url"]) try: # Maybe the page was JSON-LD? data = resp.json() return ap.parse_activity(data) except Exception: pass # Try content negotiation (retry with the AP Accept header) return ap.fetch_remote_activity(url)
def find_possible_feeds(origin): # scrape an origin source to find possible alternative feeds try: resp = util.requests_get(origin) except requests.exceptions.RequestException as e: flask.flash("Error fetching source {}".format(repr(e))) flask.current_app.logger.warn("Subscribe failed for %s with error %s", origin, repr(e)) return None feeds = [] xml_feed_types = [ "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml", "text/xml", ] xml_mime_types = xml_feed_types + ["text/xml", "text/rss+xml", "text/atom+xml"] content_type = resp.headers["content-type"] content_type = content_type.split(";", 1)[0].strip() if content_type in xml_mime_types: feeds.append({"origin": origin, "feed": origin, "type": "xml", "title": "untitled xml feed"}) elif content_type == "text/html": parsed = mf2py.parse(doc=resp.text, url=origin) # if text/html, then parse and look for h-entries hfeed = mf2util.interpret_feed(parsed, origin) if hfeed.get("entries"): ftitle = hfeed.get("name") or "untitled h-feed" feeds.append({"origin": origin, "feed": resp.url, "type": "html", "title": ftitle[:140]}) # look for link="feed" for furl in parsed.get("rels", {}).get("feed", []): fprops = parsed.get("rel-urls", {}).get(furl, {}) if not fprops.get("type") or fprops.get("type") == "text/html": feeds.append({"origin": origin, "feed": furl, "type": "html", "title": fprops.get("title")}) # then look for link rel="alternate" for link in parsed.get("alternates", []): if link.get("type") in xml_feed_types: feeds.append({"origin": origin, "feed": link.get("url"), "type": "xml", "title": link.get("title")}) return feeds
def callback(info): if info.error: flash('Micropub failure: {}'.format(info.error)) else: flash('Micropub success! Authorized {}'.format(info.me)) p = mf2py.parse(url=info.me) current_app.logger.debug('found author info %s', info.me) target = PosseTarget( uid=info.me, name=info.me, style='microblog', micropub_endpoint=info.micropub_endpoint, access_token=info.access_token) current_user.posse_targets.append(target) db.session.commit() return redirect(url_for('.edit', target_id=target.id))
def fetch_reply_context(entry_id, in_reply_to, now): with flask_app(): entry = Entry.query.get(entry_id) context = Entry.query\ .join(Entry.feed)\ .filter(Entry.permalink==in_reply_to, Feed.type == 'html')\ .first() if not context: current_app.logger.info('fetching in-reply-to url: %s', in_reply_to) parsed = mf2util.interpret( mf2py.parse(url=proxy_url(in_reply_to)), in_reply_to) if parsed: context = hentry_to_entry(parsed, in_reply_to, False, now) if context: entry.reply_context.append(context) db.session.commit()
def user_to_actor(self, resp): """Convert a Flickr user dict into an ActivityStreams actor. """ person = resp.get('person', {}) username = person.get('username', {}).get('_content') obj = util.trim_nulls({ 'objectType': 'person', 'displayName': person.get('realname', {}).get('_content') or username, 'image': { 'url': self.get_user_image(person.get('iconfarm'), person.get('iconserver'), person.get('nsid')), }, 'id': self.tag_uri(username), # numeric_id is our own custom field that always has the source's numeric # user id, if available. 'numeric_id': person.get('nsid'), 'location': { 'displayName': person.get('location', {}).get('_content'), }, 'username': username, 'description': person.get('description', {}).get('_content'), }) # fetch profile page to get url(s) profile_url = person.get('profileurl', {}).get('_content') if profile_url: try: logging.debug('fetching flickr profile page %s', profile_url) resp = urllib2.urlopen( profile_url, timeout=appengine_config.HTTP_TIMEOUT) profile_json = mf2py.parse(doc=resp, url=profile_url) # personal site is likely the first non-flickr url urls = profile_json.get('rels', {}).get('me', []) obj['urls'] = [{'value': u} for u in urls] obj['url'] = next( (u for u in urls if not u.startswith('https://www.flickr.com/')), None) except urllib2.URLError, e: logging.warning('could not fetch user homepage %s', profile_url)
def user_to_actor(self, resp): """Convert a Flickr user dict into an ActivityStreams actor. """ person = resp.get('person', {}) username = person.get('username', {}).get('_content') obj = util.trim_nulls({ 'objectType': 'person', 'displayName': person.get('realname', {}).get('_content') or username, 'image': { 'url': self.get_user_image(person.get('iconfarm'), person.get('iconserver'), person.get('nsid')), }, 'id': self.tag_uri(username), # numeric_id is our own custom field that always has the source's numeric # user id, if available. 'numeric_id': person.get('nsid'), 'location': { 'displayName': person.get('location', {}).get('_content'), }, 'username': username, 'description': person.get('description', {}).get('_content'), }) # fetch profile page to get url(s) profile_url = person.get('profileurl', {}).get('_content') if profile_url: try: resp = util.urlopen(profile_url) profile_json = mf2py.parse(doc=resp, url=profile_url, img_with_alt=True) urls = profile_json.get('rels', {}).get('me', []) if urls: obj['url'] = urls[0] if len(urls) > 1: obj['urls'] = [{'value': u} for u in urls] except urllib_error.URLError: logging.warning('could not fetch user homepage %s', profile_url) return self.postprocess_object(obj)
def process_html_feed_for_new_entries(feed, content, backfill, now, fetch_mf2_func): # strip noscript tags before parsing, since we definitely aren't # going to preserve js content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE) # look for a <base> element doc = bs4.BeautifulSoup(content, 'html5lib') base_el = doc.find('base') base_href = base_el.get('href') if base_el else None parsed = mf2util.interpret_feed( mf2py.parse(doc, feed.feed), source_url=feed.feed, base_href=base_href, fetch_mf2_func=fetch_mf2_func) hfeed = parsed.get('entries', []) for hentry in hfeed: current_app.logger.debug('building entry: %s', hentry.get('url')) entry = hentry_to_entry(hentry, feed, backfill, now) if entry: current_app.logger.debug('built entry: %s', entry.permalink) yield entry
def fetch_context(): url = request.args.get('url') if not url: return make_response(jsonify({ 'error': 'missing_url', 'message': "Missing 'url' query parameter", }), 400) # TODO cache everything. check newer urls more frequently than # older urls. be careful not to overwrite previous good responses # with failure. url = maybe_proxy(url) resp = fetch(url) if resp.status_code // 100 != 2: return make_response(jsonify({ 'error': 'fetch_failed', 'message': 'Failed to fetch resource at ' + url, 'response': resp.text, 'code': resp.status_code, }), resp.status_code) parsed = mf2py.parse( doc=resp.text if 'content-type' in resp.headers else resp.content, url=url) entry = mf2util.interpret(parsed, url, want_json=True) blob = {} if entry: blob['data'] = entry cb = request.args.get('callback') if cb: # jsonp resp = make_response('{}({})'.format(cb, json.dumps(blob))) resp.headers['content-type'] = 'application/javascript; charset=utf-8' return resp return jsonify(blob)
def _find_feed_items(feed_url, feed_doc): """Extract feed items from a given URL and document. If the top-level h-* item is an h-feed, return its children. Otherwise, returns the top-level items. Args: feed_url: a string. the URL passed to mf2py parser feed_doc: a string or BeautifulSoup object. document is passed to mf2py parser Returns: a list of dicts, each one representing an mf2 h-* item """ parsed = mf2py.parse(url=feed_url, doc=feed_doc) feeditems = parsed['items'] hfeed = mf2util.find_first_entry(parsed, ('h-feed',)) if hfeed: feeditems = hfeed.get('children', []) else: logging.debug('No h-feed found, fallback to top-level h-entrys.') return feeditems