def handle_root(): source = request.form['source'] target = request.form['target'] if source == target: return Response(response='source same as target', status=400) if not discoverEndpoint(target)[1]: return Response(response='target does not support webmentions', status=400) # find mention in source result = findMentions(source, target) if result['status'] != 200: return Response(response='error fetching source', status=400) if not result['refs']: return Response(response='target not found in source', status=400) parsed = mf2py.Parser(url=source).to_dict() r = commit_file(webmention_path(source, target), yaml.dump(parsed)) if r.status_code != 201: print('failed to post to github: ' + r.text) raise Exception('failed to post to github: ' + str(r)) return Response(status=201)
def fetch(self): # fetch the website and parse for microformats try: parser = mf2py.Parser(url=self.url) except: return None # identify the representative h-card parsed = parser.to_dict() hcard = mf2util.representative_hcard(parsed, self.url) if not hcard: hcards = parser.to_dict(filter_by_type='h-card') if len(hcards): hcard = hcards[0] if hcard: self.name = hcard['properties'].get('name', [None])[0] self.nicknames = hcard['properties'].get('nickname', None) # identify rel-me links as pseudonyms matches = {} for url in parser.to_dict()['rels'].get('me', []): match = Pseudonym.identify_url(url, self) if not match: continue if match.target not in self.pseudonyms: self.pseudonyms[match.target] = match # remember the last time I fetched self.timestamp = time.time() # save to the database self.save()
def get_profile(id_url: str, server_profile: dict = None, links=None, content: BeautifulSoup = None, endpoints=None) -> dict: """ Given an identity URL, try to parse out an Authl profile :param str id_url: The profile page to parse :param dict server_profile: An IndieAuth response profile :param dict links: Profile response's links dictionary :param content: Pre-parsed page content :param dict endpoints: Pre-parsed page endpoints """ if id_url in _PROFILE_CACHE: LOGGER.debug("Reusing %s profile from cache", id_url) profile = _PROFILE_CACHE[id_url].copy() else: profile = {} if not content and id_url not in _PROFILE_CACHE: LOGGER.debug("get_profile: Retrieving %s", id_url) request = utils.request_url(id_url) if request is not None: links = request.links content = BeautifulSoup(request.text, 'html.parser') if content: profile = {} h_cards = mf2py.Parser(doc=content).to_dict(filter_by_type="h-card") LOGGER.debug("get_profile(%s): found %d h-cards", id_url, len(h_cards)) for card in h_cards: items = _parse_hcard(id_url, card) profile.update({k: v for k, v in items.items() if v and k not in profile}) # Only stash the version without the IndieAuth server profile addons, in case # the user logs in again without the profile/email scopes LOGGER.debug("Stashing %s profile", id_url) _PROFILE_CACHE[id_url] = profile.copy() if server_profile: # The IndieAuth server also provided a profile, which should supercede the h-card for in_key, out_key in (('name', 'name'), ('photo', 'avatar'), ('url', 'homepage'), ('email', 'email')): if in_key in server_profile: profile[out_key] = server_profile[in_key] if not endpoints: endpoints, _ = find_endpoints(id_url, links=links, content=content) if endpoints: profile['endpoints'] = endpoints return profile
def parse_mf2(url_or_path): import mf2py logger.debug("Fetching %s..." % url_or_path) if os.path.exists(url_or_path): obj = open(url_or_path, 'r', encoding='utf8') params = {'doc': obj} else: params = {'url': url_or_path} return mf2py.Parser(html_parser='html5lib', **params)
def get_profile(id_url: str, content: BeautifulSoup = None) -> dict: """ Given an identity URL, try to parse out an Authl profile """ if content: h_cards = mf2py.Parser(doc=content).to_dict(filter_by_type="h-card") elif id_url in _PROFILE_CACHE: return _PROFILE_CACHE[id_url] else: h_cards = mf2py.Parser(url=id_url).to_dict(filter_by_type="h-card") profile = {} for card in h_cards: items = _parse_hcard(id_url, card) profile.update( {k: v for k, v in items.items() if v and k not in profile}) _PROFILE_CACHE[id_url] = profile return profile
def from_soup(cls, soup: BeautifulSoup, save=False): """Create or update HCard(s) using data from a BeautifulSoup document. See https://github.com/microformats/mf2py""" parser = mf2py.Parser(doc=soup) parsed_data = parser.to_dict() for item in parsed_data.get("items", []): try: return _parse_hcard(item, save) except NotEnoughData as e: log.debug(e) continue
def fetch_profile(): url = request.args.get('url') if not url: return """ <html><body> <h1>Fetch Profile</h1> <form><label>URL to fetch: <input name="url"></label> <input type="Submit"> </form></body></html>""" try: name = None image = None d = mf2py.Parser(url=url).to_dict() relmes = d['rels'].get('me', []) # check for h-feed hfeed = next((item for item in d['items'] if 'h-feed' in item['type']), None) if hfeed: authors = hfeed.get('properties', {}).get('author') images = hfeed.get('properties', {}).get('photo') if authors: if isinstance(authors[0], dict): name = authors[0].get('properties', {}).get('name') image = authors[0].get('properties', {}).get('photo') else: name = authors[0] if images and not image: image = images[0] # check for top-level h-card for item in d['items']: if 'h-card' in item.get('type', []): if not name: name = item.get('properties', {}).get('name') if not image: image = item.get('properties', {}).get('photo') return jsonify({ 'name': name, 'image': image, 'social': relmes, }) except BaseException as e: resp = jsonify({'error': str(e)}) resp.status_code = 400 return resp
def find_syndicated(original): if regex.match(original): return original try: d = mf2py.Parser(url=original).to_dict() urls = d['rels'].get('syndication', []) for item in d['items']: if 'h-entry' in item['type']: urls += item['properties'].get('syndication', []) for url in urls: if regex.match(url): return url except HTTPError: current_app.logger.exception('Could not fetch original') except SSLError: current_app.logger.exception('SSL Error') except Exception as e: current_app.logger.exception('MF2 Parser error: %s', e)
async def fetch_event(url: URL) -> Optional[Event]: """ Extract a event from a page. We assume the event is represented as an h-event in the URL. """ async with ClientSession() as session: async with session.get(url) as response: content = await response.text() parser = mf2py.Parser(content) hevents = parser.to_dict(filter_by_type="h-event") if not hevents: _logger.warning("No events found on %s", url) return None if len(hevents) > 1: _logger.warning("Multiple events found on %s", url) return None hevent = hevents[0] event = Event() event.add("summary", hevent["properties"]["name"][0]) event.add("dtstart", dateutil.parser.parse(hevent["properties"]["start"][0])) event.add("dtend", dateutil.parser.parse(hevent["properties"]["end"][0])) event.add("dtstamp", datetime.now(timezone.utc)) if "url" in hevent["properties"]: event.add("url", hevent["properties"]["url"][0]) if "content" in hevent["properties"]: event.add("description", hevent["properties"]["content"][0]["value"]) if "category" in hevent["properties"]: event.add("categories", hevent["properties"]["category"]) if "featured" in hevent["properties"]: attachment_url = url.join(URL(hevent["properties"]["featured"][0])) event.add("attach", attachment_url) return event
def fetch_mf2_result(url): parsed = mf2py.Parser(url=url).to_dict() if not parsed: return None result = {} result['mf2'] = True result['type'] = 'mf2:' + fetch_post_type(parsed) entry = mf2util.interpret_entry(parsed, url, want_json=True) if 'name' in entry: result['title'] = entry['name'] if 'summary' in entry: result['description'] = convert2html(elide(entry['summary'], 500), True) elif 'content' in entry: result['description'] = convert2html( elide(entry['content-plain'], 500), True) if 'url' in entry: result['url'] = entry['url'] if 'author' in entry: result['author'] = entry['author'] if 'published' in entry: result['published'] = entry['published'] date = dateutil.parser.parse(entry['published']) result['published_locale'] = date.strftime('%d %b, %Y %H:%M %p') if 'featured' in entry: result['image'] = entry['featured'] elif 'photo' in entry: result['image'] = entry['photo'] fetch_image_dimensions(result) if 'title' not in result and 'description' not in result: result = {} return result
def process_webmention(commit_url, source, target): # find mention in source result = findMentions(source, target) if result['status'] != 200: raise Exception('error fetching source') if not result['refs']: raise Exception('target not found in source') parsed = mf2py.Parser(url=source).to_dict() webmention = { 'sourceUrl': source, 'targetUrl': target, 'parsedSource': parsed } r = commit_file(commit_url, yaml.dump(webmention)) if r.status_code != 201: raise Exception('failed to post to github: ' + str(r.status_code) + ', ' + r.text)
def getSummary(): """ Extracts microformat data from the based in html data which is the body of the post data Args: post data - contents of the html file in which to extract the microformat data. Returns: A json object Example: { } Raises: None """ text = request.data.decode('utf-8') p = mf2py.Parser(doc=text) #parsedData = mf2py.parse(doc=text) return p.to_json()
formated_date = datetime.strptime(inputDate, '%Y-%m-%d') #apply the modifier to the date modified_date = formated_date + timedelta(days=modifier) # convert it back to a string YYYY-mm-dd outputDate = modified_date.strftime('%Y-%m-%d') return outputDate # reading the wiki and copy the content into a list of strings with urlopen(wiki_url) as f: wiki_html_list = f.read().splitlines() #wiki_html_list = wiki_html_list.encode('utf-8') # reading data from the OSM Wiki site and parse it mf_obj = mf2py.Parser(url=wiki_url, html_parser="html5lib") # convert the data to a json string and filter events / exclude all the html stuff ## create a json string json_obj = mf_obj.to_json(filter_by_type="h-event") ## Mit json.loads wandelt man einen String im JSON-Format, ## hier json_obj, in Python-Elemente um die Dictionarys enthält: formated_json = json.loads(json_obj) # we store the combined output data here out_array = [] out_error = [] # just a seperator for printiung end_str = ' ** '
import mf2py import mf2util import pprint source_url = r'https://brid.gy/comment/twitter/desmondrivet/1117876830478852096/1118148721034891264' target_url = r'https://desmondrivet.com/2019/04/15/20190415154611' parsed = mf2py.Parser(url=source_url).to_dict() comment = mf2util.interpret_comment(parsed, source_url, [target_url]) general = mf2util.interpret(parsed, source_url) pprint.pprint(parsed) print('-----\n') pprint.pprint(comment)
def mf2parse(source): return mf2py.Parser(url=source).to_dict()
def htmltomfjf(html,url,mf2=None): if not mf2: mf2 = mf2py.Parser(html, url).to_dict() jf2 = mf2tojf2.mf2tojf2(mf2) return mf2,jf2
import mf2py import sys import json if len(sys.argv) < 2: print('usage: ' + sys.argv[0] + ' <inputfile> <outputfile>') sys.exit(1) with open(sys.argv[1], 'r') as file: p = mf2py.Parser(doc=file, url='http://example.com/') res = json.loads(p.to_json()) del res['debug'] print(json.dumps(res))