Python Parserの例、mf2py.Parser Pythonの例

コード例 #1

0

ファイルを表示

def handle_root():
    source = request.form['source']
    target = request.form['target']

    if source == target:
        return Response(response='source same as target', status=400)

    if not discoverEndpoint(target)[1]:
        return Response(response='target does not support webmentions',
                        status=400)

    # find mention in source
    result = findMentions(source, target)

    if result['status'] != 200:
        return Response(response='error fetching source', status=400)

    if not result['refs']:
        return Response(response='target not found in source', status=400)

    parsed = mf2py.Parser(url=source).to_dict()
    r = commit_file(webmention_path(source, target), yaml.dump(parsed))
    if r.status_code != 201:
        print('failed to post to github: ' + r.text)
        raise Exception('failed to post to github: ' + str(r))
    return Response(status=201)

コード例 #2

0

ファイルを表示

ファイル: lib.py プロジェクト: cleverdevil/pseudonym

    def fetch(self):
        # fetch the website and parse for microformats
        try:
            parser = mf2py.Parser(url=self.url)
        except:
            return None

        # identify the representative h-card
        parsed = parser.to_dict()
        hcard = mf2util.representative_hcard(parsed, self.url)

        if not hcard:
            hcards = parser.to_dict(filter_by_type='h-card')
            if len(hcards):
                hcard = hcards[0]

        if hcard:
            self.name = hcard['properties'].get('name', [None])[0]
            self.nicknames = hcard['properties'].get('nickname', None)

        # identify rel-me links as pseudonyms
        matches = {}
        for url in parser.to_dict()['rels'].get('me', []):
            match = Pseudonym.identify_url(url, self)
            if not match:
                continue
            if match.target not in self.pseudonyms:
                self.pseudonyms[match.target] = match

        # remember the last time I fetched
        self.timestamp = time.time()

        # save to the database
        self.save()

コード例 #3

0

ファイルを表示

def get_profile(id_url: str,
                server_profile: dict = None,
                links=None,
                content: BeautifulSoup = None,
                endpoints=None) -> dict:
    """ Given an identity URL, try to parse out an Authl profile

    :param str id_url: The profile page to parse
    :param dict server_profile: An IndieAuth response profile
    :param dict links: Profile response's links dictionary
    :param content: Pre-parsed page content
    :param dict endpoints: Pre-parsed page endpoints
    """

    if id_url in _PROFILE_CACHE:
        LOGGER.debug("Reusing %s profile from cache", id_url)
        profile = _PROFILE_CACHE[id_url].copy()
    else:
        profile = {}

    if not content and id_url not in _PROFILE_CACHE:
        LOGGER.debug("get_profile: Retrieving %s", id_url)
        request = utils.request_url(id_url)
        if request is not None:
            links = request.links
            content = BeautifulSoup(request.text, 'html.parser')

    if content:
        profile = {}
        h_cards = mf2py.Parser(doc=content).to_dict(filter_by_type="h-card")
        LOGGER.debug("get_profile(%s): found %d h-cards", id_url, len(h_cards))

        for card in h_cards:
            items = _parse_hcard(id_url, card)

            profile.update({k: v for k, v in items.items() if v and k not in profile})

    # Only stash the version without the IndieAuth server profile addons, in case
    # the user logs in again without the profile/email scopes
    LOGGER.debug("Stashing %s profile", id_url)
    _PROFILE_CACHE[id_url] = profile.copy()

    if server_profile:
        # The IndieAuth server also provided a profile, which should supercede the h-card
        for in_key, out_key in (('name', 'name'),
                                ('photo', 'avatar'),
                                ('url', 'homepage'),
                                ('email', 'email')):
            if in_key in server_profile:
                profile[out_key] = server_profile[in_key]

    if not endpoints:
        endpoints, _ = find_endpoints(id_url, links=links, content=content)
    if endpoints:
        profile['endpoints'] = endpoints

    return profile

コード例 #4

0

ファイルを表示

def parse_mf2(url_or_path):
    import mf2py
    logger.debug("Fetching %s..." % url_or_path)
    if os.path.exists(url_or_path):
        obj = open(url_or_path, 'r', encoding='utf8')
        params = {'doc': obj}
    else:
        params = {'url': url_or_path}
    return mf2py.Parser(html_parser='html5lib', **params)

コード例 #5

0

ファイルを表示

def get_profile(id_url: str, content: BeautifulSoup = None) -> dict:
    """ Given an identity URL, try to parse out an Authl profile """
    if content:
        h_cards = mf2py.Parser(doc=content).to_dict(filter_by_type="h-card")
    elif id_url in _PROFILE_CACHE:
        return _PROFILE_CACHE[id_url]
    else:
        h_cards = mf2py.Parser(url=id_url).to_dict(filter_by_type="h-card")

    profile = {}
    for card in h_cards:
        items = _parse_hcard(id_url, card)

        profile.update(
            {k: v
             for k, v in items.items() if v and k not in profile})

    _PROFILE_CACHE[id_url] = profile
    return profile

コード例 #6

0

ファイルを表示

    def from_soup(cls, soup: BeautifulSoup, save=False):
        """Create or update HCard(s) using data from a BeautifulSoup document.

        See https://github.com/microformats/mf2py"""
        parser = mf2py.Parser(doc=soup)
        parsed_data = parser.to_dict()
        for item in parsed_data.get("items", []):
            try:
                return _parse_hcard(item, save)
            except NotEnoughData as e:
                log.debug(e)
                continue

コード例 #7

0

ファイルを表示

def fetch_profile():
    url = request.args.get('url')
    if not url:
        return """
<html><body>
<h1>Fetch Profile</h1>
<form><label>URL to fetch: <input name="url"></label>
<input type="Submit">
</form></body></html>"""

    try:
        name = None
        image = None

        d = mf2py.Parser(url=url).to_dict()

        relmes = d['rels'].get('me', [])

        # check for h-feed
        hfeed = next((item for item in d['items']
                      if 'h-feed' in item['type']), None)
        if hfeed:
            authors = hfeed.get('properties', {}).get('author')
            images = hfeed.get('properties', {}).get('photo')
            if authors:
                if isinstance(authors[0], dict):
                    name = authors[0].get('properties', {}).get('name')
                    image = authors[0].get('properties', {}).get('photo')
                else:
                    name = authors[0]
            if images and not image:
                image = images[0]

        # check for top-level h-card
        for item in d['items']:
            if 'h-card' in item.get('type', []):
                if not name:
                    name = item.get('properties', {}).get('name')
                if not image:
                    image = item.get('properties', {}).get('photo')

        return jsonify({
            'name': name,
            'image': image,
            'social': relmes,
        })

    except BaseException as e:
        resp = jsonify({'error': str(e)})
        resp.status_code = 400
        return resp

コード例 #8

0

ファイルを表示

 def find_syndicated(original):
     if regex.match(original):
         return original
     try:
         d = mf2py.Parser(url=original).to_dict()
         urls = d['rels'].get('syndication', [])
         for item in d['items']:
             if 'h-entry' in item['type']:
                 urls += item['properties'].get('syndication', [])
         for url in urls:
             if regex.match(url):
                 return url
     except HTTPError:
         current_app.logger.exception('Could not fetch original')
     except SSLError:
         current_app.logger.exception('SSL Error')
     except Exception as e:
         current_app.logger.exception('MF2 Parser error: %s', e)

コード例 #9

0

ファイルを表示

async def fetch_event(url: URL) -> Optional[Event]:
    """
    Extract a event from a page.

    We assume the event is represented as an h-event in the URL.
    """
    async with ClientSession() as session:
        async with session.get(url) as response:
            content = await response.text()

        parser = mf2py.Parser(content)
        hevents = parser.to_dict(filter_by_type="h-event")

        if not hevents:
            _logger.warning("No events found on %s", url)
            return None
        if len(hevents) > 1:
            _logger.warning("Multiple events found on %s", url)
            return None
        hevent = hevents[0]

        event = Event()
        event.add("summary", hevent["properties"]["name"][0])
        event.add("dtstart",
                  dateutil.parser.parse(hevent["properties"]["start"][0]))
        event.add("dtend",
                  dateutil.parser.parse(hevent["properties"]["end"][0]))
        event.add("dtstamp", datetime.now(timezone.utc))

        if "url" in hevent["properties"]:
            event.add("url", hevent["properties"]["url"][0])

        if "content" in hevent["properties"]:
            event.add("description",
                      hevent["properties"]["content"][0]["value"])

        if "category" in hevent["properties"]:
            event.add("categories", hevent["properties"]["category"])

        if "featured" in hevent["properties"]:
            attachment_url = url.join(URL(hevent["properties"]["featured"][0]))
            event.add("attach", attachment_url)

    return event

コード例 #10

0

ファイルを表示

ファイル: unfurl.py プロジェクト: drivet/indieweb-utils

def fetch_mf2_result(url):
    parsed = mf2py.Parser(url=url).to_dict()
    if not parsed:
        return None

    result = {}
    result['mf2'] = True
    result['type'] = 'mf2:' + fetch_post_type(parsed)

    entry = mf2util.interpret_entry(parsed, url, want_json=True)
    if 'name' in entry:
        result['title'] = entry['name']

    if 'summary' in entry:
        result['description'] = convert2html(elide(entry['summary'], 500),
                                             True)
    elif 'content' in entry:
        result['description'] = convert2html(
            elide(entry['content-plain'], 500), True)

    if 'url' in entry:
        result['url'] = entry['url']

    if 'author' in entry:
        result['author'] = entry['author']

    if 'published' in entry:
        result['published'] = entry['published']
        date = dateutil.parser.parse(entry['published'])
        result['published_locale'] = date.strftime('%d %b, %Y %H:%M %p')

    if 'featured' in entry:
        result['image'] = entry['featured']
    elif 'photo' in entry:
        result['image'] = entry['photo']

    fetch_image_dimensions(result)

    if 'title' not in result and 'description' not in result:
        result = {}

    return result

コード例 #11

0

ファイルを表示

ファイル: webmention.py プロジェクト: drivet/pelican-indieweb-kit

def process_webmention(commit_url, source, target):
    # find mention in source
    result = findMentions(source, target)

    if result['status'] != 200:
        raise Exception('error fetching source')

    if not result['refs']:
        raise Exception('target not found in source')

    parsed = mf2py.Parser(url=source).to_dict()
    webmention = {
        'sourceUrl': source,
        'targetUrl': target,
        'parsedSource': parsed
    }
    r = commit_file(commit_url, yaml.dump(webmention))
    if r.status_code != 201:
        raise Exception('failed to post to github: ' + str(r.status_code) +
                        ', ' + r.text)

コード例 #12

0

ファイルを表示

ファイル: app.py プロジェクト: sethadam30/OSKE

def getSummary():
    """ Extracts microformat data from the based in html data
        which is the body of the post data

    Args:
        post data - contents of the html file in which to extract
                    the microformat data.

    Returns:
        A json object

        Example:
        {
        }

    Raises:
        None
    """
    text = request.data.decode('utf-8')
    p = mf2py.Parser(doc=text)
    #parsedData = mf2py.parse(doc=text)
    return p.to_json()

コード例 #13

0

ファイルを表示

ファイル: mf2json.py プロジェクト: osmtools/osmcalendar

    formated_date = datetime.strptime(inputDate, '%Y-%m-%d')
    #apply the modifier to the date
    modified_date = formated_date + timedelta(days=modifier)
    # convert it back to a string YYYY-mm-dd
    outputDate = modified_date.strftime('%Y-%m-%d')
    return outputDate


# reading the wiki and copy the content into a list of strings
with urlopen(wiki_url) as f:
    wiki_html_list = f.read().splitlines()

#wiki_html_list = wiki_html_list.encode('utf-8')

# reading data from the OSM Wiki site and parse it
mf_obj = mf2py.Parser(url=wiki_url, html_parser="html5lib")

# convert the data to a json string and filter events / exclude all the html stuff
## create a json string
json_obj = mf_obj.to_json(filter_by_type="h-event")

## Mit json.loads wandelt man einen String im JSON-Format,
## hier json_obj, in Python-Elemente um die Dictionarys enthält:
formated_json = json.loads(json_obj)

# we store the combined output data here
out_array = []
out_error = []

# just a seperator for printiung
end_str = ' ** '

コード例 #14

0

ファイルを表示

ファイル: test.py プロジェクト: drivet/pelican-indieweb-kit

import mf2py
import mf2util
import pprint

source_url = r'https://brid.gy/comment/twitter/desmondrivet/1117876830478852096/1118148721034891264'
target_url = r'https://desmondrivet.com/2019/04/15/20190415154611'

parsed = mf2py.Parser(url=source_url).to_dict()
comment = mf2util.interpret_comment(parsed, source_url, [target_url])
general = mf2util.interpret(parsed, source_url)

pprint.pprint(parsed)
print('-----\n')
pprint.pprint(comment)

コード例 #15

0

ファイルを表示

ファイル: webmention.py プロジェクト: drivet/webmention-git-server

def mf2parse(source):
    return mf2py.Parser(url=source).to_dict()

コード例 #16

0

ファイルを表示

def htmltomfjf(html,url,mf2=None):
    if not mf2:
        mf2 = mf2py.Parser(html, url).to_dict()
    jf2 = mf2tojf2.mf2tojf2(mf2)
    return mf2,jf2

コード例 #17

0

ファイルを表示

ファイル: test-one.py プロジェクト: willnorris/mf2-tester

import mf2py
import sys
import json
if len(sys.argv) < 2:
    print('usage: ' + sys.argv[0] + ' <inputfile> <outputfile>')
    sys.exit(1)

with open(sys.argv[1], 'r') as file:
    p = mf2py.Parser(doc=file, url='http://example.com/')
    res = json.loads(p.to_json())
    del res['debug']
    print(json.dumps(res))