Example #1
0
def qquery(xml_thing, xpath_thing, vars=None, funcs=None, force_root=True):
    '''
    Quick query. Convenience for using the MicroXPath engine.
    Give it some XML and an expression and it will yield the results. No fuss.
    
    xml_thing - bytes or string, or amara3.xml.tree node
    xpath_thing - string or parsed XPath expression
    vars - optional mapping of variables, name to value
    funcs - optional mapping of functions, name to function object
    
    >>> from amara3.uxml.uxpath import qquery
    >>> results = qquery(b'<a>1<b>2</b>3</a>', 'a/text()'))
    >>> next(results).xml_value
    '1'
    >>> next(results).xml_value
    '3'
    '''
    root = None
    if isinstance(xml_thing, nodetype):
        root = xml_thing
    elif isinstance(xml_thing, str):
        tb = tree.treebuilder()
        root = tb.parse(xml_thing)
    elif isinstance(xml_thing, bytes):
        tb = tree.treebuilder()
        #Force UTF-8
        root = tb.parse(xml_thing.decode('utf-8'))
    if not root: return
    if isinstance(xpath_thing, str):
        parsed_expr = parse(xpath_thing)
    ctx = context(root, variables=vars, functions=funcs, force_root=force_root)
    result = parsed_expr.compute(ctx)
    yield from result
Example #2
0
def qquery(xml_thing, xpath_thing, vars=None, funcs=None):
    '''
    Quick query. Convenience for using the MicroXPath engine.
    Give it some XML and an expression and it will yield the results. No fuss.
    
    xml_thing - bytes or string, or amara3.xml.tree node
    xpath_thing - string or parsed XPath expression
    vars - optional mapping of variables, name to value
    funcs - optional mapping of functions, name to function object
    
    >>> from amara3.uxml.uxpath import qquery
    >>> results = qquery(b'<a>1<b>2</b>3</a>', 'a/text()'))
    >>> next(results).xml_value
    '1'
    >>> next(results).xml_value
    '3'
    '''
    root = None
    if isinstance(xml_thing, nodetype):
        root = xml_thing
    elif isinstance(xml_thing, str):
        tb = tree.treebuilder()
        root = tb.parse(xml_thing)
    elif isinstance(xml_thing, bytes):
        tb = tree.treebuilder()
        #Force UTF-8
        root = tb.parse(xml_thing.decode('utf-8'))
    if not root: return
    if isinstance(xpath_thing, str):
        parsed_expr = parse(xpath_thing)
    ctx = context(root, variables=vars, functions=funcs)
    result = parsed_expr.compute(ctx)
    yield from result
Example #3
0
def test_basic_mutate(doc):
    tb = tree.treebuilder()
    root = tb.parse(doc)
    new_elem_1 = element('dee', {'a': '1'})
    root.xml_append(new_elem_1)
    new_elem_2 = element('dum', {'a': '2'})
    root.xml_insert(new_elem_2, 0)
    #logging.debug(root.xml_children)
    assert root.xml_children[-1] == new_elem_1, (root.xml_children[-1], new_elem_1)
    assert root.xml_children[0] == new_elem_2, (root.xml_children[0], new_elem_2)
Example #4
0
def test_basic_nav(doc):
    tb = tree.treebuilder()
    root = tb.parse(doc)
    #No parent
    assert root.xml_parent is None
    #Root is an element
    assert isinstance(root, element)
    child_elems = [ch for ch in root.xml_children if isinstance(root, element)]
    for elem in child_elems:
        assert elem.xml_parent is root
Example #5
0
def test_basic_nav(doc):
    tb = tree.treebuilder()
    root = tb.parse(doc)
    #No parent
    assert root.xml_parent is None
    #Root is an element
    assert isinstance(root, element)
    child_elems = [ ch for ch in root.xml_children if isinstance(root, element) ]
    for elem in child_elems:
        assert elem.xml_parent is root
Example #6
0
def test_basic_mutate(doc):
    tb = tree.treebuilder()
    root = tb.parse(doc)
    new_elem_1 = element('dee', {'a': '1'})
    root.xml_append(new_elem_1)
    new_elem_2 = element('dum', {'a': '2'})
    root.xml_insert(new_elem_2, 0)
    #logging.debug(root.xml_children)
    assert root.xml_children[-1] == new_elem_1, (root.xml_children[-1],
                                                 new_elem_1)
    assert root.xml_children[0] == new_elem_2, (root.xml_children[0],
                                                new_elem_2)
Example #7
0
def main():
    tb = tree.treebuilder()
    root = tb.parse(MICRODOC)
    print('Processes\n')
    with concurrent.futures.ProcessPoolExecutor() as executor:
        #for markdown in executor.map(summarize, select_pattern(root, ('description',))):
        for markdown in executor.map(md_summary, select_name(root, 'description')):
            print(markdown)
    print()

    print('Threads\n')
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        #for markdown in executor.map(summarize, select_pattern(root, ('description',))):
        for markdown in executor.map(md_summary, select_name(root, 'description')):
            print(markdown)
Example #8
0
def main():
    tb = tree.treebuilder()
    root = tb.parse(MICRODOC)
    print('Processes\n')
    with concurrent.futures.ProcessPoolExecutor() as executor:
        #for markdown in executor.map(summarize, select_pattern(root, ('description',))):
        for markdown in executor.map(md_summary,
                                     select_name(root, 'description')):
            print(markdown)
    print()

    print('Threads\n')
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        #for markdown in executor.map(summarize, select_pattern(root, ('description',))):
        for markdown in executor.map(md_summary,
                                     select_name(root, 'description')):
            print(markdown)
Example #9
0
'''
py.test test/uxml/test_treegc.py
'''

import sys
import gc

import pytest
from amara3.uxml import tree
from amara3.uxml.tree import node, text, element
from amara3.uxml.uxpath import context, parse as uxpathparse

#from amara3.util import coroutine

TB = tree.treebuilder()
P = TB.parse

N1 = P(
    '<a>+1+<b i="1.1">+2+<x>1</x></b><c i="1.2"><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>'
)

N10 = P('<a><b>1</b><b>2</b><b>3</b></a>')
N11 = P('<a><b>1</b><c>2</c><d>3</d></a>')
N12 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x></a>')
N13 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>')
N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>')

N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>')

V1 = {'a': 1, 'b': 'x', 'a1': N1, 'a1.2': N10}
Example #10
0
File: md.py Project: uogbuji/versa
def parse(md, model, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    model -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    Returns: The overall base URI (`@base`) specified in the Markdown file, or None

    >>> from versa.driver import memory
    >>> from versa.reader.md import from_markdown
    >>> m = memory.connection()
    >>> from_markdown(open('test/resource/poetry.md').read(), m)
    'http://uche.ogbuji.net/poems/'
    >>> m.size()
    40
    >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15'))
    (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {})
    """
    #Set up configuration to interpret the conventions for the Markdown
    config = config or {}
    #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources
    syntaxtypemap = {}
    if config.get('autotype-h1'): syntaxtypemap['h1'] = config.get('autotype-h1')
    if config.get('autotype-h2'): syntaxtypemap['h2'] = config.get('autotype-h2')
    if config.get('autotype-h3'): syntaxtypemap['h3'] = config.get('autotype-h3')
    interp_stanza = config.get('interpretations', {})
    interpretations = {}

    def setup_interpretations(interp):
        #Map the interpretation IRIs to functions to do the data prep
        for prop, interp_key in interp.items():
            if interp_key.startswith('@'):
                interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI)
            if interp_key in PREP_METHODS:
                interpretations[prop] = PREP_METHODS[interp_key]
            else:
                #just use the identity, i.e. no-op
                interpretations[prop] = lambda x, **kwargs: x

    setup_interpretations(interp_stanza)

    #Prep ID generator, in case needed
    idg = idgen(None)

    #Parse the Markdown
    #Alternately:
    #from xml.sax.saxutils import escape, unescape
    #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5')
    #Note: even using safe_mode this should not be presumed safe from tainted input
    #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5')
    comments = mkdcomments.CommentsExtension()
    h = markdown.markdown(md, safe_mode='escape', output_format='html5', extensions=[comments])

    #doc = html.markup_fragment(inputsource.text(h.encode('utf-8')))
    tb = treebuilder()
    h = '<html>' + h + '</html>'
    root = tb.parse(h)
    #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest
    first_h1 = next(select_name(descendants(root), 'h1'))
    #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2'))

    #Extract header elements. Notice I use an empty element with an empty parent as the default result
    docheader = next(select_value(select_name(descendants(root), 'h1'), '@docheader'), element('empty', parent=root)) # //h1[.="@docheader"]
    sections = filter(lambda x: x.xml_value != '@docheader', select_name_pattern(descendants(root), HEADER_PAT)) # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")]

    def fields(sect):
        '''
        Each section represents a resource and contains a list with its properties
        This generator parses the list and yields the key value pairs representing the properties
        Some properties have attributes, expressed in markdown as a nested list. If present these attributes
        Are yielded as well, else None is yielded
        '''
        #import logging; logging.debug(repr(sect))
        #Pull all the list elements until the next header. This accommodates multiple lists in a section
        sect_body_items = itertools.takewhile(lambda x: HEADER_PAT.match(x.xml_name) is None, select_elements(following_siblings(sect)))
        #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3')
        #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ]
        field_list = [ li for elem in select_name(sect_body_items, 'ul') for li in select_name(elem, 'li') ]

        def parse_li(pair):
            '''
            Parse each list item into a property pair
            '''
            if pair.strip():
                matched = REL_PAT.match(pair)
                if not matched:
                    raise ValueError(_('Syntax error in relationship expression: {0}'.format(pair)))
                #print matched.groups()
                if matched.group(3): prop = matched.group(3).strip()
                if matched.group(4): prop = matched.group(4).strip()
                if matched.group(7):
                    val = matched.group(7).strip()
                    typeindic = RES_VAL
                elif matched.group(9):
                    val = matched.group(9).strip()
                    typeindic = TEXT_VAL
                elif matched.group(11):
                    val = matched.group(11).strip()
                    typeindic = TEXT_VAL
                elif matched.group(12):
                    val = matched.group(12).strip()
                    typeindic = UNKNOWN_VAL
                else:
                    val = ''
                    typeindic = UNKNOWN_VAL
                #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ]
                #import logging; logging.debug(repr((prop, val)))
                return prop, val, typeindic
            return None, None, None

        #Go through each list item
        for li in field_list:
            #Is there a nested list, which expresses attributes on a property
            if list(select_name(li, 'ul')):
                #main = ''.join([ node.xml_value
                #        for node in itertools.takewhile(
                #            lambda x: x.xml_name != 'ul', select_elements(li)
                #            )
                #    ])
                main = ''.join(itertools.takewhile(
                            lambda x: isinstance(x, text), li.xml_children
                            ))
                #main = li.xml_select('string(ul/preceding-sibling::node())')
                prop, val, typeindic = parse_li(main)
                subfield_list = [ parse_li(sli.xml_value) for e in select_name(li, 'ul') for sli in (
                                select_name(e, 'li')
                                ) ]
                subfield_list = [ (p, v, t) for (p, v, t) in subfield_list if p is not None ]
                #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader
                if val is None: val = ''
                yield prop, val, typeindic, subfield_list
            #Just a regular, unadorned property
            else:
                prop, val, typeindic = parse_li(li.xml_value)
                if prop: yield prop, val, typeindic, None

    iris = {}

    #Gather the document-level metadata from the @docheader section
    base = propbase = rtbase = document_iri = default_lang = None
    for prop, val, typeindic, subfield_list in fields(docheader):
        #The @iri section is where key IRI prefixes can be set
        if prop == '@iri':
            for (k, uri, typeindic) in subfield_list:
                if k == '@base':
                    base = propbase = rtbase = uri
                elif k == '@property':
                    propbase = uri
                elif k == '@resource-type':
                    rtbase = uri
                else:
                    iris[k] = uri
        #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship
        elif prop == '@interpretations':
            #Iterate over items from the @docheader/@interpretations section to set up for further parsing
            interp = {}
            for k, v, x in subfield_list:
                interp[I(iri.absolutize(k, propbase))] = v
            setup_interpretations(interp)
        #Setting an IRI for this very document being parsed
        elif prop == '@document':
            document_iri = val
        elif prop == '@language':
            default_lang = val
        #If we have a resource to which to attach them, just attach all other properties
        elif document_iri or base:
            rid = document_iri or base
            fullprop = I(iri.absolutize(prop, propbase or base))
            if fullprop in interpretations:
                val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model)
                if val is not None: model.add(rid, fullprop, val)
            else:
                model.add(rid, fullprop, val)


    #Default IRI prefixes if @iri/@base is set
    if not propbase: propbase = base
    if not rtbase: rtbase = base
    if not document_iri: document_iri = base

    #Go through the resources expressed in remaining sections
    for sect in sections:
        #if U(sect) == '@docheader': continue #Not needed because excluded by ss
        #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]"
        #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type
        matched = RESOURCE_PAT.match(sect.xml_value)
        if not matched:
            raise ValueError(_('Syntax error in resource header: {0}'.format(sect.xml_value)))
        rid = matched.group(1)
        rtype = matched.group(3)
        if rtype:
            rtype = I(iri.absolutize(rtype, base))

        if rid:
            rid = I(iri.absolutize(rid, base))
        if not rid:
            rid = next(idg)

        #Resource type might be set by syntax config
        if not rtype:
            rtype = syntaxtypemap.get(sect.xml_name)
        if rtype:
            model.add(rid, TYPE_REL, rtype)
        #Add the property
        for prop, val, typeindic, subfield_list in fields(sect):
            attrs = {}
            for (aprop, aval, atype) in subfield_list or ():
                if atype == RES_VAL:
                    valmatch = URI_ABBR_PAT.match(aval)
                    if valmatch:
                        uri = iris[valmatch.group(1)]
                        attrs[aprop] = URI_ABBR_PAT.sub(uri + '\\2\\3', aval)
                    else:
                        attrs[aprop] = I(iri.absolutize(aval, rtbase))
                elif atype == TEXT_VAL:
                    attrs[aprop] = aval
                elif atype == UNKNOWN_VAL:
                    attrs[aprop] = aval
                    if aprop in interpretations:
                        aval = interpretations[aprop](aval, rid=rid, fullprop=aprop, base=base, model=model)
                        if aval is not None: attrs[aprop] = aval
                    else:
                        attrs[aprop] = aval
            propmatch = URI_ABBR_PAT.match(prop)
            if propmatch:
                uri = iris[propmatch.group(1)]
                fullprop = URI_ABBR_PAT.sub(uri + '\\2\\3', prop)
            else:
                fullprop = I(iri.absolutize(prop, propbase))
            if typeindic == RES_VAL:
                valmatch = URI_ABBR_PAT.match(aval)
                if valmatch:
                    uri = iris[valmatch.group(1)]
                    val = URI_ABBR_PAT.sub(uri + '\\2\\3', val)
                else:
                    val = I(iri.absolutize(val, rtbase))
                model.add(rid, fullprop, val, attrs)
            elif typeindic == TEXT_VAL:
                if '@lang' not in attrs: attrs['@lang'] = default_lang
                model.add(rid, fullprop, val, attrs)
            elif typeindic == UNKNOWN_VAL:
                if fullprop in interpretations:
                    val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model)
                    if val is not None: model.add(rid, fullprop, val)
                else:
                    model.add(rid, fullprop, val, attrs)
            #resinfo = AB_RESOURCE_PAT.match(val)
            #if resinfo:
            #    val = resinfo.group(1)
            #    valtype = resinfo.group(3)
            #    if not val: val = model.generate_resource()
            #    if valtype: attrs[TYPE_REL] = valtype

    return document_iri
Example #11
0
def parse(md, model, encoding='utf-8', config=None):
    """
    Translate the Versa Markdown syntax into Versa model relationships

    md -- markdown source text
    model -- Versa model to take the output relationship
    encoding -- character encoding (defaults to UTF-8)

    Returns: The overall base URI (`@base`) specified in the Markdown file, or None

    >>> from versa.driver.memory import newmodel
    >>> from versa.serial.literate import parse
    >>> m = newmodel()
    >>> parse(open('test/resource/poetry.md').read(), m)
    'http://uche.ogbuji.net/poems/'
    >>> m.size()
    40
    >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15'))
    (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {})
    """
    #Set up configuration to interpret the conventions for the Markdown
    config = config or {}
    #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources
    syntaxtypemap = {}
    if config.get('autotype-h1'):
        syntaxtypemap['h1'] = config.get('autotype-h1')
    if config.get('autotype-h2'):
        syntaxtypemap['h2'] = config.get('autotype-h2')
    if config.get('autotype-h3'):
        syntaxtypemap['h3'] = config.get('autotype-h3')
    interp_stanza = config.get('interpretations', {})
    interpretations = {}

    def setup_interpretations(interp):
        #Map the interpretation IRIs to functions to do the data prep
        for prop, interp_key in interp.items():
            if interp_key.startswith('@'):
                interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI)
            if interp_key in PREP_METHODS:
                interpretations[prop] = PREP_METHODS[interp_key]
            else:
                #just use the identity, i.e. no-op
                interpretations[prop] = lambda x, **kwargs: x

    setup_interpretations(interp_stanza)

    #Prep ID generator, in case needed
    idg = idgen(None)

    #Preprocess the Markdown to deal with IRI-valued property values
    def iri_ref_tool(m):
        body = m.group(1)
        lchar = '&lt;' if iri.matches_uri_ref_syntax(body) else '<'
        return lchar + m.group(1) + '>'

    md = IRIREF_CAND_PAT.sub(iri_ref_tool, md)

    #Parse the Markdown
    #Alternately:
    #from xml.sax.saxutils import escape, unescape
    #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5')
    #Note: even using safe_mode this should not be presumed safe from tainted input
    #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5')
    comments = mkdcomments.CommentsExtension()
    h = markdown.markdown(md,
                          safe_mode='escape',
                          output_format='html5',
                          extensions=[comments])

    #doc = html.markup_fragment(inputsource.text(h.encode('utf-8')))
    tb = treebuilder()
    h = '<html>' + h + '</html>'
    root = html5.parse(h)
    #root = tb.parse(h)
    #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest
    first_h1 = next(select_name(descendants(root), 'h1'))
    #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2'))

    # Extract header elements. Notice I use an empty element with an empty parent as the default result
    docheader = next(
        select_value(select_name(descendants(root), 'h1'), '@docheader'),
        element('empty', parent=root))  # //h1[.="@docheader"]
    sections = filter(
        lambda x: x.xml_value != '@docheader',
        select_name_pattern(descendants(root), HEADER_PAT)
    )  # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")]

    def fields(sect):
        '''
        Each section represents a resource and contains a list with its properties
        This generator parses the list and yields the key value pairs representing the properties
        Some properties have attributes, expressed in markdown as a nested list. If present these attributes
        Are yielded as well, else None is yielded
        '''
        #import logging; logging.debug(repr(sect))
        #Pull all the list elements until the next header. This accommodates multiple lists in a section
        try:
            sect_body_items = itertools.takewhile(
                lambda x: HEADER_PAT.match(x.xml_name) is None,
                select_elements(following_siblings(sect)))
        except StopIteration:
            return
        #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3')
        #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ]
        field_list = [
            li for elem in select_name(sect_body_items, 'ul')
            for li in select_name(elem, 'li')
        ]

        def parse_li(pair):
            '''
            Parse each list item into a property pair
            '''
            if pair.strip():
                matched = REL_PAT.match(pair)
                if not matched:
                    raise ValueError(
                        _('Syntax error in relationship expression: {0}'.
                          format(pair)))
                if matched.group(3): prop = matched.group(3).strip()
                if matched.group(4): prop = matched.group(4).strip()
                if matched.group(7):
                    val = matched.group(7).strip()
                    typeindic = RES_VAL
                elif matched.group(9):
                    val = matched.group(9).strip()
                    typeindic = TEXT_VAL
                elif matched.group(11):
                    val = matched.group(11).strip()
                    typeindic = TEXT_VAL
                elif matched.group(12):
                    val = matched.group(12).strip()
                    typeindic = UNKNOWN_VAL
                else:
                    val = ''
                    typeindic = UNKNOWN_VAL
                #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ]
                #import logging; logging.debug(repr((prop, val)))
                return prop, val, typeindic
            return None, None, None

        def prep_li(li):
            '''
            Take care of Markdown parsing minutiae. Also, Exclude child uls

            * a/href embedded in the li means it was specified as <link_text>.
            Restore the angle brackets as expected by the li parser
            * Similar for cases where e.g. prop: <abc> gets turned into prop: <abc></abc>
            '''
            prepped = ''
            for ch in itertools.takewhile(
                    lambda x: not (isinstance(x, element) and x.xml_name ==
                                   'ul'), li.xml_children):
                if isinstance(ch, text):
                    prepped += ch
                elif isinstance(ch, element):
                    if ch.xml_name == 'a':
                        prepped += '<' + ch.xml_value + '>'
                    else:
                        prepped += '<' + ch.xml_name + '>'
            return prepped

        #Go through each list item
        for li in field_list:
            #Is there a nested list, which expresses attributes on a property
            if list(select_name(li, 'ul')):
                #main = ''.join([ node.xml_value
                #        for node in itertools.takewhile(
                #            lambda x: x.xml_name != 'ul', select_elements(li)
                #            )
                #    ])
                main = prep_li(li)
                prop, val, typeindic = parse_li(main)
                subfield_list = [
                    parse_li(prep_li(sli)) for e in select_name(li, 'ul')
                    for sli in (select_name(e, 'li'))
                ]
                subfield_list = [(p, v, t) for (p, v, t) in subfield_list
                                 if p is not None]
                #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader
                if val is None: val = ''
                yield prop, val, typeindic, subfield_list
            #Just a regular, unadorned property
            else:
                prop, val, typeindic = parse_li(prep_li(li))
                if prop: yield prop, val, typeindic, None

    iris = {}

    # Gather the document-level metadata from the @docheader section
    base = schemabase = rtbase = document_iri = default_lang = None
    for prop, val, typeindic, subfield_list in fields(docheader):
        #The @iri section is where key IRI prefixes can be set
        if prop == '@iri':
            for (k, uri, typeindic) in subfield_list:
                if k == '@base':
                    base = schemabase = rtbase = uri
                # @property is legacy
                elif k == '@schema' or k == '@property':
                    schemabase = uri
                elif k == '@resource-type':
                    rtbase = uri
                else:
                    iris[k] = uri
        #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship
        elif prop == '@interpretations':
            #Iterate over items from the @docheader/@interpretations section to set up for further parsing
            interp = {}
            for k, v, x in subfield_list:
                interp[I(iri.absolutize(k, schemabase))] = v
            setup_interpretations(interp)
        #Setting an IRI for this very document being parsed
        elif prop == '@document':
            document_iri = val
        elif prop == '@language':
            default_lang = val
        #If we have a resource to which to attach them, just attach all other properties
        elif document_iri or base:
            rid = document_iri or base
            fullprop = I(iri.absolutize(prop, schemabase or base))
            if fullprop in interpretations:
                val = interpretations[fullprop](val,
                                                rid=rid,
                                                fullprop=fullprop,
                                                base=base,
                                                model=model)
                if val is not None: model.add(rid, fullprop, val)
            else:
                model.add(rid, fullprop, val)

    #Default IRI prefixes if @iri/@base is set
    if not schemabase: schemabase = base
    if not rtbase: rtbase = base
    if not document_iri: document_iri = base

    #Go through the resources expressed in remaining sections
    for sect in sections:
        #if U(sect) == '@docheader': continue #Not needed because excluded by ss
        #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]"
        #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type
        matched = RESOURCE_PAT.match(sect.xml_value)
        if not matched:
            raise ValueError(
                _('Syntax error in resource header: {0}'.format(
                    sect.xml_value)))
        rid = matched.group(1)
        rtype = matched.group(3)
        if rtype:
            rtype = I(iri.absolutize(rtype, schemabase))

        if rid:
            rid = I(iri.absolutize(rid, base))
        if not rid:
            rid = next(idg)

        #Resource type might be set by syntax config
        if not rtype:
            rtype = syntaxtypemap.get(sect.xml_name)
        if rtype:
            model.add(rid, TYPE_REL, rtype)

        def expand_iri(iri_in, base):
            if iri_in.startswith('@'):
                return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI))
            iri_match = URI_EXPLICIT_PAT.match(iri_in)
            if iri_match:
                return I(iri.absolutize(iri_match.group(1), base))
            iri_match = URI_ABBR_PAT.match(iri_in)
            if iri_match:
                uri = iris[iri_match.group(1)]
                fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in)
            else:
                fulliri = I(iri.absolutize(iri_in, base))
            return fulliri

        #Add the property
        for prop, val, typeindic, subfield_list in fields(sect):
            attrs = {}
            for (aprop, aval, atype) in subfield_list or ():
                fullaprop = expand_iri(aprop, schemabase)
                if atype == RES_VAL:
                    val = expand_iri(aval, rtbase)
                    valmatch = URI_ABBR_PAT.match(aval)
                    if valmatch:
                        uri = iris[valmatch.group(1)]
                        attrs[fullaprop] = URI_ABBR_PAT.sub(
                            uri + '\\2\\3', aval)
                    else:
                        attrs[fullaprop] = I(iri.absolutize(aval, rtbase))
                elif atype == TEXT_VAL:
                    attrs[fullaprop] = aval
                elif atype == UNKNOWN_VAL:
                    val_iri_match = URI_EXPLICIT_PAT.match(aval)
                    if val_iri_match:
                        aval = expand_iri(aval, rtbase)
                    elif fullaprop in interpretations:
                        aval = interpretations[fullaprop](aval,
                                                          rid=rid,
                                                          fullprop=fullaprop,
                                                          base=base,
                                                          model=model)
                    if aval is not None:
                        attrs[fullaprop] = aval

            fullprop = expand_iri(prop, schemabase)
            if typeindic == RES_VAL:
                val = expand_iri(val, rtbase)
                model.add(rid, fullprop, val, attrs)
            elif typeindic == TEXT_VAL:
                if '@lang' not in attrs: attrs['@lang'] = default_lang
                model.add(rid, fullprop, val, attrs)
            elif typeindic == UNKNOWN_VAL:
                val_iri_match = URI_EXPLICIT_PAT.match(val)
                if val_iri_match:
                    val = expand_iri(val, rtbase)
                elif fullprop in interpretations:
                    val = interpretations[fullprop](val,
                                                    rid=rid,
                                                    fullprop=fullprop,
                                                    base=base,
                                                    model=model)
                if val is not None:
                    model.add(rid, fullprop, val, attrs)

            #resinfo = AB_RESOURCE_PAT.match(val)
            #if resinfo:
            #    val = resinfo.group(1)
            #    valtype = resinfo.group(3)
            #    if not val: val = model.generate_resource()
            #    if valtype: attrs[TYPE_REL] = valtype

    return document_iri
Example #12
0
    def run(self, littext, g):
        """
        Translate Onya Literate (Markdown syntax) into an Onya graph

        md -- markdown source text
        g -- Onya graph to take the output relationship
        encoding -- character encoding (defaults to UTF-8)

        Returns: Set of nodes defined in the source

        Note: One of the nodes usually represents the graph itself. It should be the
        only node of type @graph (Onya graph). If there are multiple such nodes
        a warning will be issued.
        
        Each generated graph has a propety (`@base`) with the overall base URI specified
        in the Markdown file. If there is no such specification this propety is omitted

        >>> from onya.graph import graph
        >>> from onya.serial.literate import parse
        >>> g = graph()
        >>> parse(open('test/resource/poetry.md').read(), g)
        ...
        >>> len(g)
        3
        """
        self.base = self.schemabase = self.rtbase = \
            self.document_iri = self.default_lang = None
        self.new_nodes = set()

        # Parse the Markdown
        # Alternately:
        # from xml.sax.saxutils import escape, unescape
        # h = markdown.markdown(escape(md.decode(encoding)), output_format='html5')
        # Note: even using safe_mode this should not be presumed safe from tainted input
        # h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5')
        h = markdown.markdown(littext,
                              safe_mode='escape',
                              output_format='html5',
                              extensions=[self.comment_ext])

        # doc = html.markup_fragment(inputsource.text(h.encode('utf-8')))
        tb = treebuilder()
        h = '<html>' + h + '</html>'
        root = html5.parse(h)
        # Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest
        first_h1 = next(select_name(descendants(root), 'h1'))

        # Doc header element, if any
        docheader = next(
            select_value(select_name(descendants(root), 'h1'), '@docheader'),
            None)  # //h1[.="@docheader"]
        sections = filter(
            lambda x: x.xml_value != '@docheader',
            select_name_pattern(descendants(root), HEADER_PAT)
        )  # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")]

        print(docheader)
        if docheader is not None:
            self.handle_docheader(docheader)

        # Go through the resources expressed in remaining sections
        for sect in sections:
            # header in one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]"
            # 3rd & 4th forms have no ID given in file (type specified or not). One will be assigned
            # XXX Should we require a resource ID?
            matched = RESOURCE_PAT.match(sect.xml_value)
            if not matched:
                raise ValueError(
                    _('Syntax error in resource header: {0}'.format(
                        sect.xml_value)))
            rid = matched.group(1)
            rtype = matched.group(3)
            if rtype and self.schemabase:
                rtype = self.schemabase(rtype)

            if self.base: rid = self.base(rid)

            # Resource type might be set by syntax config
            if not rtype:
                rtype = self.syntaxtypemap.get(sect.xml_name)

            # We have enough info to init the node this section represents
            new_node = node(rid, rtype)

            fields(sect, new_node, schema)

        return self.document_iri
Example #13
0
def test_make_pretty(doc, dep, ind, expected):
    tb = tree.treebuilder()
    root = tb.parse(doc)
    make_pretty(root, dep, ind)
    assert root.xml_encode() == expected
Example #14
0
'''
py.test test/uxml/test_treegc.py
'''

import sys
import gc

import pytest
from amara3.uxml import tree
from amara3.uxml.tree import node, text, element
from amara3.uxml.uxpath import context, parse as uxpathparse

#from amara3.util import coroutine

TB = tree.treebuilder()
P = TB.parse

N1 = P('<a>+1+<b i="1.1">+2+<x>1</x></b><c i="1.2"><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>')

N10 = P('<a><b>1</b><b>2</b><b>3</b></a>')
N11 = P('<a><b>1</b><c>2</c><d>3</d></a>')
N12 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x></a>')
N13 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>')
N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>')

N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>')

V1 = {'a': 1, 'b': 'x', 'a1': N1, 'a1.2': N10}

#uxpath, doc element, result sequence
#Nodes in result are represented as a tuple of node name & concatenation of text node children
Example #15
0
import itertools
from amara3.uxml.tree import treebuilder, element
from amara3.util import coroutine

doc = '<a>1<aa a="1">2<aaa>3</aaa>4<aab>5</aab>6</aa>7<ab a="2">8</ab></a>'
tb = treebuilder()
root = tb.parse(doc)

def descendants(elem):
    for child in elem.xml_children:
        yield child
        if isinstance(child, element):
            yield from descendants(child)

print('descendants')

for e in descendants(root):
    print (e)

def select_elements(source):
    if isinstance(source, element):
        source = source.xml_children
    return filter(lambda x: isinstance(x, element), source)


def select_name(source, name):
    return filter(lambda x: x.xml_name == name, select_elements(source))


def select_name_pattern(source, pat):
    return filter(lambda x: pat.match(x.xml_name) is not None, select_elements(source))
Example #16
0
def test_make_pretty(doc, dep, ind, expected):
    tb = tree.treebuilder()
    root = tb.parse(doc)
    make_pretty(root, dep, ind)
    assert root.xml_encode() == expected