def qquery(xml_thing, xpath_thing, vars=None, funcs=None, force_root=True): ''' Quick query. Convenience for using the MicroXPath engine. Give it some XML and an expression and it will yield the results. No fuss. xml_thing - bytes or string, or amara3.xml.tree node xpath_thing - string or parsed XPath expression vars - optional mapping of variables, name to value funcs - optional mapping of functions, name to function object >>> from amara3.uxml.uxpath import qquery >>> results = qquery(b'<a>1<b>2</b>3</a>', 'a/text()')) >>> next(results).xml_value '1' >>> next(results).xml_value '3' ''' root = None if isinstance(xml_thing, nodetype): root = xml_thing elif isinstance(xml_thing, str): tb = tree.treebuilder() root = tb.parse(xml_thing) elif isinstance(xml_thing, bytes): tb = tree.treebuilder() #Force UTF-8 root = tb.parse(xml_thing.decode('utf-8')) if not root: return if isinstance(xpath_thing, str): parsed_expr = parse(xpath_thing) ctx = context(root, variables=vars, functions=funcs, force_root=force_root) result = parsed_expr.compute(ctx) yield from result
def qquery(xml_thing, xpath_thing, vars=None, funcs=None): ''' Quick query. Convenience for using the MicroXPath engine. Give it some XML and an expression and it will yield the results. No fuss. xml_thing - bytes or string, or amara3.xml.tree node xpath_thing - string or parsed XPath expression vars - optional mapping of variables, name to value funcs - optional mapping of functions, name to function object >>> from amara3.uxml.uxpath import qquery >>> results = qquery(b'<a>1<b>2</b>3</a>', 'a/text()')) >>> next(results).xml_value '1' >>> next(results).xml_value '3' ''' root = None if isinstance(xml_thing, nodetype): root = xml_thing elif isinstance(xml_thing, str): tb = tree.treebuilder() root = tb.parse(xml_thing) elif isinstance(xml_thing, bytes): tb = tree.treebuilder() #Force UTF-8 root = tb.parse(xml_thing.decode('utf-8')) if not root: return if isinstance(xpath_thing, str): parsed_expr = parse(xpath_thing) ctx = context(root, variables=vars, functions=funcs) result = parsed_expr.compute(ctx) yield from result
def test_basic_mutate(doc): tb = tree.treebuilder() root = tb.parse(doc) new_elem_1 = element('dee', {'a': '1'}) root.xml_append(new_elem_1) new_elem_2 = element('dum', {'a': '2'}) root.xml_insert(new_elem_2, 0) #logging.debug(root.xml_children) assert root.xml_children[-1] == new_elem_1, (root.xml_children[-1], new_elem_1) assert root.xml_children[0] == new_elem_2, (root.xml_children[0], new_elem_2)
def test_basic_nav(doc): tb = tree.treebuilder() root = tb.parse(doc) #No parent assert root.xml_parent is None #Root is an element assert isinstance(root, element) child_elems = [ch for ch in root.xml_children if isinstance(root, element)] for elem in child_elems: assert elem.xml_parent is root
def test_basic_nav(doc): tb = tree.treebuilder() root = tb.parse(doc) #No parent assert root.xml_parent is None #Root is an element assert isinstance(root, element) child_elems = [ ch for ch in root.xml_children if isinstance(root, element) ] for elem in child_elems: assert elem.xml_parent is root
def main(): tb = tree.treebuilder() root = tb.parse(MICRODOC) print('Processes\n') with concurrent.futures.ProcessPoolExecutor() as executor: #for markdown in executor.map(summarize, select_pattern(root, ('description',))): for markdown in executor.map(md_summary, select_name(root, 'description')): print(markdown) print() print('Threads\n') with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: #for markdown in executor.map(summarize, select_pattern(root, ('description',))): for markdown in executor.map(md_summary, select_name(root, 'description')): print(markdown)
''' py.test test/uxml/test_treegc.py ''' import sys import gc import pytest from amara3.uxml import tree from amara3.uxml.tree import node, text, element from amara3.uxml.uxpath import context, parse as uxpathparse #from amara3.util import coroutine TB = tree.treebuilder() P = TB.parse N1 = P( '<a>+1+<b i="1.1">+2+<x>1</x></b><c i="1.2"><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>' ) N10 = P('<a><b>1</b><b>2</b><b>3</b></a>') N11 = P('<a><b>1</b><c>2</c><d>3</d></a>') N12 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x></a>') N13 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>') N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>') N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>') V1 = {'a': 1, 'b': 'x', 'a1': N1, 'a1.2': N10}
def parse(md, model, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text model -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) Returns: The overall base URI (`@base`) specified in the Markdown file, or None >>> from versa.driver import memory >>> from versa.reader.md import from_markdown >>> m = memory.connection() >>> from_markdown(open('test/resource/poetry.md').read(), m) 'http://uche.ogbuji.net/poems/' >>> m.size() 40 >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15')) (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {}) """ #Set up configuration to interpret the conventions for the Markdown config = config or {} #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources syntaxtypemap = {} if config.get('autotype-h1'): syntaxtypemap['h1'] = config.get('autotype-h1') if config.get('autotype-h2'): syntaxtypemap['h2'] = config.get('autotype-h2') if config.get('autotype-h3'): syntaxtypemap['h3'] = config.get('autotype-h3') interp_stanza = config.get('interpretations', {}) interpretations = {} def setup_interpretations(interp): #Map the interpretation IRIs to functions to do the data prep for prop, interp_key in interp.items(): if interp_key.startswith('@'): interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI) if interp_key in PREP_METHODS: interpretations[prop] = PREP_METHODS[interp_key] else: #just use the identity, i.e. no-op interpretations[prop] = lambda x, **kwargs: x setup_interpretations(interp_stanza) #Prep ID generator, in case needed idg = idgen(None) #Parse the Markdown #Alternately: #from xml.sax.saxutils import escape, unescape #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5') #Note: even using safe_mode this should not be presumed safe from tainted input #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5') comments = mkdcomments.CommentsExtension() h = markdown.markdown(md, safe_mode='escape', output_format='html5', extensions=[comments]) #doc = html.markup_fragment(inputsource.text(h.encode('utf-8'))) tb = treebuilder() h = '<html>' + h + '</html>' root = tb.parse(h) #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest first_h1 = next(select_name(descendants(root), 'h1')) #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2')) #Extract header elements. Notice I use an empty element with an empty parent as the default result docheader = next(select_value(select_name(descendants(root), 'h1'), '@docheader'), element('empty', parent=root)) # //h1[.="@docheader"] sections = filter(lambda x: x.xml_value != '@docheader', select_name_pattern(descendants(root), HEADER_PAT)) # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")] def fields(sect): ''' Each section represents a resource and contains a list with its properties This generator parses the list and yields the key value pairs representing the properties Some properties have attributes, expressed in markdown as a nested list. If present these attributes Are yielded as well, else None is yielded ''' #import logging; logging.debug(repr(sect)) #Pull all the list elements until the next header. This accommodates multiple lists in a section sect_body_items = itertools.takewhile(lambda x: HEADER_PAT.match(x.xml_name) is None, select_elements(following_siblings(sect))) #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3') #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ] field_list = [ li for elem in select_name(sect_body_items, 'ul') for li in select_name(elem, 'li') ] def parse_li(pair): ''' Parse each list item into a property pair ''' if pair.strip(): matched = REL_PAT.match(pair) if not matched: raise ValueError(_('Syntax error in relationship expression: {0}'.format(pair))) #print matched.groups() if matched.group(3): prop = matched.group(3).strip() if matched.group(4): prop = matched.group(4).strip() if matched.group(7): val = matched.group(7).strip() typeindic = RES_VAL elif matched.group(9): val = matched.group(9).strip() typeindic = TEXT_VAL elif matched.group(11): val = matched.group(11).strip() typeindic = TEXT_VAL elif matched.group(12): val = matched.group(12).strip() typeindic = UNKNOWN_VAL else: val = '' typeindic = UNKNOWN_VAL #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ] #import logging; logging.debug(repr((prop, val))) return prop, val, typeindic return None, None, None #Go through each list item for li in field_list: #Is there a nested list, which expresses attributes on a property if list(select_name(li, 'ul')): #main = ''.join([ node.xml_value # for node in itertools.takewhile( # lambda x: x.xml_name != 'ul', select_elements(li) # ) # ]) main = ''.join(itertools.takewhile( lambda x: isinstance(x, text), li.xml_children )) #main = li.xml_select('string(ul/preceding-sibling::node())') prop, val, typeindic = parse_li(main) subfield_list = [ parse_li(sli.xml_value) for e in select_name(li, 'ul') for sli in ( select_name(e, 'li') ) ] subfield_list = [ (p, v, t) for (p, v, t) in subfield_list if p is not None ] #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader if val is None: val = '' yield prop, val, typeindic, subfield_list #Just a regular, unadorned property else: prop, val, typeindic = parse_li(li.xml_value) if prop: yield prop, val, typeindic, None iris = {} #Gather the document-level metadata from the @docheader section base = propbase = rtbase = document_iri = default_lang = None for prop, val, typeindic, subfield_list in fields(docheader): #The @iri section is where key IRI prefixes can be set if prop == '@iri': for (k, uri, typeindic) in subfield_list: if k == '@base': base = propbase = rtbase = uri elif k == '@property': propbase = uri elif k == '@resource-type': rtbase = uri else: iris[k] = uri #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship elif prop == '@interpretations': #Iterate over items from the @docheader/@interpretations section to set up for further parsing interp = {} for k, v, x in subfield_list: interp[I(iri.absolutize(k, propbase))] = v setup_interpretations(interp) #Setting an IRI for this very document being parsed elif prop == '@document': document_iri = val elif prop == '@language': default_lang = val #If we have a resource to which to attach them, just attach all other properties elif document_iri or base: rid = document_iri or base fullprop = I(iri.absolutize(prop, propbase or base)) if fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val) else: model.add(rid, fullprop, val) #Default IRI prefixes if @iri/@base is set if not propbase: propbase = base if not rtbase: rtbase = base if not document_iri: document_iri = base #Go through the resources expressed in remaining sections for sect in sections: #if U(sect) == '@docheader': continue #Not needed because excluded by ss #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]" #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type matched = RESOURCE_PAT.match(sect.xml_value) if not matched: raise ValueError(_('Syntax error in resource header: {0}'.format(sect.xml_value))) rid = matched.group(1) rtype = matched.group(3) if rtype: rtype = I(iri.absolutize(rtype, base)) if rid: rid = I(iri.absolutize(rid, base)) if not rid: rid = next(idg) #Resource type might be set by syntax config if not rtype: rtype = syntaxtypemap.get(sect.xml_name) if rtype: model.add(rid, TYPE_REL, rtype) #Add the property for prop, val, typeindic, subfield_list in fields(sect): attrs = {} for (aprop, aval, atype) in subfield_list or (): if atype == RES_VAL: valmatch = URI_ABBR_PAT.match(aval) if valmatch: uri = iris[valmatch.group(1)] attrs[aprop] = URI_ABBR_PAT.sub(uri + '\\2\\3', aval) else: attrs[aprop] = I(iri.absolutize(aval, rtbase)) elif atype == TEXT_VAL: attrs[aprop] = aval elif atype == UNKNOWN_VAL: attrs[aprop] = aval if aprop in interpretations: aval = interpretations[aprop](aval, rid=rid, fullprop=aprop, base=base, model=model) if aval is not None: attrs[aprop] = aval else: attrs[aprop] = aval propmatch = URI_ABBR_PAT.match(prop) if propmatch: uri = iris[propmatch.group(1)] fullprop = URI_ABBR_PAT.sub(uri + '\\2\\3', prop) else: fullprop = I(iri.absolutize(prop, propbase)) if typeindic == RES_VAL: valmatch = URI_ABBR_PAT.match(aval) if valmatch: uri = iris[valmatch.group(1)] val = URI_ABBR_PAT.sub(uri + '\\2\\3', val) else: val = I(iri.absolutize(val, rtbase)) model.add(rid, fullprop, val, attrs) elif typeindic == TEXT_VAL: if '@lang' not in attrs: attrs['@lang'] = default_lang model.add(rid, fullprop, val, attrs) elif typeindic == UNKNOWN_VAL: if fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val) else: model.add(rid, fullprop, val, attrs) #resinfo = AB_RESOURCE_PAT.match(val) #if resinfo: # val = resinfo.group(1) # valtype = resinfo.group(3) # if not val: val = model.generate_resource() # if valtype: attrs[TYPE_REL] = valtype return document_iri
def parse(md, model, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text model -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) Returns: The overall base URI (`@base`) specified in the Markdown file, or None >>> from versa.driver.memory import newmodel >>> from versa.serial.literate import parse >>> m = newmodel() >>> parse(open('test/resource/poetry.md').read(), m) 'http://uche.ogbuji.net/poems/' >>> m.size() 40 >>> next(m.match(None, 'http://uche.ogbuji.net/poems/updated', '2013-10-15')) (I(http://uche.ogbuji.net/poems/1), I(http://uche.ogbuji.net/poems/updated), '2013-10-15', {}) """ #Set up configuration to interpret the conventions for the Markdown config = config or {} #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources syntaxtypemap = {} if config.get('autotype-h1'): syntaxtypemap['h1'] = config.get('autotype-h1') if config.get('autotype-h2'): syntaxtypemap['h2'] = config.get('autotype-h2') if config.get('autotype-h3'): syntaxtypemap['h3'] = config.get('autotype-h3') interp_stanza = config.get('interpretations', {}) interpretations = {} def setup_interpretations(interp): #Map the interpretation IRIs to functions to do the data prep for prop, interp_key in interp.items(): if interp_key.startswith('@'): interp_key = iri.absolutize(interp_key[1:], VERSA_BASEIRI) if interp_key in PREP_METHODS: interpretations[prop] = PREP_METHODS[interp_key] else: #just use the identity, i.e. no-op interpretations[prop] = lambda x, **kwargs: x setup_interpretations(interp_stanza) #Prep ID generator, in case needed idg = idgen(None) #Preprocess the Markdown to deal with IRI-valued property values def iri_ref_tool(m): body = m.group(1) lchar = '<' if iri.matches_uri_ref_syntax(body) else '<' return lchar + m.group(1) + '>' md = IRIREF_CAND_PAT.sub(iri_ref_tool, md) #Parse the Markdown #Alternately: #from xml.sax.saxutils import escape, unescape #h = markdown.markdown(escape(md.decode(encoding)), output_format='html5') #Note: even using safe_mode this should not be presumed safe from tainted input #h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5') comments = mkdcomments.CommentsExtension() h = markdown.markdown(md, safe_mode='escape', output_format='html5', extensions=[comments]) #doc = html.markup_fragment(inputsource.text(h.encode('utf-8'))) tb = treebuilder() h = '<html>' + h + '</html>' root = html5.parse(h) #root = tb.parse(h) #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest first_h1 = next(select_name(descendants(root), 'h1')) #top_section_fields = itertools.takewhile(lambda x: x.xml_name != 'h1', select_name(following_siblings(first_h1), 'h2')) # Extract header elements. Notice I use an empty element with an empty parent as the default result docheader = next( select_value(select_name(descendants(root), 'h1'), '@docheader'), element('empty', parent=root)) # //h1[.="@docheader"] sections = filter( lambda x: x.xml_value != '@docheader', select_name_pattern(descendants(root), HEADER_PAT) ) # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")] def fields(sect): ''' Each section represents a resource and contains a list with its properties This generator parses the list and yields the key value pairs representing the properties Some properties have attributes, expressed in markdown as a nested list. If present these attributes Are yielded as well, else None is yielded ''' #import logging; logging.debug(repr(sect)) #Pull all the list elements until the next header. This accommodates multiple lists in a section try: sect_body_items = itertools.takewhile( lambda x: HEADER_PAT.match(x.xml_name) is None, select_elements(following_siblings(sect))) except StopIteration: return #results_until(sect.xml_select('following-sibling::*'), 'self::h1|self::h2|self::h3') #field_list = [ U(li) for ul in sect.xml_select('following-sibling::ul') for li in ul.xml_select('./li') ] field_list = [ li for elem in select_name(sect_body_items, 'ul') for li in select_name(elem, 'li') ] def parse_li(pair): ''' Parse each list item into a property pair ''' if pair.strip(): matched = REL_PAT.match(pair) if not matched: raise ValueError( _('Syntax error in relationship expression: {0}'. format(pair))) if matched.group(3): prop = matched.group(3).strip() if matched.group(4): prop = matched.group(4).strip() if matched.group(7): val = matched.group(7).strip() typeindic = RES_VAL elif matched.group(9): val = matched.group(9).strip() typeindic = TEXT_VAL elif matched.group(11): val = matched.group(11).strip() typeindic = TEXT_VAL elif matched.group(12): val = matched.group(12).strip() typeindic = UNKNOWN_VAL else: val = '' typeindic = UNKNOWN_VAL #prop, val = [ part.strip() for part in U(li.xml_select('string(.)')).split(':', 1) ] #import logging; logging.debug(repr((prop, val))) return prop, val, typeindic return None, None, None def prep_li(li): ''' Take care of Markdown parsing minutiae. Also, Exclude child uls * a/href embedded in the li means it was specified as <link_text>. Restore the angle brackets as expected by the li parser * Similar for cases where e.g. prop: <abc> gets turned into prop: <abc></abc> ''' prepped = '' for ch in itertools.takewhile( lambda x: not (isinstance(x, element) and x.xml_name == 'ul'), li.xml_children): if isinstance(ch, text): prepped += ch elif isinstance(ch, element): if ch.xml_name == 'a': prepped += '<' + ch.xml_value + '>' else: prepped += '<' + ch.xml_name + '>' return prepped #Go through each list item for li in field_list: #Is there a nested list, which expresses attributes on a property if list(select_name(li, 'ul')): #main = ''.join([ node.xml_value # for node in itertools.takewhile( # lambda x: x.xml_name != 'ul', select_elements(li) # ) # ]) main = prep_li(li) prop, val, typeindic = parse_li(main) subfield_list = [ parse_li(prep_li(sli)) for e in select_name(li, 'ul') for sli in (select_name(e, 'li')) ] subfield_list = [(p, v, t) for (p, v, t) in subfield_list if p is not None] #Support a special case for syntax such as in the @iri and @interpretations: stanza of @docheader if val is None: val = '' yield prop, val, typeindic, subfield_list #Just a regular, unadorned property else: prop, val, typeindic = parse_li(prep_li(li)) if prop: yield prop, val, typeindic, None iris = {} # Gather the document-level metadata from the @docheader section base = schemabase = rtbase = document_iri = default_lang = None for prop, val, typeindic, subfield_list in fields(docheader): #The @iri section is where key IRI prefixes can be set if prop == '@iri': for (k, uri, typeindic) in subfield_list: if k == '@base': base = schemabase = rtbase = uri # @property is legacy elif k == '@schema' or k == '@property': schemabase = uri elif k == '@resource-type': rtbase = uri else: iris[k] = uri #The @interpretations section is where defaults can be set as to the primitive types of values from the Markdown, based on the relevant property/relationship elif prop == '@interpretations': #Iterate over items from the @docheader/@interpretations section to set up for further parsing interp = {} for k, v, x in subfield_list: interp[I(iri.absolutize(k, schemabase))] = v setup_interpretations(interp) #Setting an IRI for this very document being parsed elif prop == '@document': document_iri = val elif prop == '@language': default_lang = val #If we have a resource to which to attach them, just attach all other properties elif document_iri or base: rid = document_iri or base fullprop = I(iri.absolutize(prop, schemabase or base)) if fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val) else: model.add(rid, fullprop, val) #Default IRI prefixes if @iri/@base is set if not schemabase: schemabase = base if not rtbase: rtbase = base if not document_iri: document_iri = base #Go through the resources expressed in remaining sections for sect in sections: #if U(sect) == '@docheader': continue #Not needed because excluded by ss #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]" #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type matched = RESOURCE_PAT.match(sect.xml_value) if not matched: raise ValueError( _('Syntax error in resource header: {0}'.format( sect.xml_value))) rid = matched.group(1) rtype = matched.group(3) if rtype: rtype = I(iri.absolutize(rtype, schemabase)) if rid: rid = I(iri.absolutize(rid, base)) if not rid: rid = next(idg) #Resource type might be set by syntax config if not rtype: rtype = syntaxtypemap.get(sect.xml_name) if rtype: model.add(rid, TYPE_REL, rtype) def expand_iri(iri_in, base): if iri_in.startswith('@'): return I(iri.absolutize(iri_in[1:], VERSA_BASEIRI)) iri_match = URI_EXPLICIT_PAT.match(iri_in) if iri_match: return I(iri.absolutize(iri_match.group(1), base)) iri_match = URI_ABBR_PAT.match(iri_in) if iri_match: uri = iris[iri_match.group(1)] fulliri = URI_ABBR_PAT.sub(uri + '\\2\\3', iri_in) else: fulliri = I(iri.absolutize(iri_in, base)) return fulliri #Add the property for prop, val, typeindic, subfield_list in fields(sect): attrs = {} for (aprop, aval, atype) in subfield_list or (): fullaprop = expand_iri(aprop, schemabase) if atype == RES_VAL: val = expand_iri(aval, rtbase) valmatch = URI_ABBR_PAT.match(aval) if valmatch: uri = iris[valmatch.group(1)] attrs[fullaprop] = URI_ABBR_PAT.sub( uri + '\\2\\3', aval) else: attrs[fullaprop] = I(iri.absolutize(aval, rtbase)) elif atype == TEXT_VAL: attrs[fullaprop] = aval elif atype == UNKNOWN_VAL: val_iri_match = URI_EXPLICIT_PAT.match(aval) if val_iri_match: aval = expand_iri(aval, rtbase) elif fullaprop in interpretations: aval = interpretations[fullaprop](aval, rid=rid, fullprop=fullaprop, base=base, model=model) if aval is not None: attrs[fullaprop] = aval fullprop = expand_iri(prop, schemabase) if typeindic == RES_VAL: val = expand_iri(val, rtbase) model.add(rid, fullprop, val, attrs) elif typeindic == TEXT_VAL: if '@lang' not in attrs: attrs['@lang'] = default_lang model.add(rid, fullprop, val, attrs) elif typeindic == UNKNOWN_VAL: val_iri_match = URI_EXPLICIT_PAT.match(val) if val_iri_match: val = expand_iri(val, rtbase) elif fullprop in interpretations: val = interpretations[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=model) if val is not None: model.add(rid, fullprop, val, attrs) #resinfo = AB_RESOURCE_PAT.match(val) #if resinfo: # val = resinfo.group(1) # valtype = resinfo.group(3) # if not val: val = model.generate_resource() # if valtype: attrs[TYPE_REL] = valtype return document_iri
def run(self, littext, g): """ Translate Onya Literate (Markdown syntax) into an Onya graph md -- markdown source text g -- Onya graph to take the output relationship encoding -- character encoding (defaults to UTF-8) Returns: Set of nodes defined in the source Note: One of the nodes usually represents the graph itself. It should be the only node of type @graph (Onya graph). If there are multiple such nodes a warning will be issued. Each generated graph has a propety (`@base`) with the overall base URI specified in the Markdown file. If there is no such specification this propety is omitted >>> from onya.graph import graph >>> from onya.serial.literate import parse >>> g = graph() >>> parse(open('test/resource/poetry.md').read(), g) ... >>> len(g) 3 """ self.base = self.schemabase = self.rtbase = \ self.document_iri = self.default_lang = None self.new_nodes = set() # Parse the Markdown # Alternately: # from xml.sax.saxutils import escape, unescape # h = markdown.markdown(escape(md.decode(encoding)), output_format='html5') # Note: even using safe_mode this should not be presumed safe from tainted input # h = markdown.markdown(md.decode(encoding), safe_mode='escape', output_format='html5') h = markdown.markdown(littext, safe_mode='escape', output_format='html5', extensions=[self.comment_ext]) # doc = html.markup_fragment(inputsource.text(h.encode('utf-8'))) tb = treebuilder() h = '<html>' + h + '</html>' root = html5.parse(h) # Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest first_h1 = next(select_name(descendants(root), 'h1')) # Doc header element, if any docheader = next( select_value(select_name(descendants(root), 'h1'), '@docheader'), None) # //h1[.="@docheader"] sections = filter( lambda x: x.xml_value != '@docheader', select_name_pattern(descendants(root), HEADER_PAT) ) # //h1[not(.="@docheader")]|h2[not(.="@docheader")]|h3[not(.="@docheader")] print(docheader) if docheader is not None: self.handle_docheader(docheader) # Go through the resources expressed in remaining sections for sect in sections: # header in one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]" # 3rd & 4th forms have no ID given in file (type specified or not). One will be assigned # XXX Should we require a resource ID? matched = RESOURCE_PAT.match(sect.xml_value) if not matched: raise ValueError( _('Syntax error in resource header: {0}'.format( sect.xml_value))) rid = matched.group(1) rtype = matched.group(3) if rtype and self.schemabase: rtype = self.schemabase(rtype) if self.base: rid = self.base(rid) # Resource type might be set by syntax config if not rtype: rtype = self.syntaxtypemap.get(sect.xml_name) # We have enough info to init the node this section represents new_node = node(rid, rtype) fields(sect, new_node, schema) return self.document_iri
def test_make_pretty(doc, dep, ind, expected): tb = tree.treebuilder() root = tb.parse(doc) make_pretty(root, dep, ind) assert root.xml_encode() == expected
''' py.test test/uxml/test_treegc.py ''' import sys import gc import pytest from amara3.uxml import tree from amara3.uxml.tree import node, text, element from amara3.uxml.uxpath import context, parse as uxpathparse #from amara3.util import coroutine TB = tree.treebuilder() P = TB.parse N1 = P('<a>+1+<b i="1.1">+2+<x>1</x></b><c i="1.2"><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>') N10 = P('<a><b>1</b><b>2</b><b>3</b></a>') N11 = P('<a><b>1</b><c>2</c><d>3</d></a>') N12 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x></a>') N13 = P('<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>') N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>') N14 = P('<a><b><x>1</x></b><b><x>2</x></b><b><x>3</x></b><b><x>4</x></b></a>') V1 = {'a': 1, 'b': 'x', 'a1': N1, 'a1.2': N10} #uxpath, doc element, result sequence #Nodes in result are represented as a tuple of node name & concatenation of text node children
import itertools from amara3.uxml.tree import treebuilder, element from amara3.util import coroutine doc = '<a>1<aa a="1">2<aaa>3</aaa>4<aab>5</aab>6</aa>7<ab a="2">8</ab></a>' tb = treebuilder() root = tb.parse(doc) def descendants(elem): for child in elem.xml_children: yield child if isinstance(child, element): yield from descendants(child) print('descendants') for e in descendants(root): print (e) def select_elements(source): if isinstance(source, element): source = source.xml_children return filter(lambda x: isinstance(x, element), source) def select_name(source, name): return filter(lambda x: x.xml_name == name, select_elements(source)) def select_name_pattern(source, pat): return filter(lambda x: pat.match(x.xml_name) is not None, select_elements(source))