def anchorlink(self, on, name='', **kw): self._elem(u'link', on) if on: id = kw.get('id', None) if id: self._curr.xml_attributes[None, u'id'] = U(id) self._curr.xml_attributes[None, u'anchor'] = U(name) return ''
def image(self, src=None, **kw): e = tree.element(None, u'img') self._curr.xml_append(e) valid_attrs = ('src', 'width', 'height', 'alt', 'title') kw.update({'src': src}) for key, value in kw.items(): if key in valid_attrs: self._curr.xml_attributes[None, U(key)] = U(value) return ''
def fields(sect): ''' Each section represents a resource and contains a list with its properties This generator parses the list and yields the key value pairs representing the properties Some properties have attributes, expressed in markdown as a nested list. If present these attributes Are yielded as well, else None is yielded ''' #import logging; logging.debug(repr(sect)) #Pull all the list elements until the next header. This accommodates multiple lists in a section sect_body_items = results_until( sect.xml_select(u'following-sibling::*'), u'self::h1|self::h2|self::h3') #field_list = [ U(li) for ul in sect.xml_select(u'following-sibling::ul') for li in ul.xml_select(u'./li') ] field_list = [ li for elem in sect_body_items for li in elem.xml_select(u'li') ] def parse_pair(pair): ''' Parse each list item into a property pair ''' if pair.strip(): matched = REL_PAT.match(pair) if not matched: raise ValueError( _(u'Syntax error in relationship expression: {0}'. format(field))) prop = matched.group(1).strip() val = matched.group(2).strip() #prop, val = [ part.strip() for part in U(li.xml_select(u'string(.)')).split(u':', 1) ] #import logging; logging.debug(repr((prop, val))) return prop, val return None, None #Go through each list item for li in field_list: #Is there a nested list, which expresses attributes on a property if li.xml_select(u'ul'): main = ''.join([ U(node) for node in results_until(li.xml_select(u'node()'), u'self::ul') ]) #main = li.xml_select(u'string(ul/preceding-sibling::node())') prop, val = parse_pair(main) subfield_list = [sli for sli in li.xml_select(u'ul/li')] subfield_dict = dict( [parse_pair(U(pair)) for pair in subfield_list]) if None in subfield_dict: del subfield_dict[None] yield prop, val, subfield_dict #Just a regular, unadorned property else: prop, val = parse_pair(U(li)) if prop: yield prop, val, None
def castToTerm(node): if node.xml_local == 'bnode': return BNode(u'') elif node.xml_local == 'uri': return URIRef(U(node)) elif node.xml_local == 'literal': if node.xml_select('string(@datatype)'): dT = URIRef(U(node.xpath('string(@datatype)'))) return Literal(U(node), datatype=dT) else: return Literal(U(node)) else: raise NotImplementedError()
def factory(rest_uri, moin_link=None, opener=None): opener = opener or urllib2.build_opener() logger.debug('rest_uri: ' + rest_uri) req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT}) resp = opener.open(req) doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #amara.xml_print(self.content_cache) metadata, first_id = metadata_dict(generate_metadata(doc)) metadata = metadata[first_id] akara_type = U(metadata[u'ak-type']) logger.debug('Type: ' + akara_type) try: #Older Moin CMS resource types are implemented by registration to the global node.NODES cls = node.NODES[akara_type] except KeyError: #Newer Moin CMS resource types are implemented by discovery of a URL, #to which a POST request executes the desired action return node.ENDPOINTS and (rest_uri, akara_type, node.ENDPOINTS[akara_type], doc, metadata, original_wiki_base) else: instance = cls(rest_uri, moin_link, opener, cache=(doc, metadata, original_wiki_base)) return instance
def heading(self, on, depth, id=None, **kw): # remember depth of first heading, and adapt current depth accordingly if not self._base_depth: self._base_depth = depth depth = max(depth + (2 - self._base_depth), 2) name = u's%i' % depth if on: found = None parent_depth = depth - 1 while not found: found = self._curr.xml_select(u'ancestor-or-self::' + u's%i' % (parent_depth)) parent_depth -= 1 if found: break #print name, found self._curr = found[0] e = tree.element(None, name) id = U(id) if id else u'' e.xml_attributes[None, u'title'] = id e.xml_attributes[None, u'id'] = id self._curr.xml_append(e) self._curr = e e = tree.element(None, u'title') self._curr.xml_append(e) self._curr = e else: parent = self._curr.xml_parent if self._curr.xml_local == u'title': parent.xml_remove(self._curr) self._curr = parent return ''
def pagelink(self, on, pagename='', page=None, **kw): FormatterBase.pagelink(self, on, pagename, page, **kw) if page is None: page = Page(self.request, pagename, formatter=self) link_text = page.link_to(self.request, on=on, **kw) self._curr.xml_append(tree.text(U(link_text))) return ''
def definition_list(self, list_path, contextnode=None, patterns=None): ''' Helper to construct a dictionary from an indicated definition list on the page ''' #FIXME: rethink this "caching" business #Use defaultdict instead, for performance patterns = patterns or {None: lambda x: U(x) if x else None} doc, metadata, original_wiki_base = self.cache contextnode = contextnode or doc.article top = contextnode.xml_select(list_path) if not top: return None #Go over the glossentries, and map from term to def, applying the matching #Unit transform function from the patterns dict result = dict( (U(i.glossterm), patterns.get(U(i.glossterm), patterns[None])(i.glossdef)) for i in top[0].glossentry) #logger.debug("definition_list: " + repr(result)) return result
def list_records(self, set="", resumption_token="", metadataPrefix=""): ''' List records. Use either the resumption token or set id. ''' if resumption_token: params = {'verb' : 'ListRecords', 'resumptionToken': resumption_token} else: params = {'verb' : 'ListRecords', 'metadataPrefix': metadataPrefix, 'set': set} qstr = urllib.urlencode(params) url = self.root + '?' + qstr self.logger.debug('OAI request URL: {0}'.format(url)) start_t = time.time() resp, content = self.h.request(url) retrieved_t = time.time() self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t)) if metadataPrefix == "mods" or metadataPrefix == "marc": xml_content = XML_PARSE(content) records = [] for record in xml_content["OAI-PMH"]["ListRecords"]["record"]: id = record["header"]["identifier"] if "null" not in id: records.append((id, record)) if "resumptionToken" in xml_content["OAI-PMH"]["ListRecords"]: resumption_token = xml_content["OAI-PMH"]["ListRecords"]["resumptionToken"] else: resumption_token = '' else: doc = bindery.parse(url, model=LISTRECORDS_MODELS[metadataPrefix]) records, first_id = metadata_dict(generate_metadata(doc), nesteddict=False) for id_, props in records: for k, v in props.iteritems(): props[k] = [ U(item) for item in v ] if (doc.OAI_PMH.ListRecords is not None) and (doc.OAI_PMH.ListRecords.resumptionToken is not None): resumption_token = U(doc.OAI_PMH.ListRecords.resumptionToken) else: resumption_token = '' return {'records' : records, 'resumption_token' : resumption_token}
def code_area(self, on, code_id, code_type='code', show=0, start=-1, step=-1, msg=None): self._elem(u'codearea', on) if on: self._curr.xml_attributes[None, u'id'] = U(code_id) return ''
def parseResults(sparqlRT): from amara import bindery actualRT = [] doc = bindery.parse( sparqlRT, prefixes={u'sparql': u'http://www.w3.org/2005/sparql-results#'}) askAnswer = doc.xml_select('string(/sparql:sparql/sparql:boolean)') if askAnswer: askAnswer = U(askAnswer) actualRT = askAnswer == u'true' else: for result in doc.xml_select( '/sparql:sparql/sparql:results/sparql:result'): currBind = {} for binding in result.binding: varVal = U(binding.name) var = Variable(varVal) term = castToTerm(binding.xml_select('*')[0]) currBind[var] = term if currBind: actualRT.append(currBind) return actualRT
def process(resource, context): subj = interpret(resource.id) for rel in resource.xml_select('*'): if rel.xml_name == u'rel': #Rel id is in an attribute pass else: #Look up rel id from abbrs relid = abbrs[rel.xml_local] val = U(rel) attrs = {} if context: attrs[u'@context'] = context for ans, aname in rel.xml_attributes: aval = rel.xml_attributes[ans, aname] if aname == u'value': val = interpret(rel.value) else: attrs[abbrs.get(U(aname), U(aname))] = interpret(U(aval)) print(subj, relid, val, attrs) #model.add(subj, relid, val, attrs) return
def normalize_generated_ids(meta_list): pat = re.compile('r(\d+)e') # Takes an ID such as 'r1234e0e4' and returns 'r*e0e4'. def normalize_id(id): m = pat.match(id) if m: id = 'r*e' + id[m.end():] return id for i, (s, p, o) in enumerate(meta_list): s = normalize_id(s) o = normalize_id(U(o)) meta_list[i] = (s, p, o) return meta_list
def get_record(self, id): params = {'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id} qstr = urllib.urlencode(params) url = self.root + '?' + qstr self.logger.debug('OAI request URL: {0}'.format(url)) start_t = time.time() resp, content = self.h.request(url) retrieved_t = time.time() self.logger.debug('Retrieved in {0}s',format(retrieved_t - start_t)) doc = bindery.parse(url, model=OAI_GETRECORD_MODEL) record, rid = metadata_dict(generate_metadata(doc), nesteddict=False) for id_, props in (record if isinstance(record, list) else [record]): for k, v in props.iteritems(): props[k] = [ U(item) for item in v ] return {'record' : record}
def lang(self, on, lang_name): self._elem(u'div', on) if on: self._curr.xml_attributes[None, u'lang'] = U(lang_name) return ''
def endDocument(self): #Yuck! But Moin seems to insist on Unicode object result (see MoinMoin.parser.text_moin_wiki.Parser.scan) #print "endDocument", repr(self._doc.xml_encode(encoding=config.charset).decode(config.charset)) return U(self._doc.xml_encode(encoding=config.charset))
def startDocument(self, pagename): self._curr = tree.element(None, u's1') self._curr.xml_attributes[None, u'title'] = U(pagename) self._doc.xml_append(self._curr) return ''
def code_token(self, on, tok_type): self._elem(u'codetoken', on) if on: self._curr.xml_attributes[None, u'type'] = U(tok_type) return ''
def render(self): ''' The typical approach is along the lines of "Style-free XSLT Style Sheets" * http://www.xml.com/pub/a/2000/07/26/xslt/xsltstyle.html * http://www.cocooncenter.org/articles/stylefree.html But using div/@id rather than custome elements ''' doc, metadata, original_wiki_base = self.cache self.content = content_handlers(original_wiki_base) #metadata = doc.article.xml_model.generate_metadata(doc) #import pprint #pprint.pprint(resources) ''' akara:type:: [[http://purl.org/xml3k/akara/xmlmodel/cms/folder|folder]] title:: A page template:: http://wiki.example.com/Site;attachment=foo.xslt ##Just XSLT for now. Plan to support other templating systems soon link:: [[http://example.org|]] rel=... meta:: dc:Creator value=Uche Ogbuji script:: `...` ##preferably they'd only use linked scripts: [[myscript...]] ''' page_id = doc.article.xml_nodeid header = doc.article.glosslist[0] #node_type = first_item(header.xml_select(u'glossentry[glossterm = "akara:type"]/glossdef')) template = unicode( first_item( header.xml_select( u'glossentry[glossterm = "template"]/glossdef'))).strip() template = os.path.join(self.outputdir, template) title = first_item( header.xml_select(u'glossentry[glossterm = "title"]/glossdef')) #title = resources[articleid]['title'] #sections = dict([ (unicode(s.title), s) for s in page.article.section ]) #print sections # if unicode(g.glossterm) == u'page:header' ] #authors = [ a # for a in page.article.section.glosslist.glossentry # if unicode(a.glossterm) == u'entry:authors' #] #title = article.xml_select(u'section[@title = ]') #revdate = dateparse(unicode(page.article.articleinfo.revhistory.revision.date)) #if revdate.tzinfo == None: revdate = revdate.replace(tzinfo=UTC) #Create ouput file print >> sys.stderr, 'Writing to ', self.output buf = StringIO() w = structwriter(indent=u"yes", stream=buf) w.feed( ROOT( E( (XHTML_NAMESPACE, u'html'), {(XML_NAMESPACE, u'xml:lang'): u'en'}, E( u'head', E(u'title', title), E( u'meta', { u'content': U(metadata[u'ak-updated']), u'name': u'updated' }), #E(u'link', {u'href': unicode(uri), u'rel': u'alternate', u'title': u"Permalink"}), ), E(u'body', (self.content.dispatch(s) for s in doc.article.section)), ), )) with open(self.output, 'w') as output: #text = f.read().rstrip() #print buf.getvalue() transform(buf.getvalue(), template, output=output) return
from amara.lib import U from amara.tree import element, text SOURCE = '''<catalog> <book> <title>Spam for Supper</title> <authors>By A.X. Ham and Franco Bacon</authors> </book> </catalog>''' EXTRACT_AUTHORS_PAT = r'(\s*By\s*)|(\s*,\s*)|(\s*and\s*)' EXTRACT_AUTHORS_PAT_GROUPS = 4 doc = amara.parse(SOURCE) for author_node in doc.xml_select(u'/catalog/book/authors'): authors = re.split(EXTRACT_AUTHORS_PAT, U(author_node)) for n in author_node.xml_children: author_node.xml_remove(n) #Collect the regex match into the regex-defined groups for i, subenum in groupby(enumerate(authors), lambda i: i[0]//EXTRACT_AUTHORS_PAT_GROUPS): matchgroup = [ group for i, group in subenum ] if matchgroup[0]: link = element(None, u'a') link.xml_attributes[None, u'href'] = 'http://example.org' link.xml_append(text(matchgroup[0])) author_node.xml_append(link) for match in matchgroup[1:]: if match: author_node.xml_append(text(match)) doc.xml_write() print
def attachment_link(self, on, url=None, **kw): self._elem(u'attachment', on) if on: self._curr.xml_attributes[None, u'href'] = U(url) return ''
def anchordef(self, id): e = tree.element(None, u'anchor') self._curr.xml_append(e) self._curr.xml_attributes[None, u'id'] = U(id) return ''
def receive_items(): ''' Receives each record and processes it by creating an item dict which is then forwarded to the sink ''' ix = 1 while True: rec = yield recid = u'_' + str(ix) leader = U(rec.xml_select(u'ma:leader', prefixes=PREFIXES)) work_item = { u'id': u'work' + recid, u'label': recid, #u'label': u'{0}, {1}'.format(row['TPNAML'], row['TPNAMF']), u'type': u'WorkRecord', } print >> sys.stderr, 'Begin processing Work: ', work_item[u'id'] #Instance starts with same as work, with leader added instance_item = { u'leader': leader, } instance_item.update(work_item) instance_item[u'id'] = u'instance' + recid instance_item[u'type'] = u'InstanceRecord' work_item[u'instance'] = u'instance' + recid for cf in rec.xml_select(u'ma:controlfield', prefixes=PREFIXES): key = u'cftag_' + U(cf.xml_select(u'@tag')) val = U(cf) if list(cf.xml_select(u'ma:subfield', prefixes=PREFIXES)): for sf in cf.xml_select(u'ma:subfield', prefixes=PREFIXES): code = U(sf.xml_select(u'@code')) sfval = U(sf) #For now assume all leader fields are instance level instance_item[key + code] = sfval else: #For now assume all leader fields are instance level instance_item[key] = val for df in rec.xml_select(u'ma:datafield', prefixes=PREFIXES): code = U(df.xml_select(u'@tag')) key = u'dftag_' + code val = U(df) if list(df.xml_select(u'ma:subfield', prefixes=PREFIXES)): subfields = dict( ((U(sf.xml_select(u'@code')), U(sf)) for sf in df.xml_select(u'ma:subfield', prefixes=PREFIXES))) lookup = code #See if any of the field codes represents a reference to an object which can be materialized handled = False if code in MATERIALIZE: (subst, extra_props) = MATERIALIZE[code] props = {u'marccode': code} props.update(extra_props) #props.update(other_properties) props.update(subfields) #work_item[FIELD_RENAMINGS.get(code, code)] = subid # subid = subobjs.add(props) if ix < len(recs): subid = subobjs.add(props) objects_sink.write(",\n") else: subid = subobjs.add(props, last=True) if code in INSTANCE_FIELDS: instance_item.setdefault(subst, []).append(subid) elif code in WORK_FIELDS: work_item.setdefault(subst, []).append(subid) handled = True if code in MATERIALIZE_VIA_ANNOTATION: (subst, extra_object_props, extra_annotation_props ) = MATERIALIZE_VIA_ANNOTATION[code] object_props = {u'marccode': code} object_props.update(extra_object_props) #props.update(other_properties) #Separate annotation subfields from object subfields object_subfields = subfields.copy() annotation_subfields = {} for k, v in object_subfields.items(): if code + k in ANNOTATIONS_FIELDS: annotation_subfields[k] = v del object_subfields[k] object_props.update(object_subfields) # objectid = subobjs.add(object_props) # if ix < len(recs): # objects_sink.write(",\n") if ix < len(recs): objectid = subobjs.add(object_props) objects_sink.write(",\n") else: objectid = subobjs.add(object_props, last=True) annid = u'annotation' + recid annotation_item = { u'id': annid, u'label': recid, subst: objectid, u'type': u'Annotation', u'on_work': work_item[u'id'], u'on_instance': instance_item[u'id'], } annotation_item.update(extra_annotation_props) annotation_item.update(annotation_subfields) emitter(annotation_item, annotations_sink) if ix < len(recs): annotations_sink.write(",\n") # annotations_sink.write(annotation_item) print >> sys.stderr, 'Processing annotation: ', annotation_item[ u'id'], "\n" if code in INSTANCE_FIELDS: instance_item.setdefault('annotation', []).append(annid) elif code in WORK_FIELDS: work_item.setdefault('annotation', []).append(annid) #The actual subfields go to the annotations sink #annotations_props = {u'annotates': instance_item[u'id']} #annotations_props.update(props) #subid = subobjs.add(annotations_props, annotations_sink) #The reference is from the instance ID #instance_item.setdefault(subst, []).append(subid) handled = True #work_item.setdefault(FIELD_RENAMINGS.get(code, code), []).append(subid) #See if any of the field+subfield codes represents a reference to an object which can be materialized if not handled: for k, v in subfields.items(): lookup = code + k if lookup in MATERIALIZE: (subst, extra_props) = MATERIALIZE[lookup] props = {u'marccode': code, k: v} props.update(extra_props) #print >> sys.stderr, lookup, k, props, if ix < len(recs): subid = subobjs.add(props) objects_sink.write(",\n") else: subid = subobjs.add(props, last=True) if lookup in INSTANCE_FIELDS or code in INSTANCE_FIELDS: instance_item.setdefault(subst, []).append(subid) elif lookup in WORK_FIELDS or code in WORK_FIELDS: work_item.setdefault(subst, []).append(subid) handled = True else: field_name = u'dftag_' + lookup if lookup in FIELD_RENAMINGS: field_name = FIELD_RENAMINGS[lookup] #Handle the simple field_nameitution of a label name for a MARC code if lookup in INSTANCE_FIELDS or code in INSTANCE_FIELDS: instance_item.setdefault(field_name, []).append(v) elif lookup in WORK_FIELDS or code in WORK_FIELDS: work_item.setdefault(field_name, []).append(v) #print >> sys.stderr, lookup, key elif not handled: if code in INSTANCE_FIELDS: instance_item[key] = val elif code in WORK_FIELDS: work_item[key] = val else: if code in INSTANCE_FIELDS: instance_item[key] = val elif code in WORK_FIELDS: work_item[key] = val #link = work_item.get(u'cftag_008') #Handle ISBNs re: https://foundry.zepheira.com/issues/1976 new_instances = [] isbns = instance_item.get('isbn', []) def isbn_list(isbns): isbn_tags = {} for isbn in isbns: parts = isbn.split(None, 1) #Remove any cruft from ISBNs. Leave just the digits cleaned_isbn = NON_ISBN_CHARS.subn(u'', parts[0])[0] if len(parts) == 1: #FIXME: More generally strip non-digit chars from ISBNs isbn_tags[cleaned_isbn] = None else: isbn_tags[cleaned_isbn] = parts[1] c14ned = canonicalize_isbns(isbn_tags.keys()) for c14nisbn, variants in invert_dict(c14ned).items(): #We'll use the heuristic that the longest ISBN number is the best variants.sort(key=len, reverse=True) # sort by descending length yield variants[0], isbn_tags[variants[0]] return # list(isbnset) base_instance_id = instance_item[u'id'] instance_ids = [] subscript = ord(u'a') for subix, (inum, itype) in enumerate(isbn_list(isbns)): #print >> sys.stderr, subix, inum, itype subitem = instance_item.copy() subitem[u'isbn'] = inum subitem[u'id'] = base_instance_id + (unichr(subscript + subix) if subix else u'') if itype: subitem[u'isbnType'] = itype instance_ids.append(subitem[u'id']) new_instances.append(subitem) isbnnu_url = ISBNNU_PAT.format(inum) subitem[u'isbnnu'] = isbnnu_url #U(doc.xml_select(u'/rss/channel/item/link')) subitem[u'openlibcover'] = OPENLIBRARY_COVER_PAT.format(inum) #time.sleep(2) #Be polite! #instance_item[u'isbn'] = isbns[0] if not new_instances: #Make sure it's created as an instance even if it has no ISBN new_instances.append(instance_item) instance_ids.append(base_instance_id) work_item[u'instance'] = instance_ids special_properties = {} for k, v in process_leader(leader): special_properties.setdefault(k, set()).add(v) for k, v in process_008(instance_item[u'cftag_008']): special_properties.setdefault(k, set()).add(v) #We get some repeated values out of leader & 008 processing, and we want to #Remove dupes so we did so by working with sets then converting to lists for k, v in special_properties.items(): special_properties[k] = list(v) instance_item.update(special_properties) #reduce lists of just one item for k, v in work_item.items(): if type(v) is list and len(v) == 1: work_item[k] = v[0] # work_sink.write(work_item) emitter(work_item, work_sink) if ix < len(recs): work_sink.write(",\n") def send_instance(instance): print >> sys.stderr, 'Processing instance: ', instance[u'id'] emitter(instance, instance_sink) i = 0 for ninst in new_instances: i += 1 send_instance(ninst) if i < len(new_instances): instance_sink.write(",\n") if ix < len(recs): instance_sink.write(",\n") print >> sys.stderr, 'Finished processing Work: ', work_item[ u'id'], "\n" ix += 1 return
def icon(self, type_): self._elem(u'icon', on) self._curr.xml_attributes[None, u'type'] = U(type_) self._elem(u'icon', off) return ''
def smiley(self, text): self._curr.xml_append(tree.text(U(text))) return ''
def attachment_drawing(self, url, text, **kw): self._elem(u'attachmentimage', on) self._curr.xml_attributes[None, u'href'] = U(url) self._curr.xml_append(tree.text(U(text))) self._elem(u'attachmentimage', off) return ''
def interwikilink(self, on, interwiki='', pagename='', **kw): self._elem(u'interwiki', on) if on: self._curr.xml_attributes[None, u'wiki'] = U(interwiki) self._curr.xml_attributes[None, u'pagename'] = U(pagename) return ''
def from_markdown(md, output, encoding='utf-8', config=None): """ Translate the Versa Markdown syntax into Versa model relationships md -- markdown source text output -- Versa model to take the output relationship encoding -- character encoding (defaults to UTF-8) No return value """ #Set up configuration to interpret the conventions for the Markdown config = config or {} #This mapping takes syntactical elements such as the various header levels in Markdown and associates a resource type with the specified resources syntaxtypemap = {} if config.get('autotype-h1'): syntaxtypemap[u'h1'] = config.get('autotype-h1') if config.get('autotype-h2'): syntaxtypemap[u'h2'] = config.get('autotype-h2') if config.get('autotype-h3'): syntaxtypemap[u'h3'] = config.get('autotype-h3') interp = config.get('interpretations', {}) #Map the interpretation IRIs to functions to do the data prep for prop, interp_key in interp.iteritems(): if interp_key in PREP_METHODS: interp[prop] = PREP_METHODS[interp_key] else: #just use the identity, i.e. no-op interp[prop] = lambda x, **kwargs: x #Parse the Markdown h = markdown.markdown(md.decode(encoding)) doc = html.markup_fragment(inputsource.text(h.encode('utf-8'))) #Each section contains one resource description, but the special one named @docheader contains info to help interpret the rest top_section_fields = results_until( doc.xml_select(u'//h1[1]/following-sibling::h2'), u'self::h1') docheader = doc.xml_select(u'//h1[.="@docheader"]')[0] sections = doc.xml_select(u'//h1|h2|h3[not(.="@docheader")]') def fields(sect): ''' Each section represents a resource and contains a list with its properties This generator parses the list and yields the key value pairs representing the properties Some properties have attributes, expressed in markdown as a nested list. If present these attributes Are yielded as well, else None is yielded ''' #import logging; logging.debug(repr(sect)) #Pull all the list elements until the next header. This accommodates multiple lists in a section sect_body_items = results_until( sect.xml_select(u'following-sibling::*'), u'self::h1|self::h2|self::h3') #field_list = [ U(li) for ul in sect.xml_select(u'following-sibling::ul') for li in ul.xml_select(u'./li') ] field_list = [ li for elem in sect_body_items for li in elem.xml_select(u'li') ] def parse_pair(pair): ''' Parse each list item into a property pair ''' if pair.strip(): matched = REL_PAT.match(pair) if not matched: raise ValueError( _(u'Syntax error in relationship expression: {0}'. format(field))) prop = matched.group(1).strip() val = matched.group(2).strip() #prop, val = [ part.strip() for part in U(li.xml_select(u'string(.)')).split(u':', 1) ] #import logging; logging.debug(repr((prop, val))) return prop, val return None, None #Go through each list item for li in field_list: #Is there a nested list, which expresses attributes on a property if li.xml_select(u'ul'): main = ''.join([ U(node) for node in results_until(li.xml_select(u'node()'), u'self::ul') ]) #main = li.xml_select(u'string(ul/preceding-sibling::node())') prop, val = parse_pair(main) subfield_list = [sli for sli in li.xml_select(u'ul/li')] subfield_dict = dict( [parse_pair(U(pair)) for pair in subfield_list]) if None in subfield_dict: del subfield_dict[None] yield prop, val, subfield_dict #Just a regular, unadorned property else: prop, val = parse_pair(U(li)) if prop: yield prop, val, None #Gather the document-level metadata base = propbase = rbase = None for prop, val, subfield_dict in fields(docheader): if prop == '@base': base = val if prop == '@property-base': propbase = val if prop == '@resource-base': rbase = val if not propbase: propbase = base if not rbase: rbase = base #Go through the resources expressed in remaining sections for sect in sections: #The header can take one of 4 forms: "ResourceID" "ResourceID [ResourceType]" "[ResourceType]" or "[]" #The 3rd form is for an anonymous resource with specified type and the 4th an anonymous resource with unspecified type matched = RESOURCE_PAT.match(U(sect)) if not matched: raise ValueError( _(u'Syntax error in resource header: {0}'.format(U(sect)))) rid = matched.group(1) rtype = matched.group(3) if rid: rid = I(iri.absolutize(rid, base)) if not rid: rid = I(iri.absolutize(output.generate_resource(), base)) if rtype: rtype = I(iri.absolutize(rtype, base)) #Resource type might be set by syntax config if not rtype: rtype = syntaxtypemap.get(sect.xml_local) if rtype: output.add(rid, RDFTYPE, rtype) #Add the property for prop, val, subfield_dict in fields(sect): attrs = subfield_dict or {} fullprop = I(iri.absolutize(prop, propbase)) resinfo = AB_RESOURCE_PAT.match(val) if resinfo: val = resinfo.group(1) valtype = resinfo.group(3) if not val: val = output.generate_resource() if valtype: attrs[RDFTYPE] = valtype if fullprop in interp: val = interp[fullprop](val, rid=rid, fullprop=fullprop, base=base, model=output) if val is not None: output.add(rid, fullprop, val) else: output.add(rid, fullprop, val, attrs) return base
def attachment_image(self, url, **kw): self._elem(u'attachmentimage', on) if on: self._curr.xml_attributes[None, u'href'] = U(url) return ''
def url(self, on, url='', css=None, **kw): self._elem(u'jump', on) self._curr.xml_attributes[None, u'url'] = U(url) if css: self._curr.xml_attributes[None, u'class'] = U(css) return ''