def stringForEntity(node): """Replaces entities in the node.""" text = node.serialize('utf-8') try: # Lets add document DTD so entities are resolved dtd = node.doc.intSubset() tmp = dtd.serialize('utf-8') + '<norm>%s</norm>' % text next = 1 except: tmp = '<norm>%s</norm>' % text next = 0 ctxt = libxml2.createDocParserCtxt(tmp) if expand_entities: ctxt.replaceEntities(1) ctxt.parseDocument() tree = ctxt.doc() if next: newnode = tree.children.next else: newnode = tree.children result = '' child = newnode.children while child: result += child.serialize('utf-8') child = child.next return result
def replaceNodeContentsWithText(node,text): """Replaces all subnodes of a node with contents of text treated as XML.""" if node.children: starttag = startTagForNode(node) endtag = endTagForNode(node) # Lets add document DTD so entities are resolved tmp = '<?xml version="1.0" encoding="utf-8" ?>' try: dtd = doc.intSubset() tmp = tmp + dtd.serialize('utf-8') except libxml2.treeError: pass content = '<%s>%s</%s>' % (starttag, text, endtag) tmp = tmp + content.encode('utf-8') newnode = None try: ctxt = libxml2.createDocParserCtxt(tmp) ctxt.replaceEntities(0) ctxt.parseDocument() newnode = ctxt.doc() except: pass if not newnode: print >> sys.stderr, """Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8')) return newelem = newnode.getRootElement() if newelem and newelem.children: free = node.children while free: next = free.next free.unlinkNode() free = next if node: copy = newelem.copyNodeList() next = node.next node.replaceNode(newelem.copyNodeList()) node.next = next else: # In practice, this happens with tags such as "<para> </para>" (only whitespace in between) pass else: node.setContent(text)
def replaceNodeContentsWithText(node,text): """Replaces all subnodes of a node with contents of text treated as XML.""" if node.children: starttag = node.name #startTagForNode(node) endtag = endTagForNode(node) try: # Lets add document DTD so entities are resolved dtd = doc.intSubset() tmp = '' if expand_entities: # FIXME: we get a "Segmentation fault" in libxml2.parseMemory() when we include DTD otherwise tmp = dtd.serialize('utf-8') tmp = tmp + '<%s>%s</%s>' % (starttag, text, endtag) except: tmp = '<%s>%s</%s>' % (starttag, text, endtag) try: ctxt = libxml2.createDocParserCtxt(tmp.encode('utf-8')) ctxt.replaceEntities(0) ctxt.parseDocument() newnode = ctxt.doc() except: print >> sys.stderr, """Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8')) return newelem = newnode.getRootElement() if newelem and newelem.children: free = node.children while free: next = free.next free.unlinkNode() free = next node.addChildList(newelem.children) else: # In practice, this happens with tags such as "<para> </para>" (only whitespace in between) pass else: node.setContent(text)
def validate(self,xml,container,line_offset=0,char_offset=0): dtd=self.plugin.settings["dtd"] xml=(u"<!DOCTYPE %s %s>\n<%s>\n%s\n</%s>" % (container,dtd,container,xml,container)) xml=xml.encode("utf-8") self.plugin.debug("validating %r",xml) self.xml=xml self.xml_line_offset=line_offset; self.xml_char_offset=char_offset; self.xml_line_char_offsets=[] self.xml_errors=0 self.parser=libxml2.createDocParserCtxt(xml) self.parser.lineNumbers(1) self.parser.validate(1) self.parser.setErrorHandler(self.xml_error,None) if " HTML " in dtd: self.parser.htmlParseDocument() else: self.parser.parseDocument() ok=self.parser.isValid() and self.xml_errors==0 self.parser=None self.plugin.debug("validation result: %r",ok) return ok
def normalizeString(text, ignorewhitespace = 1): """Normalizes string to be used as key for gettext lookup. Removes all unnecessary whitespace.""" if not ignorewhitespace: return text try: # Lets add document DTD so entities are resolved dtd = doc.intSubset() tmp = dtd.serialize('utf-8') tmp = tmp + '<norm>%s</norm>' % text except: tmp = '<norm>%s</norm>' % text try: ctxt = libxml2.createDocParserCtxt(tmp) if expand_entities: ctxt.replaceEntities(1) ctxt.parseDocument() tree = ctxt.doc() newnode = tree.getRootElement() except: print >> sys.stderr, """Error while normalizing string as XML:\n"%s"\n""" % (text) return text normalizeNode(newnode) result = '' child = newnode.children while child: result += child.serialize('utf-8') child = child.next result = re.sub('^ ','', result) result = re.sub(' $','', result) return result