def find_documents(self, session, cache=0): # Should extract records by xpath or span and store as X/SGML if cache == 1: # nothing to offset into raise NotImplementedError rec = self.stream hasNsRe = re.compile('<([a-zA-Z1-9_-]+:[a-zA-Z1-9_-])[ >]') for src in self.sources: raw = src.process_record(session, rec) for xp in raw: for r in xp: if (type(r) == types.ListType): tempRec = SaxRecord(r) docstr = tempRec.get_xml(session) hasNs = hasNsRe.search(docstr) saxid = r[-1][r[-1].rfind(' ') + 1:] if hasNs: docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/schemas/component/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % ( rec, saxid, docstr) else: docstr = "<c3component parent=\"%r\" event=\"%s\">%s</c3component>" % ( rec, saxid, docstr) elif (type(r) == types.StringType): docstr = "<c3component parent=\"%r\"><data>%s</data></c3component>" % ( rec, escape(r)) else: if r.__class__ == etree._Element: # Lxml Record docstr = etree.tostring(r) tree = r.getroottree() path = tree.getpath(r) if (r.nsmap): #if hasNs: namespaceList = [] for (pref, ns) in r.nsmap.iteritems(): namespaceList.append("xmlns:%s=\"%s\"" % (pref, ns)) namespaces = " ".join(namespaceList) docstr = """<c3:component xmlns:c3="http://www.cheshire3.org/schemas/component/" %s parent="%r" xpath="%s">%s</c3component>""" % ( namespaces, rec, path, docstr) else: docstr = """<c3component parent="%r" xpath="%s">%s</c3component>""" % ( rec, path, docstr) else: raise ValueError("Unknown Record Type") doc = StringDocument(docstr) if cache == 0: yield doc else: self.documents.append(doc)
def find_documents(self, session, cache=0): # Should extract records by xpath or span and store as X/SGML if cache == 1: # nothing to offset into raise NotImplementedError rec = self.stream hasNsRe = re.compile('<([a-zA-Z1-9_-]+:[a-zA-Z1-9_-])[ >]') for src in self.sources: raw = src.process_record(session, rec) for xp in raw: for r in xp: if (type(r) == types.ListType): tempRec = SaxRecord(r) docstr = tempRec.get_xml(session) hasNs = hasNsRe.search(docstr) saxid = r[-1][r[-1].rfind(' ')+1:] if hasNs: docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/schemas/component/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % (rec, saxid, docstr) else: docstr = "<c3component parent=\"%r\" event=\"%s\">%s</c3component>" % (rec, saxid, docstr) elif (type(r) == types.StringType): docstr = "<c3component parent=\"%r\"><data>%s</data></c3component>" % (rec, escape(r)) else: if r.__class__ == etree._Element: # Lxml Record docstr = etree.tostring(r) tree = r.getroottree() path = tree.getpath(r) if (r.nsmap): #if hasNs: namespaceList = [] for (pref, ns) in r.nsmap.iteritems(): namespaceList.append("xmlns:%s=\"%s\"" % (pref, ns)) namespaces = " ".join(namespaceList) docstr = """<c3:component xmlns:c3="http://www.cheshire3.org/schemas/component/" %s parent="%r" xpath="%s">%s</c3component>""" % (namespaces, rec, path, docstr) else: docstr = """<c3component parent="%r" xpath="%s">%s</c3component>""" % (rec, path, docstr) else: raise ValueError("Unknown Record Type") doc = StringDocument(docstr) if cache == 0: yield doc else: self.documents.append(doc)