コード例 #1
0
 def find_documents(self, session, cache=0):
     # Should extract records by xpath or span and store as X/SGML
     if cache == 1:
         # nothing to offset into
         raise NotImplementedError
     rec = self.stream
     hasNsRe = re.compile('<([a-zA-Z1-9_-]+:[a-zA-Z1-9_-])[ >]')
     for src in self.sources:
         raw = src.process_record(session, rec)
         for xp in raw:
             for r in xp:
                 if (type(r) == types.ListType):
                     tempRec = SaxRecord(r)
                     docstr = tempRec.get_xml(session)
                     hasNs = hasNsRe.search(docstr)
                     saxid = r[-1][r[-1].rfind(' ') + 1:]
                     if hasNs:
                         docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/schemas/component/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % (
                             rec, saxid, docstr)
                     else:
                         docstr = "<c3component parent=\"%r\" event=\"%s\">%s</c3component>" % (
                             rec, saxid, docstr)
                 elif (type(r) == types.StringType):
                     docstr = "<c3component parent=\"%r\"><data>%s</data></c3component>" % (
                         rec, escape(r))
                 else:
                     if r.__class__ == etree._Element:
                         # Lxml Record
                         docstr = etree.tostring(r)
                         tree = r.getroottree()
                         path = tree.getpath(r)
                         if (r.nsmap):
                             #if hasNs:
                             namespaceList = []
                             for (pref, ns) in r.nsmap.iteritems():
                                 namespaceList.append("xmlns:%s=\"%s\"" %
                                                      (pref, ns))
                             namespaces = " ".join(namespaceList)
                             docstr = """<c3:component xmlns:c3="http://www.cheshire3.org/schemas/component/" %s parent="%r" xpath="%s">%s</c3component>""" % (
                                 namespaces, rec, path, docstr)
                         else:
                             docstr = """<c3component parent="%r" xpath="%s">%s</c3component>""" % (
                                 rec, path, docstr)
                     else:
                         raise ValueError("Unknown Record Type")
                 doc = StringDocument(docstr)
                 if cache == 0:
                     yield doc
                 else:
                     self.documents.append(doc)
コード例 #2
0
 def find_documents(self, session, cache=0):
     # Should extract records by xpath or span and store as X/SGML
     if cache == 1:
         # nothing to offset into
         raise NotImplementedError
     rec = self.stream
     hasNsRe = re.compile('<([a-zA-Z1-9_-]+:[a-zA-Z1-9_-])[ >]')
     for src in self.sources:
         raw = src.process_record(session, rec)
         for xp in raw:
             for r in xp:
                 if (type(r) == types.ListType):
                     tempRec = SaxRecord(r)
                     docstr = tempRec.get_xml(session)
                     hasNs = hasNsRe.search(docstr)
                     saxid = r[-1][r[-1].rfind(' ')+1:]
                     if hasNs:
                         docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/schemas/component/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % (rec, saxid, docstr)
                     else:
                         docstr = "<c3component parent=\"%r\" event=\"%s\">%s</c3component>" % (rec, saxid, docstr)
                 elif (type(r) == types.StringType):
                     docstr = "<c3component parent=\"%r\"><data>%s</data></c3component>" % (rec, escape(r))
                 else:
                     if r.__class__ == etree._Element:
                         # Lxml Record
                         docstr = etree.tostring(r)
                         tree = r.getroottree()
                         path = tree.getpath(r)
                         if (r.nsmap):
                         #if hasNs:
                             namespaceList = []
                             for (pref, ns) in r.nsmap.iteritems():
                                 namespaceList.append("xmlns:%s=\"%s\"" % (pref, ns))
                             namespaces = " ".join(namespaceList)
                             docstr = """<c3:component xmlns:c3="http://www.cheshire3.org/schemas/component/" %s parent="%r" xpath="%s">%s</c3component>""" % (namespaces, rec, path, docstr)
                         else:
                             docstr = """<c3component parent="%r" xpath="%s">%s</c3component>""" % (rec, path, docstr)
                     else:
                         raise ValueError("Unknown Record Type")
                 doc = StringDocument(docstr)
                 if cache == 0:
                     yield doc
                 else:
                     self.documents.append(doc)