def process_document(self, session, doc): xml = doc.get_raw(session) self.inputSource.setByteStream(cStringIO.StringIO(xml)) ch = self.contentHandler ch.reinit() try: self.parser.parse(self.inputSource) except: # Splat. Reset self and reraise if self.keepError: # Work out path path = [] for l in ch.pathLines: line = ch.currentText[l] elemName = line[2:line.index('{')-1] path.append("%s[@SAXID='%s']" % (elemName, l)) self.errorPath = '/'.join(path) else: ch.reinit() raise rec = SaxRecord(ch.currentText, xml, wordCount=ch.recordWordCount) rec.elementHash = ch.elementHash rec.byteCount = len(xml) self._copyData(doc, rec) ch.reinit() return rec
def process_document(self, session, doc): xml = doc.get_raw(session) self.inputSource.setByteStream(cStringIO.StringIO(xml)) ch = self.contentHandler ch.reinit() try: self.parser.parse(self.inputSource) except: # Splat. Reset self and reraise if self.keepError: # Work out path path = [] for l in ch.pathLines: line = ch.currentText[l] elemName = line[2:line.index('{') - 1] path.append("%s[@SAXID='%s']" % (elemName, l)) self.errorPath = '/'.join(path) else: ch.reinit() raise rec = SaxRecord(ch.currentText, xml, wordCount=ch.recordWordCount) rec.elementHash = ch.elementHash rec.byteCount = len(xml) self._copyData(doc, rec) ch.reinit() return rec
def process_document(self, session, doc): data = doc.get_raw(session) data = unicode(data, 'utf-8') sax = data.split(nonTextToken) if sax[-1][0] == "9": line = sax.pop() elemHash = pickle.loads(str(line[2:])) else: elemHash = {} rec = SaxRecord(sax) rec.elementHash = elemHash return rec
def process_document(self, session, doc): data = doc.get_raw(session) data = unicode(data, 'utf-8') sax = data.split(nonTextToken) if sax[-1][0] == "9": line = sax.pop() elemHash = pickle.loads(str(line[2:])) else: elemHash = {} rec = SaxRecord(sax) rec.elementHash = elemHash return rec