def tmx_import(self, file, REQUEST=None, RESPONSE=None): """ Imports a TMX level 1 file. We use the SAX parser. It has the benefit that it internally converts everything to python unicode strings. """ self._v_srclang = self._default_language # Create a parser parser = make_parser() chandler = HandleTMXParsing(self._tmx_tu, self._tmx_header) # Tell the parser to use our handler parser.setContentHandler(chandler) # Don't load the DTD from the Internet parser.setFeature(handler.feature_external_ges, 0) inputsrc = InputSource() if type(file) is StringType: inputsrc.setByteStream(StringIO(file)) parser.parse(inputsrc) else: content = file.read() inputsrc.setByteStream(StringIO(content)) parser.parse(inputsrc) if hasattr(self, '_v_srclang'): del self._v_srclang if REQUEST is not None: RESPONSE.redirect('manage_localPropertiesForm')
def parseFile(self, inputFile, stream=None): input = InputSource(inputFile) if stream is None: stream = file(inputFile) input.setByteStream(stream) self.parseSource(input)
def parse(self, file=None, string=None): """ SAX parse XML text. @param file: Parse a python I{file-like} object. @type file: I{file-like} object. @param string: Parse string XML. @type string: str """ timer = metrics.Timer() timer.start() sax, handler = self.saxparser() if file is not None: sax.parse(file) timer.stop() metrics.log.debug('sax (%s) duration: %s', file, timer) return handler.nodes[0] if string is not None: if isinstance(string, six.text_type): string = string.encode("utf-8") source = InputSource(None) source.setByteStream(BytesIO(string)) sax.parse(source) timer.stop() metrics.log.debug('%s\nsax duration: %s', string, timer) return handler.nodes[0]
def parse(self, file=None, string=None): """ SAX parse XML text. @param file: Parse a python I{file-like} object. @type file: I{file-like} object. @param string: Parse string XML. @type string: str """ timer = metrics.Timer() timer.start() sax, handler = self.saxparser() if file is not None: sax.parse(file) timer.stop() metrics.log.debug('sax (%s) duration: %s', file, timer) return handler.nodes[0] if string is not None: source = InputSource(None) try: source.setByteStream(StringIO(string.encode('utf8'))) except UnicodeDecodeError: source.setByteStream(StringIO(string)) sax.parse(source) timer.stop() metrics.log.debug('%s\nsax duration: %s', string, timer) return handler.nodes[0]
def parse(self, file=None, string=None): """ SAX parse XML text. @param file: Parse a python I{file-like} object. @type file: I{file-like} object @param string: Parse string XML. @type string: str @return: Parsed XML document. @rtype: L{Document} """ if file is None and string is None: return timer = metrics.Timer() timer.start() source = file if file is None: source = InputSource(None) source.setByteStream(suds.BytesIO(string)) sax, handler = self.saxparser() sax.parse(source) timer.stop() if file is None: metrics.log.debug("%s\nsax duration: %s", string, timer) else: metrics.log.debug("sax (%s) duration: %s", file, timer) return handler.nodes[0]
def parse(self, file=None, string=None): """ SAX parse XML text. @param file: Parse a python I{file-like} object. @type file: I{file-like} object @param string: Parse string XML. @type string: str @return: Parsed XML document. @rtype: L{Document} """ if file is None and string is None: return timer = suds.metrics.Timer() timer.start() source = file if file is None: source = InputSource(None) source.setByteStream(suds.BytesIO(string)) sax, handler = self.saxparser() sax.parse(source) timer.stop() if file is None: suds.metrics.log.debug("%s\nsax duration: %s", string, timer) else: suds.metrics.log.debug("sax (%s) duration: %s", file, timer) return handler.nodes[0]
def test_ignorable(): p = XMLValParserFactory.make_parser() i = InputSource("doc3.xml") i.setByteStream(StringIO(doc3)) h = H() p.setContentHandler(h) p.parse(i) return h.passed
def resolveEntity(self, publicId, systemId): if systemId: name = os.path.join(self._path, systemId) if os.path.isfile(name): source = InputSource() source.setByteStream(open(name, "rb")) return source # Using default resolution return EntityResolver.resolveEntity(self, publicId, systemId)
def test_illformed(): p = XMLValParserFactory.make_parser() i = InputSource("doc2.xml") i.setByteStream(StringIO(doc2)) try: p.parse(i) except SAXException,e: print "PASS:",e return 1
def parseXLIFFSTring(self, xml_string): """ """ chandler = XLIFFHandler() parser = make_parser() # Tell the parser to use our handler parser.setContentHandler(chandler) # Don't load the DTD from the Internet parser.setFeature(handler.feature_external_ges, 0) inpsrc = InputSource() inpsrc.setByteStream(StringIO(xml_string)) try: parser.parse(inpsrc) return chandler except: return None
def __init__(self, session, config, parent): Parser.__init__(self, session, config, parent) self.parser = make_parser() self.errorHandler = ErrorHandler() self.parser.setErrorHandler(self.errorHandler) self.inputSource = SaxInput() ch = SaxContentHandler() self.contentHandler = ch self.parser.setContentHandler(ch) self.keepError = 1 if (self.get_setting(session, 'namespaces')): self.parser.setFeature('http://xml.org/sax/features/namespaces', 1) p = self.get_setting(session, 'attrHash') if (p): l = p.split() for i in l: (a, b) = i.split("@") try: ch.hashAttributesNames[a].append(b) except: ch.hashAttributesNames[a] = [b] if self.get_setting(session, 'stripWhitespace'): ch.stripWS = 1
def __init__(self, parent, config): C3Object.__init__(self, parent, config) self.parser = make_parser() self.inputSource = SaxInput() self.errorHandler = ErrorHandler() self.parser.setErrorHandler(self.errorHandler) self.parser.setContentHandler(self)
def parseContent(self, file): # Create a parser try: parser = make_parser() chandler = GBoxHandler() # Tell the parser to use our handler parser.setContentHandler(chandler) # Don't load the DTD from the Internet parser.setFeature(handler.feature_external_ges, 0) inputsrc = InputSource() gbox_content = utils.utRead(file) inputsrc.setByteStream(StringIO(gbox_content)) parser.parse(inputsrc) except: return 'err' return chandler
def tmx_import(self, howmuch, file, REQUEST=None, RESPONSE=None): """ Imports a TMX level 1 file. We use the SAX parser. It has the benefit that it internally converts everything to python unicode strings. """ if howmuch == 'clear': # Clear the message catalogue prior to import self._messages = {} self._languages = () self._v_howmuch = howmuch self._v_srclang = self._default_language self._v_num_translations = 0 self._v_num_notes = 0 # Create a parser parser = make_parser() chandler = HandleTMXParsing(self._tmx_tu, self._tmx_header) # Tell the parser to use our handler parser.setContentHandler(chandler) # Don't load the DTD from the Internet parser.setFeature(handler.feature_external_ges, 0) inputsrc = InputSource() if type(file) is StringType: inputsrc.setByteStream(StringIO(file)) else: content = file.read() inputsrc.setByteStream(StringIO(content)) parser.parse(inputsrc) num_translations = self._v_num_translations num_notes = self._v_num_notes del self._v_srclang del self._v_howmuch del self._v_num_translations del self._v_num_notes if REQUEST is not None: return MessageDialog( title = _('Messages imported'), message = _('Imported %d messages and %d notes') % (num_translations, num_notes), action = 'manage_messages')
def parseXLIFFFile(self, file): # Create a parser parser = make_parser() chandler = XLIFFHandler() # Tell the parser to use our handler parser.setContentHandler(chandler) # Don't load the DTD from the Internet parser.setFeature(handler.feature_external_ges, 0) inputsrc = InputSource() try: if type(file) is StringType: inputsrc.setByteStream(StringIO(file)) else: filecontent = file.read() inputsrc.setByteStream(StringIO(filecontent)) parser.parse(inputsrc) return chandler except: return None
def addSectionTags(content): from cStringIO import StringIO src = InputSource() src.setByteStream(StringIO(content)) # Create an XML parser parser = make_parser() #("xml.sax.drivers2.drv_xmlproc") dh = docHandler() parser.setContentHandler(dh) er = EntityResolver() parser.setEntityResolver(er) # Allow external entities parser.setFeature(feature_external_ges, True) # Parse the file; your handler's methods will get called parser.parse(src) return dh.document.encode('UTF-8')
def parse(self, file=None, url=None, string=None): timer = metrics.Timer() timer.start() sax, handler = self.saxparser() if file is not None: sax.parse(file) timer.stop() metrics.log.debug('sax (%s) duration: %s', file, timer) return handler.nodes[0] if url is not None: fp = self.transport.open(Request(url)) sax.parse(fp) timer.stop() metrics.log.debug('sax (%s) duration: %s', url, timer) return handler.nodes[0] if string is not None: source = InputSource(None) source.setByteStream(StringIO(string)) sax.parse(source) timer.stop() metrics.log.debug('%s\nsax duration: %s', string, timer) return handler.nodes[0]
def parse(self, xml): input = InputSource() input.setByteStream(StringIO(xml)) self.parseSource(input)
class SaxParser(BaseParser): """ Default SAX based parser. Creates SaxRecord """ _possibleSettings = {'namespaces' : {'docs' : "Enable namespace processing in SAX"}, 'stripWhitespace' : {'docs' : "Strip additional whitespace when processing."}, 'attrHash' : {'docs' : "Tag/Attribute combinations to include in hash."} } def __init__(self, session, config, parent): Parser.__init__(self, session, config, parent) self.parser = make_parser() self.errorHandler = ErrorHandler() self.parser.setErrorHandler(self.errorHandler) self.inputSource = SaxInput() ch = SaxContentHandler() self.contentHandler = ch self.parser.setContentHandler(ch) self.keepError = 1 if (self.get_setting(session, 'namespaces')): self.parser.setFeature('http://xml.org/sax/features/namespaces', 1) p = self.get_setting(session, 'attrHash') if (p): l = p.split() for i in l: (a,b) = i.split("@") try: ch.hashAttributesNames[a].append(b) except: ch.hashAttributesNames[a] = [b] if self.get_setting(session, 'stripWhitespace'): ch.stripWS = 1 def process_document(self, session, doc): xml = doc.get_raw(session) self.inputSource.setByteStream(cStringIO.StringIO(xml)) ch = self.contentHandler ch.reinit() try: self.parser.parse(self.inputSource) except: # Splat. Reset self and reraise if self.keepError: # Work out path path = [] for l in ch.pathLines: line = ch.currentText[l] elemName = line[2:line.index('{')-1] path.append("%s[@SAXID='%s']" % (elemName, l)) self.errorPath = '/'.join(path) else: ch.reinit() raise rec = SaxRecord(ch.currentText, xml, wordCount=ch.recordWordCount) rec.elementHash = ch.elementHash rec.byteCount = len(xml) self._copyData(doc, rec) ch.reinit() return rec
class SaxParser(BaseParser): """ Default SAX based parser. Creates SaxRecord """ _possibleSettings = { 'namespaces': { 'docs': "Enable namespace processing in SAX" }, 'stripWhitespace': { 'docs': "Strip additional whitespace when processing." }, 'attrHash': { 'docs': "Tag/Attribute combinations to include in hash." } } def __init__(self, session, config, parent): Parser.__init__(self, session, config, parent) self.parser = make_parser() self.errorHandler = ErrorHandler() self.parser.setErrorHandler(self.errorHandler) self.inputSource = SaxInput() ch = SaxContentHandler() self.contentHandler = ch self.parser.setContentHandler(ch) self.keepError = 1 if (self.get_setting(session, 'namespaces')): self.parser.setFeature('http://xml.org/sax/features/namespaces', 1) p = self.get_setting(session, 'attrHash') if (p): l = p.split() for i in l: (a, b) = i.split("@") try: ch.hashAttributesNames[a].append(b) except: ch.hashAttributesNames[a] = [b] if self.get_setting(session, 'stripWhitespace'): ch.stripWS = 1 def process_document(self, session, doc): xml = doc.get_raw(session) self.inputSource.setByteStream(cStringIO.StringIO(xml)) ch = self.contentHandler ch.reinit() try: self.parser.parse(self.inputSource) except: # Splat. Reset self and reraise if self.keepError: # Work out path path = [] for l in ch.pathLines: line = ch.currentText[l] elemName = line[2:line.index('{') - 1] path.append("%s[@SAXID='%s']" % (elemName, l)) self.errorPath = '/'.join(path) else: ch.reinit() raise rec = SaxRecord(ch.currentText, xml, wordCount=ch.recordWordCount) rec.elementHash = ch.elementHash rec.byteCount = len(xml) self._copyData(doc, rec) ch.reinit() return rec
class SaxParser(Parser, ContentHandler): locked = 0 currentText = [] currentPath = [] pathLines = [] currentLine = -1 recordSize = 0 elementHash = {} def __init__(self, parent, config): C3Object.__init__(self, parent, config) self.parser = make_parser() self.inputSource = SaxInput() self.errorHandler = ErrorHandler() self.parser.setErrorHandler(self.errorHandler) self.parser.setContentHandler(self) def process_document(self, session, doc): if (self.locked): # Shouldn't be reusing across threads anyway! # XXX: Can we instantiate a new self ?? raise(ValueError) self.locked = 1 xml = doc.get_raw() self.inputSource.setByteStream(StringIO.StringIO(xml)) self.currentText = [] self.pathLines = [] self.currentLine = -1 self.elementHash = {} self.elementIndexes = [] self.recordSize = 0 try: self.parser.parse(self.inputSource) except: # Try again... sometimes odd things happen self.currentText = [] self.pathLines = [] self.currentLine = -1 self.elementHash = {} self.elementIndexes = [] self.recordSize = 0 self.inputSource.setByteStream(StringIO.StringIO(xml)) self.parser.parse(self.inputSource) self.currentText.append("#hash " + repr(self.elementHash)) self.locked = 0 rec = SaxRecord(self.currentText, xml, recordSize=self.recordSize) return rec # We want to fwd elems to NS elem handlers with default NS def startElement(self, name, attrs): self.currentLine += 1 attrHash = {} for k in attrs.keys(): attrHash[k] = attrs[k] self.pathLines.append(self.currentLine) if (len(self.pathLines) > 1): parent = self.pathLines[-2] else: parent = -1 if (self.currentLine == 0): npred = 1 self.elementIndexes = [{name: npred}] elif self.elementIndexes[-1].has_key(name): npred = self.elementIndexes[-1][name] + 1 self.elementIndexes[-1][name] = npred else: npred = 1 self.elementIndexes[-1][name] = 1 self.elementIndexes.append({}) ptxt = "#elem %s %s %d %d" % (name, repr(attrHash), parent, npred) self.currentText.append(ptxt) def endElement(self, name): self.currentLine += 1 start = self.pathLines.pop() self.currentText.append("#end %s %d" % (name, start)) self.currentText[start] = "%s %d" % (self.currentText[start], self.currentLine) self.elementIndexes.pop() if (self.elementHash.has_key(name)): self.elementHash[name].append([start, self.currentLine]) else: self.elementHash[name] = [[start, self.currentLine]] def startElementNS(self, name, qname, attrs): self.currentLine += 1 attrHash = {} for k in attrs.keys(): attrHash[k] = attrs[k] ptxt = "#elemNS %s %s %s" % (name, qname, repr(attrHash)) self.currentText.append(ptxt) def endElementNS(self, name, qname): self.currentLine += 1 self.currentText.append("#endNS %s %s" % (name, qname)) def characters(self, text, start=0, length=-1): if text.isspace(): text = " " self.currentLine += 1 self.currentText.append("#text %s" % (text)) self.recordSize += len(text.split()) def processingInstruction(self, target, data): pass def skippedEntity(self, name): pass
def stream(str): inpsrc = InputSource() inpsrc.setByteStream(StringIO(str)) return inpsrc