def get_news(rss_feed): """Get a list of news items.""" class _CurrentData(object): """Class holding a set of current attributes.""" item = None text = None def _start_element_handler(name, attrs): """Handle XML start-elements.""" if name == 'item': # Allocate a new item. current.item = NewsItem() def _end_element_handler(name): """Handle XML end-elements.""" if name == 'item': news_items.append(current.item) elif name in ('title', 'description', 'link', 'category'): try: setattr(current.item, name, current.text) except AttributeError: # The parser has run into a non-news item. pass def _char_data_handler(data): """Handle XML element character data.""" current.text = data news_items = list() current = _CurrentData() parser = expat.ParserCreate() parser.StartElementHandler = _start_element_handler parser.EndElementHandler = _end_element_handler parser.CharacterDataHandler = _char_data_handler news_handle = urllib2.urlopen(rss_feed) xml_data = news_handle.read() parser.Parse(xml_data) return news_items
def test_utf8(self): out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') for name in self.handler_names: setattr(parser, name, getattr(out, name)) parser.returns_unicode = 0 parser.Parse(data, 1) # Verify output op = out.out self.assertEqual(op[0], 'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'') self.assertEqual(op[1], "Comment: ' comment data '") self.assertEqual( op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)") self.assertEqual( op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')" ) self.assertEqual( op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\xe1\\xbd\\x80'}" ) self.assertEqual(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'") self.assertEqual( op[6], "Start element: 'http://www.python.org/namespace!subelement' {}") self.assertEqual(op[7], "Character data: 'Contents of subelements'") self.assertEqual( op[8], "End element: 'http://www.python.org/namespace!subelement'") self.assertEqual(op[9], "End of NS decl: 'myns'") self.assertEqual(op[10], "Start element: 'sub2' {}") self.assertEqual(op[11], 'Start of CDATA section') self.assertEqual(op[12], "Character data: 'contents of CDATA section'") self.assertEqual(op[13], 'End of CDATA section') self.assertEqual(op[14], "End element: 'sub2'") self.assertEqual(op[15], "External entity ref: (None, 'entity.file', None)") self.assertEqual(op[16], "End element: 'root'")
def test_change_size_1(self): if sys.platform == 'cli': # https://github.com/IronLanguages/ironpython2/issues/513 xml1 = "<?xml version='1.0' encoding='iso8859-1'?><a><s>%s" % ( 'a' * 1024) else: xml1 = "<?xml version='1.0' encoding='iso8859'?><a><s>%s" % ('a' * 1024) xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025) parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_text = 1 parser.buffer_size = 1024 self.assertEqual(parser.buffer_size, 1024) self.n = 0 parser.Parse(xml1, 0) parser.buffer_size *= 2 self.assertEqual(parser.buffer_size, 2048) parser.Parse(xml2, 1) self.assertEqual(self.n, 2)
def parse(url_or_path, encoding=None): """ :param url_or_path: A file-like object, a filesystem path, a URL, or a string containing XML :rtype: :class:`XmlElement` """ handler = DrillHandler() parser = expat.ParserCreate(encoding) parser.buffer_text = 1 parser.StartElementHandler = handler.start_element parser.EndElementHandler = handler.end_element parser.CharacterDataHandler = handler.characters if isinstance(url_or_path, basestring): if '://' in url_or_path[:20]: #with contextlib.closing(url_lib.urlopen(url_or_path)) as f: f = contextlib.closing(url_lib.urlopen(url_or_path)) try: parser.ParseFile(f) except Exception, e: raise e finally:
def _unescape(s): if not isinstance(s, str): s = s.encode("utf-8") list = [] # create and initialize a parser object p = expat.ParserCreate("utf-8") p.buffer_text = True p.CharacterDataHandler = list.append # parse the data wrapped in a dummy element # (needed so the "document" is well-formed) p.Parse("<e>", 0) p.Parse(s, 0) p.Parse("</e>", 1) # join the extracted strings and return es = "" return es.join(list)
def test_ignore_use_foreign_dtd(self): """ If UseForeignDTD is passed True and a document with an external entity reference is parsed, ExternalEntityRefHandler is called with the public and system ids from the document. """ handler_call_args = [] def resolve_entity(context, base, system_id, public_id): handler_call_args.append((public_id, system_id)) return 1 parser = expat.ParserCreate() parser.UseForeignDTD(True) parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.ExternalEntityRefHandler = resolve_entity parser.Parse( b"<?xml version='1.0'?><!DOCTYPE foo PUBLIC 'bar' 'baz'><element/>" ) self.assertEqual(handler_call_args, [("bar", "baz")])
def test_exception(self): parser = expat.ParserCreate() parser.StartElementHandler = self.StartElementHandler try: parser.Parse(b"<a><b><c/></b></a>", 1) self.fail() except RuntimeError as e: self.assertEqual(e.args[0], 'a', "Expected RuntimeError for element 'a', but" + \ " found %r" % e.args[0]) # Check that the traceback contains the relevant line in pyexpat.c entries = traceback.extract_tb(e.__traceback__) self.assertEqual(len(entries), 3) self.check_traceback_entry(entries[0], "test_pyexpat.py", "test_exception") self.check_traceback_entry(entries[1], "pyexpat.c", "StartElement") self.check_traceback_entry(entries[2], "test_pyexpat.py", "StartElementHandler") self.assertIn('call_with_frame("StartElement"', entries[1][3])
def parse(self, xml_data, tree): cell = gtk.CellRendererText() cell.set_fixed_height_from_font(1) column = gtk.TreeViewColumn('ID', cell, text=0) column.set_sizing(gtk.TREE_VIEW_COLUMN_FIXED) column.set_fixed_width(60) column.set_visible(False) tree.append_column(column) self.tree = tree self.pos = 1 self.fields_order = [] self.invisible_fields = [] psr = expat.ParserCreate() psr.StartElementHandler = self._psr_start psr.EndElementHandler = self._psr_end psr.CharacterDataHandler = self._psr_char psr.Parse(xml_data) return self.pos
def __init__(self, html=0, target=None, encoding=None): try: from xml.parsers import expat except ImportError: try: import pyexpat as expat except ImportError: raise ImportError( 'No module named expat; use SimpleXMLTreeBuilder instead') parser = expat.ParserCreate(encoding, '}') if target is None: target = TreeBuilder() self.parser = self._parser = parser self.target = self._target = target self._error = expat.error self._names = {} parser.DefaultHandlerExpand = self._default parser.StartElementHandler = self._start parser.EndElementHandler = self._end parser.CharacterDataHandler = self._data parser.CommentHandler = self._comment parser.ProcessingInstructionHandler = self._pi try: self._parser.buffer_text = 1 except AttributeError: pass try: self._parser.ordered_attributes = 1 self._parser.specified_attributes = 1 parser.StartElementHandler = self._start_list except AttributeError: pass self._doctype = None self.entity = {} try: self.version = 'Expat %d.%d.%d' % expat.version_info except AttributeError: pass
def test_unchanged_size(self): xml1 = ("<?xml version='1.0' encoding='iso8859'?><s>%s" % ('a' * 512)) xml2 = 'a'*512 + '</s>' parser = expat.ParserCreate() parser.CharacterDataHandler = self.counting_handler parser.buffer_size = 512 parser.buffer_text = 1 # Feed 512 bytes of character data: the handler should be called # once. self.n = 0 parser.Parse(xml1) self.assertEquals(self.n, 1) # Reassign to buffer_size, but assign the same size. parser.buffer_size = parser.buffer_size self.assertEquals(self.n, 1) # Try parsing rest of the document parser.Parse(xml2) self.assertEquals(self.n, 2)
def test_parse_only_xml_data(self): # http://python.org/sf/1296433 # xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * 1025) # this one doesn't crash #xml = "<?xml version='1.0'?><s>%s</s>" % ('a' * 10000) class SpecificException(Exception): pass def handler(text): raise SpecificException parser = expat.ParserCreate() parser.CharacterDataHandler = handler # https://github.com/IronLanguages/ironpython2/issues/464 if sys.platform == 'cli': self.assertRaises(Exception, parser.Parse, xml, True) else: self.assertRaises(Exception, parser.Parse, xml)
def parse_stream(self, stream): self._setup_handlers() self._parser = expat.ParserCreate() self._parser.StartElementHandler = self._start_element_handler self._parser.EndElementHandler = self._end_element_handler self._stream = stream self._progress = util.ProgressFactory("Loading XML...", self._stream_length, numsteps=32) try: try: self._parser.ParseFile(stream) except Exception: raise finally: self._progress.kill() self._remove_handlers() self._parser.StartElementHandler = None self._parser.EndElementHandler = None del self._parser return self
def __init__(self, encoding=None): # type: (Optional[Text]) -> None self._parser = expat.ParserCreate(encoding, "}") self._target = etree.TreeBuilder() # parser settings self._parser.buffer_text = True self._parser.ordered_attributes = True self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) # parser callbacks self._parser.XmlDeclHandler = self._xml_decl # mypy generates a type error in py2 because it wants # StartElementHandler to take str, List[str]. But the code # seems to always pass in Text to this function self._parser.StartElementHandler = self._start self._parser.EndElementHandler = self._end self._parser.CharacterDataHandler = self._data self._parser.ExternalEntityRefHandler = self._external self._parser.SkippedEntityHandler = self._skipped # type: ignore # used for our horrible re-encoding hack self._fed_data = [] # type: Optional[List[bytes]] self._read_encoding = None # type: Optional[Text]
def __call__(self, fileobj, keywords, comment_tags, options): self.keywords = keywords self.comment_tags = comment_tags self.options = options self.messages = [] self.parser = expat.ParserCreate() self.parser.StartElementHandler = self.StartElementHandler self.parser.EndElementHandler = self.EndElementHandler self.domainstack = collections.deque() try: self.parser.ParseFile(fileobj) except expat.ExpatError as e: if getattr(fileobj, 'name', None): print >> sys.stderr, \ ('Aborting due to parse error in %s: %s' % (fileobj.name, e.message)) else: print >> sys.stderr, \ ('Aborting due to parse error: %s' % e.message) sys.exit(1) return self.messages
def MergeXML(self, sFile): ## Open the file from xml.parsers import expat self.xml = expat.ParserCreate() oFile = file(sFile,'r+') ## Handlers and values self.mode = '' self.lattrs = [] self.xattrs = {} self.xml.StartElementHandler = self.start_element self.xml.EndElementHandler = self.end_element ## Parse try: self.xml.ParseFile(oFile) finally: oFile.close() ## Save its values return self.__GenerateXML()
def __init__(self, source, parseErrorClass=ValueError): self.source = source self.parseErrorClass = parseErrorClass if hasattr(source, "name"): self.inputName = source.name elif hasattr(source, "getvalue"): self.inputName = repr(source.getvalue())[1:-1] else: self.inputName = repr(source)[:34] self.parser = expat.ParserCreate() self.parser.buffer_text = True self.lastLine, self.lastColumn = 1, 0 # We want ordered attributes for forcing attribute names to be # byte strings. self.parser.returns_unicode = True self.evBuf = collections.deque() self.parser.StartElementHandler = self._startElement self.parser.EndElementHandler = self._endElement self.parser.CharacterDataHandler = self._characters
def toDict(inputSource, expat=expat, **kwargs): contentHandler = _XMLToDictHandler(**kwargs) xmlParser = expat.ParserCreate() try: xmlParser.ordered_attributes = True except AttributeError: pass xmlParser.StartElementHandler = contentHandler.startElement xmlParser.EndElementHandler = contentHandler.endElement xmlParser.CharacterDataHandler = contentHandler.characters try: xmlParser.ParseFile(inputSource) except TypeError as TE: print "T: " + TE.message except AttributeError as AE: print "A: " + AE.message xmlParser.Parse(inputSource, True) return contentHandler.tagItem
def __init__(self, html=0, target=None, encoding=None): try: from xml.parsers import expat except ImportError: try: import pyexpat as expat except ImportError: raise ImportError( "No module named expat; use SimpleXMLTreeBuilder instead") parser = expat.ParserCreate(encoding, "}") if target is None: target = TreeBuilder() # underscored names are provided for compatibility only self.parser = self._parser = parser self.target = self._target = target self._error = expat.error self._names = {} # name memo cache # main callbacks parser.DefaultHandlerExpand = self._default if hasattr(target, 'start'): parser.StartElementHandler = self._start if hasattr(target, 'end'): parser.EndElementHandler = self._end if hasattr(target, 'data'): parser.CharacterDataHandler = target.data # miscellaneous callbacks if hasattr(target, 'comment'): parser.CommentHandler = target.comment if hasattr(target, 'pi'): parser.ProcessingInstructionHandler = target.pi # Configure pyexpat: buffering, new-style attribute handling. parser.buffer_text = 1 parser.ordered_attributes = 1 parser.specified_attributes = 1 self._doctype = None self.entity = {} try: self.version = "Expat %d.%d.%d" % expat.version_info except AttributeError: pass # unknown
def _RetrieverCallback(self, code, crc): ''' Callback from the retriever. @type code: integer @param code: Result code from the retriever. @type crc: string @param crc: CRC32 checksum of the retrieved file. ''' # Any errors? if code != Retriever.Result_OK: # Failed. if self.Callback: self.Callback(code) return # Shut down the retriever. self.RetrieverInstance.Shutdown() # Try parsing the file. try: fileobj = file(self.TempFile, "r") parser = expat.ParserCreate() parser.StartElementHandler = self._StartElementHandler parser.EndElementHandler = self._EndElementHandler parser.ParseFile(fileobj) except: # failed, call the callback with an error code msg = "Error while parsing the metaserver data:\n" msg += traceback.format_exc() mainlog.error(msg) if self.Callback: self.Callback(-1) finally: fileobj.close() # Call the callback. if self.Callback: self.Callback(code)
def __init__(self, source, filename=None, encoding=None): """Initialize the parser for the given XML input. :param source: the XML text as a file-like object :param filename: the name of the file, if appropriate :param encoding: the encoding of the file; if not specified, the encoding is assumed to be ASCII, UTF-8, or UTF-16, or whatever the encoding specified in the XML declaration (if any) """ self.source = source self.filename = filename # Setup the Expat parser parser = expat.ParserCreate(encoding, '}') parser.buffer_text = True parser.returns_unicode = True parser.ordered_attributes = True parser.StartElementHandler = self._handle_start parser.EndElementHandler = self._handle_end parser.CharacterDataHandler = self._handle_data parser.StartDoctypeDeclHandler = self._handle_doctype parser.StartNamespaceDeclHandler = self._handle_start_ns parser.EndNamespaceDeclHandler = self._handle_end_ns parser.StartCdataSectionHandler = self._handle_start_cdata parser.EndCdataSectionHandler = self._handle_end_cdata parser.ProcessingInstructionHandler = self._handle_pi parser.XmlDeclHandler = self._handle_xml_decl parser.CommentHandler = self._handle_comment # Tell Expat that we'll handle non-XML entities ourselves # (in _handle_other) parser.DefaultHandler = self._handle_other parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.UseForeignDTD() parser.ExternalEntityRefHandler = self._build_foreign self.expat = parser self._queue = []
def _fast_iterparse(fd, buffersize=2**10): from xml.parsers import expat if not six.callable(fd): read = fd.read else: read = fd queue = [] text = [] def start(name, attr): queue.append((True, name, attr, (parser.CurrentLineNumber, parser.CurrentColumnNumber))) del text[:] def end(name): queue.append((False, name, ''.join(text).strip(), (parser.CurrentLineNumber, parser.CurrentColumnNumber))) parser = expat.ParserCreate() if six.PY2: parser.returns_unicode = True parser.specified_attributes = True parser.StartElementHandler = start parser.EndElementHandler = end parser.CharacterDataHandler = text.append Parse = parser.Parse data = read(buffersize) while data: Parse(data, False) for elem in queue: yield elem del queue[:] data = read(buffersize) Parse('', True) for elem in queue: yield elem
def __init__(self, data): self.lists = [] self.nodeStack = [self.lists] # Parse XML parser = expat.ParserCreate() parser.StartElementHandler = self.startElement parser.EndElementHandler = self.endElement parser.CharacterDataHandler = self.characterData try: xmlStart = data.find('<') if xmlStart == -1: raise Exception, _('Invalid XML response: %s') % str(data) xmlEnd = data.rfind('>') if xmlEnd == -1: raise Exception, _('Invalid XML response: %s') % str(data) self.status = parser.Parse(data[xmlStart:xmlEnd + 1], 1) except: wx.MessageBox(Utils.html2txt(data), _('Error'), wx.ICON_ERROR) raise
def parseXmlDump(self, src_filename, dst_filename): """Runs the filter. Args: src_filename: The filename of the source XML dump to be parsed. dst_filename: The filename of the destination XML dump to be written. """ parser = expat.ParserCreate() parser.StartElementHandler = self._startElement parser.EndElementHandler = self._endElement parser.CharacterDataHandler = self._charData self._src = open(src_filename) self._dst = open(dst_filename, 'w') self._dst.write('<pages>\n') try: parser.ParseFile(self._src) finally: self._dst.write('</pages>') self._src.close() self._dst.close()
class NamespaceSeparatorTest(unittest.TestCase): def test_legal(self): # Tests that make sure we get errors when the namespace_separator value # is illegal, and that we don't for good values: expat.ParserCreate() expat.ParserCreate(namespace_separator=None) expat.ParserCreate(namespace_separator=' ') def test_illegal(self): try: expat.ParserCreate(namespace_separator=42) self.fail() except TypeError, e: self.assertEquals(str(e), 'ParserCreate() argument 2 must be string or None, not int') try: expat.ParserCreate(namespace_separator='too long') self.fail() except ValueError, e: self.assertEquals(str(e), 'namespace_separator must be at most one character, omitted, or None')
def __init__(self, html=0, target=None, encoding=None): try: from xml.parsers import expat except ImportError: try: import pyexpat as expat except ImportError: raise ImportError( "No module named expat; use SimpleXMLTreeBuilder instead" ) parser = expat.ParserCreate(encoding, "}") if target is None: target = OriginalTreeBuilder(ET._Element_Py) self.parser = self._parser = parser self.target = self._target = target self._error = expat.error self._names = {} parser.DefaultHandlerExpand = self._default if hasattr(target, 'start'): parser.StartElementHandler = self._start if hasattr(target, 'end'): parser.EndElementHandler = self._end if hasattr(target, 'data'): parser.CharacterDataHandler = target.data if hasattr(target, 'comment'): parser.CommentHandler = target.comment if hasattr(target, 'pi'): parser.ProcessingInstructionHandler = target.pi parser.buffer_text = 1 parser.ordered_attributes = 1 parser.specified_attributes = 1 self._doctype = None self.entity = {} try: self.version = "Expat %d.%d.%d" % expat.version_info except AttributeError: pass
def test_exception(self): parser = expat.ParserCreate() parser.StartElementHandler = self.StartElementHandler try: parser.Parse(b"<a><b><c/></b></a>", True) self.fail() except RuntimeError as e: self.assertEqual(e.args[0], 'a', "Expected RuntimeError for element 'a', but" + \ " found %r" % e.args[0]) # Check that the traceback contains the relevant line in pyexpat.c entries = traceback.extract_tb(e.__traceback__) self.assertEqual(len(entries), 3) self.check_traceback_entry(entries[0], "test_pyexpat.py", "test_exception") self.check_traceback_entry(entries[1], "pyexpat.c", "StartElement") self.check_traceback_entry(entries[2], "test_pyexpat.py", "StartElementHandler") if (sysconfig.is_python_build() and not (sys.platform == 'win32' and platform.machine() == 'ARM') and not is_emscripten and not is_wasi): self.assertIn('call_with_frame("StartElement"', entries[1][3])
def __init__(self): # toBrowse can contain GET and POST resources self.to_browse = [] # browsed contains only GET resources self.browsed_links = [] # forms contains only POST resources self.browsed_forms = [] self.uploads = [] self.headers = {} self.root_url = "" self.tag = "" self.array = None self.method = "" self.path = "" self.encoding = "" self.referer = "" self.get_params = [] self.post_params = [] self.file_params = [] self._parser = expat.ParserCreate("UTF-8")
def __init__(self, result_path, nitrate=None): self.start_element_map = { 'testsuite': self.testsuite_start, 'testcase': self.testcase_start, } self.end_element_map = { 'testcase': self.testcase_end, 'error': self.error_end, 'skipped': self.skipped_end, 'failure': self.error_end, 'system-out': self.system_out_end } self.test = None self.text = None self.data_reset() self.nitrate = nitrate self.parser = expat.ParserCreate() self.parser.StartElementHandler = self.start self.parser.EndElementHandler = self.end self.parser.CharacterDataHandler = self.data with open(result_path) as result_file: logging.debug("Reading results file: %s", result_path) self.parser.ParseFile(result_file)
def createParser(self): """ creates the parser """ content_handler = self.getContentHandler() if content_handler is None: raise 'No content handler set.' try: import xml.parsers.expat as expat parser = expat.ParserCreate() except: from xmlparser import SelfmadeXMLParser as Parser parser = Parser() parser.StartElementHandler = content_handler.startElementHandler parser.EndElementHandler = content_handler.endElementHandler parser.CharacterDataHandler = content_handler.charDataHandler return parser
def __init__(self, html=0, target=None, encoding=None): try: from xml.parsers import expat except ImportError: raise ImportError( "No module named expat; use SimpleXMLTreeBuilder instead") parser = expat.ParserCreate(encoding, "}") if target is None: target = TreeBuilder() # underscored names are provided for compatibility only self.parser = self._parser = parser self.target = self._target = target self._error = expat.error self._names = {} # name memo cache # callbacks parser.DefaultHandlerExpand = self._default parser.StartElementHandler = self._start parser.EndElementHandler = self._end parser.CharacterDataHandler = self._data # let expat do the buffering, if supported try: self._parser.buffer_text = 1 except AttributeError: pass # use new-style attribute handling, if supported try: self._parser.ordered_attributes = 1 self._parser.specified_attributes = 1 parser.StartElementHandler = self._start_list except AttributeError: pass self._doctype = None self.entity = {} try: self.version = "Expat %d.%d.%d" % expat.version_info except AttributeError: pass # unknown