def test_serializer_doctype(self): stream = Stream([]) output = stream.render(XMLSerializer, doctype=DocType.HTML_STRICT) self.assertEqual('<!DOCTYPE html PUBLIC ' '"-//W3C//DTD HTML 4.01//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">\n', output)
def test_doctype_in_stream_no_sysid(self): stream = Stream([(Stream.DOCTYPE, ('html', '-//W3C//DTD HTML 4.01//EN', None), (None, -1, -1))]) output = stream.render(XMLSerializer) self.assertEqual('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n', output)
def test_doctype_in_stream(self): stream = Stream([(Stream.DOCTYPE, DocType.HTML_STRICT, (None, -1, -1))]) output = stream.render(XMLSerializer) self.assertEqual('<!DOCTYPE html PUBLIC ' '"-//W3C//DTD HTML 4.01//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">\n', output)
def test_doctype_in_stream_no_pubid(self): stream = Stream([ (Stream.DOCTYPE, ('html', None, 'http://www.w3.org/TR/html4/strict.dtd'), (None, -1, -1)) ]) output = stream.render(XMLSerializer) self.assertEqual('<!DOCTYPE html SYSTEM ' '"http://www.w3.org/TR/html4/strict.dtd">\n', output)
def generate(self, *args, **kwargs): """Apply the template to the given context data. Any keyword arguments are made available to the template as context data. Only one positional argument is accepted: if it is provided, it must be an instance of the `Context` class, and keyword arguments are ignored. This calling style is used for internal processing. :return: a markup event stream representing the result of applying the template to the context data. """ if args: assert len(args) == 1 ctxt = args[0] if ctxt is None: ctxt = Context(**kwargs) assert isinstance(ctxt, Context) else: ctxt = Context(**kwargs) stream = self.stream for filter_ in self.filters: stream = filter_(iter(stream), ctxt) return Stream(stream)
def test_multiple_bound_namespaces(self): stream = Stream([ (Stream.START, (QName('div'), Attrs()), (None, -1, -1)), (Stream.TEXT, '\n ', (None, -1, -1)), (Stream.START_NS, ('x', 'http://example.org/'), (None, -1, -1)), (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)), (Stream.END, QName('http://example.org/}p'), (None, -1, -1)), (Stream.END_NS, 'x', (None, -1, -1)), (Stream.TEXT, '\n ', (None, -1, -1)), (Stream.START_NS, ('x', 'http://example.org/'), (None, -1, -1)), (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)), (Stream.END, QName('http://example.org/}p'), (None, -1, -1)), (Stream.END_NS, 'x', (None, -1, -1)), (Stream.TEXT, '\n ', (None, -1, -1)), (Stream.END, QName('div'), (None, -1, -1)), ]) output = stream.render(XMLSerializer) self.assertEqual("""<div> <x:p xmlns:x="http://example.org/"/> <x:p xmlns:x="http://example.org/"/> </div>""", output)
def test_nested_default_namespaces(self): stream = Stream([ (Stream.START_NS, ('', 'http://example.org/'), (None, -1, -1)), (Stream.START, (QName('http://example.org/}div'), Attrs()), (None, -1, -1)), (Stream.TEXT, '\n ', (None, -1, -1)), (Stream.START_NS, ('', 'http://example.org/'), (None, -1, -1)), (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)), (Stream.END, QName('http://example.org/}p'), (None, -1, -1)), (Stream.END_NS, '', (None, -1, -1)), (Stream.TEXT, '\n ', (None, -1, -1)), (Stream.START_NS, ('', 'http://example.org/'), (None, -1, -1)), (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)), (Stream.END, QName('http://example.org/}p'), (None, -1, -1)), (Stream.END_NS, '', (None, -1, -1)), (Stream.TEXT, '\n ', (None, -1, -1)), (Stream.END, QName('http://example.org/}div'), (None, -1, -1)), (Stream.END_NS, '', (None, -1, -1)) ]) output = stream.render(XMLSerializer) self.assertEqual("""<div xmlns="http://example.org/"> <p/> <p/> </div>""", output)
def select(self, stream, namespaces=None, variables=None): """Returns a substream of the given stream that matches the path. If there are no matches, this method returns an empty stream. >>> from genshi.input import XML >>> xml = XML('<root><elem><child>Text</child></elem></root>') >>> print Path('.//child').select(xml) <child>Text</child> >>> print Path('.//child/text()').select(xml) Text :param stream: the stream to select from :param namespaces: (optional) a mapping of namespace prefixes to URIs :param variables: (optional) a mapping of variable names to values :return: the substream matching the path, or an empty stream """ if namespaces is None: namespaces = {} if variables is None: variables = {} stream = iter(stream) def _generate(): test = self.test() for event in stream: result = test(event, namespaces, variables) if result is True: yield event if event[0] is START: depth = 1 while depth > 0: subevent = stream.next() if subevent[0] is START: depth += 1 elif subevent[0] is END: depth -= 1 yield subevent test(subevent, namespaces, variables, updateonly=True) elif result: yield result return Stream(_generate())
def XML(text): """Parse the given XML source and return a markup stream. Unlike with `XMLParser`, the returned stream is reusable, meaning it can be iterated over multiple times: >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>') >>> print xml <doc><elem>Foo</elem><elem>Bar</elem></doc> >>> print xml.select('elem') <elem>Foo</elem><elem>Bar</elem> >>> print xml.select('elem/text()') FooBar :param text: the XML source :return: the parsed XML event stream :raises ParseError: if the XML text is not well-formed """ return Stream(list(XMLParser(StringIO(text))))
def HTML(text, encoding='utf-8'): """Parse the given HTML source and return a markup stream. Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be iterated over multiple times: >>> html = HTML('<body><h1>Foo</h1></body>') >>> print html <body><h1>Foo</h1></body> >>> print html.select('h1') <h1>Foo</h1> >>> print html.select('h1/text()') Foo :param text: the HTML source :return: the parsed XML event stream :raises ParseError: if the HTML text is not well-formed, and error recovery fails """ return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
class XMLParser(object): """Generator-based XML parser based on roughly equivalent code in Kid/ElementTree. The parsing is initiated by iterating over the parser object: >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) >>> for kind, data, pos in parser: ... print kind, data START (QName(u'root'), Attrs([(QName(u'id'), u'2')])) START (QName(u'child'), Attrs()) TEXT Foo END child END root """ _entitydefs = [ '<!ENTITY %s "&#%d;">' % (name, value) for name, value in htmlentitydefs.name2codepoint.items() ] _external_dtd = '\n'.join(_entitydefs) def __init__(self, source, filename=None, encoding=None): """Initialize the parser for the given XML input. :param source: the XML text as a file-like object :param filename: the name of the file, if appropriate :param encoding: the encoding of the file; if not specified, the encoding is assumed to be ASCII, UTF-8, or UTF-16, or whatever the encoding specified in the XML declaration (if any) """ self.source = source self.filename = filename # Setup the Expat parser parser = expat.ParserCreate(encoding, '}') parser.buffer_text = True parser.returns_unicode = True parser.ordered_attributes = True parser.StartElementHandler = self._handle_start parser.EndElementHandler = self._handle_end parser.CharacterDataHandler = self._handle_data parser.StartDoctypeDeclHandler = self._handle_doctype parser.StartNamespaceDeclHandler = self._handle_start_ns parser.EndNamespaceDeclHandler = self._handle_end_ns parser.StartCdataSectionHandler = self._handle_start_cdata parser.EndCdataSectionHandler = self._handle_end_cdata parser.ProcessingInstructionHandler = self._handle_pi parser.XmlDeclHandler = self._handle_xml_decl parser.CommentHandler = self._handle_comment # Tell Expat that we'll handle non-XML entities ourselves # (in _handle_other) parser.DefaultHandler = self._handle_other parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.UseForeignDTD() parser.ExternalEntityRefHandler = self._build_foreign # Location reporting is only support in Python >= 2.4 if not hasattr(parser, 'CurrentLineNumber'): self._getpos = self._getpos_unknown self.expat = parser self._queue = [] def parse(self): """Generator that parses the XML source, yielding markup events. :return: a markup event stream :raises ParseError: if the XML text is not well formed """ def _generate(): try: bufsize = 4 * 1024 # 4K done = False while 1: while not done and len(self._queue) == 0: data = self.source.read(bufsize) if data == '': # end of data if hasattr(self, 'expat'): self.expat.Parse('', True) del self.expat # get rid of circular references done = True else: if isinstance(data, unicode): data = data.encode('utf-8') self.expat.Parse(data, False) for event in self._queue: yield event self._queue = [] if done: break except expat.ExpatError, e: msg = str(e) raise ParseError(msg, self.filename, e.lineno, e.offset) return Stream(_generate()).filter(_coalesce)
class HTMLParser(html.HTMLParser, object): """Parser for HTML input based on the Python `HTMLParser` module. This class provides the same interface for generating stream events as `XMLParser`, and attempts to automatically balance tags. The parsing is initiated by iterating over the parser object: >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) >>> for kind, data, pos in parser: ... print kind, data START (QName(u'ul'), Attrs([(QName(u'compact'), u'compact')])) START (QName(u'li'), Attrs()) TEXT Foo END li END ul """ _EMPTY_ELEMS = frozenset([ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param' ]) def __init__(self, source, filename=None, encoding='utf-8'): """Initialize the parser for the given HTML input. :param source: the HTML text as a file-like object :param filename: the name of the file, if known :param filename: encoding of the file; ignored if the input is unicode """ html.HTMLParser.__init__(self) self.source = source self.filename = filename self.encoding = encoding self._queue = [] self._open_tags = [] def parse(self): """Generator that parses the HTML source, yielding markup events. :return: a markup event stream :raises ParseError: if the HTML text is not well formed """ def _generate(): try: bufsize = 4 * 1024 # 4K done = False while 1: while not done and len(self._queue) == 0: data = self.source.read(bufsize) if data == '': # end of data self.close() done = True else: self.feed(data) for kind, data, pos in self._queue: yield kind, data, pos self._queue = [] if done: open_tags = self._open_tags open_tags.reverse() for tag in open_tags: yield END, QName(tag), pos break except html.HTMLParseError, e: msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) raise ParseError(msg, self.filename, e.lineno, e.offset) return Stream(_generate()).filter(_coalesce)
def select(path): return Stream(content).select(path, namespaces, ctxt)
def test_processing_instruction(self): stream = Stream([(Stream.PI, ('python', 'x = 2'), (None, -1, -1))]) output = stream.render(XMLSerializer) self.assertEqual('<?python x = 2?>', output)
def test_comment(self): stream = Stream([(Stream.COMMENT, 'foo bar', (None, -1, -1))]) output = stream.render(XMLSerializer) self.assertEqual('<!--foo bar-->', output)
def test_doctype_in_stream_no_pubid_or_sysid(self): stream = Stream([(Stream.DOCTYPE, ('html', None, None), (None, -1, -1))]) output = stream.render(XMLSerializer) self.assertEqual('<!DOCTYPE html>\n', output)