Exemple #1
0
 def test_serializer_doctype(self):
     stream = Stream([])
     output = stream.render(XMLSerializer, doctype=DocType.HTML_STRICT)
     self.assertEqual('<!DOCTYPE html PUBLIC '
                      '"-//W3C//DTD HTML 4.01//EN" '
                      '"http://www.w3.org/TR/html4/strict.dtd">\n',
                      output)
Exemple #2
0
 def test_doctype_in_stream_no_sysid(self):
     stream = Stream([(Stream.DOCTYPE,
                      ('html', '-//W3C//DTD HTML 4.01//EN', None),
                      (None, -1, -1))])
     output = stream.render(XMLSerializer)
     self.assertEqual('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n',
                      output)
Exemple #3
0
 def test_doctype_in_stream(self):
     stream = Stream([(Stream.DOCTYPE, DocType.HTML_STRICT, (None, -1, -1))])
     output = stream.render(XMLSerializer)
     self.assertEqual('<!DOCTYPE html PUBLIC '
                      '"-//W3C//DTD HTML 4.01//EN" '
                      '"http://www.w3.org/TR/html4/strict.dtd">\n',
                      output)
Exemple #4
0
 def test_doctype_in_stream_no_pubid(self):
     stream = Stream([
         (Stream.DOCTYPE,
          ('html', None, 'http://www.w3.org/TR/html4/strict.dtd'),
          (None, -1, -1))
     ])
     output = stream.render(XMLSerializer)
     self.assertEqual('<!DOCTYPE html SYSTEM '
                      '"http://www.w3.org/TR/html4/strict.dtd">\n',
                      output)
Exemple #5
0
    def generate(self, *args, **kwargs):
        """Apply the template to the given context data.
        
        Any keyword arguments are made available to the template as context
        data.
        
        Only one positional argument is accepted: if it is provided, it must be
        an instance of the `Context` class, and keyword arguments are ignored.
        This calling style is used for internal processing.
        
        :return: a markup event stream representing the result of applying
                 the template to the context data.
        """
        if args:
            assert len(args) == 1
            ctxt = args[0]
            if ctxt is None:
                ctxt = Context(**kwargs)
            assert isinstance(ctxt, Context)
        else:
            ctxt = Context(**kwargs)

        stream = self.stream
        for filter_ in self.filters:
            stream = filter_(iter(stream), ctxt)
        return Stream(stream)
Exemple #6
0
 def test_multiple_bound_namespaces(self):
     stream = Stream([
         (Stream.START, (QName('div'), Attrs()), (None, -1, -1)),
         (Stream.TEXT, '\n          ', (None, -1, -1)),
         (Stream.START_NS, ('x', 'http://example.org/'), (None, -1, -1)),
         (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)),
         (Stream.END, QName('http://example.org/}p'), (None, -1, -1)),
         (Stream.END_NS, 'x', (None, -1, -1)),
         (Stream.TEXT, '\n          ', (None, -1, -1)),
         (Stream.START_NS, ('x', 'http://example.org/'), (None, -1, -1)),
         (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)),
         (Stream.END, QName('http://example.org/}p'), (None, -1, -1)),
         (Stream.END_NS, 'x', (None, -1, -1)),
         (Stream.TEXT, '\n        ', (None, -1, -1)),
         (Stream.END, QName('div'), (None, -1, -1)),
     ])
     output = stream.render(XMLSerializer)
     self.assertEqual("""<div>
       <x:p xmlns:x="http://example.org/"/>
       <x:p xmlns:x="http://example.org/"/>
     </div>""", output)
Exemple #7
0
 def test_nested_default_namespaces(self):
     stream = Stream([
         (Stream.START_NS, ('', 'http://example.org/'), (None, -1, -1)),
         (Stream.START, (QName('http://example.org/}div'), Attrs()), (None, -1, -1)),
         (Stream.TEXT, '\n          ', (None, -1, -1)),
         (Stream.START_NS, ('', 'http://example.org/'), (None, -1, -1)),
         (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)),
         (Stream.END, QName('http://example.org/}p'), (None, -1, -1)),
         (Stream.END_NS, '', (None, -1, -1)),
         (Stream.TEXT, '\n          ', (None, -1, -1)),
         (Stream.START_NS, ('', 'http://example.org/'), (None, -1, -1)),
         (Stream.START, (QName('http://example.org/}p'), Attrs()), (None, -1, -1)),
         (Stream.END, QName('http://example.org/}p'), (None, -1, -1)),
         (Stream.END_NS, '', (None, -1, -1)),
         (Stream.TEXT, '\n        ', (None, -1, -1)),
         (Stream.END, QName('http://example.org/}div'), (None, -1, -1)),
         (Stream.END_NS, '', (None, -1, -1))
     ])
     output = stream.render(XMLSerializer)
     self.assertEqual("""<div xmlns="http://example.org/">
       <p/>
       <p/>
     </div>""", output)
Exemple #8
0
    def select(self, stream, namespaces=None, variables=None):
        """Returns a substream of the given stream that matches the path.
        
        If there are no matches, this method returns an empty stream.
        
        >>> from genshi.input import XML
        >>> xml = XML('<root><elem><child>Text</child></elem></root>')
        
        >>> print Path('.//child').select(xml)
        <child>Text</child>
        
        >>> print Path('.//child/text()').select(xml)
        Text
        
        :param stream: the stream to select from
        :param namespaces: (optional) a mapping of namespace prefixes to URIs
        :param variables: (optional) a mapping of variable names to values
        :return: the substream matching the path, or an empty stream
        """
        if namespaces is None:
            namespaces = {}
        if variables is None:
            variables = {}
        stream = iter(stream)

        def _generate():
            test = self.test()
            for event in stream:
                result = test(event, namespaces, variables)
                if result is True:
                    yield event
                    if event[0] is START:
                        depth = 1
                        while depth > 0:
                            subevent = stream.next()
                            if subevent[0] is START:
                                depth += 1
                            elif subevent[0] is END:
                                depth -= 1
                            yield subevent
                            test(subevent,
                                 namespaces,
                                 variables,
                                 updateonly=True)
                elif result:
                    yield result

        return Stream(_generate())
Exemple #9
0
def XML(text):
    """Parse the given XML source and return a markup stream.
    
    Unlike with `XMLParser`, the returned stream is reusable, meaning it can be
    iterated over multiple times:
    
    >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>')
    >>> print xml
    <doc><elem>Foo</elem><elem>Bar</elem></doc>
    >>> print xml.select('elem')
    <elem>Foo</elem><elem>Bar</elem>
    >>> print xml.select('elem/text()')
    FooBar
    
    :param text: the XML source
    :return: the parsed XML event stream
    :raises ParseError: if the XML text is not well-formed
    """
    return Stream(list(XMLParser(StringIO(text))))
Exemple #10
0
def HTML(text, encoding='utf-8'):
    """Parse the given HTML source and return a markup stream.
    
    Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
    iterated over multiple times:
    
    >>> html = HTML('<body><h1>Foo</h1></body>')
    >>> print html
    <body><h1>Foo</h1></body>
    >>> print html.select('h1')
    <h1>Foo</h1>
    >>> print html.select('h1/text()')
    Foo
    
    :param text: the HTML source
    :return: the parsed XML event stream
    :raises ParseError: if the HTML text is not well-formed, and error recovery
                        fails
    """
    return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
Exemple #11
0
class XMLParser(object):
    """Generator-based XML parser based on roughly equivalent code in
    Kid/ElementTree.
    
    The parsing is initiated by iterating over the parser object:
    
    >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
    >>> for kind, data, pos in parser:
    ...     print kind, data
    START (QName(u'root'), Attrs([(QName(u'id'), u'2')]))
    START (QName(u'child'), Attrs())
    TEXT Foo
    END child
    END root
    """

    _entitydefs = [
        '<!ENTITY %s "&#%d;">' % (name, value)
        for name, value in htmlentitydefs.name2codepoint.items()
    ]
    _external_dtd = '\n'.join(_entitydefs)

    def __init__(self, source, filename=None, encoding=None):
        """Initialize the parser for the given XML input.
        
        :param source: the XML text as a file-like object
        :param filename: the name of the file, if appropriate
        :param encoding: the encoding of the file; if not specified, the
                         encoding is assumed to be ASCII, UTF-8, or UTF-16, or
                         whatever the encoding specified in the XML declaration
                         (if any)
        """
        self.source = source
        self.filename = filename

        # Setup the Expat parser
        parser = expat.ParserCreate(encoding, '}')
        parser.buffer_text = True
        parser.returns_unicode = True
        parser.ordered_attributes = True

        parser.StartElementHandler = self._handle_start
        parser.EndElementHandler = self._handle_end
        parser.CharacterDataHandler = self._handle_data
        parser.StartDoctypeDeclHandler = self._handle_doctype
        parser.StartNamespaceDeclHandler = self._handle_start_ns
        parser.EndNamespaceDeclHandler = self._handle_end_ns
        parser.StartCdataSectionHandler = self._handle_start_cdata
        parser.EndCdataSectionHandler = self._handle_end_cdata
        parser.ProcessingInstructionHandler = self._handle_pi
        parser.XmlDeclHandler = self._handle_xml_decl
        parser.CommentHandler = self._handle_comment

        # Tell Expat that we'll handle non-XML entities ourselves
        # (in _handle_other)
        parser.DefaultHandler = self._handle_other
        parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
        parser.UseForeignDTD()
        parser.ExternalEntityRefHandler = self._build_foreign

        # Location reporting is only support in Python >= 2.4
        if not hasattr(parser, 'CurrentLineNumber'):
            self._getpos = self._getpos_unknown

        self.expat = parser
        self._queue = []

    def parse(self):
        """Generator that parses the XML source, yielding markup events.
        
        :return: a markup event stream
        :raises ParseError: if the XML text is not well formed
        """
        def _generate():
            try:
                bufsize = 4 * 1024  # 4K
                done = False
                while 1:
                    while not done and len(self._queue) == 0:
                        data = self.source.read(bufsize)
                        if data == '':  # end of data
                            if hasattr(self, 'expat'):
                                self.expat.Parse('', True)
                                del self.expat  # get rid of circular references
                            done = True
                        else:
                            if isinstance(data, unicode):
                                data = data.encode('utf-8')
                            self.expat.Parse(data, False)
                    for event in self._queue:
                        yield event
                    self._queue = []
                    if done:
                        break
            except expat.ExpatError, e:
                msg = str(e)
                raise ParseError(msg, self.filename, e.lineno, e.offset)

        return Stream(_generate()).filter(_coalesce)
Exemple #12
0
class HTMLParser(html.HTMLParser, object):
    """Parser for HTML input based on the Python `HTMLParser` module.
    
    This class provides the same interface for generating stream events as
    `XMLParser`, and attempts to automatically balance tags.
    
    The parsing is initiated by iterating over the parser object:
    
    >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
    >>> for kind, data, pos in parser:
    ...     print kind, data
    START (QName(u'ul'), Attrs([(QName(u'compact'), u'compact')]))
    START (QName(u'li'), Attrs())
    TEXT Foo
    END li
    END ul
    """

    _EMPTY_ELEMS = frozenset([
        'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input',
        'isindex', 'link', 'meta', 'param'
    ])

    def __init__(self, source, filename=None, encoding='utf-8'):
        """Initialize the parser for the given HTML input.
        
        :param source: the HTML text as a file-like object
        :param filename: the name of the file, if known
        :param filename: encoding of the file; ignored if the input is unicode
        """
        html.HTMLParser.__init__(self)
        self.source = source
        self.filename = filename
        self.encoding = encoding
        self._queue = []
        self._open_tags = []

    def parse(self):
        """Generator that parses the HTML source, yielding markup events.
        
        :return: a markup event stream
        :raises ParseError: if the HTML text is not well formed
        """
        def _generate():
            try:
                bufsize = 4 * 1024  # 4K
                done = False
                while 1:
                    while not done and len(self._queue) == 0:
                        data = self.source.read(bufsize)
                        if data == '':  # end of data
                            self.close()
                            done = True
                        else:
                            self.feed(data)
                    for kind, data, pos in self._queue:
                        yield kind, data, pos
                    self._queue = []
                    if done:
                        open_tags = self._open_tags
                        open_tags.reverse()
                        for tag in open_tags:
                            yield END, QName(tag), pos
                        break
            except html.HTMLParseError, e:
                msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
                raise ParseError(msg, self.filename, e.lineno, e.offset)

        return Stream(_generate()).filter(_coalesce)
Exemple #13
0
 def select(path):
     return Stream(content).select(path, namespaces, ctxt)
Exemple #14
0
 def test_processing_instruction(self):
     stream = Stream([(Stream.PI, ('python', 'x = 2'), (None, -1, -1))])
     output = stream.render(XMLSerializer)
     self.assertEqual('<?python x = 2?>', output)
Exemple #15
0
 def test_comment(self):
     stream = Stream([(Stream.COMMENT, 'foo bar', (None, -1, -1))])
     output = stream.render(XMLSerializer)
     self.assertEqual('<!--foo bar-->', output)
Exemple #16
0
 def test_doctype_in_stream_no_pubid_or_sysid(self):
     stream = Stream([(Stream.DOCTYPE, ('html', None, None),
                      (None, -1, -1))])
     output = stream.render(XMLSerializer)
     self.assertEqual('<!DOCTYPE html>\n', output)