Beispiel #1
0
    def _create_node(self, tag_name, parent_node, content='', **kwargs):
        """ Create a new minidom node
        """
        node = self.doc.createElement(tag_name)

        for key, value in kwargs.items():
            if isinstance(value, str):
                value = value.decode('utf8')
            node.setAttribute(key, value)

        if content is not False:
            if not content:
                content = ' '

            if isinstance(content, unicode):
                content = content.encode('utf8')

            content = html2xmlentities(content)
            contentDoc = TableGenerator._clean_and_parse_html(content)
            contentNode = contentDoc.getElementsByTagName('data')[0]

            for elm in list(contentNode.childNodes):
                node.appendChild(elm)

        parent_node.appendChild(node)

        return node
Beispiel #2
0
    def _create_node(self, tag_name, parent_node, content='', **kwargs):
        """ Create a new minidom node
        """
        node = self.doc.createElement(tag_name)

        for key, value in kwargs.items():
            if isinstance(value, str):
                value = value.decode('utf8')
            node.setAttribute(key, value)

        if content is not False:
            if not content:
                content = ' '

            if isinstance(content, unicode):
                content = content.encode('utf8')

            content = html2xmlentities(content)
            contentDoc = TableGenerator._clean_and_parse_html(content)
            contentNode = contentDoc.getElementsByTagName('data')[0]

            for elm in list(contentNode.childNodes):
                node.appendChild(elm)

        parent_node.appendChild(node)

        return node
Beispiel #3
0
    def parse(self):
        html = self.get_html()
        # cleanup html with BeautifulSoup
        html = str(BeautifulSoup(html, fromEncoding='utf-8'))
        # minidom hates htmlentities, but loves xmlentities -.-

        html = html2xmlentities(html)

        # parse DOM
        self.dom = minidom.parseString(html)
        self.parse_dom()
Beispiel #4
0
    def _clean_and_parse_html(html):
        """ Cleanup the given html and parse it
        """
        html = encode_htmlentities(html.decode('utf-8')).encode('utf-8')
        html = str(BeautifulSoup(html))
        html = html2xmlentities(html)

        try:
            doc = minidom.parseString('<data>%s</data>' % html)
        except UnicodeEncodeError:
            doc = minidom.parseString('<data>FEHLER</data>')
        return doc
Beispiel #5
0
    def __call__(self):
        html = self.get_html()

        # minidom hates htmlentities, but loves xmlentities -.-
        html = '<dummy>%s</dummy>' % html
        html = html2xmlentities(html)

        # parse DOM
        try:
            dom = minidom.parseString(html)
        except ExpatError, exc:
            # cleanup html with BeautifulSoup
            html = str(BeautifulSoup(html))
            dom = minidom.parseString(html)
Beispiel #6
0
    def _clean_and_parse_html(html):
        """ Cleanup the given html and parse it
        """

        html = cleanup_standalone_html_tags(html)

        html = encode_htmlentities(html.decode('utf-8')).encode('utf-8')
        html = str(BeautifulSoup(html))
        html = html2xmlentities(html)

        html = html.replace('&#60;', '<')
        html = html.replace('&#62;', '>')

        try:
            doc = minidom.parseString('<data>%s</data>' % html)
        except (UnicodeEncodeError, ExpatError):
            doc = minidom.parseString('<data>FEHLER</data>')
        return doc
Beispiel #7
0
    def _clean_and_parse_html(html):
        """ Cleanup the given html and parse it
        """

        html = cleanup_standalone_html_tags(html)

        html = encode_htmlentities(html.decode('utf-8')).encode('utf-8')
        html = str(BeautifulSoup(html))
        html = html2xmlentities(html)

        html = html.replace('&#60;', '<')
        html = html.replace('&#62;', '>')
        html = html.replace('&#34;', '"')

        try:
            doc = minidom.parseString('<data>%s</data>' % html)
        except (UnicodeEncodeError, ExpatError):
            doc = minidom.parseString('<data>FEHLER</data>')
        return doc
    def __call__(self):
        html = self.get_html()
        # cleanup html with BeautifulSoup
        html = str(BeautifulSoup(html))

        # minidom hates htmlentities, but loves xmlentities -.-
        html = '<dummy>%s</dummy>' % html
        html = html2xmlentities(html)

        # parse DOM
        dom = minidom.parseString(html)
        latex = []

        for node in dom.getElementsByTagName('dummy')[0].childNodes:
            if node.nodeType == minidom.Node.ELEMENT_NODE and \
                    node.tagName.lower() in self.listing_tag_mapping.keys():

                latex.extend(self.convert_listing_environment(node))

            else:
                latex.append(self.converter.convert(node.toxml()))

        latex.append('')
        self.replace_and_lock('\n'.join(latex))
 def test_html2xmlentities(self):
     self.assertEqual(utils.html2xmlentities("m&amp;m"), "m&#38;m")
     self.assertEqual(utils.html2xmlentities("a&foo;b"), "a&foo;b")