Exemple #1
0
    def _clean_and_parse_html(html):
        """ Cleanup the given html and parse it
        """
        html = encode_htmlentities(html.decode('utf-8')).encode('utf-8')
        html = str(BeautifulSoup(html))
        html = html2xmlentities(html)

        try:
            doc = minidom.parseString('<data>%s</data>' % html)
        except UnicodeEncodeError:
            doc = minidom.parseString('<data>FEHLER</data>')
        return doc
Exemple #2
0
    def _clean_and_parse_html(html):
        """ Cleanup the given html and parse it
        """

        html = cleanup_standalone_html_tags(html)

        html = encode_htmlentities(html.decode('utf-8')).encode('utf-8')
        html = str(BeautifulSoup(html))
        html = html2xmlentities(html)

        html = html.replace('&#60;', '<')
        html = html.replace('&#62;', '>')

        try:
            doc = minidom.parseString('<data>%s</data>' % html)
        except (UnicodeEncodeError, ExpatError):
            doc = minidom.parseString('<data>FEHLER</data>')
        return doc
Exemple #3
0
    def _clean_and_parse_html(html):
        """ Cleanup the given html and parse it
        """

        html = cleanup_standalone_html_tags(html)

        html = encode_htmlentities(html.decode('utf-8')).encode('utf-8')
        html = str(BeautifulSoup(html))
        html = html2xmlentities(html)

        html = html.replace('&#60;', '<')
        html = html.replace('&#62;', '>')
        html = html.replace('&#34;', '"')

        try:
            doc = minidom.parseString('<data>%s</data>' % html)
        except (UnicodeEncodeError, ExpatError):
            doc = minidom.parseString('<data>FEHLER</data>')
        return doc
 def convert_plain(self, text, **kwargs):
     html = encode_htmlentities(text)
     return self.convert(html, **kwargs)
 def test_encode_htmlentities(self):
     self.assertEqual(utils.encode_htmlentities('"X>Y"'), "&quot;X&gt;Y&quot;")
     self.assertEqual(utils.encode_htmlentities("m&m"), "m&amp;m")
     self.assertEqual(utils.encode_htmlentities("a&foo;b"), "a&amp;foo;b")
     self.assertEqual(utils.encode_htmlentities(u"uml\xe4ut"), u"uml&auml;ut")
     self.assertEqual(utils.encode_htmlentities("uml\xc3\xa4ut"), "uml&auml;ut")