def _clean_and_parse_html(html): """ Cleanup the given html and parse it """ html = encode_htmlentities(html.decode('utf-8')).encode('utf-8') html = str(BeautifulSoup(html)) html = html2xmlentities(html) try: doc = minidom.parseString('<data>%s</data>' % html) except UnicodeEncodeError: doc = minidom.parseString('<data>FEHLER</data>') return doc
def _clean_and_parse_html(html): """ Cleanup the given html and parse it """ html = cleanup_standalone_html_tags(html) html = encode_htmlentities(html.decode('utf-8')).encode('utf-8') html = str(BeautifulSoup(html)) html = html2xmlentities(html) html = html.replace('<', '<') html = html.replace('>', '>') try: doc = minidom.parseString('<data>%s</data>' % html) except (UnicodeEncodeError, ExpatError): doc = minidom.parseString('<data>FEHLER</data>') return doc
def _clean_and_parse_html(html): """ Cleanup the given html and parse it """ html = cleanup_standalone_html_tags(html) html = encode_htmlentities(html.decode('utf-8')).encode('utf-8') html = str(BeautifulSoup(html)) html = html2xmlentities(html) html = html.replace('<', '<') html = html.replace('>', '>') html = html.replace('"', '"') try: doc = minidom.parseString('<data>%s</data>' % html) except (UnicodeEncodeError, ExpatError): doc = minidom.parseString('<data>FEHLER</data>') return doc
def convert_plain(self, text, **kwargs): html = encode_htmlentities(text) return self.convert(html, **kwargs)
def test_encode_htmlentities(self): self.assertEqual(utils.encode_htmlentities('"X>Y"'), ""X>Y"") self.assertEqual(utils.encode_htmlentities("m&m"), "m&m") self.assertEqual(utils.encode_htmlentities("a&foo;b"), "a&foo;b") self.assertEqual(utils.encode_htmlentities(u"uml\xe4ut"), u"umläut") self.assertEqual(utils.encode_htmlentities("uml\xc3\xa4ut"), "umläut")