def _create_node(self, tag_name, parent_node, content='', **kwargs): """ Create a new minidom node """ node = self.doc.createElement(tag_name) for key, value in kwargs.items(): if isinstance(value, str): value = value.decode('utf8') node.setAttribute(key, value) if content is not False: if not content: content = ' ' if isinstance(content, unicode): content = content.encode('utf8') content = html2xmlentities(content) contentDoc = TableGenerator._clean_and_parse_html(content) contentNode = contentDoc.getElementsByTagName('data')[0] for elm in list(contentNode.childNodes): node.appendChild(elm) parent_node.appendChild(node) return node
def parse(self): html = self.get_html() # cleanup html with BeautifulSoup html = str(BeautifulSoup(html, fromEncoding='utf-8')) # minidom hates htmlentities, but loves xmlentities -.- html = html2xmlentities(html) # parse DOM self.dom = minidom.parseString(html) self.parse_dom()
def _clean_and_parse_html(html): """ Cleanup the given html and parse it """ html = encode_htmlentities(html.decode('utf-8')).encode('utf-8') html = str(BeautifulSoup(html)) html = html2xmlentities(html) try: doc = minidom.parseString('<data>%s</data>' % html) except UnicodeEncodeError: doc = minidom.parseString('<data>FEHLER</data>') return doc
def __call__(self): html = self.get_html() # minidom hates htmlentities, but loves xmlentities -.- html = '<dummy>%s</dummy>' % html html = html2xmlentities(html) # parse DOM try: dom = minidom.parseString(html) except ExpatError, exc: # cleanup html with BeautifulSoup html = str(BeautifulSoup(html)) dom = minidom.parseString(html)
def _clean_and_parse_html(html): """ Cleanup the given html and parse it """ html = cleanup_standalone_html_tags(html) html = encode_htmlentities(html.decode('utf-8')).encode('utf-8') html = str(BeautifulSoup(html)) html = html2xmlentities(html) html = html.replace('<', '<') html = html.replace('>', '>') try: doc = minidom.parseString('<data>%s</data>' % html) except (UnicodeEncodeError, ExpatError): doc = minidom.parseString('<data>FEHLER</data>') return doc
def _clean_and_parse_html(html): """ Cleanup the given html and parse it """ html = cleanup_standalone_html_tags(html) html = encode_htmlentities(html.decode('utf-8')).encode('utf-8') html = str(BeautifulSoup(html)) html = html2xmlentities(html) html = html.replace('<', '<') html = html.replace('>', '>') html = html.replace('"', '"') try: doc = minidom.parseString('<data>%s</data>' % html) except (UnicodeEncodeError, ExpatError): doc = minidom.parseString('<data>FEHLER</data>') return doc
def __call__(self): html = self.get_html() # cleanup html with BeautifulSoup html = str(BeautifulSoup(html)) # minidom hates htmlentities, but loves xmlentities -.- html = '<dummy>%s</dummy>' % html html = html2xmlentities(html) # parse DOM dom = minidom.parseString(html) latex = [] for node in dom.getElementsByTagName('dummy')[0].childNodes: if node.nodeType == minidom.Node.ELEMENT_NODE and \ node.tagName.lower() in self.listing_tag_mapping.keys(): latex.extend(self.convert_listing_environment(node)) else: latex.append(self.converter.convert(node.toxml())) latex.append('') self.replace_and_lock('\n'.join(latex))
def test_html2xmlentities(self): self.assertEqual(utils.html2xmlentities("m&m"), "m&m") self.assertEqual(utils.html2xmlentities("a&foo;b"), "a&foo;b")