Esempio n. 1
0
    def _decode_htmlescapes(self, s):
        """Unescape HTML code."""
        # In case of bad formated html you can import MinimalSoup etc.. see btflsoup source code
        from bs4 import BeautifulSoup as btflsoup

        # my sm2004 also ecaped & char in escaped sequences.
        s = re.sub("&", "&", s)
        # unescaped solitary chars < or > that were ok for minidom confuse btfl soup
        # s = re.sub(u'>',u'&gt;',s)
        # s = re.sub(u'<',u'&lt;',s)

        return str(btflsoup(s, "html.parser"))
Esempio n. 2
0
    def _decode_htmlescapes(self,s):
        """Unescape HTML code."""
        #In case of bad formated html you can import MinimalSoup etc.. see btflsoup source code
        from bs4 import BeautifulSoup as btflsoup

        #my sm2004 also ecaped & char in escaped sequences.
        s = re.sub('&amp;','&',s)
        #unescaped solitary chars < or > that were ok for minidom confuse btfl soup
        #s = re.sub(u'>',u'&gt;',s)
        #s = re.sub(u'<',u'&lt;',s)

        return str(btflsoup(s, "html.parser"))
Esempio n. 3
0
    def _decode_htmlescapes(self,s):
        """Unescape HTML code."""
        #In case of bad formated html you can import MinimalSoup etc.. see btflsoup source code
        from bs4 import BeautifulStoneSoup as btflsoup

        #my sm2004 also ecaped & char in escaped sequences.
        s = re.sub(u'&amp;',u'&',s)
        #unescaped solitary chars < or > that were ok for minidom confuse btfl soup
        s = re.sub(u'>',u'&gt;',s)
        s = re.sub(u'<',u'&lt;',s)

        return unicode(btflsoup(s,convertEntities=btflsoup.HTML_ENTITIES ))