def test_html_invalid_utf8_entity_encoded(self): """Test for invalid entity encoded chars""" samples = { 'Valid ASCII': u"a", 'Valid 2 Octet Sequence': u"쎱", 'Invalid 2 Octet Sequence': u"쌨", 'Invalid Sequence Identifier': u"ꂡ", 'Valid 3 Octet Sequence': u"�", 'Invalid 3 Octet Sequence (in 2nd Octet)': u"�", 'Invalid 3 Octet Sequence (in 3rd Octet)': u"�", 'Valid 4 Octet Sequence': u"�", 'Invalid 4 Octet Sequence (in 2nd Octet)': u"�", 'Invalid 4 Octet Sequence (in 3rd Octet)': u"�", 'Invalid 4 Octet Sequence (in 4th Octet)': u"�", 'Valid 5 Octet Sequence (but not Unicode!)': u" � ", 'Valid 6 Octet Sequence (but not Unicode!)': u" � ", 'Invalid unicode FFFE': u"", 'Invalid unicode FFFF': u"", } for desc, sample in samples.iteritems(): try: htmldecode(sample) except Exception as e: msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".' self.assertTrue(False, msg % (e, desc))
def html_unescape(t): """Decoder doing HTML unescaping. >>> encode_decode.htmldecode('<script>') u'<script>' >>> """ return encode_decode.htmldecode(t)
def test_bug_trigger_case01(self): """ u'í'.decode('utf-8') UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in position 9745: ordinal not in range(128) """ html = u'Aquí encontrará' self.assertEqual(htmldecode(html), html)
def _extract_emails(self, doc_str): """ :return: A set() with all mail users that are present in the doc_str. @see: We don't support emails like myself <at> gmail !dot! com """ # Revert url-encoded sub-strings doc_str = urllib.unquote_plus(doc_str) # Then html-decode HTML special characters doc_str = htmldecode(doc_str) self._emails = set() # Perform a fast search for the @. In w3af, if we don't have an @ we # don't have an email. if doc_str.find('@') != -1: compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE) doc_str = re.sub(compiled_re, ' ', doc_str) for email, domain in re.findall(self.EMAIL_RE, doc_str): self._emails.add(email) return self._emails
def test_bug_trigger_case02(self): html_utf8_raw = 'Aqu\xc3\xad encontrar\xc3\xa1' html_unicode = 'Aqu\xc3\xad encontrar\xc3\xa1'.decode('utf-8') self.assertEqual(htmldecode(html_utf8_raw), html_unicode)
def test_html_encoded(self): self.assertEqual(htmldecode(u'á'), u'á')
def test_charref(self): self.assertEqual(htmldecode(u'hola mundo A'), u'hola mundo A')
def test_special_char(self): self.assertEqual(htmldecode(u'hola ƻ'), u'hola ƻ')
def test_tilde(self): self.assertEqual(htmldecode(u'hólá múndó'), u'hólá múndó')
def test_bug_trigger_case04(self): html = u'\xed' self.assertEqual(htmldecode(html), html)
def test_simple(self): self.assertEqual(htmldecode('hola mundo'), 'hola mundo')