Python htmldecode Examples, w3af.core.data.parsers.encode_decode.htmldecode Python Examples

Example #1

0

Show file

    def test_html_invalid_utf8_entity_encoded(self):
        """Test for invalid entity encoded chars"""
        samples = {
            'Valid ASCII': u"a",
            'Valid 2 Octet Sequence': u"&#xc3b1",
            'Invalid 2 Octet Sequence': u"&#xc328",
            'Invalid Sequence Identifier': u"&#xa0a1",
            'Valid 3 Octet Sequence': u"&#xe282a1",
            'Invalid 3 Octet Sequence (in 2nd Octet)': u"&#xe228a1",
            'Invalid 3 Octet Sequence (in 3rd Octet)': u"&#xe28228",
            'Valid 4 Octet Sequence': u"&#xf0908cbc",
            'Invalid 4 Octet Sequence (in 2nd Octet)': u"&#xf0288cbc",
            'Invalid 4 Octet Sequence (in 3rd Octet)': u"&#xf09028bc",
            'Invalid 4 Octet Sequence (in 4th Octet)': u"&#xf0288c28",
            'Valid 5 Octet Sequence (but not Unicode!)': u" &#xf8a1a1a1a1 ",
            'Valid 6 Octet Sequence (but not Unicode!)': u" &#xfca1a1a1a1a1 ",
            'Invalid unicode FFFE': u"&#xFFFE",
            'Invalid unicode FFFF': u"&#xFFFF",
        }

        for desc, sample in samples.iteritems():
            try:
                htmldecode(sample)
            except Exception as e:
                msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".'
                self.assertTrue(False, msg % (e, desc))

Example #2

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

    def test_html_invalid_utf8_entity_encoded(self):
        """Test for invalid entity encoded chars"""
        samples = {
            'Valid ASCII': u"a",
            'Valid 2 Octet Sequence': u"&#xc3b1",
            'Invalid 2 Octet Sequence': u"&#xc328",
            'Invalid Sequence Identifier': u"&#xa0a1",
            'Valid 3 Octet Sequence': u"&#xe282a1",
            'Invalid 3 Octet Sequence (in 2nd Octet)': u"&#xe228a1",
            'Invalid 3 Octet Sequence (in 3rd Octet)': u"&#xe28228",
            'Valid 4 Octet Sequence': u"&#xf0908cbc",
            'Invalid 4 Octet Sequence (in 2nd Octet)': u"&#xf0288cbc",
            'Invalid 4 Octet Sequence (in 3rd Octet)': u"&#xf09028bc",
            'Invalid 4 Octet Sequence (in 4th Octet)': u"&#xf0288c28",
            'Valid 5 Octet Sequence (but not Unicode!)': u" &#xf8a1a1a1a1 ",
            'Valid 6 Octet Sequence (but not Unicode!)': u" &#xfca1a1a1a1a1 ",
            'Invalid unicode FFFE': u"&#xFFFE",
            'Invalid unicode FFFF': u"&#xFFFF",
        }

        for desc, sample in samples.iteritems():
            try:
                htmldecode(sample)
            except Exception as e:
                msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".'
                self.assertTrue(False, msg % (e, desc))

Example #3

0

Show file

File: encdec.py Project: vasubesimple/w3af

def html_unescape(t):
    """Decoder doing HTML unescaping.

    >>> encode_decode.htmldecode('&lt;script&gt;')
    u'<script>'
    >>>
    """
    return encode_decode.htmldecode(t)

Example #4

0

Show file

File: encdec.py Project: cathartic/w3af

def html_unescape(t):
    """Decoder doing HTML unescaping.

    >>> encode_decode.htmldecode('&lt;script&gt;')
    u'<script>'
    >>>
    """
    return encode_decode.htmldecode(t)

Example #5

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_bug_trigger_case01(self):
     """
     u'í'.decode('utf-8')
     
     UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in
                         position 9745: ordinal not in range(128)
     """
     html = u'Aquí encontrará'
     self.assertEqual(htmldecode(html), html)

Example #6

0

Show file

 def test_bug_trigger_case01(self):
     """
     u'í'.decode('utf-8')
     
     UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in
                         position 9745: ordinal not in range(128)
     """
     html = u'Aquí encontrará'
     self.assertEqual(htmldecode(html), html)

Example #7

0

Show file

File: baseparser.py Project: intfrr/Tortazo

    def _extract_emails(self, doc_str):
        """
        :return: A set() with all mail users that are present in the doc_str.
        @see: We don't support emails like myself <at> gmail !dot! com
        """
        # Revert url-encoded sub-strings
        doc_str = urllib.unquote_plus(doc_str)

        # Then html-decode HTML special characters
        doc_str = htmldecode(doc_str)

        self._emails = set()

        # Perform a fast search for the @. In w3af, if we don't have an @ we
        # don't have an email.
        if doc_str.find('@') != -1:
            compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE)
            doc_str = re.sub(compiled_re, ' ', doc_str)
            for email, domain in re.findall(self.EMAIL_RE, doc_str):
                self._emails.add(email)

        return self._emails

Example #8

0

Show file

File: baseparser.py Project: Adastra-thw/Tortazo

    def _extract_emails(self, doc_str):
        """
        :return: A set() with all mail users that are present in the doc_str.
        @see: We don't support emails like myself <at> gmail !dot! com
        """
        # Revert url-encoded sub-strings
        doc_str = urllib.unquote_plus(doc_str)

        # Then html-decode HTML special characters
        doc_str = htmldecode(doc_str)

        self._emails = set()

        # Perform a fast search for the @. In w3af, if we don't have an @ we
        # don't have an email.
        if doc_str.find('@') != -1:
            compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE)
            doc_str = re.sub(compiled_re, ' ', doc_str)
            for email, domain in re.findall(self.EMAIL_RE, doc_str):
                self._emails.add(email)

        return self._emails

Example #9

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_bug_trigger_case02(self):
     html_utf8_raw = 'Aqu\xc3\xad encontrar\xc3\xa1'
     html_unicode = 'Aqu\xc3\xad encontrar\xc3\xa1'.decode('utf-8')
     self.assertEqual(htmldecode(html_utf8_raw), html_unicode)

Example #10

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_html_encoded(self):
     self.assertEqual(htmldecode(u'&aacute;'), u'á')

Example #11

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_charref(self):
     self.assertEqual(htmldecode(u'hola mundo &#x41'), u'hola mundo A')

Example #12

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_special_char(self):
     self.assertEqual(htmldecode(u'hola &#0443'), u'hola ƻ')

Example #13

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_tilde(self):
     self.assertEqual(htmldecode(u'hólá múndó'), u'hólá múndó')

Example #14

0

Show file

 def test_html_encoded(self):
     self.assertEqual(htmldecode(u'&aacute;'), u'á')

Example #15

0

Show file

 def test_special_char(self):
     self.assertEqual(htmldecode(u'hola &#0443'), u'hola ƻ')

Example #16

0

Show file

 def test_bug_trigger_case04(self):
     html = u'\xed'
     self.assertEqual(htmldecode(html), html)

Example #17

0

Show file

 def test_bug_trigger_case02(self):
     html_utf8_raw = 'Aqu\xc3\xad encontrar\xc3\xa1'
     html_unicode = 'Aqu\xc3\xad encontrar\xc3\xa1'.decode('utf-8')
     self.assertEqual(htmldecode(html_utf8_raw), html_unicode)

Example #18

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_bug_trigger_case04(self):
     html = u'\xed'
     self.assertEqual(htmldecode(html), html)

Example #19

0

Show file

 def test_tilde(self):
     self.assertEqual(htmldecode(u'hólá múndó'), u'hólá múndó')

Example #20

0

Show file

 def test_charref(self):
     self.assertEqual(htmldecode(u'hola mundo &#x41'), u'hola mundo A')

Example #21

0

Show file

File: test_encode_decode.py Project: 3rdDegree/w3af

 def test_simple(self):
     self.assertEqual(htmldecode('hola mundo'), 'hola mundo')

Example #22

0

Show file

 def test_simple(self):
     self.assertEqual(htmldecode('hola mundo'), 'hola mundo')