Example #1
0
    def test_html_invalid_utf8_entity_encoded(self):
        """Test for invalid entity encoded chars"""
        samples = {
            'Valid ASCII': u"a",
            'Valid 2 Octet Sequence': u"&#xc3b1",
            'Invalid 2 Octet Sequence': u"&#xc328",
            'Invalid Sequence Identifier': u"&#xa0a1",
            'Valid 3 Octet Sequence': u"&#xe282a1",
            'Invalid 3 Octet Sequence (in 2nd Octet)': u"&#xe228a1",
            'Invalid 3 Octet Sequence (in 3rd Octet)': u"&#xe28228",
            'Valid 4 Octet Sequence': u"&#xf0908cbc",
            'Invalid 4 Octet Sequence (in 2nd Octet)': u"&#xf0288cbc",
            'Invalid 4 Octet Sequence (in 3rd Octet)': u"&#xf09028bc",
            'Invalid 4 Octet Sequence (in 4th Octet)': u"&#xf0288c28",
            'Valid 5 Octet Sequence (but not Unicode!)': u" &#xf8a1a1a1a1 ",
            'Valid 6 Octet Sequence (but not Unicode!)': u" &#xfca1a1a1a1a1 ",
            'Invalid unicode FFFE': u"&#xFFFE",
            'Invalid unicode FFFF': u"&#xFFFF",
        }

        for desc, sample in samples.iteritems():
            try:
                htmldecode(sample)
            except Exception as e:
                msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".'
                self.assertTrue(False, msg % (e, desc))
Example #2
0
    def test_html_invalid_utf8_entity_encoded(self):
        """Test for invalid entity encoded chars"""
        samples = {
            'Valid ASCII': u"a",
            'Valid 2 Octet Sequence': u"&#xc3b1",
            'Invalid 2 Octet Sequence': u"&#xc328",
            'Invalid Sequence Identifier': u"&#xa0a1",
            'Valid 3 Octet Sequence': u"&#xe282a1",
            'Invalid 3 Octet Sequence (in 2nd Octet)': u"&#xe228a1",
            'Invalid 3 Octet Sequence (in 3rd Octet)': u"&#xe28228",
            'Valid 4 Octet Sequence': u"&#xf0908cbc",
            'Invalid 4 Octet Sequence (in 2nd Octet)': u"&#xf0288cbc",
            'Invalid 4 Octet Sequence (in 3rd Octet)': u"&#xf09028bc",
            'Invalid 4 Octet Sequence (in 4th Octet)': u"&#xf0288c28",
            'Valid 5 Octet Sequence (but not Unicode!)': u" &#xf8a1a1a1a1 ",
            'Valid 6 Octet Sequence (but not Unicode!)': u" &#xfca1a1a1a1a1 ",
            'Invalid unicode FFFE': u"&#xFFFE",
            'Invalid unicode FFFF': u"&#xFFFF",
        }

        for desc, sample in samples.iteritems():
            try:
                htmldecode(sample)
            except Exception as e:
                msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".'
                self.assertTrue(False, msg % (e, desc))
Example #3
0
def html_unescape(t):
    """Decoder doing HTML unescaping.

    >>> encode_decode.htmldecode('<script>')
    u'<script>'
    >>>
    """
    return encode_decode.htmldecode(t)
Example #4
0
def html_unescape(t):
    """Decoder doing HTML unescaping.

    >>> encode_decode.htmldecode('&lt;script&gt;')
    u'<script>'
    >>>
    """
    return encode_decode.htmldecode(t)
Example #5
0
 def test_bug_trigger_case01(self):
     """
     u'í'.decode('utf-8')
     
     UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in
                         position 9745: ordinal not in range(128)
     """
     html = u'Aquí encontrará'
     self.assertEqual(htmldecode(html), html)
Example #6
0
 def test_bug_trigger_case01(self):
     """
     u'í'.decode('utf-8')
     
     UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in
                         position 9745: ordinal not in range(128)
     """
     html = u'Aquí encontrará'
     self.assertEqual(htmldecode(html), html)
Example #7
0
    def _extract_emails(self, doc_str):
        """
        :return: A set() with all mail users that are present in the doc_str.
        @see: We don't support emails like myself <at> gmail !dot! com
        """
        # Revert url-encoded sub-strings
        doc_str = urllib.unquote_plus(doc_str)

        # Then html-decode HTML special characters
        doc_str = htmldecode(doc_str)

        self._emails = set()

        # Perform a fast search for the @. In w3af, if we don't have an @ we
        # don't have an email.
        if doc_str.find('@') != -1:
            compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE)
            doc_str = re.sub(compiled_re, ' ', doc_str)
            for email, domain in re.findall(self.EMAIL_RE, doc_str):
                self._emails.add(email)

        return self._emails
Example #8
0
    def _extract_emails(self, doc_str):
        """
        :return: A set() with all mail users that are present in the doc_str.
        @see: We don't support emails like myself <at> gmail !dot! com
        """
        # Revert url-encoded sub-strings
        doc_str = urllib.unquote_plus(doc_str)

        # Then html-decode HTML special characters
        doc_str = htmldecode(doc_str)

        self._emails = set()

        # Perform a fast search for the @. In w3af, if we don't have an @ we
        # don't have an email.
        if doc_str.find('@') != -1:
            compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE)
            doc_str = re.sub(compiled_re, ' ', doc_str)
            for email, domain in re.findall(self.EMAIL_RE, doc_str):
                self._emails.add(email)

        return self._emails
Example #9
0
 def test_bug_trigger_case02(self):
     html_utf8_raw = 'Aqu\xc3\xad encontrar\xc3\xa1'
     html_unicode = 'Aqu\xc3\xad encontrar\xc3\xa1'.decode('utf-8')
     self.assertEqual(htmldecode(html_utf8_raw), html_unicode)
Example #10
0
 def test_html_encoded(self):
     self.assertEqual(htmldecode(u'&aacute;'), u'á')
Example #11
0
 def test_charref(self):
     self.assertEqual(htmldecode(u'hola mundo &#x41'), u'hola mundo A')
Example #12
0
 def test_special_char(self):
     self.assertEqual(htmldecode(u'hola &#0443'), u'hola ƻ')
Example #13
0
 def test_tilde(self):
     self.assertEqual(htmldecode(u'hólá múndó'), u'hólá múndó')
Example #14
0
 def test_html_encoded(self):
     self.assertEqual(htmldecode(u'&aacute;'), u'á')
Example #15
0
 def test_special_char(self):
     self.assertEqual(htmldecode(u'hola &#0443'), u'hola ƻ')
Example #16
0
 def test_bug_trigger_case04(self):
     html = u'\xed'
     self.assertEqual(htmldecode(html), html)
Example #17
0
 def test_bug_trigger_case02(self):
     html_utf8_raw = 'Aqu\xc3\xad encontrar\xc3\xa1'
     html_unicode = 'Aqu\xc3\xad encontrar\xc3\xa1'.decode('utf-8')
     self.assertEqual(htmldecode(html_utf8_raw), html_unicode)
Example #18
0
 def test_bug_trigger_case04(self):
     html = u'\xed'
     self.assertEqual(htmldecode(html), html)
Example #19
0
 def test_tilde(self):
     self.assertEqual(htmldecode(u'hólá múndó'), u'hólá múndó')
Example #20
0
 def test_charref(self):
     self.assertEqual(htmldecode(u'hola mundo &#x41'), u'hola mundo A')
Example #21
0
 def test_simple(self):
     self.assertEqual(htmldecode('hola mundo'), 'hola mundo')
Example #22
0
 def test_simple(self):
     self.assertEqual(htmldecode('hola mundo'), 'hola mundo')