Beispiel #1
0
    def test_html_invalid_utf8_entity_encoded(self):
        """Test for invalid entity encoded chars"""
        samples = {
            "Valid ASCII": u"a",
            "Valid 2 Octet Sequence": u"&#xc3b1",
            "Invalid 2 Octet Sequence": u"&#xc328",
            "Invalid Sequence Identifier": u"&#xa0a1",
            "Valid 3 Octet Sequence": u"&#xe282a1",
            "Invalid 3 Octet Sequence (in 2nd Octet)": u"&#xe228a1",
            "Invalid 3 Octet Sequence (in 3rd Octet)": u"&#xe28228",
            "Valid 4 Octet Sequence": u"&#xf0908cbc",
            "Invalid 4 Octet Sequence (in 2nd Octet)": u"&#xf0288cbc",
            "Invalid 4 Octet Sequence (in 3rd Octet)": u"&#xf09028bc",
            "Invalid 4 Octet Sequence (in 4th Octet)": u"&#xf0288c28",
            "Valid 5 Octet Sequence (but not Unicode!)": u" &#xf8a1a1a1a1 ",
            "Valid 6 Octet Sequence (but not Unicode!)": u" &#xfca1a1a1a1a1 ",
            "Invalid unicode FFFE": u"&#xFFFE",
            "Invalid unicode FFFF": u"&#xFFFF",
        }

        for desc, sample in samples.iteritems():
            try:
                htmldecode(sample)
            except Exception as e:
                msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".'
                self.assertTrue(False, msg % (e, desc))
Beispiel #2
0
    def test_html_invalid_utf8_entity_encoded(self):
        '''Test for invalid entity encoded chars'''
        samples = {
            'Valid ASCII': u"a",
            'Valid 2 Octet Sequence': u"&#xc3b1",
            'Invalid 2 Octet Sequence': u"&#xc328",
            'Invalid Sequence Identifier': u"&#xa0a1",
            'Valid 3 Octet Sequence': u"&#xe282a1",
            'Invalid 3 Octet Sequence (in 2nd Octet)': u"&#xe228a1",
            'Invalid 3 Octet Sequence (in 3rd Octet)': u"&#xe28228",
            'Valid 4 Octet Sequence': u"&#xf0908cbc",
            'Invalid 4 Octet Sequence (in 2nd Octet)': u"&#xf0288cbc",
            'Invalid 4 Octet Sequence (in 3rd Octet)': u"&#xf09028bc",
            'Invalid 4 Octet Sequence (in 4th Octet)': u"&#xf0288c28",
            'Valid 5 Octet Sequence (but not Unicode!)': u" &#xf8a1a1a1a1 ",
            'Valid 6 Octet Sequence (but not Unicode!)': u" &#xfca1a1a1a1a1 ",
            'Invalid unicode FFFE': u"&#xFFFE",
            'Invalid unicode FFFF': u"&#xFFFF",
        }

        for desc, sample in samples.iteritems():
            try:
                htmldecode(sample)
            except Exception as e:
                msg = 'Exception "%s" was raised when trying to htmldecode() a "%s".'
                self.assertTrue(False, msg % (e, desc))
Beispiel #3
0
def html_unescape(t):
    '''Decoder doing HTML unescaping.

    >>> encode_decode.htmldecode('<script>')
    u'<script>'
    >>>
    '''
    return encode_decode.htmldecode(t)
Beispiel #4
0
def html_unescape(t):
    '''Decoder doing HTML unescaping.

    >>> encode_decode.htmldecode('&lt;script&gt;')
    '<script>'
    >>> 
    '''
    return encode_decode.htmldecode(t)
Beispiel #5
0
    def findEmails( self , documentString ):
        '''
        @return: A list with all mail users that are present in the documentString.

        Init,
        >>> from core.data.url.httpResponse import httpResponse as httpResponse
        >>> u = url_object('http://www.w3af.com/')
        >>> response = httpResponse( 200, '', {}, u, u )
        >>> a = abstractParser(response)
        
        First test, no emails.
        >>> a.findEmails( '' )
        []
        
        >>> a = abstractParser(response)
        >>> a.findEmails( ' [email protected] ' )
        ['*****@*****.**']
        
        >>> a = abstractParser(response)
        >>> a.findEmails( '<a href="mailto:[email protected]">test</a>' )
        ['*****@*****.**']

        >>> a = abstractParser(response)
        >>> a.findEmails( '<a href="mailto:[email protected]">[email protected]</a>' )
        ['*****@*****.**']

        >>> a = abstractParser(response)
        >>> a.findEmails( '<a href="mailto:[email protected]">[email protected]</a>' )
        ['*****@*****.**', '*****@*****.**']

        >>> a = abstractParser(response)
        >>> a.findEmails( 'header [email protected] footer' )
        ['*****@*****.**']
        
        >>> a = abstractParser(response)
        >>> a.findEmails( 'header [email protected] footer' )
        ['*****@*****.**']
        '''
        # First, we decode all chars. I have found some strange sites where they encode the @... some other
        # sites where they encode the email, or add some %20 padding... strange stuff... so better be safe...
        documentString = urllib.unquote_plus( documentString )
        
        # Now we decode the HTML special characters...
        documentString = htmldecode( documentString )
        
        # Perform a fast search for the @. In w3af, if we don't have an @ we don't have an email
        # We don't support mails like myself <at> gmail !dot! com
        if documentString.find('@') != -1:
            documentString = re.sub( '[^\w@\-\\.]', ' ', documentString )

            # NOTE: emailRegex is also used in pks search engine.
            # Now we have a clean documentString; and we can match the mail addresses!
            emailRegex = '([A-Z0-9\._%-]{1,45}@([A-Z0-9\.-]{1,45}\.){1,10}[A-Z]{2,4})'
            for email, domain in re.findall(emailRegex, documentString,  re.IGNORECASE):
                if email not in self._emails:
                    self._emails.append( email )
                    
        return self._emails
Beispiel #6
0
 def test_bug_trigger_case01(self):
     '''
     u'í'.decode('utf-8')
     
     UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in
                         position 9745: ordinal not in range(128)
     '''
     html = u'Aquí encontrará'
     self.assertEqual(htmldecode(html), html)
Beispiel #7
0
 def test_bug_trigger_case01(self):
     """
     u'í'.decode('utf-8')
     
     UnicodeEncodeError: 'ascii' codec can't encode character u'\xed' in
                         position 9745: ordinal not in range(128)
     """
     html = u"Aquí encontrará"
     self.assertEqual(htmldecode(html), html)
Beispiel #8
0
    def _findEmails(self, doc_str):
        '''
        @return: A list with all mail users that are present in the doc_str.

        Init,
        >>> from core.data.url.httpResponse import httpResponse as httpResponse
        >>> u = url_object('http://www.w3af.com/')
        >>> response = httpResponse( 200, '', {}, u, u )
        >>> a = BaseParser(response)
        
        First test, no emails.
        >>> a._findEmails( '' )
        []
        
        >>> a = BaseParser(response)
        >>> a._findEmails(u' [email protected] ')
        [u'*****@*****.**']
        
        >>> a = BaseParser(response)
        >>> a._findEmails(u'<a href="mailto:[email protected]">test</a>')
        [u'*****@*****.**']

        >>> a = BaseParser(response)
        >>> a._findEmails(u'<a href="mailto:[email protected]">[email protected]</a>')
        [u'*****@*****.**']

        >>> a = BaseParser(response)
        >>> a._findEmails(u'<a href="mailto:[email protected]">[email protected]</a>')
        [u'*****@*****.**', u'*****@*****.**']

        >>> a = BaseParser(response)
        >>> a._findEmails(u'header [email protected] footer')
        [u'*****@*****.**']
        
        >>> a = BaseParser(response)
        >>> a._findEmails(u'header [email protected] footer')
        [u'*****@*****.**']
        '''
        # Revert url-encoded sub-strings
        doc_str = urllib.unquote_plus(doc_str)
        
        # Then html-decode HTML special characters
        doc_str = htmldecode(doc_str)
        
        # Perform a fast search for the @. In w3af, if we don't have an @ we
        # don't have an email
        # We don't support mails like myself <at> gmail !dot! com
        if doc_str.find('@') != -1:
            compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE)
            doc_str = re.sub(compiled_re, ' ', doc_str)
            for email, domain in re.findall(self.EMAIL_RE, doc_str):
                if email not in self._emails:
                    self._emails.append(email)
                    
        return self._emails
Beispiel #9
0
    def _extract_emails(self, doc_str):
        '''
        :return: A set() with all mail users that are present in the doc_str.
        @see: We don't support emails like myself <at> gmail !dot! com
        '''
        # Revert url-encoded sub-strings
        doc_str = urllib.unquote_plus(doc_str)

        # Then html-decode HTML special characters
        doc_str = htmldecode(doc_str)

        self._emails = set()

        # Perform a fast search for the @. In w3af, if we don't have an @ we
        # don't have an email.
        if doc_str.find('@') != -1:
            compiled_re = re.compile('[^\w@\-\\.]', re.UNICODE)
            doc_str = re.sub(compiled_re, ' ', doc_str)
            for email, domain in re.findall(self.EMAIL_RE, doc_str):
                self._emails.add(email)

        return self._emails
 def findEmails( self , documentString ):
     '''
     @return: A list with all mail users that are present in the documentString.
     '''
     # First, we decode all chars. I have found some strange sites where they encode the @... some other
     # sites where they encode the email, or add some %20 padding... strange stuff... so better be safe...
     documentString = urllib.unquote_plus( documentString )
     
     # Now we decode the html special characters...
     documentString = htmldecode( documentString )
     
     # Perform a fast search for the @. In w3af, if we don't have an @ we don't have an email
     # We don't support mails like myself <at> gmail !dot! com
     if documentString.find('@') != -1:
         documentString = re.sub( '[^\w@\\.]', ' ', documentString )
         
         # NOTE: emailRegex is also used in pks search engine.
         # Now we have a clean documentString; and we can match the mail addresses!
         emailRegex = '([A-Z0-9\._%-]{1,45}@([A-Z0-9\.-]{1,45}\.){1,10}[A-Z]{2,4})'
         for email, domain in re.findall(emailRegex, documentString,  re.IGNORECASE):
             if email not in self._emails:
                 self._emails.append( email )
                 
     return self._emails
Beispiel #11
0
 def test_bug_trigger_case02(self):
     html_utf8_raw = 'Aqu\xc3\xad encontrar\xc3\xa1'
     html_unicode = 'Aqu\xc3\xad encontrar\xc3\xa1'.decode('utf-8')
     self.assertEqual(htmldecode(html_utf8_raw), html_unicode)
Beispiel #12
0
 def test_html_encoded(self):
     self.assertEqual(htmldecode(u'&aacute;'), u'á')
Beispiel #13
0
 def test_charref(self):
     self.assertEqual(htmldecode(u'hola mundo &#x41'), u'hola mundo A')
Beispiel #14
0
 def test_special_char(self):
     self.assertEqual(htmldecode(u'hola &#0443'), u'hola ƻ')
Beispiel #15
0
 def test_simple(self):
     self.assertEqual(htmldecode("hola mundo"), "hola mundo")
Beispiel #16
0
 def test_special_char(self):
     self.assertEqual(htmldecode(u"hola &#0443"), u"hola ƻ")
Beispiel #17
0
 def test_charref(self):
     self.assertEqual(htmldecode(u"hola mundo &#x41"), u"hola mundo A")
Beispiel #18
0
 def test_bug_trigger_case04(self):
     html = u"\xed"
     self.assertEqual(htmldecode(html), html)
Beispiel #19
0
 def test_bug_trigger_case02(self):
     html_utf8_raw = "Aqu\xc3\xad encontrar\xc3\xa1"
     html_unicode = "Aqu\xc3\xad encontrar\xc3\xa1".decode("utf-8")
     self.assertEqual(htmldecode(html_utf8_raw), html_unicode)
Beispiel #20
0
 def test_bug_trigger_case04(self):
     html = u'\xed'
     self.assertEqual(htmldecode(html), html)
Beispiel #21
0
 def test_tilde(self):
     self.assertEqual(htmldecode(u"hólá múndó"), u"hólá múndó")
Beispiel #22
0
 def test_simple(self):
     self.assertEqual(htmldecode('hola mundo'), 'hola mundo')
Beispiel #23
0
 def test_tilde(self):
     self.assertEqual(htmldecode(u'hólá múndó'), u'hólá múndó')
Beispiel #24
0
 def test_html_encoded(self):
     self.assertEqual(htmldecode(u"&aacute;"), u"á")