def test_parse_response_with_no_charset_in_header(self): # No charset was specified, use the default as well as the default # error handling scheme for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) resp = self.create_resp({'Content-Type':'text/xml'}, html) self.assertEquals( smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), resp.body )
def test_parse_response_with_wrong_charset(self): # A wrong or non-existant charset was set; try to decode the response # using the default charset and handling scheme from random import choice for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) headers = {'Content-Type': 'text/xml; charset=%s' % choice(('XXX', 'utf-8'))} resp = self.create_resp(headers, html) self.assertEquals( smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), resp.body )
def createFuzzableRequests(resp, request=None, add_self=True): ''' Generates the fuzzable requests based on an http response instance. @parameter resp: An HTTPResponse instance. @parameter request: The HTTP request that generated the resp @parameter add_self: If I should add the current HTTP request (@parameter request) to the result on not. @return: A list of fuzzable requests. ''' is_redirect = lambda resp: 300 <= resp.getCode() < 400 res = [] # Headers for all fuzzable requests created here: # And add the fuzzable headers to the dict headers = dict((h, '') for h in cf.cf.getData('fuzzableHeaders')) req_headers = dict(headers) req_headers.update(request and request.getHeaders() or {}) # Get the cookie! cookieObj = _create_cookie(resp) # Create the fuzzable request that represents the request object # passed as parameter if add_self: qsr = HTTPQSRequest( resp.getURI(), headers=req_headers, cookie=cookieObj ) res.append(qsr) # If response was a 30X (i.e. a redirect) then include the # corresponding fuzzable request. if is_redirect(resp): redir_headers = resp.getLowerCaseHeaders() location = redir_headers.get('location') or \ redir_headers.get('uri', '') if location: location = smart_unicode(location, encoding=resp.charset) try: absolute_location = resp.getURL().urlJoin(location) except ValueError: msg = 'The application sent a 30x redirect "Location:" that' msg += ' w3af failed to correctly parse as an URL, the header' msg += ' value was: "%s"' om.out.debug( msg % location ) else: qsr = HTTPQSRequest( absolute_location, headers=req_headers, cookie=cookieObj ) res.append(qsr) # Try to find forms in the document try: dp = dpCache.dpc.getDocumentParserFor(resp) except w3afException: # Failed to find a suitable parser for the document form_list = [] else: form_list = dp.getForms() if not form_list: # Check if its a wsdl file wsdlp = wsdlParser.wsdlParser() try: wsdlp.setWsdl(resp.getBody()) except w3afException: pass else: for rem_meth in wsdlp.getMethods(): wspdr = wsPostDataRequest( rem_meth.getLocation(), rem_meth.getAction(), rem_meth.getParameters(), rem_meth.getNamespace(), rem_meth.getMethodName(), headers ) res.append(wspdr) else: # Create one httpPostDataRequest for each form variant mode = cf.cf.getData('fuzzFormComboValues') for form in form_list: for variant in form.getVariants(mode): if form.getMethod().upper() == 'POST': r = httpPostDataRequest( variant.getAction(), variant.getMethod(), headers, cookieObj, variant, form.getFileVariables() ) else: # The default is a GET request r = HTTPQSRequest( variant.getAction(), headers=headers, cookie=cookieObj ) r.setDc(variant) res.append(r) return res
def _charset_handling(self): ''' Decode the body based on the header (or metadata) encoding. The implemented algorithm follows the encoding detection logic used by FF: 1) First try to find a charset using the following search criteria: a) Look in the 'content-type' HTTP header. Example: content-type: text/html; charset=iso-8859-1 b) Look in the 'meta' HTML header. Example: <meta .* content="text/html; charset=utf-8" /> c) Determine the charset using the chardet module (TODO) d) Use the DEFAULT_CHARSET 2) Try to decode the body using the found charset. If it fails, then force it to use the DEFAULT_CHARSET Finally return the unicode (decoded) body and the used charset. Note: If the body is already a unicode string return it as it is. ''' lowerCaseHeaders = self.getLowerCaseHeaders() charset = self._charset rawbody = self._raw_body # Only try to decode <str> strings. Skip <unicode> strings if type(rawbody) is unicode: _body = rawbody assert charset is not None, ("httpResponse objects containing " "unicode body must have an associated charset") elif not 'content-type' in lowerCaseHeaders: om.out.debug("hmmm... wtf?! The remote web server failed to " "send the 'content-type' header.") _body = rawbody charset = DEFAULT_CHARSET elif not self.is_text_or_html(): # Not text, save as it is. _body = rawbody charset = charset or DEFAULT_CHARSET else: # Figure out charset to work with if not charset: # Start with the headers charset_mo = re.search('charset=\s*?([\w-]+)', lowerCaseHeaders['content-type'], re.I) if charset_mo: # Seems like the response's headers contain a charset charset = charset_mo.groups()[0].lower().strip() else: # Continue with the body's meta tag charset_mo = re.search( '<meta.*?content=".*?charset=\s*?([\w-]+)".*?>', rawbody, re.IGNORECASE) if charset_mo: charset = charset_mo.groups()[0].lower().strip() else: charset = DEFAULT_CHARSET # Now that we have the charset, we use it! # The return value of the decode function is a unicode string. try: _body = smart_unicode( rawbody, charset, errors=ESCAPED_CHAR, on_error_guess=False ) except LookupError: # Warn about a buggy charset msg = ('Charset LookupError: unknown charset: %s; ' 'ignored and set to default: %s' % (charset, self._charset)) om.out.debug(msg) # Forcing it to use the default charset = DEFAULT_CHARSET _body = smart_unicode( rawbody, charset, errors=ESCAPED_CHAR, on_error_guess=False ) return _body, charset