def get_desc(self, with_id=False): if self.TEMPLATE is None: return self.first_info.get_desc(with_id=with_id) # We render the template using the information set data context = {'urls': [smart_unicode(u) for u in self.get_urls()], 'uris': [smart_unicode(u) for u in self.get_uris()], 'severity': self.get_severity(), 'name': self.get_name(), 'id': self.get_id(), 'method': smart_unicode(self.get_method()), 'plugin': self.get_plugin_name()} context.update(self.first_info.items()) template_str = textwrap.dedent(self.TEMPLATE) template = self.JINJA2_ENV.from_string(template_str) try: rendered_desc = template.render(context) except UnicodeDecodeError: context_pp = pprint.pformat(context, indent=4) msg = ('UnicodeDecodeError found while rendering:\n\n%s\n\n' 'Using the following context:\n\n%r\n\n') om.out.debug(msg % (smart_str(template_str), smart_str(context_pp))) raise return rendered_desc
def get_desc(self, with_id=False): if self.TEMPLATE is None: return self.first_info.get_desc(with_id=with_id) # We render the template using the information set data context = { 'urls': [smart_unicode(u) for u in self.get_urls()], 'uris': [smart_unicode(u) for u in self.get_uris()], 'severity': self.get_severity(), 'name': self.get_name(), 'id': self.get_id(), 'method': smart_unicode(self.get_method()), 'plugin': self.get_plugin_name() } context.update(self.first_info.items()) template_str = textwrap.dedent(self.TEMPLATE) template = self.JINJA2_ENV.from_string(template_str) try: rendered_desc = template.render(context) except UnicodeDecodeError: context_pp = pprint.pformat(context, indent=4) msg = ('UnicodeDecodeError found while rendering:\n\n%s\n\n' 'Using the following context:\n\n%r\n\n') om.out.debug(msg % (smart_str(template_str), smart_str(context_pp))) raise return rendered_desc
def __setitem__(self, k, v): if isinstance(k, basestring): k = smart_unicode(k, encoding=self.encoding) else: raise ValueError('Header name must be a string.') if isinstance(v, basestring): v = smart_unicode(v, encoding=self.encoding) elif isinstance(v, DataToken): encoded_str = smart_unicode(v.get_value(), encoding=self.encoding) v.set_value(encoded_str) else: raise ValueError('Header value must be a string.') super(Headers, self).__setitem__(k, v)
def headers_url_generator(resp, fuzzable_req): """ Yields tuples containing: * Newly found URL * The FuzzableRequest instance passed as parameter * The HTTPResponse generated by the FuzzableRequest * Boolean indicating if we trust this reference or not The newly found URLs are extracted from the http response headers such as "Location". :param resp: HTTP response object :param fuzzable_req: The HTTP request that generated the response """ resp_headers = resp.get_headers() for parser, header_names in URL_HEADERS.iteritems(): for header_name in header_names: header_value, _ = resp_headers.iget(header_name, None) if header_value is not None: header_value = smart_unicode(header_value, encoding=resp.charset) for ref in parser(resp, header_name, header_value): yield ref, fuzzable_req, resp, False
def end(self): """ This method is called when the scan has finished, we perform these main tasks: * Get the target URLs * Get the enabled plugins * Get the vulnerabilities and infos from the KB * Get the debug data * Send all the data to jinja2 for rendering the template """ target_urls = [t.url_string for t in cf.cf.get('targets')] target_domain = cf.cf.get('target_domains')[0] enabled_plugins = self._enabled_plugins findings = kb.kb.get_all_findings() debug_log = ((t, l, smart_unicode(m)) for (t, l, m) in self._additional_info) known_urls = kb.kb.get_all_known_urls() context = { 'target_urls': target_urls, 'target_domain': target_domain, 'enabled_plugins': enabled_plugins, 'findings': findings, 'debug_log': debug_log, 'known_urls': known_urls } # The file was verified to exist when setting the plugin configuration template_fh = file(os.path.expanduser(self._template), 'r') output_fh = file(os.path.expanduser(self._output_file_name), 'w') self._render_html_file(template_fh, context, output_fh)
def test_parse_response_with_no_charset_in_header(self): # No charset was specified, use the default as well as the default # error handling scheme for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) resp = self.create_resp(Headers([("Content-Type", "text/xml")]), html) self.assertEquals(smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), resp.body)
def _clean_filenames(self, filenames): """ Filter some characters from filenames. :return: A clear list of filenames. """ resources = set() for filename in filenames: # Sometimes we get random bytes from the .git/index because of # git versions we don't fully support, so we ignore any encoding # errors filename = smart_unicode(filename, errors='ignore') if filename.startswith('/'): filename = filename[1:] if filename.startswith('./'): filename = filename[2:] if filename.endswith('/'): filename = filename[:-1] resources.add(filename) return resources
def end(self): """ This method is called when the scan has finished, we perform these main tasks: * Get the target URLs * Get the enabled plugins * Get the vulnerabilities and infos from the KB * Get the debug data * Send all the data to jinja2 for rendering the template """ target_urls = [t.url_string for t in cf.cf.get('targets')] target_domain = cf.cf.get('target_domains')[0] enabled_plugins = self._enabled_plugins findings = kb.kb.get_all_findings() debug_log = ((t, l, smart_unicode(m)) for (t, l, m) in self._additional_info) known_urls = kb.kb.get_all_known_urls() context = {'target_urls': target_urls, 'target_domain': target_domain, 'enabled_plugins': enabled_plugins, 'findings': findings, 'debug_log': debug_log, 'known_urls': known_urls} # The file was verified to exist when setting the plugin configuration template_fh = file(os.path.expanduser(self._template), 'r') output_fh = file(os.path.expanduser(self._output_file_name), 'w') self._render_html_file(template_fh, context, output_fh)
def _headers_url_generator(self, resp, fuzzable_req): """ Yields tuples containing: * Newly found URL * The FuzzableRequest instance passed as parameter * The HTTPResponse generated by the FuzzableRequest * Boolean indicating if we trust this reference or not The newly found URLs are extracted from the http response headers such as "Location". :param resp: HTTP response object :param fuzzable_req: The HTTP request that generated the response """ # If response was a 30X (i.e. a redirect) then include the # corresponding fuzzable request. resp_headers = resp.get_headers() for url_header_name in URL_HEADERS: url_header_value, _ = resp_headers.iget(url_header_name, '') if url_header_value: url = smart_unicode(url_header_value, encoding=resp.charset) try: ref = resp.get_url().url_join(url) except ValueError: msg = 'The application sent a "%s" redirect that w3af' \ ' failed to correctly parse as an URL, the header' \ ' value was: "%s"' om.out.debug(msg % (url_header_name, url)) else: yield ref, fuzzable_req, resp, False
def clean_values(self, init_val): if isinstance(init_val, DataContainer)\ or isinstance(init_val, dict): return init_val cleaned_vals = [] # Cleanup whatever came from the wire into a unicode string for key, value in init_val: # I can do this key, value thing because the headers do NOT # have multiple header values like query strings and post-data if isinstance(value, basestring): value = smart_unicode(value) cleaned_vals.append((smart_unicode(key), value)) return cleaned_vals
def comment(self, elem): if self._inside_script: # This handles the case where we have: # <script><!-- code(); --></script> return if elem.text is not None: self._comments_in_doc.append(smart_unicode(elem.text))
def test_invalid_utf8(self): invalid_utf8 = '\xf3' token = DataToken(self.NAME, invalid_utf8, self.PATH) self.assertRaises(UnicodeDecodeError, unicode, token) encoded_token = smart_unicode(token) self.assertEqual(encoded_token, u'\xf3')
def clean_values(self, init_val): if isinstance(init_val, NonRepeatKeyValueContainer)\ or isinstance(init_val, dict): return init_val cleaned_vals = [] # Cleanup whatever came from the wire into a unicode string for key, value in init_val: # I can do this (key, value) thing because the headers do NOT # have multiple header values like query strings and post-data if isinstance(value, basestring): value = smart_unicode(value) cleaned_vals.append((smart_unicode(key), value)) return cleaned_vals
def test_invalid_utf8(self): invalid_utf8 = "\xf3" token = DataToken(self.NAME, invalid_utf8, self.PATH) self.assertRaises(UnicodeDecodeError, unicode, token) encoded_token = smart_unicode(token) self.assertEqual(encoded_token, u"\xf3")
def get_clean_body(mutant, response): """ @see: Very similar to fingerprint_404.py get_clean_body() bug not quite the same maybe in the future I can merge both? Definition of clean in this method: - input: - response.get_url() == http://host.tld/aaaaaaa/?id=1 OR 23=23 - response.get_body() == '...<x>1 OR 23=23</x>...' - output: - self._clean_body(response) == '...<x></x>...' All injected values are removed encoded and 'as is'. :param mutant: The mutant where I can get the value from. :param response: The HTTPResponse object to clean :return: A string that represents the 'cleaned' response body. """ if not response.is_text_or_html(): return response.body body = response.body mod_value_1 = mutant.get_token_value() # Since the body is already in unicode, when we call body.replace() all # arguments are converted to unicode by python. If there are special # chars in the mod_value then we end up with an UnicodeDecodeError, so # I convert it myself with some error handling # # https://github.com/andresriancho/w3af/issues/8953 mod_value_1 = smart_unicode(mod_value_1, errors=PERCENT_ENCODE) # unquote, just in case the plugin did an extra encoding of some type. # what we want to do here is get the original version of the string mod_value_2 = urllib.unquote_plus(mod_value_1) payloads_to_replace = set() payloads_to_replace.add(mod_value_1) payloads_to_replace.add(mod_value_2) encoded_payloads = set() for payload in payloads_to_replace: for encoded_payload in apply_multi_escape_table(payload, EXTENDED_TABLE): encoded_payloads.add(encoded_payload) # uniq sorted by longest len encoded_payloads = list(encoded_payloads) encoded_payloads.sort(lambda x, y: cmp(len(y), len(x))) empty = u'' replace = unicode.replace for to_replace in encoded_payloads: body = replace(body, to_replace, empty) return body
def test_parse_response_with_wrong_charset(self): # A wrong or non-existant charset was set; try to decode the response # using the default charset and handling scheme from random import choice for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) headers = Headers([("Content-Type", "text/xml; charset=%s" % choice(("XXX", "utf-8")))]) resp = self.create_resp(headers, html) self.assertEquals(smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), resp.body)
def get_path_qs(self): """ :return: Returns the path for the url containing the QS """ res = self.path if self.params != u'': res += u';' + self.params if self.has_query_string(): res += u'?' + smart_unicode(self.querystring) return res
def test_parse_response_with_no_charset_in_header(self): # No charset was specified, use the default as well as the default # error handling scheme for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) resp = self.create_resp(Headers([('Content-Type', 'text/xml')]), html) self.assertEquals( smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), resp.body)
def __setattr__(self, key, value): """ Overriding in order to translate every value to an unicode object :param key: The attribute name to set :param value: The value (string, unicode or anything else) :return: None """ if isinstance(value, basestring): value = smart_unicode(value) self[key] = value
def _to_str_with_separators(self, key_val_sep, pair_sep): """ :return: Join all the values stored in this data container using the specified separators. """ lst = [] for k, v in self.items(): to_app = u"%s%s%s" % (k, key_val_sep, smart_unicode(v, encoding=UTF8)) lst.append(to_app) return pair_sep.join(lst)
def _to_str_with_separators(self, key_val_sep, pair_sep): """ :return: Join all the values stored in this data container using the specified separators. """ lst = [] for k, v in self.items(): to_app = u'%s%s%s' % (k, key_val_sep, smart_unicode(v, encoding=UTF8)) lst.append(to_app) return pair_sep.join(lst)
def response_dump(_id): """ :param _id: The ID to query in the database :return: The response as unicode """ _history = HistoryItem() try: details = _history.read(_id) except DBException: return None return smart_unicode(details.response.dump().strip())
def test_from_dict_encodings(self): for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) resp = self.create_resp(Headers([("Content-Type", "text/xml")]), html) msg = msgpack.dumps(resp.to_dict()) loaded_dict = msgpack.loads(msg) loaded_resp = HTTPResponse.from_dict(loaded_dict) self.assertEquals( smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), loaded_resp.body )
def test_parse_response_with_wrong_charset(self): # A wrong or non-existant charset was set; try to decode the response # using the default charset and handling scheme for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) headers = Headers([('Content-Type', 'text/xml; charset=%s' % choice(('XXX', 'utf-8')))]) resp = self.create_resp(headers, html) self.assertEquals( smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), resp.body )
def _to_str_with_separators(self, key_val_sep, pair_sep, errors='strict'): """ :return: Join all the values stored in this data container using the specified separators. """ lst = [] for key, value_list in self.items(): for value in value_list: value = smart_unicode(value, encoding=UTF8, errors=errors) to_app = u'%s%s%s' % (key, key_val_sep, value) lst.append(to_app) return pair_sep.join(lst)
def test_dump_case03(self): header_value = ''.join(chr(i) for i in xrange(256)) expected = u'\r\n'.join([u'GET http://w3af.com/a/b/c.php HTTP/1.1', u'Hola: %s' % smart_unicode(header_value), u'', u'a=b']) headers = Headers([(u'Hola', header_value)]) post_data = KeyValueContainer(init_val=[('a', ['b'])]) fr = FuzzableRequest(self.url, method='GET', post_data=post_data, headers=headers) self.assertEqual(fr.dump(), expected)
def test_dump_case03(self): header_value = ''.join(chr(i) for i in xrange(256)) expected = u'\r\n'.join([u'GET http://w3af.com/a/b/c.php HTTP/1.1', u'Hola: %s' % smart_unicode(header_value), u'', u'']) headers = Headers([(u'Hola', header_value)]) #TODO: Note that I'm passing a dc to the FuzzableRequest and it's not # appearing in the dump. It might be a bug... fr = FuzzableRequest(self.url, method='GET', dc={u'a': ['b']}, headers=headers) self.assertEqual(fr.dump(), expected)
def url_string(self): """ :return: A <unicode> representation of the URL """ data = (self.scheme, self.netloc, self.path, self.params, self.querystring, self.fragment) data = [smart_unicode(s) for s in data] calc = urlparse.urlunparse(data) # ensuring this is actually unicode if not isinstance(calc, unicode): calc = unicode(calc, self.encoding, 'replace') return calc
def test_from_dict_encodings(self): for body, charset in TEST_RESPONSES.values(): html = body.encode(charset) resp = self.create_resp(Headers([('Content-Type', 'text/xml')]), html) msg = msgpack.dumps(resp.to_dict()) loaded_dict = msgpack.loads(msg) loaded_resp = HTTPResponse.from_dict(loaded_dict) self.assertEquals( smart_unicode(html, DEFAULT_CHARSET, ESCAPED_CHAR, on_error_guess=False), loaded_resp.body)
def test_dump_case03(self): header_value = ''.join(chr(i) for i in xrange(256)) expected = u'\r\n'.join([ u'GET http://w3af.com/a/b/c.php HTTP/1.1', u'Hola: %s' % smart_unicode(header_value), u'', u'' ]) headers = Headers([(u'Hola', header_value)]) #TODO: Note that I'm passing a dc to the FuzzableRequest and it's not # appearing in the dump. It might be a bug... fr = FuzzableRequest(self.url, method='GET', dc={u'a': ['b']}, headers=headers) self.assertEqual(fr.dump(), expected)
def jinja2_attr_value_escape_filter(value): """ This method is used to escape attribute values: <tag attribute="value"> The objective is to escape all the special characters which can not be printed in that context. We also implement something very specific for special characters. We're replacing the XML invalid characters with: <character code="%04x"/> The parser should handle that and replace these tags with the real char (if it can be handled by the reader). Something to note is that when escaping special characters we print the HTML-encoded (< replaced by < and so on) version of the `character` tag. We do that because it is invalid to print < inside the attribute value. :param value: The value to escape :return: The escaped string """ if not isinstance(value, basestring): return value # Fix some encoding errors which are triggered when the value is not an # unicode string value = smart_unicode(value) retval = u'' for letter in value: if letter in ATTR_VALUE_ESCAPES_IGNORE: retval += letter continue escape = ATTR_VALUE_ESCAPES.get(letter, None) if escape is not None: retval += escape else: retval += letter return jinja2.Markup(retval)
def get_clean_body(mutant, response): """ @see: Very similar to fingerprint_404.py get_clean_body() bug not quite the same maybe in the future I can merge both? Definition of clean in this method: - input: - response.get_url() == http://host.tld/aaaaaaa/?id=1 OR 23=23 - response.get_body() == '...<x>1 OR 23=23</x>...' - output: - self._clean_body( response ) == '...<x></x>...' All injected values are removed encoded and "as is". :param mutant: The mutant where I can get the value from. :param response: The HTTPResponse object to clean :return: A string that represents the "cleaned" response body. """ body = response.body if response.is_text_or_html(): mod_value = mutant.get_token_value() # Since the body is already in unicode, when we call body.replace() all # arguments are converted to unicode by python. If there are special # chars in the mod_value then we end up with an UnicodeDecodeError, so # I convert it myself with some error handling # # https://github.com/andresriancho/w3af/issues/8953 mod_value = smart_unicode(mod_value, errors=PERCENT_ENCODE) empty = u'' unquoted = urllib.unquote_plus(mod_value) cgi_escape = cgi.escape body = body.replace(mod_value, empty) body = body.replace(unquoted, empty) body = body.replace(cgi_escape(mod_value), empty) body = body.replace(cgi_escape(unquoted), empty) return body
def jinja2_text_value_escape_filter(value): """ This method is used to escape text values: <tag>text</tag> The objective is to escape all the special characters which can not be printed in that context, and the special characters which might be in the input and we want to escape to avoid "xml injection". We also implement something very specific for special characters. We're replacing the XML invalid characters with: <character code="%04x"/> The parser should handle that and replace these tags with the real char (if it can be handled by the reader). :param value: The value to escape :return: The escaped string """ if not isinstance(value, basestring): return value # Fix some encoding errors which are triggered when the value is not an # unicode string value = smart_unicode(value) retval = u'' for letter in value: if letter in TEXT_VALUE_ESCAPES_IGNORE: retval += letter continue escape = TEXT_VALUE_ESCAPES.get(letter, None) if escape is not None: retval += escape else: retval += letter return jinja2.Markup(retval)
def jinja2_attr_value_escape_filter(value): if not isinstance(value, basestring): return value # Fix some encoding errors which are triggered when the value is not an # unicode string value = smart_unicode(value) retval = u'' for letter in value: if letter in ATTR_VALUE_ESCAPES_IGNORE: retval += letter continue escape = ATTR_VALUE_ESCAPES.get(letter, None) if escape is not None: retval += escape else: retval += letter return jinja2.Markup(retval)
def __init__(self, code, read, headers, geturl, original_url, msg='OK', _id=None, time=DEFAULT_WAIT_TIME, alias=None, charset=None): """ :param code: HTTP code :param read: HTTP body text; typically a string :param headers: HTTP headers, typically a dict or a httplib.HTTPMessage :param geturl: URL object instance :param original_url: URL object instance :param msg: HTTP message :param _id: Optional response identifier :param time: The time between the request and the response :param alias: Alias for the response, this contains a hash that helps the backend sqlite find http_responses faster by indexing by this attr. :param charset: Response's encoding; obligatory when `read` is unicode """ if not isinstance(geturl, URL): msg = 'Invalid type %s for HTTPResponse ctor param geturl.' raise TypeError(msg % type(geturl)) if not isinstance(original_url, URL): msg = 'Invalid type %s for HTTPResponse ctor param original_url.' raise TypeError(msg % type(original_url)) if not isinstance(headers, Headers): msg = 'Invalid type %s for HTTPResponse ctor param headers.' raise TypeError(msg % type(headers)) if not isinstance(read, basestring): raise TypeError('Invalid type %s for HTTPResponse ctor param read.' % type(read)) self._charset = charset self._headers = None self._body = None self._raw_body = read self._content_type = None self._dom = None # A unique id identifier for the response self.id = _id # From cache defaults to False self._from_cache = False # Set the info self._info = headers # Set code self.set_code(code) # Set the URL variables # The URL that we really GET'ed self._realurl = original_url.uri2url() self._uri = original_url # The URL where we were redirected to (equal to original_url # when no redirect) self._redirected_url = geturl self._redirected_uri = geturl.uri2url() # Set the rest self._msg = smart_unicode(msg) self._time = time self._alias = alias self._doc_type = None # Internal lock self._body_lock = threading.RLock()
def set_protocol(self, protocol): """ :return: Returns the domain name for the url. """ self._scheme = smart_unicode(protocol)
def set_path(self, path): self._path = smart_unicode(path) or u'/'
def _get_clean_body_impl(response, strings_to_replace_list, multi_encode=True): """ This is a low level function which allows me to use all the improvements I did in the helpers.get_clean_body() in fingerprint_404.get_clean_body(). Both helpers.get_clean_body() and fingerprint_404.get_clean_body() receive different parameters, do some preparation work, and then call this function to really do the replacements. :param response: HTTP response object :param strings_to_replace_list: A list of strings to replace. These can be byte strings or unicode, we'll handle both internally. :param multi_encode: Apply the multiple encodings before replacing, setting this to True with many strings to replace in the list will consume considerable CPU time. :return: The body as a unicode with all strings to replace removed. """ body = response.body unicodes_to_replace_set = set() for str_to_repl in strings_to_replace_list: # Since the body is already in unicode, when we call body.replace() all # arguments are converted to unicode by python. If there are special # chars in the mod_value then we end up with an UnicodeDecodeError, so # I convert it myself with some error handling # # https://github.com/andresriancho/w3af/issues/8953 unicode_to_repl = smart_unicode(str_to_repl, errors=PERCENT_ENCODE) # unquote, just in case the plugin did an extra encoding of some type. # what we want to do here is get the original version of the string unicode_to_repl_unquoted = urllib.unquote_plus(unicode_to_repl) unicodes_to_replace_set.add(unicode_to_repl) unicodes_to_replace_set.add(unicode_to_repl_unquoted) # Now we apply multiple encodings to find in different responses encoded_payloads = set() if multi_encode: # Populate the set with multiple versions of the same set for unicode_to_repl in unicodes_to_replace_set: for encoded_to_repl in apply_multi_escape_table( unicode_to_repl, EXTENDED_TABLE): encoded_payloads.add(encoded_to_repl) else: # Just leave the the two we have encoded_payloads = unicodes_to_replace_set # uniq sorted by longest len encoded_payloads = list(encoded_payloads) encoded_payloads.sort(lambda x, y: cmp(len(y), len(x))) empty = u'' replace = unicode.replace for to_replace in encoded_payloads: body = replace(body, to_replace, empty) return body
def get_clean_body_impl(response, strings_to_replace_list, multi_encode=True, max_escape_count=None): """ This is a low level function which allows me to use all the improvements I did in the helpers.get_clean_body() in fingerprint_404.get_clean_body(). Both helpers.get_clean_body() and fingerprint_404.get_clean_body() receive different parameters, do some preparation work, and then call this function to really do the replacements. :param response: HTTP response object :param strings_to_replace_list: A list of strings to replace. These can be byte strings or unicode, we'll handle both internally. :param multi_encode: Apply the multiple encodings before replacing, setting this to True with many strings to replace in the list will consume considerable CPU time. :param max_escape_count: The max number of escapes to try to replace, note that the default here is 500, which is a little bit more than the max number of escapes generated in the worse case I could imagine at test_apply_multi_escape_table_count which generated ~350. The goal is to make sure that everything is generated but at the same time control any edge cases which I might have missed. :return: The body as a unicode with all strings to replace removed. """ body = response.body body_lower = body.lower() body_len = len(body) unicodes_to_replace_set = set() for str_to_repl in strings_to_replace_list: # Since the body is already in unicode, when we call body.replace() all # arguments are converted to unicode by python. If there are special # chars in the mod_value then we end up with an UnicodeDecodeError, so # I convert it myself with some error handling # # https://github.com/andresriancho/w3af/issues/8953 unicode_to_repl = smart_unicode(str_to_repl, errors=PERCENT_ENCODE) # unquote, just in case the plugin did an extra encoding of some type. # what we want to do here is get the original version of the string unicode_to_repl_unquoted = urllib.unquote_plus(unicode_to_repl) unicodes_to_replace_set.add(unicode_to_repl) unicodes_to_replace_set.add(unicode_to_repl_unquoted) # Now we apply multiple encodings to find in different responses encoded_payloads = set() if multi_encode: # Populate the set with multiple versions of the same set for unicode_to_repl in unicodes_to_replace_set: # If the unicode_to_repl (in its original version, without applying # the multi escape table) is larger than the response body; and # taking into account that `apply_multi_escape_table` will always # return a string which is equal or larger than the original; we # reduce the CPU-usage of this function by preventing the generation # of strings which will NEVER be replaced in: # # body = replace(body, to_replace, empty) # # Because to_replace will be larger than body: ergo, it will never # be there. if len(unicode_to_repl) > body_len: continue # Note that we also do something similar with the max_len=body_len # parameter we send to apply_multi_escape_table for encoded_to_repl in apply_multi_escape_table( unicode_to_repl, max_len=body_len, max_count=max_escape_count): encoded_payloads.add(encoded_to_repl) else: # Just leave the the two we have encoded_payloads = unicodes_to_replace_set # uniq sorted by longest len encoded_payloads = list(encoded_payloads) encoded_payloads.sort(lambda x, y: cmp(len(y), len(x))) for to_replace in encoded_payloads: body, body_lower = remove_using_lower_case(body, body_lower, to_replace) return body
def set_net_location(self, netloc): self._netloc = smart_unicode(netloc)
def _charset_handling(self): """ Decode the body based on the header (or metadata) encoding. The implemented algorithm follows the encoding detection logic used by FF: 1) First try to find a charset using the following search criteria: a) Look in the CONTENT_TYPE HTTP header. Example: content-type: text/html; charset=iso-8859-1 b) Look in the 'meta' HTML header. Example: <meta .* content="text/html; charset=utf-8" /> c) Determine the charset using the chardet module (TODO) d) Use the DEFAULT_CHARSET 2) Try to decode the body using the found charset. If it fails, then force it to use the DEFAULT_CHARSET Finally return the unicode (decoded) body and the used charset. Note: If the body is already a unicode string return it as it is. """ charset = self._charset raw_body = self._raw_body headers = self.get_headers() content_type, _ = headers.iget(CONTENT_TYPE, None) # Only try to decode <str> strings. Skip <unicode> strings if type(raw_body) is unicode: _body = raw_body assert charset is not None, ( "HTTPResponse objects containing " "unicode body must have an associated " "charset") elif content_type is None: _body = raw_body charset = DEFAULT_CHARSET if _body: msg = ('The remote web server failed to send the CONTENT_TYPE' ' header in HTTP response with id %s') om.out.debug(msg % self.id) elif not self.is_text_or_html(): # Not text, save as it is. _body = raw_body charset = charset or DEFAULT_CHARSET else: # Figure out charset to work with if not charset: charset = self.guess_charset(raw_body, headers) # Now that we have the charset, we use it! # The return value of the decode function is a unicode string. try: _body = smart_unicode(raw_body, charset, errors=ESCAPED_CHAR, on_error_guess=False) except LookupError: # Warn about a buggy charset msg = ('Charset LookupError: unknown charset: %s; ' 'ignored and set to default: %s' % (charset, DEFAULT_CHARSET)) om.out.debug(msg) # Forcing it to use the default charset = DEFAULT_CHARSET _body = smart_unicode(raw_body, charset, errors=ESCAPED_CHAR, on_error_guess=False) return _body, charset
def set_param(self, param_string): """ :param param_string: The param to set (e.g. "foo=aaa"). :return: Returns the url containing param. """ self._params = smart_unicode(param_string)
def _charset_handling(self): """ Decode the body based on the header (or metadata) encoding. The implemented algorithm follows the encoding detection logic used by FF: 1) First try to find a charset using the following search criteria: a) Look in the CONTENT_TYPE HTTP header. Example: content-type: text/html; charset=iso-8859-1 b) Look in the 'meta' HTML header. Example: <meta .* content="text/html; charset=utf-8" /> c) Determine the charset using the chardet module (TODO) d) Use the DEFAULT_CHARSET 2) Try to decode the body using the found charset. If it fails, then force it to use the DEFAULT_CHARSET Finally return the unicode (decoded) body and the used charset. Note: If the body is already a unicode string return it as it is. """ headers = self.get_headers() content_type, _ = headers.iget(CONTENT_TYPE, None) charset = self._charset rawbody = self._raw_body # Only try to decode <str> strings. Skip <unicode> strings if type(rawbody) is unicode: _body = rawbody assert charset is not None, ("HTTPResponse objects containing " "unicode body must have an associated " "charset") elif content_type is None: _body = rawbody charset = DEFAULT_CHARSET if len(_body): msg = ('The remote web server failed to send the CONTENT_TYPE' ' header in HTTP response with id %s') om.out.debug(msg % self.id) elif not self.is_text_or_html(): # Not text, save as it is. _body = rawbody charset = charset or DEFAULT_CHARSET else: # Figure out charset to work with if not charset: charset = self.guess_charset(rawbody, headers) # Now that we have the charset, we use it! # The return value of the decode function is a unicode string. try: _body = smart_unicode(rawbody, charset, errors=ESCAPED_CHAR, on_error_guess=False) except LookupError: # Warn about a buggy charset msg = ('Charset LookupError: unknown charset: %s; ' 'ignored and set to default: %s' % (charset, DEFAULT_CHARSET)) om.out.debug(msg) # Forcing it to use the default charset = DEFAULT_CHARSET _body = smart_unicode(rawbody, charset, errors=ESCAPED_CHAR, on_error_guess=False) return _body, charset
def __contains__(self, s): """ :return: True if "s" in url_string """ s = smart_unicode(s) return s in self.url_string
def __init__(self, code, read, headers, geturl, original_url, msg='OK', _id=None, time=DEFAULT_WAIT_TIME, alias=None, charset=None, binary_response=False, set_body=False, debugging_id=None): """ :param code: HTTP code :param read: HTTP body text; typically a string :param headers: HTTP headers, typically a dict or a httplib.HTTPMessage :param geturl: URL object instance :param original_url: URL object instance :param msg: HTTP message :param _id: Optional response identifier :param time: The time between the request and the response :param alias: Alias for the response, this contains a hash that helps the backend sqlite find http_responses faster by indexing by this attr. :param charset: Response's encoding; obligatory when `read` is unicode """ if not isinstance(geturl, URL): msg = 'Invalid type %s for HTTPResponse ctor param geturl.' raise TypeError(msg % type(geturl)) if not isinstance(original_url, URL): msg = 'Invalid type %s for HTTPResponse ctor param original_url.' raise TypeError(msg % type(original_url)) if not isinstance(headers, Headers): msg = 'Invalid type %s for HTTPResponse ctor param headers.' raise TypeError(msg % type(headers)) if not isinstance(read, basestring): raise TypeError( 'Invalid type %s for HTTPResponse ctor param read.' % type(read)) self._charset = charset self._headers = None if set_body and isinstance(read, unicode): # We use this case for deserialization via from_dict() # # The goal is to prevent the body to be analyzed for charset data # once again, since it was already done during to_dict() in the # get_body() call. self._body = self._raw_body = read else: self._body = None self._raw_body = read self._binary_response = binary_response self._content_type = None self._dom = None # A unique id identifier for the response self.id = _id # From cache defaults to False self._from_cache = False # Set the info self._info = headers # Set code self._code = None self.set_code(code) # Set the URL variables # The URL that we really GET'ed self._realurl = original_url.uri2url() self._uri = original_url # The URL where we were redirected to (equal to original_url # when no redirect) self._redirected_url = geturl.uri2url() self._redirected_uri = geturl # Set the rest self._msg = smart_unicode(msg) self._time = time self._alias = alias self._doc_type = None self._debugging_id = debugging_id # Internal lock self._body_lock = threading.RLock()
def set_fragment(self, fragment): self._fragment = smart_unicode(fragment)