class ResponseCacheKeyCache(object): # # The memory impact of having a large number of items in this cache is # really low, both the keys and the values are short strings (the result of # quick_hash) # MAX_SIZE = 2000 def __init__(self): self._cache = SynchronizedLRUDict(self.MAX_SIZE) def get_response_cache_key(self, http_response, clean_response=None, headers=None): # When the clean response is available, use that body to calculate the # cache key. It has been cleaned (removed request paths and QS parameters) # so it has a higher chance of being equal to other responses / being # already in the cache if clean_response is not None: body = clean_response.body else: body = http_response.body cache_key = '%s%s' % (smart_str_ignore(body), headers) cache_key = quick_hash(cache_key) result = self._cache.get(cache_key, None) if result is not None: return result result = get_response_cache_key(http_response, clean_response=clean_response, headers=headers) self._cache[cache_key] = result return result def clear_cache(self): self._cache.clear()
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10) self._parser_finished_events = {} self._parser_blacklist = DiskSet() def clear(self): """ Clear all the internal variables :return: None """ om.out.debug('Called clear() on ParserCache') # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): if hasattr(parser, 'clear'): parser.clear() # We don't need the parsers anymore self._cache.clear() self._can_parse_cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def can_parse(self, http_response): """ Check if we can parse an HTTP response :param http_response: The HTTP response to verify :return: True if we can parse this HTTP response """ cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None) if cached_can_parse is not None: return cached_can_parse # # We need to verify if we can parse this HTTP response # try: can_parse = DocumentParser.can_parse(http_response) except: # We catch all the exceptions here and just return False because # the real parsing procedure will (most likely) fail to parse # this response too. can_parse = False self._can_parse_cache[can_parse] = can_parse return can_parse def add_to_blacklist(self, hash_string): """ Add a hash_string representing an HTTP response to the blacklist, indicating that we won't try to parse this response never again. :return: None """ self._parser_blacklist.add(hash_string) def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if possible :param http_response: The http response instance :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): msg = 'There is no parser for "%s".' raise BaseFrameworkException(msg % http_response.get_url()) hash_string = get_response_unique_id(http_response) if hash_string in self._parser_blacklist: msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.' raise BaseFrameworkException(msg % http_response.get_url()) # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached timeout parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except ScanMustStopException, e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except:
class GetAverageRTTForMutant(object): def __init__(self, url_opener): self._url_opener = url_opener # Cache to measure RTT self._rtt_mutant_cache = SynchronizedLRUDict(capacity=128) self._rtt_mutant_lock = threading.RLock() self._specific_rtt_mutant_locks = dict() def _get_cache_key(self, mutant): # # Get the cache key for this mutant # method = mutant.get_method() uri = mutant.get_uri() data = mutant.get_data() headers = mutant.get_all_headers() cache_key_parts = [method, uri, data, headers] cache_key_str = ''.join([smart_str_ignore(i) for i in cache_key_parts]) m = hashlib.md5() m.update(cache_key_str) return m.hexdigest() def get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None): """ Get the average time for the HTTP request represented as a mutant. This method caches responses. The cache entries are valid for 5 seconds, after that period of time the entry is removed from the cache, the average RTT is re-calculated and stored again. :param mutant: The mutant to send and measure RTT from :param count: Number of checks to perform :param debugging_id: Unique ID used for logging :return: A float representing the seconds it took to get the response """ assert count >= 3, 'Count must be greater or equal than 3.' cache_key = self._get_cache_key(mutant) # # Only perform one of these checks at the time, this is useful to prevent # different threads which need the same result from duplicating efforts # specific_rtt_mutant_lock = self._get_specific_rtt_mutant_lock(cache_key) with specific_rtt_mutant_lock: cached_value = self._rtt_mutant_cache.get(cache_key, default=None) if cached_value is not None: timestamp, value = cached_value if time.time() - timestamp <= 5: # # The cache entry is still valid, return the cached value # msg = 'Returning cached average RTT of %.2f seconds for mutant %s' om.out.debug(msg % (value, cache_key)) return value # # Need to send the HTTP requests and do the average # rtts = self._get_rtts(mutant, count, debugging_id) if self._has_outliers(rtts): # # The measurement has outliers, we can't continue! If we do # continue the average_rtt will be completely invalid and # potentially yield false positives # self._remove_cache_key_from_mutant_locks(cache_key) rtts_str = ', '.join(str(i) for i in rtts) msg = 'Found outliers while sampling average RTT: %s' % rtts_str raise OutlierException(msg) average_rtt = float(sum(rtts)) / len(rtts) self._rtt_mutant_cache[cache_key] = (time.time(), average_rtt) self._remove_cache_key_from_mutant_locks(cache_key) msg = 'Returning fresh average RTT of %.2f seconds for mutant %s' om.out.debug(msg % (average_rtt, cache_key)) return average_rtt def _remove_cache_key_from_mutant_locks(self, cache_key): with self._rtt_mutant_lock: if cache_key in self._specific_rtt_mutant_locks: self._specific_rtt_mutant_locks.pop(cache_key) def _get_rtts(self, mutant, count=3, debugging_id=None): """ :param mutant: The mutant to send and measure RTT from :param count: Number of checks to perform :param debugging_id: Unique ID used for logging :return: A float representing the seconds it took to get the response """ rtts = [] for _ in xrange(count): resp = self._url_opener.send_mutant(mutant, cache=False, grep=False, debugging_id=debugging_id) rtt = resp.get_wait_time() rtts.append(rtt) return rtts def _has_outliers(self, rtts): """ When we measure the RTT for a specific endpoint + parameter set we might get a big variation in the result, for example the RTTs might be: [0.2, 0.25, 1.8] Where 1.8 is an outlier that will break the detection of time-based SQL injection, OS commanding, etc. since the average for that RTT set is very influenced by the outlier. :param rtts: The list of RTT obtained by _get_rtts :return: True if the list of rtts has one or more outliers. """ return False outlier_analyis = outliers_modified_z_score(rtts) return None in outlier_analyis def _get_specific_rtt_mutant_lock(self, cache_key): with self._rtt_mutant_lock: specific_rtt_mutant_lock = self._specific_rtt_mutant_locks.get(cache_key) if specific_rtt_mutant_lock is not None: return specific_rtt_mutant_lock specific_rtt_mutant_lock = threading.RLock() self._specific_rtt_mutant_locks[cache_key] = specific_rtt_mutant_lock return specific_rtt_mutant_lock
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._parser_finished_events = {} def clear(self): """ Clear all the internal variables :return: None """ # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): parser.clear() # We don't need the parsers anymore self._cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if required :param http_response: The http response instance :return: An instance of DocumentParser """ hash_string = get_request_unique_id(http_response) parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished try: parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) except: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except: # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else: save_to_cache = self.should_cache(http_response) and cache if save_to_cache: self._cache[hash_string] = parser else: self._handle_no_cache(hash_string) finally: event.set() self._parser_finished_events.pop(hash_string, None) return parser
class grep(BaseConsumer): """ Consumer thread that takes requests and responses from the queue and analyzes them using the user-enabled grep plugins. """ LOG_QUEUE_SIZES_EVERY = 25 REPORT_GREP_STATS_EVERY = 25 EXCLUDE_HEADERS_FOR_HASH = tuple([ 'date', 'expires', 'last-modified', 'etag', 'x-request-id', 'x-content-duration', 'x-execution-time', 'x-requestid', 'content-length', 'cf-ray', 'set-cookie' ]) def __init__(self, grep_plugins, w3af_core): """ :param grep_plugins: Instances of grep plugins in a list :param w3af_core: The w3af core that we'll use for status reporting """ # max_in_queue_size, is the number of items that will be stored in-memory # in the consumer queue # # Any items exceeding max_in_queue_size will be stored on-disk, which # is slow but will prevent any high memory usage imposed by this part # of the framework max_in_queue_size = 25 # thread_pool_size defines how many threads we'll use to run grep plugins thread_pool_size = 10 # max_pool_queued_tasks defines how many tasks we'll keep in memory waiting # for a worker from the pool to be available max_pool_queued_tasks = thread_pool_size * 3 super(grep, self).__init__(grep_plugins, w3af_core, create_pool=True, max_pool_queued_tasks=max_pool_queued_tasks, thread_pool_size=thread_pool_size, thread_name=self.get_name(), max_in_queue_size=max_in_queue_size) self._already_analyzed_body = ScalableBloomFilter() self._already_analyzed_url = ScalableBloomFilter() self._target_domains = None self._log_queue_sizes_calls = 0 self._consumer_plugin_dict = dict( (plugin.get_name(), plugin) for plugin in self._consumer_plugins) self._first_plugin_name = self._consumer_plugin_dict.keys()[0] self._request_response_lru = SynchronizedLRUDict(thread_pool_size * 3) self._request_response_processes = dict() self._response_cache_key_cache = ResponseCacheKeyCache() self._should_grep_stats = { 'accept': 0, 'reject-seen-body': 0, 'reject-seen-url': 0, 'reject-out-of-scope': 0, } def get_name(self): return 'Grep' def _teardown(self): """ Handle POISON_PILL """ msg = 'Starting Grep consumer _teardown() with %s plugins' om.out.debug(msg % len(self._consumer_plugins)) for plugin in self._consumer_plugins: om.out.debug('Calling %s.end()' % plugin.get_name()) start_time = time.time() try: plugin.end() except Exception as exception: msg = 'An exception was found while running %s.end(): "%s"' args = (plugin.get_name(), exception) om.out.debug(msg % args) status = FakeStatus(self._w3af_core) status.set_current_fuzzable_request('grep', 'n/a') status.set_running_plugin('grep', plugin.get_name(), log=True) exec_info = sys.exc_info() enabled_plugins = 'n/a' self._w3af_core.exception_handler.handle( status, exception, exec_info, enabled_plugins) continue spent_time = time.time() - start_time msg = 'Spent %.2f seconds running %s.end()' args = (spent_time, plugin.get_name()) om.out.debug(msg % args) self._consumer_plugins = dict() self._consumer_plugin_dict = dict() self._response_cache_key_cache.clear_cache() om.out.debug('Finished Grep consumer _teardown()') def _get_request_response_from_id_impl(self, http_response_id): """ Just reads the request and response from the files. No threading, events, caching, etc. :param http_response_id: The HTTP response ID :return: An HTTP request and response tuple """ history = HistoryItem() request, response = history.load_from_file(http_response_id) # Create a fuzzable request based on the urllib2 request object headers_inst = Headers(request.header_items()) request = FuzzableRequest.from_parts(request.url_object, request.get_method(), request.get_data() or '', headers_inst) return request, response def _get_request_response_from_id(self, http_response_id): """ This is a rather complex method that reads the HTTP request and response from disk and makes sure that: * Requests and responses are cached in a LRU to prevent reading the same data from disk twice in a short period of time * Thread events are used to prevent two threads from starting to read the same HTTP response ID at the same time, which would waste CPU cycles and disk IO. :param http_response_id: The HTTP response ID :return: A request / response tuple """ # # First check if the request and response was already deserialized # by another thread and stored in the LRU # request_response = self._request_response_lru.get( http_response_id, None) if request_response is not None: request, response = request_response return request, response # # Another thread might have started with the deserialization, check # and wait for that thread to finish # event = self._request_response_processes.get(http_response_id, None) if event is not None: # Wait for the other thread to finish reading the request and # response from disk. Timeout after 20 seconds as a safety measure wait_result = event.wait(timeout=20) if not wait_result: om.out.error('There was a timeout waiting for the' ' deserialization of HTTP request and response' ' with id %s' % http_response_id) return None, None # Read the data from the LRU. There is a 99,9999% chance it is there # since the other thread saved it before setting the event request_response = self._request_response_lru.get( http_response_id, None) if request_response is not None: request, response = request_response return request, response # There is a 0,0001% chance we get here when the items in the LRU # are removed right after being added, if this happens we just # continue with the algorithm and read the request / response # from the files # # There are no threads deserializing this HTTP response id, start # the process and create an event for others to know they need to # wait # event = threading.Event() self._request_response_processes[http_response_id] = event try: request, response = self._get_request_response_from_id_impl( http_response_id) self._request_response_lru[http_response_id] = (request, response) finally: event.set() self._request_response_processes.pop(http_response_id, None) return request, response def _consume(self, http_response_id): """ Handle a request/response that needs to be analyzed :param http_response_id: The HTTP response ID :return: None """ self._run_all_plugins(http_response_id) def _log_queue_sizes(self): """ The grep consumer will loop really fast through all tasks, if the queue sizes are written on every loop, we'll end up with a log file full of those lines (with ~10 lines per second with almost the same information). Call the parent's _log_queue_sizes once every 25 calls to this method. :return: None """ self._log_queue_sizes_calls += 1 if (self._log_queue_sizes_calls % self.LOG_QUEUE_SIZES_EVERY) != 0: return return super(grep, self)._log_queue_sizes() def _run_all_plugins(self, http_response_id): """ Run one plugin against a request/response. :param http_response_id: HTTP response ID :return: None, results are saved to KB """ for plugin_name in self._consumer_plugin_dict: # Note that if we don't limit the input queue size for the thread # pool we might end up with a lot of queued calls here! The calls # contain an HTTP response body, so they really use a lot of # memory! # # This is controlled by max_pool_queued_tasks args = (plugin_name, http_response_id) self._threadpool.apply_async(self._run_one_plugin, args) def _get_plugin_from_name(self, plugin_name): plugin = self._consumer_plugin_dict.get(plugin_name, None) if plugin is None: msg = ('Internal error in grep consumer: plugin with name %s' ' does not exist in dict.') args = (plugin_name, ) om.out.error(msg % args) return plugin def _run_one_plugin(self, plugin_name, http_response_id): """ :param plugin_name: Grep plugin name to run :param http_response_id: HTTP response ID :return: None """ plugin = self._get_plugin_from_name(plugin_name) if plugin is None: return request, response = self._get_request_response_from_id( http_response_id) if request is None: return self._run_observers(plugin_name, request, response) took_line = TookLine(self._w3af_core, plugin_name, 'grep', debugging_id=None, method_params={'uri': request.get_uri()}) try: plugin.grep_wrapper(request, response) except Exception, e: self.handle_exception('grep', plugin_name, request, e) else:
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10) self._parser_finished_events = {} self._parser_blacklist = DiskSet() def clear(self): """ Clear all the internal variables :return: None """ # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): if hasattr(parser, 'clear'): parser.clear() # We don't need the parsers anymore self._cache.clear() self._can_parse_cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def can_parse(self, http_response): """ Check if we can parse an HTTP response :param http_response: The HTTP response to verify :return: True if we can parse this HTTP response """ cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None) if cached_can_parse is not None: return cached_can_parse # # We need to verify if we can parse this HTTP response # try: can_parse = DocumentParser.can_parse(http_response) except: # We catch all the exceptions here and just return False because # the real parsing procedure will (most likely) fail to parse # this response too. can_parse = False self._can_parse_cache[can_parse] = can_parse return can_parse def add_to_blacklist(self, hash_string): """ Add a hash_string representing an HTTP response to the blacklist, indicating that we won't try to parse this response never again. :return: None """ self._parser_blacklist.add(hash_string) def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if possible :param http_response: The http response instance :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): msg = 'There is no parser for "%s".' raise BaseFrameworkException(msg % http_response.get_url()) hash_string = get_response_unique_id(http_response) if hash_string in self._parser_blacklist: msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.' raise BaseFrameworkException(msg % http_response.get_url()) # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished try: parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) except: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached timeout parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except ScanMustStopException, e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except:
class GetAverageRTTForMutant(object): def __init__(self, url_opener): self._url_opener = url_opener # Cache to measure RTT self._rtt_mutant_cache = SynchronizedLRUDict(capacity=128) self._rtt_mutant_lock = threading.RLock() self._specific_rtt_mutant_locks = dict() def _get_cache_key(self, mutant): # # Get the cache key for this mutant # method = mutant.get_method() uri = mutant.get_uri() data = mutant.get_data() headers = mutant.get_all_headers() cache_key_parts = [method, uri, data, headers] cache_key_str = ''.join([smart_str_ignore(i) for i in cache_key_parts]) m = hashlib.md5() m.update(cache_key_str) return m.hexdigest() def get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None): """ Get the average time for the HTTP request represented as a mutant. This method caches responses. The cache entries are valid for 5 seconds, after that period of time the entry is removed from the cache, the average RTT is re-calculated and stored again. :param mutant: The mutant to send and measure RTT from :param count: Number of checks to perform :param debugging_id: Unique ID used for logging :return: A float representing the seconds it took to get the response """ assert count >= 3, 'Count must be greater or equal than 3.' cache_key = self._get_cache_key(mutant) # # Only perform one of these checks at the time, this is useful to prevent # different threads which need the same result from duplicating efforts # specific_rtt_mutant_lock = self._get_specific_rtt_mutant_lock( cache_key) with specific_rtt_mutant_lock: cached_value = self._rtt_mutant_cache.get(cache_key, default=None) if cached_value is not None: timestamp, value = cached_value if time.time() - timestamp <= 5: # # The cache entry is still valid, return the cached value # msg = 'Returning cached average RTT of %.2f seconds for mutant %s' om.out.debug(msg % (value, cache_key)) return value # # Need to send the HTTP requests and do the average # rtts = self._get_rtts(mutant, count, debugging_id) if self._has_outliers(rtts): # # The measurement has outliers, we can't continue! If we do # continue the average_rtt will be completely invalid and # potentially yield false positives # self._remove_cache_key_from_mutant_locks(cache_key) rtts_str = ', '.join(str(i) for i in rtts) msg = 'Found outliers while sampling average RTT: %s' % rtts_str raise OutlierException(msg) average_rtt = float(sum(rtts)) / len(rtts) self._rtt_mutant_cache[cache_key] = (time.time(), average_rtt) self._remove_cache_key_from_mutant_locks(cache_key) msg = 'Returning fresh average RTT of %.2f seconds for mutant %s' om.out.debug(msg % (average_rtt, cache_key)) return average_rtt def _remove_cache_key_from_mutant_locks(self, cache_key): with self._rtt_mutant_lock: if cache_key in self._specific_rtt_mutant_locks: self._specific_rtt_mutant_locks.pop(cache_key) def _get_rtts(self, mutant, count=3, debugging_id=None): """ :param mutant: The mutant to send and measure RTT from :param count: Number of checks to perform :param debugging_id: Unique ID used for logging :return: A float representing the seconds it took to get the response """ rtts = [] for _ in xrange(count): resp = self._url_opener.send_mutant(mutant, cache=False, grep=False, debugging_id=debugging_id) rtt = resp.get_wait_time() rtts.append(rtt) return rtts def _has_outliers(self, rtts): """ When we measure the RTT for a specific endpoint + parameter set we might get a big variation in the result, for example the RTTs might be: [0.2, 0.25, 1.8] Where 1.8 is an outlier that will break the detection of time-based SQL injection, OS commanding, etc. since the average for that RTT set is very influenced by the outlier. :param rtts: The list of RTT obtained by _get_rtts :return: True if the list of rtts has one or more outliers. """ return False outlier_analyis = outliers_modified_z_score(rtts) return None in outlier_analyis def _get_specific_rtt_mutant_lock(self, cache_key): with self._rtt_mutant_lock: specific_rtt_mutant_lock = self._specific_rtt_mutant_locks.get( cache_key) if specific_rtt_mutant_lock is not None: return specific_rtt_mutant_lock specific_rtt_mutant_lock = threading.RLock() self._specific_rtt_mutant_locks[ cache_key] = specific_rtt_mutant_lock return specific_rtt_mutant_lock
class GetAverageRTTForMutant(object): TIMEOUT = 120 def __init__(self, url_opener): self._url_opener = url_opener # Cache to measure RTT self._rtt_mutant_cache = SynchronizedLRUDict(capacity=128) self._rtt_processing_events = dict() def _get_cache_key(self, mutant): # # Get the cache key for this mutant # method = mutant.get_method() uri = mutant.get_uri() data = mutant.get_data() headers = mutant.get_all_headers() cache_key_parts = [method, uri, data, headers] cache_key_str = ''.join([smart_str_ignore(i) for i in cache_key_parts]) m = hashlib.md5() m.update(cache_key_str) return m.hexdigest() def get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None): """ Get the average time for the HTTP request represented as a mutant. This method caches responses. The cache entries are valid for 5 seconds, after that period of time the entry is removed from the cache, the average RTT is re-calculated and stored again. :param mutant: The mutant to send and measure RTT from :param count: Number of checks to perform :param debugging_id: Unique ID used for logging :return: A float representing the seconds it took to get the response """ assert count >= 3, 'Count must be greater or equal than 3.' # # First we try to get the data from the cache # cache_key = self._get_cache_key(mutant) cached_rtt = self._get_cached_rtt(cache_key, debugging_id=debugging_id) if cached_rtt is not None: return cached_rtt # # Only perform one of these checks at the time, this is useful to prevent # different threads which need the same result from duplicating efforts # rtt_processing_event = self._rtt_processing_events.get(cache_key, None) if rtt_processing_event is not None: # There is another thread sending HTTP requests to get the average RTT # we need to wait for that thread to finish wait_result = rtt_processing_event.wait(timeout=self.TIMEOUT) if not wait_result: # The TIMEOUT has been reached, the thread that was trying to get # the RTT for us found a serious issue, is dead-locked, etc. # # We're going to have to try to get the RTT ourselves by sending # the HTTP requests. Just `pass` here and get to the code below # that sends the HTTP requests msg = ('get_average_rtt_for_mutant() timed out waiting for' ' results from another thread. Will send HTTP requests' ' and collect the data from the network (did:%s)') args = (debugging_id, ) om.out.debug(msg % args) else: # The event was set! The other thread finished and we can read # the result from the cache. # # Just in case the other thread had issues getting the RTTs, we # need to check if the cache actually has the data, and if the # data is valid # # No need to check the timestamp because we know it will be # valid, it has been just set by the other thread cached_rtt = self._get_cached_rtt(cache_key, debugging_id=debugging_id) if cached_rtt is not None: return cached_rtt msg = ( 'get_average_rtt_for_mutant() found no cache entry after' ' the other thread finished. Will send HTTP requests' ' and collect the data from the network (did:%s)') args = (debugging_id, ) om.out.debug(msg % args) # # There is no other thread getting data for `cache_key`, we'll have to # extract the information by sending the HTTP requests # event = threading.Event() self._rtt_processing_events[cache_key] = event try: average_rtt = self._get_average_rtt_for_mutant( mutant, count=count, debugging_id=debugging_id) self._rtt_mutant_cache[cache_key] = (time.time(), average_rtt) finally: event.set() self._rtt_processing_events.pop(event, None) msg = 'Returning fresh average RTT of %.2f seconds for mutant %s (did:%s)' args = (average_rtt, cache_key, debugging_id) om.out.debug(msg % args) return average_rtt def _get_cached_rtt(self, cache_key, debugging_id): cached_value = self._rtt_mutant_cache.get(cache_key, default=None) if cached_value is None: return None timestamp, value = cached_value if time.time() - timestamp > 5: return None # The cache entry is still valid, return the cached value msg = 'Returning cached average RTT of %.2f seconds for mutant %s (did:%s)' args = (value, cache_key, debugging_id) om.out.debug(msg % args) return value def _get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None): # # Need to send the HTTP requests and do the average # rtts = self._get_all_rtts(mutant, count, debugging_id) if self._has_outliers(rtts): # # The measurement has outliers, we can't continue! If we do # continue the average_rtt will be completely invalid and # potentially yield false positives # rtts_str = ', '.join(str(i) for i in rtts) msg = 'Found outliers while sampling average RTT: %s' % rtts_str raise OutlierException(msg) average_rtt = float(sum(rtts)) / len(rtts) return average_rtt def _get_all_rtts(self, mutant, count=3, debugging_id=None): """ :param mutant: The mutant to send and measure RTT from :param count: Number of checks to perform :param debugging_id: Unique ID used for logging :return: A float representing the seconds it took to get the response """ rtts = [] for _ in xrange(count): resp = self._url_opener.send_mutant(mutant, cache=False, grep=False, debugging_id=debugging_id) rtt = resp.get_wait_time() rtts.append(rtt) return rtts def _has_outliers(self, rtts): """ When we measure the RTT for a specific endpoint + parameter set we might get a big variation in the result, for example the RTTs might be: [0.2, 0.25, 1.8] Where 1.8 is an outlier that will break the detection of time-based SQL injection, OS commanding, etc. since the average for that RTT set is very influenced by the outlier. :param rtts: The list of RTT obtained by _get_rtts :return: True if the list of rtts has one or more outliers. """ # # TODO: perform outlier analysis # # https://github.com/andresriancho/w3af/commit/9494b49acab10833f629fae58dcc104b37f9720f # return False
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10) self._parser_finished_events = {} self._parser_blacklist = DiskSet() def clear(self): """ Clear all the internal variables :return: None """ om.out.debug('Called clear() on ParserCache') # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): if hasattr(parser, 'clear'): parser.clear() # We don't need the parsers anymore self._cache.clear() self._can_parse_cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def can_parse(self, http_response): """ Check if we can parse an HTTP response :param http_response: The HTTP response to verify :return: True if we can parse this HTTP response """ cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None) if cached_can_parse is not None: return cached_can_parse # # We need to verify if we can parse this HTTP response # try: can_parse = DocumentParser.can_parse(http_response) except: # We catch all the exceptions here and just return False because # the real parsing procedure will (most likely) fail to parse # this response too. can_parse = False self._can_parse_cache[can_parse] = can_parse return can_parse def add_to_blacklist(self, hash_string): """ Add a hash_string representing an HTTP response to the blacklist, indicating that we won't try to parse this response never again. :return: None """ self._parser_blacklist.add(hash_string) def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if possible :param http_response: The http response instance :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): msg = 'There is no parser for "%s".' raise BaseFrameworkException(msg % http_response.get_url()) hash_string = get_response_unique_id(http_response) if hash_string in self._parser_blacklist: msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.' raise BaseFrameworkException(msg % http_response.get_url()) # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait( timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached timeout parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url( ) raise BaseFrameworkException(msg) except ScanMustStopException as e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except: # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else: save_to_cache = self.should_cache(http_response) and cache if save_to_cache: self._cache[hash_string] = parser else: self._handle_no_cache(hash_string) finally: event.set() self._parser_finished_events.pop(hash_string, None) return parser def _log_return_empty(self, http_response, detail): msg = 'Returning empty list in get_tags_by_filter("%s"). ' msg += detail om.out.debug(msg % http_response.get_uri()) def get_tags_by_filter(self, http_response, tags, yield_text=False, cache=True): """ Get specific tags from http_response using the cache if possible :param http_response: The http response instance :param tags: List of tags to get, or None if all tags should be returned :param yield_text: Include the tag text (<a>text</a>) :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # This is a performance hack that should reduce the time consumed by # this method without impacting its results. Note that in HTML this is # valid: # # <script # # And this is invalid: # # < script # # We use that in order to speed-up this function # if tags is not None: body_lower = http_response.get_body().lower() for tag in tags: lt_tag = '<%s' % tag if lt_tag in body_lower: break else: # No tag was found in the HTML return [] # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): self._log_return_empty(http_response, 'No parser available') return [] args = '%r%r' % (tags, yield_text) hash_string = get_body_unique_id(http_response, prepend=args) if hash_string in self._parser_blacklist: self._log_return_empty(http_response, 'HTTP response is blacklisted') return [] # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait( timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser self._log_return_empty(http_response, 'Timeout waiting for response') return [] # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: tags = mp_doc_parser.get_tags_by_filter(http_response, tags, yield_text=yield_text) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser self._log_return_empty( http_response, 'Timeout waiting for get_tags_by_filter()') return [] except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser self._log_return_empty(http_response, 'Reached memory usage limit') return [] except ScanMustStopException as e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except Exception as e: # Act just like when there is no parser msg = 'Unhandled exception running get_tags_by_filter("%s"): %s' args = (http_response.get_url(), e) raise BaseFrameworkException(msg % args) else: if cache: self._cache[hash_string] = tags else: self._handle_no_cache(hash_string) finally: event.set() self._parser_finished_events.pop(hash_string, None) return tags
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._parser_finished_events = {} def clear(self): """ Clear all the internal variables :return: None """ # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): parser.clear() # We don't need the parsers anymore self._cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def parser_warpper(func): @functools.wraps(func) def inner(self, *args, **kwargs): if not hasattr(self, 'disk_cache'): self.disk_cache = {'key_set': set(), 'disk_cache': DiskDict('rsp_parser')} return func(self, *args, **kwargs) return inner @parser_warpper def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if required :param http_response: The http response instance :return: An instance of DocumentParser """ if http_response.is_image(): # Act just like when there is no parser msg = 'There is no parser for image("%s")' % (http_response.get_url()) raise BaseFrameworkException(msg) hash_string = get_request_unique_id(http_response) parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished try: parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) except: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser: self._handle_cache_hit(hash_string) # om.out.debug('[parser cache][memory] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string)) else: # om.out.debug('[parser cache][memory] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string)) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event # Not in cache, have to work. self._handle_cache_miss(hash_string) try: if hash_string in self.disk_cache['key_set']: parser = self.disk_cache['disk_cache'][hash_string] # om.out.debug('[parser cache][disk] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string)) else: # om.out.debug('[parser cache][disk] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string)) try: parser = mp_doc_parser.get_document_parser_for(http_response) except Exception as e: # Act just like when there is no parser msg = 'There is no parser for "%s".e=%s' % (http_response.get_url(), e) raise BaseFrameworkException(msg) else: self.disk_cache['disk_cache'][hash_string] = parser self.disk_cache['key_set'].add(hash_string) save_to_cache = self.should_cache(http_response) and cache if save_to_cache: self._cache[hash_string] = parser else: self._handle_no_cache(hash_string) finally: self._parser_finished_events.pop(hash_string, None) event.set() return parser
class BasicKnowledgeBase(object): """ This is a base class from which all implementations of KnowledgeBase will inherit. It has the basic utility methods that will be used. :author: Andres Riancho ([email protected]) """ UPDATE = 'update' APPEND = 'append' ADD_URL = 'add_url' def __init__(self): self._kb_lock = threading.RLock() self.FILTERS = {'URL': self.filter_url, 'VAR': self.filter_var} self._reached_max_info_instances_cache = SynchronizedLRUDict(512) def append_uniq(self, location_a, location_b, info_inst, filter_by='VAR'): """ Append to a location in the KB if and only if there it no other vulnerability in the same location for the same URL and parameter. Does this in a thread-safe manner. :param location_a: The A location where to store data :param location_b: The B location where to store data :param info_inst: An Info instance (or subclasses like Vuln and InfoSet) :param filter_by: One of 'VAR' of 'URL'. Only append to the kb in (location_a, location_b) if there is NO OTHER info in that location with the same: - 'VAR': URL,Variable,DataContainer.keys() - 'URL': URL :return: True if the vuln was added. False if there was already a vulnerability in the KB location with the same URL and parameter. """ if not isinstance(info_inst, Info): raise ValueError( 'append_uniq requires an info object as parameter.') filter_function = self.FILTERS.get(filter_by, None) if filter_function is None: raise ValueError( 'append_uniq only knows about URL or VAR filters.') with self._kb_lock: if filter_function(location_a, location_b, info_inst): self.append(location_a, location_b, info_inst) return True return False def filter_url(self, location_a, location_b, info_inst): """ :return: True if there is no other info in (location_a, location_b) with the same URL as the info_inst. """ for saved_vuln in self.get_iter(location_a, location_b): if saved_vuln.get_url() == info_inst.get_url(): return False return True def filter_var(self, location_a, location_b, info_inst): """ :return: True if there is no other info in (location_a, location_b) with the same URL, variable as the info_inst. Before I checked the data container parameter names the problem with that approach was that in some rare cases the scanner reported vulnerabilities in: http://target.com/?id={here}&tracking1=23 http://target.com/?id={here}&tracking1=23&tracking2=42 Where tracking1 and tracking2 were parameters added for tracking the user navigation through the site. Then I realized that this is the same vulnerability since the same piece of code is the one generating them. Thus, no need to report them twice. """ for saved_vuln in self.get_iter(location_a, location_b): if saved_vuln.get_token_name() != info_inst.get_token_name(): continue if saved_vuln.get_url() != info_inst.get_url(): continue msg = ('[filter_var] Preventing "%s" from being written to the' ' KB because "%s" has the same token (%s) and URL (%s).') args = (info_inst.get_desc(), saved_vuln.get_desc(), info_inst.get_token_name(), info_inst.get_url()) om.out.debug(msg % args) return False return True def _has_reached_max_info_instances(self, location_a, location_b, info_inst, group_klass): """ Checks if the tuple containing - location_a, - location_b, - info.get(self.ITAG) Is in the max info instances reached cache. Works together with _record_reached_max_info_instances() :param location_a: The "a" address :param location_b: The "b" address :param info_inst: The Info instance we want to store :param group_klass: If required, will be used to create a new InfoSet :return: True if the data is in the cache """ key = self._get_max_info_instances_key(location_a, location_b, info_inst, group_klass) return self._reached_max_info_instances_cache.get(key) def _get_max_info_instances_key(self, location_a, location_b, info_inst, group_klass): return (location_a, location_b, repr(info_inst.get(group_klass.ITAG))) def _record_reached_max_info_instances(self, location_a, location_b, info_inst, group_klass): """ Stores the tuple containing - location_a, - location_b, - info.get(self.ITAG) To the max info instances reached cache. Works together with _has_reached_max_info_instances() :param location_a: The "a" address :param location_b: The "b" address :param info_inst: The Info instance we want to store :param group_klass: If required, will be used to create a new InfoSet :return: None """ key = self._get_max_info_instances_key(location_a, location_b, info_inst, group_klass) self._reached_max_info_instances_cache[key] = True def append_uniq_group(self, location_a, location_b, info_inst, group_klass=InfoSet): """ This function will append a Info instance to an existing InfoSet which is stored in (location_a, location_b) and matches the filter_func. If filter_func doesn't match any existing InfoSet instances, then a new one is created using `group_klass` and `info_inst` is appended to it. :see: https://github.com/andresriancho/w3af/issues/3955 :param location_a: The "a" address :param location_b: The "b" address :param info_inst: The Info instance we want to store :param group_klass: If required, will be used to create a new InfoSet :return: (The updated/created InfoSet, as stored in the kb, True if a new InfoSet was created) """ if not isinstance(info_inst, Info): raise TypeError('append_uniq_group requires an Info instance' ' as parameter.') if not issubclass(group_klass, InfoSet): raise TypeError('append_uniq_group requires an InfoSet subclass' ' as parameter.') location_a = self._get_real_name(location_a) with self._kb_lock: # This performs a quick check against a LRU cache to prevent # queries to the DB if self._has_reached_max_info_instances(location_a, location_b, info_inst, group_klass): return info_inst, False for info_set in self.get_iter(location_a, location_b): if not isinstance(info_set, InfoSet): continue if info_set.match(info_inst): # InfoSet will only store a MAX_INFO_INSTANCES inside, after # that any calls to add() will not modify InfoSet.infos if info_set.has_reached_max_info_instances(): # Record that this location and infoset have reached the max # instances. This works together with _has_reached_max_info_instances() # to reduce SQLite queries self._record_reached_max_info_instances( location_a, location_b, info_inst, group_klass) # The info set instance was not modified, so we just return return info_set, False # Since MAX_INFO_INSTANCES has not been reached, we need to # copy the info set, add the info instance, and update the DB old_info_set = copy.deepcopy(info_set) # Add the new information to the InfoSet instance, if we reach # this point, and because we checked against has_reached_max_info_instances, # we are sure that `added` will be True and the info instance # will be added to the InfoSet added = info_set.add(info_inst) # Only change the ID of the InfoSet instance if a new Info # has been added if added: info_set.generate_new_id() # Save to the DB self.update(old_info_set, info_set) return info_set, False else: # No pre-existing InfoSet instance matched, let's create one # for the info_inst info_set = group_klass([info_inst]) self.append(location_a, location_b, info_set) return info_set, True def get_all_vulns(self): """ :return: A list of all info instances with severity in (LOW, MEDIUM, HIGH) """ raise NotImplementedError def get_all_infos(self): """ :return: A list of all info instances with severity eq INFORMATION """ raise NotImplementedError def get_all_entries_of_class_iter(self, klass, exclude_ids=()): """ :yield: All objects where class in klass that are saved in the kb. :param exclude_ids: The vulnerability IDs to exclude from the result """ raise NotImplementedError def get_all_findings(self, exclude_ids=()): """ :return: A list of all findings, including Info, Vuln and InfoSet. :param exclude_ids: The vulnerability IDs to exclude from the result """ return self.get_all_entries_of_class((Info, InfoSet, Vuln), exclude_ids=exclude_ids) def get_all_findings_iter(self, exclude_ids=()): """ An iterated version of get_all_findings. All new code should use get_all_findings_iter instead of get_all_findings(). :yield: All findings stored in the KB. :param exclude_ids: The vulnerability IDs to exclude from the result """ klass = (Info, InfoSet, Vuln) for finding in self.get_all_entries_of_class_iter(klass, exclude_ids): yield finding def get_all_uniq_ids_iter(self): """ :yield: All uniq IDs from the KB """ raise NotImplementedError def get_all_shells(self, w3af_core=None): """ :param w3af_core: The w3af_core used in the current scan @see: Shell.__reduce__ to understand why we need the w3af_core :return: A list of all vulns reported by all plugins. """ all_shells = [] for shell in self.get_all_entries_of_class(Shell): if w3af_core is not None: shell.set_url_opener(w3af_core.uri_opener) shell.set_worker_pool(w3af_core.worker_pool) all_shells.append(shell) return all_shells def _get_real_name(self, data): """ Some operations allow location_a to be both a plugin instance or a string. Those operations will call this method to translate the plugin instance into a string. """ if isinstance(data, basestring): return data else: return data.get_name() def append(self, location_a, location_b, value): """ This method appends the location_b value to a dict. """ raise NotImplementedError def get(self, plugin_name, location_b, check_types=True): """ :param plugin_name: The plugin that saved the data to the kb.info Typically the name of the plugin, but could also be the plugin instance. :param location_b: The name of the variables under which the vuln objects were saved. Typically the same name of the plugin, or something like "vulns", "errors", etc. In most cases this is NOT None. When set to None, a dict with all the vuln objects found by the plugin_name is returned. :return: Returns the data that was saved by another plugin. """ raise NotImplementedError def get_iter(self, plugin_name, location_b, check_types=True): """ Same as get() but yields items one by one instead of returning a list with all the items. """ raise NotImplementedError def get_all_entries_of_class(self, klass, exclude_ids=()): """ :return: A list of all objects of class == klass that are saved in the kb. :param exclude_ids: The vulnerability IDs to exclude from the result """ raise NotImplementedError def update(self, old_vuln, update_vuln): """ :return: The updated vulnerability/info instance stored in the kb. """ raise NotImplementedError def clear(self, location_a, location_b): """ Clear any values stored in (location_a, location_b) """ raise NotImplementedError def raw_write(self, location_a, location_b, value): """ This method saves the value to (location_a,location_b) """ raise NotImplementedError def raw_read(self, location_a, location_b): """ This method reads the value from (location_a,location_b) """ raise NotImplementedError def dump(self): raise NotImplementedError def cleanup(self): """ Cleanup all internal data. """ raise NotImplementedError