Ejemplo n.º 1
0
class ResponseCacheKeyCache(object):
    #
    # The memory impact of having a large number of items in this cache is
    # really low, both the keys and the values are short strings (the result of
    # quick_hash)
    #
    MAX_SIZE = 2000

    def __init__(self):
        self._cache = SynchronizedLRUDict(self.MAX_SIZE)

    def get_response_cache_key(self,
                               http_response,
                               clean_response=None,
                               headers=None):

        # When the clean response is available, use that body to calculate the
        # cache key. It has been cleaned (removed request paths and QS parameters)
        # so it has a higher chance of being equal to other responses / being
        # already in the cache
        if clean_response is not None:
            body = clean_response.body
        else:
            body = http_response.body

        cache_key = '%s%s' % (smart_str_ignore(body), headers)
        cache_key = quick_hash(cache_key)

        result = self._cache.get(cache_key, None)

        if result is not None:
            return result

        result = get_response_cache_key(http_response,
                                        clean_response=clean_response,
                                        headers=headers)

        self._cache[cache_key] = result
        return result

    def clear_cache(self):
        self._cache.clear()
Ejemplo n.º 2
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except ScanMustStopException, e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
Ejemplo n.º 3
0
class GetAverageRTTForMutant(object):
    def __init__(self, url_opener):
        self._url_opener = url_opener

        # Cache to measure RTT
        self._rtt_mutant_cache = SynchronizedLRUDict(capacity=128)
        self._rtt_mutant_lock = threading.RLock()
        self._specific_rtt_mutant_locks = dict()

    def _get_cache_key(self, mutant):
        #
        # Get the cache key for this mutant
        #
        method = mutant.get_method()
        uri = mutant.get_uri()
        data = mutant.get_data()
        headers = mutant.get_all_headers()

        cache_key_parts = [method, uri, data, headers]
        cache_key_str = ''.join([smart_str_ignore(i) for i in cache_key_parts])

        m = hashlib.md5()
        m.update(cache_key_str)
        return m.hexdigest()

    def get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None):
        """
        Get the average time for the HTTP request represented as a mutant.

        This method caches responses. The cache entries are valid for 5 seconds,
        after that period of time the entry is removed from the cache, the average RTT
        is re-calculated and stored again.

        :param mutant: The mutant to send and measure RTT from
        :param count: Number of checks to perform
        :param debugging_id: Unique ID used for logging
        :return: A float representing the seconds it took to get the response
        """
        assert count >= 3, 'Count must be greater or equal than 3.'

        cache_key = self._get_cache_key(mutant)

        #
        # Only perform one of these checks at the time, this is useful to prevent
        # different threads which need the same result from duplicating efforts
        #
        specific_rtt_mutant_lock = self._get_specific_rtt_mutant_lock(cache_key)

        with specific_rtt_mutant_lock:
            cached_value = self._rtt_mutant_cache.get(cache_key, default=None)

            if cached_value is not None:
                timestamp, value = cached_value
                if time.time() - timestamp <= 5:
                    #
                    # The cache entry is still valid, return the cached value
                    #
                    msg = 'Returning cached average RTT of %.2f seconds for mutant %s'
                    om.out.debug(msg % (value, cache_key))
                    return value

            #
            # Need to send the HTTP requests and do the average
            #
            rtts = self._get_rtts(mutant, count, debugging_id)

            if self._has_outliers(rtts):
                #
                # The measurement has outliers, we can't continue! If we do
                # continue the average_rtt will be completely invalid and
                # potentially yield false positives
                #
                self._remove_cache_key_from_mutant_locks(cache_key)
                rtts_str = ', '.join(str(i) for i in rtts)
                msg = 'Found outliers while sampling average RTT: %s' % rtts_str
                raise OutlierException(msg)

            average_rtt = float(sum(rtts)) / len(rtts)
            self._rtt_mutant_cache[cache_key] = (time.time(), average_rtt)

        self._remove_cache_key_from_mutant_locks(cache_key)

        msg = 'Returning fresh average RTT of %.2f seconds for mutant %s'
        om.out.debug(msg % (average_rtt, cache_key))

        return average_rtt

    def _remove_cache_key_from_mutant_locks(self, cache_key):
        with self._rtt_mutant_lock:
            if cache_key in self._specific_rtt_mutant_locks:
                self._specific_rtt_mutant_locks.pop(cache_key)

    def _get_rtts(self, mutant, count=3, debugging_id=None):
        """
        :param mutant: The mutant to send and measure RTT from
        :param count: Number of checks to perform
        :param debugging_id: Unique ID used for logging
        :return: A float representing the seconds it took to get the response
        """
        rtts = []

        for _ in xrange(count):
            resp = self._url_opener.send_mutant(mutant,
                                                cache=False,
                                                grep=False,
                                                debugging_id=debugging_id)
            rtt = resp.get_wait_time()
            rtts.append(rtt)

        return rtts

    def _has_outliers(self, rtts):
        """
        When we measure the RTT for a specific endpoint + parameter set we
        might get a big variation in the result, for example the RTTs might
        be:

            [0.2, 0.25, 1.8]

        Where 1.8 is an outlier that will break the detection of time-based
        SQL injection, OS commanding, etc. since the average for that RTT set
        is very influenced by the outlier.

        :param rtts: The list of RTT obtained by _get_rtts
        :return: True if the list of rtts has one or more outliers.
        """
        return False
        outlier_analyis = outliers_modified_z_score(rtts)
        return None in outlier_analyis

    def _get_specific_rtt_mutant_lock(self, cache_key):
        with self._rtt_mutant_lock:
            specific_rtt_mutant_lock = self._specific_rtt_mutant_locks.get(cache_key)

            if specific_rtt_mutant_lock is not None:
                return specific_rtt_mutant_lock

            specific_rtt_mutant_lock = threading.RLock()
            self._specific_rtt_mutant_locks[cache_key] = specific_rtt_mutant_lock
            return specific_rtt_mutant_lock
Ejemplo n.º 4
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._parser_finished_events = {}

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if required

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        hash_string = get_request_unique_id(http_response)

        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            else:
                save_to_cache = self.should_cache(http_response) and cache
                if save_to_cache:
                    self._cache[hash_string] = parser
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return parser
Ejemplo n.º 5
0
class grep(BaseConsumer):
    """
    Consumer thread that takes requests and responses from the queue and
    analyzes them using the user-enabled grep plugins.
    """

    LOG_QUEUE_SIZES_EVERY = 25
    REPORT_GREP_STATS_EVERY = 25

    EXCLUDE_HEADERS_FOR_HASH = tuple([
        'date', 'expires', 'last-modified', 'etag', 'x-request-id',
        'x-content-duration', 'x-execution-time', 'x-requestid',
        'content-length', 'cf-ray', 'set-cookie'
    ])

    def __init__(self, grep_plugins, w3af_core):
        """
        :param grep_plugins: Instances of grep plugins in a list
        :param w3af_core: The w3af core that we'll use for status reporting
        """
        # max_in_queue_size, is the number of items that will be stored in-memory
        # in the consumer queue
        #
        # Any items exceeding max_in_queue_size will be stored on-disk, which
        # is slow but will prevent any high memory usage imposed by this part
        # of the framework
        max_in_queue_size = 25

        # thread_pool_size defines how many threads we'll use to run grep plugins
        thread_pool_size = 10

        # max_pool_queued_tasks defines how many tasks we'll keep in memory waiting
        # for a worker from the pool to be available
        max_pool_queued_tasks = thread_pool_size * 3

        super(grep, self).__init__(grep_plugins,
                                   w3af_core,
                                   create_pool=True,
                                   max_pool_queued_tasks=max_pool_queued_tasks,
                                   thread_pool_size=thread_pool_size,
                                   thread_name=self.get_name(),
                                   max_in_queue_size=max_in_queue_size)

        self._already_analyzed_body = ScalableBloomFilter()
        self._already_analyzed_url = ScalableBloomFilter()

        self._target_domains = None
        self._log_queue_sizes_calls = 0

        self._consumer_plugin_dict = dict(
            (plugin.get_name(), plugin) for plugin in self._consumer_plugins)
        self._first_plugin_name = self._consumer_plugin_dict.keys()[0]

        self._request_response_lru = SynchronizedLRUDict(thread_pool_size * 3)
        self._request_response_processes = dict()
        self._response_cache_key_cache = ResponseCacheKeyCache()

        self._should_grep_stats = {
            'accept': 0,
            'reject-seen-body': 0,
            'reject-seen-url': 0,
            'reject-out-of-scope': 0,
        }

    def get_name(self):
        return 'Grep'

    def _teardown(self):
        """
        Handle POISON_PILL
        """
        msg = 'Starting Grep consumer _teardown() with %s plugins'
        om.out.debug(msg % len(self._consumer_plugins))

        for plugin in self._consumer_plugins:
            om.out.debug('Calling %s.end()' % plugin.get_name())
            start_time = time.time()

            try:
                plugin.end()

            except Exception as exception:

                msg = 'An exception was found while running %s.end(): "%s"'
                args = (plugin.get_name(), exception)
                om.out.debug(msg % args)

                status = FakeStatus(self._w3af_core)
                status.set_current_fuzzable_request('grep', 'n/a')
                status.set_running_plugin('grep', plugin.get_name(), log=True)

                exec_info = sys.exc_info()
                enabled_plugins = 'n/a'
                self._w3af_core.exception_handler.handle(
                    status, exception, exec_info, enabled_plugins)
                continue

            spent_time = time.time() - start_time
            msg = 'Spent %.2f seconds running %s.end()'
            args = (spent_time, plugin.get_name())
            om.out.debug(msg % args)

        self._consumer_plugins = dict()
        self._consumer_plugin_dict = dict()
        self._response_cache_key_cache.clear_cache()

        om.out.debug('Finished Grep consumer _teardown()')

    def _get_request_response_from_id_impl(self, http_response_id):
        """
        Just reads the request and response from the files. No threading,
        events, caching, etc.

        :param http_response_id: The HTTP response ID
        :return: An HTTP request and response tuple
        """
        history = HistoryItem()
        request, response = history.load_from_file(http_response_id)

        # Create a fuzzable request based on the urllib2 request object
        headers_inst = Headers(request.header_items())
        request = FuzzableRequest.from_parts(request.url_object,
                                             request.get_method(),
                                             request.get_data() or '',
                                             headers_inst)

        return request, response

    def _get_request_response_from_id(self, http_response_id):
        """
        This is a rather complex method that reads the HTTP request and response
        from disk and makes sure that:

            * Requests and responses are cached in a LRU to prevent reading
              the same data from disk twice in a short period of time

            * Thread events are used to prevent two threads from starting
              to read the same HTTP response ID at the same time, which
              would waste CPU cycles and disk IO.

        :param http_response_id: The HTTP response ID
        :return: A request / response tuple
        """
        #
        # First check if the request and response was already deserialized
        # by another thread and stored in the LRU
        #
        request_response = self._request_response_lru.get(
            http_response_id, None)
        if request_response is not None:
            request, response = request_response
            return request, response

        #
        # Another thread might have started with the deserialization, check
        # and wait for that thread to finish
        #
        event = self._request_response_processes.get(http_response_id, None)
        if event is not None:
            # Wait for the other thread to finish reading the request and
            # response from disk. Timeout after 20 seconds as a safety measure
            wait_result = event.wait(timeout=20)
            if not wait_result:
                om.out.error('There was a timeout waiting for the'
                             ' deserialization of HTTP request and response'
                             ' with id %s' % http_response_id)
                return None, None

            # Read the data from the LRU. There is a 99,9999% chance it is there
            # since the other thread saved it before setting the event
            request_response = self._request_response_lru.get(
                http_response_id, None)
            if request_response is not None:
                request, response = request_response
                return request, response

            # There is a 0,0001% chance we get here when the items in the LRU
            # are removed right after being added, if this happens we just
            # continue with the algorithm and read the request / response
            # from the files

        #
        # There are no threads deserializing this HTTP response id, start
        # the process and create an event for others to know they need to
        # wait
        #
        event = threading.Event()
        self._request_response_processes[http_response_id] = event

        try:
            request, response = self._get_request_response_from_id_impl(
                http_response_id)
            self._request_response_lru[http_response_id] = (request, response)
        finally:
            event.set()
            self._request_response_processes.pop(http_response_id, None)

        return request, response

    def _consume(self, http_response_id):
        """
        Handle a request/response that needs to be analyzed
        :param http_response_id: The HTTP response ID
        :return: None
        """
        self._run_all_plugins(http_response_id)

    def _log_queue_sizes(self):
        """
        The grep consumer will loop really fast through all tasks, if the
        queue sizes are written on every loop, we'll end up with a log file
        full of those lines (with ~10 lines per second with almost the same
        information).

        Call the parent's _log_queue_sizes once every 25 calls to this method.

        :return: None
        """
        self._log_queue_sizes_calls += 1

        if (self._log_queue_sizes_calls % self.LOG_QUEUE_SIZES_EVERY) != 0:
            return

        return super(grep, self)._log_queue_sizes()

    def _run_all_plugins(self, http_response_id):
        """
        Run one plugin against a request/response.

        :param http_response_id: HTTP response ID
        :return: None, results are saved to KB
        """
        for plugin_name in self._consumer_plugin_dict:
            # Note that if we don't limit the input queue size for the thread
            # pool we might end up with a lot of queued calls here! The calls
            # contain an HTTP response body, so they really use a lot of
            # memory!
            #
            # This is controlled by max_pool_queued_tasks
            args = (plugin_name, http_response_id)
            self._threadpool.apply_async(self._run_one_plugin, args)

    def _get_plugin_from_name(self, plugin_name):
        plugin = self._consumer_plugin_dict.get(plugin_name, None)

        if plugin is None:
            msg = ('Internal error in grep consumer: plugin with name %s'
                   ' does not exist in dict.')
            args = (plugin_name, )
            om.out.error(msg % args)

        return plugin

    def _run_one_plugin(self, plugin_name, http_response_id):
        """
        :param plugin_name: Grep plugin name to run
        :param http_response_id: HTTP response ID
        :return: None
        """
        plugin = self._get_plugin_from_name(plugin_name)
        if plugin is None:
            return

        request, response = self._get_request_response_from_id(
            http_response_id)
        if request is None:
            return

        self._run_observers(plugin_name, request, response)

        took_line = TookLine(self._w3af_core,
                             plugin_name,
                             'grep',
                             debugging_id=None,
                             method_params={'uri': request.get_uri()})

        try:
            plugin.grep_wrapper(request, response)
        except Exception, e:
            self.handle_exception('grep', plugin_name, request, e)
        else:
Ejemplo n.º 6
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except ScanMustStopException, e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
Ejemplo n.º 7
0
class GetAverageRTTForMutant(object):
    def __init__(self, url_opener):
        self._url_opener = url_opener

        # Cache to measure RTT
        self._rtt_mutant_cache = SynchronizedLRUDict(capacity=128)
        self._rtt_mutant_lock = threading.RLock()
        self._specific_rtt_mutant_locks = dict()

    def _get_cache_key(self, mutant):
        #
        # Get the cache key for this mutant
        #
        method = mutant.get_method()
        uri = mutant.get_uri()
        data = mutant.get_data()
        headers = mutant.get_all_headers()

        cache_key_parts = [method, uri, data, headers]
        cache_key_str = ''.join([smart_str_ignore(i) for i in cache_key_parts])

        m = hashlib.md5()
        m.update(cache_key_str)
        return m.hexdigest()

    def get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None):
        """
        Get the average time for the HTTP request represented as a mutant.

        This method caches responses. The cache entries are valid for 5 seconds,
        after that period of time the entry is removed from the cache, the average RTT
        is re-calculated and stored again.

        :param mutant: The mutant to send and measure RTT from
        :param count: Number of checks to perform
        :param debugging_id: Unique ID used for logging
        :return: A float representing the seconds it took to get the response
        """
        assert count >= 3, 'Count must be greater or equal than 3.'

        cache_key = self._get_cache_key(mutant)

        #
        # Only perform one of these checks at the time, this is useful to prevent
        # different threads which need the same result from duplicating efforts
        #
        specific_rtt_mutant_lock = self._get_specific_rtt_mutant_lock(
            cache_key)

        with specific_rtt_mutant_lock:
            cached_value = self._rtt_mutant_cache.get(cache_key, default=None)

            if cached_value is not None:
                timestamp, value = cached_value
                if time.time() - timestamp <= 5:
                    #
                    # The cache entry is still valid, return the cached value
                    #
                    msg = 'Returning cached average RTT of %.2f seconds for mutant %s'
                    om.out.debug(msg % (value, cache_key))
                    return value

            #
            # Need to send the HTTP requests and do the average
            #
            rtts = self._get_rtts(mutant, count, debugging_id)

            if self._has_outliers(rtts):
                #
                # The measurement has outliers, we can't continue! If we do
                # continue the average_rtt will be completely invalid and
                # potentially yield false positives
                #
                self._remove_cache_key_from_mutant_locks(cache_key)
                rtts_str = ', '.join(str(i) for i in rtts)
                msg = 'Found outliers while sampling average RTT: %s' % rtts_str
                raise OutlierException(msg)

            average_rtt = float(sum(rtts)) / len(rtts)
            self._rtt_mutant_cache[cache_key] = (time.time(), average_rtt)

        self._remove_cache_key_from_mutant_locks(cache_key)

        msg = 'Returning fresh average RTT of %.2f seconds for mutant %s'
        om.out.debug(msg % (average_rtt, cache_key))

        return average_rtt

    def _remove_cache_key_from_mutant_locks(self, cache_key):
        with self._rtt_mutant_lock:
            if cache_key in self._specific_rtt_mutant_locks:
                self._specific_rtt_mutant_locks.pop(cache_key)

    def _get_rtts(self, mutant, count=3, debugging_id=None):
        """
        :param mutant: The mutant to send and measure RTT from
        :param count: Number of checks to perform
        :param debugging_id: Unique ID used for logging
        :return: A float representing the seconds it took to get the response
        """
        rtts = []

        for _ in xrange(count):
            resp = self._url_opener.send_mutant(mutant,
                                                cache=False,
                                                grep=False,
                                                debugging_id=debugging_id)
            rtt = resp.get_wait_time()
            rtts.append(rtt)

        return rtts

    def _has_outliers(self, rtts):
        """
        When we measure the RTT for a specific endpoint + parameter set we
        might get a big variation in the result, for example the RTTs might
        be:

            [0.2, 0.25, 1.8]

        Where 1.8 is an outlier that will break the detection of time-based
        SQL injection, OS commanding, etc. since the average for that RTT set
        is very influenced by the outlier.

        :param rtts: The list of RTT obtained by _get_rtts
        :return: True if the list of rtts has one or more outliers.
        """
        return False
        outlier_analyis = outliers_modified_z_score(rtts)
        return None in outlier_analyis

    def _get_specific_rtt_mutant_lock(self, cache_key):
        with self._rtt_mutant_lock:
            specific_rtt_mutant_lock = self._specific_rtt_mutant_locks.get(
                cache_key)

            if specific_rtt_mutant_lock is not None:
                return specific_rtt_mutant_lock

            specific_rtt_mutant_lock = threading.RLock()
            self._specific_rtt_mutant_locks[
                cache_key] = specific_rtt_mutant_lock
            return specific_rtt_mutant_lock
Ejemplo n.º 8
0
class GetAverageRTTForMutant(object):

    TIMEOUT = 120

    def __init__(self, url_opener):
        self._url_opener = url_opener

        # Cache to measure RTT
        self._rtt_mutant_cache = SynchronizedLRUDict(capacity=128)
        self._rtt_processing_events = dict()

    def _get_cache_key(self, mutant):
        #
        # Get the cache key for this mutant
        #
        method = mutant.get_method()
        uri = mutant.get_uri()
        data = mutant.get_data()
        headers = mutant.get_all_headers()

        cache_key_parts = [method, uri, data, headers]
        cache_key_str = ''.join([smart_str_ignore(i) for i in cache_key_parts])

        m = hashlib.md5()
        m.update(cache_key_str)
        return m.hexdigest()

    def get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None):
        """
        Get the average time for the HTTP request represented as a mutant.

        This method caches responses. The cache entries are valid for 5 seconds,
        after that period of time the entry is removed from the cache, the average RTT
        is re-calculated and stored again.

        :param mutant: The mutant to send and measure RTT from
        :param count: Number of checks to perform
        :param debugging_id: Unique ID used for logging
        :return: A float representing the seconds it took to get the response
        """
        assert count >= 3, 'Count must be greater or equal than 3.'

        #
        # First we try to get the data from the cache
        #
        cache_key = self._get_cache_key(mutant)
        cached_rtt = self._get_cached_rtt(cache_key, debugging_id=debugging_id)

        if cached_rtt is not None:
            return cached_rtt

        #
        # Only perform one of these checks at the time, this is useful to prevent
        # different threads which need the same result from duplicating efforts
        #
        rtt_processing_event = self._rtt_processing_events.get(cache_key, None)

        if rtt_processing_event is not None:
            # There is another thread sending HTTP requests to get the average RTT
            # we need to wait for that thread to finish
            wait_result = rtt_processing_event.wait(timeout=self.TIMEOUT)

            if not wait_result:
                # The TIMEOUT has been reached, the thread that was trying to get
                # the RTT for us found a serious issue, is dead-locked, etc.
                #
                # We're going to have to try to get the RTT ourselves by sending
                # the HTTP requests. Just `pass` here and get to the code below
                # that sends the HTTP requests
                msg = ('get_average_rtt_for_mutant() timed out waiting for'
                       ' results from another thread. Will send HTTP requests'
                       ' and collect the data from the network (did:%s)')
                args = (debugging_id, )
                om.out.debug(msg % args)
            else:
                # The event was set! The other thread finished and we can read
                # the result from the cache.
                #
                # Just in case the other thread had issues getting the RTTs, we
                # need to check if the cache actually has the data, and if the
                # data is valid
                #
                # No need to check the timestamp because we know it will be
                # valid, it has been just set by the other thread
                cached_rtt = self._get_cached_rtt(cache_key,
                                                  debugging_id=debugging_id)

                if cached_rtt is not None:
                    return cached_rtt

                msg = (
                    'get_average_rtt_for_mutant() found no cache entry after'
                    ' the other thread finished. Will send HTTP requests'
                    ' and collect the data from the network (did:%s)')
                args = (debugging_id, )
                om.out.debug(msg % args)

        #
        # There is no other thread getting data for `cache_key`, we'll have to
        # extract the information by sending the HTTP requests
        #
        event = threading.Event()
        self._rtt_processing_events[cache_key] = event

        try:
            average_rtt = self._get_average_rtt_for_mutant(
                mutant, count=count, debugging_id=debugging_id)
            self._rtt_mutant_cache[cache_key] = (time.time(), average_rtt)
        finally:
            event.set()
            self._rtt_processing_events.pop(event, None)

        msg = 'Returning fresh average RTT of %.2f seconds for mutant %s (did:%s)'
        args = (average_rtt, cache_key, debugging_id)
        om.out.debug(msg % args)

        return average_rtt

    def _get_cached_rtt(self, cache_key, debugging_id):
        cached_value = self._rtt_mutant_cache.get(cache_key, default=None)

        if cached_value is None:
            return None

        timestamp, value = cached_value
        if time.time() - timestamp > 5:
            return None

        # The cache entry is still valid, return the cached value
        msg = 'Returning cached average RTT of %.2f seconds for mutant %s (did:%s)'
        args = (value, cache_key, debugging_id)
        om.out.debug(msg % args)
        return value

    def _get_average_rtt_for_mutant(self, mutant, count=3, debugging_id=None):
        #
        # Need to send the HTTP requests and do the average
        #
        rtts = self._get_all_rtts(mutant, count, debugging_id)

        if self._has_outliers(rtts):
            #
            # The measurement has outliers, we can't continue! If we do
            # continue the average_rtt will be completely invalid and
            # potentially yield false positives
            #
            rtts_str = ', '.join(str(i) for i in rtts)
            msg = 'Found outliers while sampling average RTT: %s' % rtts_str
            raise OutlierException(msg)

        average_rtt = float(sum(rtts)) / len(rtts)
        return average_rtt

    def _get_all_rtts(self, mutant, count=3, debugging_id=None):
        """
        :param mutant: The mutant to send and measure RTT from
        :param count: Number of checks to perform
        :param debugging_id: Unique ID used for logging
        :return: A float representing the seconds it took to get the response
        """
        rtts = []

        for _ in xrange(count):
            resp = self._url_opener.send_mutant(mutant,
                                                cache=False,
                                                grep=False,
                                                debugging_id=debugging_id)
            rtt = resp.get_wait_time()
            rtts.append(rtt)

        return rtts

    def _has_outliers(self, rtts):
        """
        When we measure the RTT for a specific endpoint + parameter set we
        might get a big variation in the result, for example the RTTs might
        be:

            [0.2, 0.25, 1.8]

        Where 1.8 is an outlier that will break the detection of time-based
        SQL injection, OS commanding, etc. since the average for that RTT set
        is very influenced by the outlier.

        :param rtts: The list of RTT obtained by _get_rtts
        :return: True if the list of rtts has one or more outliers.
        """
        #
        # TODO: perform outlier analysis
        #
        # https://github.com/andresriancho/w3af/commit/9494b49acab10833f629fae58dcc104b37f9720f
        #
        return False
Ejemplo n.º 9
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()

        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(),
                                                     default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
                timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url(
                )
                raise BaseFrameworkException(msg)
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            else:
                save_to_cache = self.should_cache(http_response) and cache
                if save_to_cache:
                    self._cache[hash_string] = parser
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return parser

    def _log_return_empty(self, http_response, detail):
        msg = 'Returning empty list in get_tags_by_filter("%s"). '
        msg += detail
        om.out.debug(msg % http_response.get_uri())

    def get_tags_by_filter(self,
                           http_response,
                           tags,
                           yield_text=False,
                           cache=True):
        """
        Get specific tags from http_response using the cache if possible

        :param http_response: The http response instance
        :param tags: List of tags to get, or None if all tags should be returned
        :param yield_text: Include the tag text (<a>text</a>)
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # This is a performance hack that should reduce the time consumed by
        # this method without impacting its results. Note that in HTML this is
        # valid:
        #
        #   <script
        #
        # And this is invalid:
        #
        #   < script
        #
        # We use that in order to speed-up this function
        #
        if tags is not None:
            body_lower = http_response.get_body().lower()

            for tag in tags:
                lt_tag = '<%s' % tag
                if lt_tag in body_lower:
                    break
            else:
                # No tag was found in the HTML
                return []

        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            self._log_return_empty(http_response, 'No parser available')
            return []

        args = '%r%r' % (tags, yield_text)
        hash_string = get_body_unique_id(http_response, prepend=args)

        if hash_string in self._parser_blacklist:
            self._log_return_empty(http_response,
                                   'HTTP response is blacklisted')
            return []

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
                timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                self._log_return_empty(http_response,
                                       'Timeout waiting for response')
                return []

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                tags = mp_doc_parser.get_tags_by_filter(http_response,
                                                        tags,
                                                        yield_text=yield_text)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                self._log_return_empty(
                    http_response, 'Timeout waiting for get_tags_by_filter()')
                return []
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                self._log_return_empty(http_response,
                                       'Reached memory usage limit')
                return []
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except Exception as e:
                # Act just like when there is no parser
                msg = 'Unhandled exception running get_tags_by_filter("%s"): %s'
                args = (http_response.get_url(), e)
                raise BaseFrameworkException(msg % args)
            else:
                if cache:
                    self._cache[hash_string] = tags
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return tags
Ejemplo n.º 10
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._parser_finished_events = {}

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def parser_warpper(func):
        @functools.wraps(func)
        def inner(self, *args, **kwargs):
            if not hasattr(self, 'disk_cache'):
                self.disk_cache = {'key_set': set(), 'disk_cache': DiskDict('rsp_parser')}
            return func(self, *args, **kwargs)
        return inner

    @parser_warpper
    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if required

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        if http_response.is_image():
            # Act just like when there is no parser
            msg = 'There is no parser for image("%s")' % (http_response.get_url())
            raise BaseFrameworkException(msg)

        hash_string = get_request_unique_id(http_response)

        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser:
            self._handle_cache_hit(hash_string)
            # om.out.debug('[parser cache][memory] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string))
        else:
            # om.out.debug('[parser cache][memory] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string))
            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event
            
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            try:
                if hash_string in self.disk_cache['key_set']:
                    parser = self.disk_cache['disk_cache'][hash_string]
                    # om.out.debug('[parser cache][disk] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string))
                else:
                    # om.out.debug('[parser cache][disk] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string))
                    try:
                        parser = mp_doc_parser.get_document_parser_for(http_response)
                    except Exception as e:
                        # Act just like when there is no parser
                        msg = 'There is no parser for "%s".e=%s' % (http_response.get_url(), e)
                        raise BaseFrameworkException(msg)
                    else:
                        self.disk_cache['disk_cache'][hash_string] = parser
                        self.disk_cache['key_set'].add(hash_string)

                        save_to_cache = self.should_cache(http_response) and cache
                        if save_to_cache:
                            self._cache[hash_string] = parser
                        else:
                            self._handle_no_cache(hash_string)
            finally:
                self._parser_finished_events.pop(hash_string, None)
                event.set()

        return parser
Ejemplo n.º 11
0
class BasicKnowledgeBase(object):
    """
    This is a base class from which all implementations of KnowledgeBase will
    inherit. It has the basic utility methods that will be used.

    :author: Andres Riancho ([email protected])
    """
    UPDATE = 'update'
    APPEND = 'append'
    ADD_URL = 'add_url'

    def __init__(self):
        self._kb_lock = threading.RLock()

        self.FILTERS = {'URL': self.filter_url, 'VAR': self.filter_var}

        self._reached_max_info_instances_cache = SynchronizedLRUDict(512)

    def append_uniq(self, location_a, location_b, info_inst, filter_by='VAR'):
        """
        Append to a location in the KB if and only if there it no other
        vulnerability in the same location for the same URL and parameter.

        Does this in a thread-safe manner.

        :param location_a: The A location where to store data

        :param location_b: The B location where to store data

        :param info_inst: An Info instance (or subclasses like Vuln and InfoSet)

        :param filter_by: One of 'VAR' of 'URL'. Only append to the kb in
                          (location_a, location_b) if there is NO OTHER info
                          in that location with the same:
                              - 'VAR': URL,Variable,DataContainer.keys()
                              - 'URL': URL

        :return: True if the vuln was added. False if there was already a
                 vulnerability in the KB location with the same URL and
                 parameter.
        """
        if not isinstance(info_inst, Info):
            raise ValueError(
                'append_uniq requires an info object as parameter.')

        filter_function = self.FILTERS.get(filter_by, None)

        if filter_function is None:
            raise ValueError(
                'append_uniq only knows about URL or VAR filters.')

        with self._kb_lock:

            if filter_function(location_a, location_b, info_inst):
                self.append(location_a, location_b, info_inst)
                return True

            return False

    def filter_url(self, location_a, location_b, info_inst):
        """
        :return: True if there is no other info in (location_a, location_b)
                 with the same URL as the info_inst.
        """
        for saved_vuln in self.get_iter(location_a, location_b):
            if saved_vuln.get_url() == info_inst.get_url():
                return False

        return True

    def filter_var(self, location_a, location_b, info_inst):
        """
        :return: True if there is no other info in (location_a, location_b)
                 with the same URL, variable as the info_inst.

                 Before I checked the data container parameter names
                 the problem with that approach was that in some rare
                 cases the scanner reported vulnerabilities in:

                    http://target.com/?id={here}&tracking1=23
                    http://target.com/?id={here}&tracking1=23&tracking2=42

                 Where tracking1 and tracking2 were parameters added
                 for tracking the user navigation through the site.

                 Then I realized that this is the same vulnerability
                 since the same piece of code is the one generating
                 them. Thus, no need to report them twice.

        """
        for saved_vuln in self.get_iter(location_a, location_b):

            if saved_vuln.get_token_name() != info_inst.get_token_name():
                continue

            if saved_vuln.get_url() != info_inst.get_url():
                continue

            msg = ('[filter_var] Preventing "%s" from being written to the'
                   ' KB because "%s" has the same token (%s) and URL (%s).')
            args = (info_inst.get_desc(), saved_vuln.get_desc(),
                    info_inst.get_token_name(), info_inst.get_url())
            om.out.debug(msg % args)

            return False

        return True

    def _has_reached_max_info_instances(self, location_a, location_b,
                                        info_inst, group_klass):
        """
        Checks if the tuple containing
            - location_a,
            - location_b,
            - info.get(self.ITAG)

        Is in the max info instances reached cache.

        Works together with _record_reached_max_info_instances()

        :param location_a: The "a" address
        :param location_b: The "b" address
        :param info_inst: The Info instance we want to store
        :param group_klass: If required, will be used to create a new InfoSet
        :return: True if the data is in the cache
        """
        key = self._get_max_info_instances_key(location_a, location_b,
                                               info_inst, group_klass)
        return self._reached_max_info_instances_cache.get(key)

    def _get_max_info_instances_key(self, location_a, location_b, info_inst,
                                    group_klass):
        return (location_a, location_b, repr(info_inst.get(group_klass.ITAG)))

    def _record_reached_max_info_instances(self, location_a, location_b,
                                           info_inst, group_klass):
        """
        Stores the tuple containing
            - location_a,
            - location_b,
            - info.get(self.ITAG)

        To the max info instances reached cache.

        Works together with _has_reached_max_info_instances()

        :param location_a: The "a" address
        :param location_b: The "b" address
        :param info_inst: The Info instance we want to store
        :param group_klass: If required, will be used to create a new InfoSet
        :return: None
        """
        key = self._get_max_info_instances_key(location_a, location_b,
                                               info_inst, group_klass)
        self._reached_max_info_instances_cache[key] = True

    def append_uniq_group(self,
                          location_a,
                          location_b,
                          info_inst,
                          group_klass=InfoSet):
        """
        This function will append a Info instance to an existing InfoSet which
        is stored in (location_a, location_b) and matches the filter_func.

        If filter_func doesn't match any existing InfoSet instances, then a new
        one is created using `group_klass` and `info_inst` is appended to it.

        :see: https://github.com/andresriancho/w3af/issues/3955

        :param location_a: The "a" address
        :param location_b: The "b" address
        :param info_inst: The Info instance we want to store
        :param group_klass: If required, will be used to create a new InfoSet
        :return: (The updated/created InfoSet, as stored in the kb,
                  True if a new InfoSet was created)
        """
        if not isinstance(info_inst, Info):
            raise TypeError('append_uniq_group requires an Info instance'
                            ' as parameter.')

        if not issubclass(group_klass, InfoSet):
            raise TypeError('append_uniq_group requires an InfoSet subclass'
                            ' as parameter.')

        location_a = self._get_real_name(location_a)

        with self._kb_lock:

            # This performs a quick check against a LRU cache to prevent
            # queries to the DB
            if self._has_reached_max_info_instances(location_a, location_b,
                                                    info_inst, group_klass):
                return info_inst, False

            for info_set in self.get_iter(location_a, location_b):
                if not isinstance(info_set, InfoSet):
                    continue

                if info_set.match(info_inst):
                    # InfoSet will only store a MAX_INFO_INSTANCES inside, after
                    # that any calls to add() will not modify InfoSet.infos
                    if info_set.has_reached_max_info_instances():

                        # Record that this location and infoset have reached the max
                        # instances. This works together with _has_reached_max_info_instances()
                        # to reduce SQLite queries
                        self._record_reached_max_info_instances(
                            location_a, location_b, info_inst, group_klass)

                        # The info set instance was not modified, so we just return
                        return info_set, False

                    # Since MAX_INFO_INSTANCES has not been reached, we need to
                    # copy the info set, add the info instance, and update the DB
                    old_info_set = copy.deepcopy(info_set)

                    # Add the new information to the InfoSet instance, if we reach
                    # this point, and because we checked against has_reached_max_info_instances,
                    # we are sure that `added` will be True and the info instance
                    # will be added to the InfoSet
                    added = info_set.add(info_inst)

                    # Only change the ID of the InfoSet instance if a new Info
                    # has been added
                    if added:
                        info_set.generate_new_id()

                    # Save to the DB
                    self.update(old_info_set, info_set)

                    return info_set, False
            else:
                # No pre-existing InfoSet instance matched, let's create one
                # for the info_inst
                info_set = group_klass([info_inst])
                self.append(location_a, location_b, info_set)
                return info_set, True

    def get_all_vulns(self):
        """
        :return: A list of all info instances with severity in (LOW, MEDIUM,
                 HIGH)
        """
        raise NotImplementedError

    def get_all_infos(self):
        """
        :return: A list of all info instances with severity eq INFORMATION
        """
        raise NotImplementedError

    def get_all_entries_of_class_iter(self, klass, exclude_ids=()):
        """
        :yield: All objects where class in klass that are saved in the kb.
        :param exclude_ids: The vulnerability IDs to exclude from the result
        """
        raise NotImplementedError

    def get_all_findings(self, exclude_ids=()):
        """
        :return: A list of all findings, including Info, Vuln and InfoSet.
        :param exclude_ids: The vulnerability IDs to exclude from the result
        """
        return self.get_all_entries_of_class((Info, InfoSet, Vuln),
                                             exclude_ids=exclude_ids)

    def get_all_findings_iter(self, exclude_ids=()):
        """
        An iterated version of get_all_findings. All new code should use
        get_all_findings_iter instead of get_all_findings().

        :yield: All findings stored in the KB.
        :param exclude_ids: The vulnerability IDs to exclude from the result
        """
        klass = (Info, InfoSet, Vuln)

        for finding in self.get_all_entries_of_class_iter(klass, exclude_ids):
            yield finding

    def get_all_uniq_ids_iter(self):
        """
        :yield: All uniq IDs from the KB
        """
        raise NotImplementedError

    def get_all_shells(self, w3af_core=None):
        """
        :param w3af_core: The w3af_core used in the current scan
        @see: Shell.__reduce__ to understand why we need the w3af_core
        :return: A list of all vulns reported by all plugins.
        """
        all_shells = []

        for shell in self.get_all_entries_of_class(Shell):
            if w3af_core is not None:
                shell.set_url_opener(w3af_core.uri_opener)
                shell.set_worker_pool(w3af_core.worker_pool)

            all_shells.append(shell)

        return all_shells

    def _get_real_name(self, data):
        """
        Some operations allow location_a to be both a plugin instance or a string.
        
        Those operations will call this method to translate the plugin instance
        into a string.
        """
        if isinstance(data, basestring):
            return data
        else:
            return data.get_name()

    def append(self, location_a, location_b, value):
        """
        This method appends the location_b value to a dict.
        """
        raise NotImplementedError

    def get(self, plugin_name, location_b, check_types=True):
        """
        :param plugin_name: The plugin that saved the data to the
                                kb.info Typically the name of the plugin,
                                but could also be the plugin instance.

        :param location_b: The name of the variables under which the vuln
                                 objects were saved. Typically the same name of
                                 the plugin, or something like "vulns", "errors",
                                 etc. In most cases this is NOT None. When set
                                 to None, a dict with all the vuln objects found
                                 by the plugin_name is returned.

        :return: Returns the data that was saved by another plugin.
        """
        raise NotImplementedError

    def get_iter(self, plugin_name, location_b, check_types=True):
        """
        Same as get() but yields items one by one instead of returning
        a list with all the items.
        """
        raise NotImplementedError

    def get_all_entries_of_class(self, klass, exclude_ids=()):
        """
        :return: A list of all objects of class == klass that are saved in the
                 kb.
        :param exclude_ids: The vulnerability IDs to exclude from the result
        """
        raise NotImplementedError

    def update(self, old_vuln, update_vuln):
        """
        :return: The updated vulnerability/info instance stored in the kb.
        """
        raise NotImplementedError

    def clear(self, location_a, location_b):
        """
        Clear any values stored in (location_a, location_b)
        """
        raise NotImplementedError

    def raw_write(self, location_a, location_b, value):
        """
        This method saves the value to (location_a,location_b)
        """
        raise NotImplementedError

    def raw_read(self, location_a, location_b):
        """
        This method reads the value from (location_a,location_b)
        """
        raise NotImplementedError

    def dump(self):
        raise NotImplementedError

    def cleanup(self):
        """
        Cleanup all internal data.
        """
        raise NotImplementedError