Exemple #1
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except ScanMustStopException, e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
Exemple #2
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling() or
                         user_wants_pytracemalloc() or
                         user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    def __init__(self):
        self._pool = None
        self._processes = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # pylint: disable=E1101
                # Keep track of which pid is processing which http response
                self._processes = manager.dict()
                # pylint: enable=E1101

                # The pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         maxtasksperchild=20,
                                         initializer=init_worker,
                                         initargs=(log_queue,))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.terminate()
            self._pool = None

        if self._processes is not None:
            self._processes.clear()
            self._processes = None

    def _kill_parser_process(self, hash_string, http_response):
        """
        Kill the process that's handling the parsing of http_response which
        can be identified by hash_string

        :param hash_string: The hash for the http_response
        :param http_response: The HTTP response which is being parsed
        :return: None
        """
        # Near the timeout error, so we make sure that the pid is still
        # running our "buggy" input
        pid = self._processes.pop(hash_string, None)
        if pid is not None:
            try:
                os.kill(pid, signal.SIGTERM)
            except OSError, ose:
                msg = ('An error occurred while killing the parser'
                       ' process: "%s"')
                om.out.debug(msg % ose)

        msg = ('[timeout] The parser took more than %s seconds to complete'
               ' parsing of "%s", killed it!')

        if self.PROFILING_ENABLED:
            msg += (' You are running a profiling session which requires more'
                    ' CPU and resources to be run; the'
                    ' MultiProcessingDocumentParser failed to parse the HTML'
                    ' document. Try to increase the PARSER_TIMEOUT and try'
                    ' again.\n\n'
                    'This issue invalidates the profiling session!\n\n'
                    'See issue #9713 for more information'
                    ' https://github.com/andresriancho/w3af/issues/9713')

        log_function = om.out.error if self.PROFILING_ENABLED else om.out.debug
        log_function(msg % (self.PARSER_TIMEOUT, http_response.get_url()))
Exemple #3
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling()
                         or user_wants_pytracemalloc()
                         or user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    # Document parsers can go crazy on memory usage when parsing some very
    # specific HTML / PDF documents. Sometimes when this happens the operating
    # system does an out of memory (OOM) kill of a "randomly chosen" process.
    #
    # We limit the memory which can be used by parsing processes to this constant
    #
    # The feature was tested in test_pebble_limit_memory_usage.py
    MEMORY_LIMIT = get_memory_limit()

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue,
                                                   self.MEMORY_LIMIT))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser, http_response, self.DEBUG)

        # Push the task to the workers
        try:
            future = self._pool.schedule(apply_with_return_error,
                                         args=(apply_args, ),
                                         timeout=self.PARSER_TIMEOUT)
        except RuntimeError, rte:
            # We get here when the pebble pool management thread dies and
            # suddenly starts answering all calls with:
            #
            # RuntimeError('Unexpected error within the Pool')
            #
            # The scan needs to stop because we can't parse any more
            # HTTP responses, which is a very critical part of the process
            msg = str(rte)
            raise ScanMustStopException(msg)

        try:
            parser_output = future.result()
        except TimeoutError:
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')
            args = (self.PARSER_TIMEOUT, http_response.get_url())
            raise TimeoutError(msg % args)

        # We still need to perform some error handling here...
        if isinstance(parser_output, Error):
            if isinstance(parser_output.exc_value, MemoryError):
                msg = ('The parser exceeded the memory usage limit of %s bytes'
                       ' while trying to parse "%s". The parser was stopped in'
                       ' order to prevent OOM issues.')
                args = (self.MEMORY_LIMIT, http_response.get_url())
                om.out.debug(msg % args)
                raise MemoryError(msg % args)

            parser_output.reraise()

        # Success!
        return parser_output
Exemple #4
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._parser_finished_events = {}

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if required

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        hash_string = get_request_unique_id(http_response)

        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            else:
                save_to_cache = self.should_cache(http_response) and cache
                if save_to_cache:
                    self._cache[hash_string] = parser
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return parser
Exemple #5
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    # in seconds
    PARSER_TIMEOUT = 10
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    def __init__(self):
        self._pool = None
        self._processes = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # pylint: disable=E1101
                # Keep track of which pid is processing which http response
                self._processes = manager.dict()
                # pylint: enable=E1101

                # The pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         maxtasksperchild=20,
                                         initializer=init_worker,
                                         initargs=(log_queue, ))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.terminate()
            self._pool = None

            self._processes.clear()
            self._processes = None

    def _kill_parser_process(self, hash_string, http_response):
        """
        Kill the process that's handling the parsing of http_response which
        can be identified by hash_string

        :param hash_string: The hash for the http_response
        :param http_response: The HTTP response which is being parsed
        :return: None
        """
        # Near the timeout error, so we make sure that the pid is still
        # running our "buggy" input
        pid = self._processes.pop(hash_string, None)
        if pid is not None:
            try:
                os.kill(pid, signal.SIGTERM)
            except OSError, ose:
                msg = ('An error occurred while killing the parser'
                       ' process: "%s"')
                om.out.debug(msg % ose)

        msg = ('[timeout] The parser took more than %s seconds to complete'
               ' parsing of "%s", killed it!')

        if user_wants_memory_profiling() or user_wants_pytracemalloc():
            msg += (' Keep in mind that you\'re profiling memory usage and'
                    ' there is a known bug where memory profilers break the'
                    ' parser cache. See issue #9713 for more information'
                    ' https://github.com/andresriancho/w3af/issues/9713')

        om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url()))
Exemple #6
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()

        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):
                parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()
        self._can_parse_cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(),
                                                     default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        """
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None
        """
        self._parser_blacklist.add(hash_string)

    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
                timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url(
                )
                raise BaseFrameworkException(msg)
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            else:
                save_to_cache = self.should_cache(http_response) and cache
                if save_to_cache:
                    self._cache[hash_string] = parser
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return parser

    def _log_return_empty(self, http_response, detail):
        msg = 'Returning empty list in get_tags_by_filter("%s"). '
        msg += detail
        om.out.debug(msg % http_response.get_uri())

    def get_tags_by_filter(self,
                           http_response,
                           tags,
                           yield_text=False,
                           cache=True):
        """
        Get specific tags from http_response using the cache if possible

        :param http_response: The http response instance
        :param tags: List of tags to get, or None if all tags should be returned
        :param yield_text: Include the tag text (<a>text</a>)
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        """
        #
        # This is a performance hack that should reduce the time consumed by
        # this method without impacting its results. Note that in HTML this is
        # valid:
        #
        #   <script
        #
        # And this is invalid:
        #
        #   < script
        #
        # We use that in order to speed-up this function
        #
        if tags is not None:
            body_lower = http_response.get_body().lower()

            for tag in tags:
                lt_tag = '<%s' % tag
                if lt_tag in body_lower:
                    break
            else:
                # No tag was found in the HTML
                return []

        #
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        #
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        #
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        #
        if not self.can_parse(http_response):
            self._log_return_empty(http_response, 'No parser available')
            return []

        args = '%r%r' % (tags, yield_text)
        hash_string = get_body_unique_id(http_response, prepend=args)

        if hash_string in self._parser_blacklist:
            self._log_return_empty(http_response,
                                   'HTTP response is blacklisted')
            return []

        #
        # We know that we can parse this document, lets work!
        #
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
                timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                self._log_return_empty(http_response,
                                       'Timeout waiting for response')
                return []

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._handle_cache_hit(hash_string)
            return parser
        else:
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

            try:
                tags = mp_doc_parser.get_tags_by_filter(http_response,
                                                        tags,
                                                        yield_text=yield_text)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                self._log_return_empty(
                    http_response, 'Timeout waiting for get_tags_by_filter()')
                return []
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.
                self.add_to_blacklist(hash_string)

                # Act just like when there is no parser
                self._log_return_empty(http_response,
                                       'Reached memory usage limit')
                return []
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except Exception as e:
                # Act just like when there is no parser
                msg = 'Unhandled exception running get_tags_by_filter("%s"): %s'
                args = (http_response.get_url(), e)
                raise BaseFrameworkException(msg % args)
            else:
                if cache:
                    self._cache[hash_string] = tags
                else:
                    self._handle_no_cache(hash_string)
            finally:
                event.set()
                self._parser_finished_events.pop(hash_string, None)

            return tags
Exemple #7
0
class ParserCache(CacheStats):
    """
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    """
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._parser_finished_events = {}

    def clear(self):
        """
        Clear all the internal variables
        :return: None
        """
        # Stop any workers
        mp_doc_parser.stop_workers()

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def parser_warpper(func):
        @functools.wraps(func)
        def inner(self, *args, **kwargs):
            if not hasattr(self, 'disk_cache'):
                self.disk_cache = {'key_set': set(), 'disk_cache': DiskDict('rsp_parser')}
            return func(self, *args, **kwargs)
        return inner

    @parser_warpper
    def get_document_parser_for(self, http_response, cache=True):
        """
        Get a document parser for http_response using the cache if required

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        if http_response.is_image():
            # Act just like when there is no parser
            msg = 'There is no parser for image("%s")' % (http_response.get_url())
            raise BaseFrameworkException(msg)

        hash_string = get_request_unique_id(http_response)

        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase
        self.inc_query_count()

        parser = self._cache.get(hash_string, None)
        if parser:
            self._handle_cache_hit(hash_string)
            # om.out.debug('[parser cache][memory] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string))
        else:
            # om.out.debug('[parser cache][memory] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string))
            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event
            
            # Not in cache, have to work.
            self._handle_cache_miss(hash_string)

            try:
                if hash_string in self.disk_cache['key_set']:
                    parser = self.disk_cache['disk_cache'][hash_string]
                    # om.out.debug('[parser cache][disk] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string))
                else:
                    # om.out.debug('[parser cache][disk] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string))
                    try:
                        parser = mp_doc_parser.get_document_parser_for(http_response)
                    except Exception as e:
                        # Act just like when there is no parser
                        msg = 'There is no parser for "%s".e=%s' % (http_response.get_url(), e)
                        raise BaseFrameworkException(msg)
                    else:
                        self.disk_cache['disk_cache'][hash_string] = parser
                        self.disk_cache['key_set'].add(hash_string)

                        save_to_cache = self.should_cache(http_response) and cache
                        if save_to_cache:
                            self._cache[hash_string] = parser
                        else:
                            self._handle_no_cache(hash_string)
            finally:
                self._parser_finished_events.pop(hash_string, None)
                event.set()

        return parser
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling()
                         or user_wants_pytracemalloc()
                         or user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue, ))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser, http_response, self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args, ),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            parser_output = future.result()
        except TimeoutError:
            # Act just like when there is no parser
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')

            args = (self.PARSER_TIMEOUT, http_response.get_url())

            raise BaseFrameworkException(msg % args)
        else:
            if isinstance(parser_output, Error):
                parser_output.reraise()

        return parser_output

    def get_tags_by_filter(self, http_response, tags, yield_text=False):
        """
        Return Tag instances for the tags which match the `tags` filter,
        parsing and all lxml stuff is done in another process and the Tag
        instances are sent to the main process (the one calling this method)
        through a pipe

        Some things to note:
            * Not all responses can be parsed, so I need to call DocumentParser
              and handle exceptions

            * The parser selected by DocumentParser might not have tags, and
              it might not have get_tags_by_filter. In this case just return an
              empty list

            * Just like get_document_parser_for we have a timeout in place,
              when we hit the timeout just return an empty list, this is not
              the best thing to do, but makes the plugin code easier to write
              (plugins would ignore this anyways)

        :param tags: The filter
        :param yield_text: Should we yield the tag text?
        :return: A list of Tag instances as defined in sgml.py

        :see: SGMLParser.get_tags_by_filter
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_get_tags_by_filter, http_response, tags,
                      yield_text, self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args, ),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            filtered_tags = future.result()
        except TimeoutError:
            # We hit a timeout, return an empty list
            return []
        else:
            # There was an exception in the parser, maybe the HTML was really
            # broken, or it wasn't an HTML at all.
            if isinstance(filtered_tags, Error):
                return []

        return filtered_tags