class ResponseCacheKeyCache(object): # # The memory impact of having a large number of items in this cache is # really low, both the keys and the values are short strings (the result of # quick_hash) # MAX_SIZE = 2000 def __init__(self): self._cache = SynchronizedLRUDict(self.MAX_SIZE) def get_response_cache_key(self, http_response, clean_response=None, headers=None): # When the clean response is available, use that body to calculate the # cache key. It has been cleaned (removed request paths and QS parameters) # so it has a higher chance of being equal to other responses / being # already in the cache if clean_response is not None: body = clean_response.body else: body = http_response.body cache_key = '%s%s' % (smart_str_ignore(body), headers) cache_key = quick_hash(cache_key) result = self._cache.get(cache_key, None) if result is not None: return result result = get_response_cache_key(http_response, clean_response=clean_response, headers=headers) self._cache[cache_key] = result return result def clear_cache(self): self._cache.clear()
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10) self._parser_finished_events = {} self._parser_blacklist = DiskSet() def clear(self): """ Clear all the internal variables :return: None """ om.out.debug('Called clear() on ParserCache') # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): if hasattr(parser, 'clear'): parser.clear() # We don't need the parsers anymore self._cache.clear() self._can_parse_cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def can_parse(self, http_response): """ Check if we can parse an HTTP response :param http_response: The HTTP response to verify :return: True if we can parse this HTTP response """ cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None) if cached_can_parse is not None: return cached_can_parse # # We need to verify if we can parse this HTTP response # try: can_parse = DocumentParser.can_parse(http_response) except: # We catch all the exceptions here and just return False because # the real parsing procedure will (most likely) fail to parse # this response too. can_parse = False self._can_parse_cache[can_parse] = can_parse return can_parse def add_to_blacklist(self, hash_string): """ Add a hash_string representing an HTTP response to the blacklist, indicating that we won't try to parse this response never again. :return: None """ self._parser_blacklist.add(hash_string) def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if possible :param http_response: The http response instance :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): msg = 'There is no parser for "%s".' raise BaseFrameworkException(msg % http_response.get_url()) hash_string = get_response_unique_id(http_response) if hash_string in self._parser_blacklist: msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.' raise BaseFrameworkException(msg % http_response.get_url()) # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached timeout parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except ScanMustStopException, e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except:
class ParserCache(object): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ LRU_LENGTH = 40 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 PARSER_TIMEOUT = 60 # in seconds DEBUG = False MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._cache = SynchronizedLRUDict(self.LRU_LENGTH) self._pool = None self._processes = None self._parser_finished_events = {} self._start_lock = threading.RLock() # These are here for debugging: self._archive = set() self._from_LRU = 0.0 self._calculated_more_than_once = 0.0 self._total = 0.0 def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None self._processes = None # We don't need this data anymore self._cache.clear() if self.DEBUG: re_calc_rate = (self._calculated_more_than_once / self._total) print('parser_cache LRU rate: %s' % (self._from_LRU / self._total)) print('parser_cache re-calculation rate: %s' % re_calc_rate) print('parser_cache size: %s' % self.LRU_LENGTH) def get_cache_key(self, http_response): """ Before I used md5, but I realized that it was unnecessary. I experimented a little bit with python's hash functions and the builtin hash was the fastest. At first I thought that the built-in hash wasn't good enough, as it could create collisions... but... given that the LRU has only 40 positions, the real probability of a collision is too low. :return: The key to be used in the cache for storing this http_response """ # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the # body before hashing uri_str = http_response.get_uri().url_string.encode('utf-8') body_str = http_response.body if isinstance(body_str, unicode): body_str = body_str.encode('utf-8', 'replace') _to_hash = body_str + uri_str # Added adler32 after finding some hash() collisions in builds hash_string = str(hash(_to_hash)) hash_string += str(zlib.adler32(_to_hash)) return hash_string def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def _test_parse_http_response(self, http_response, *args): """ Left here for testing! """ return DocumentParser(http_response) def _parse_http_response_in_worker(self, http_response, hash_string): """ This parses the http_response in a pool worker. This has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :return: The DocumentParser instance """ event = multiprocessing.Event() self._parser_finished_events[hash_string] = event # Start the worker processes if needed self.start_workers() apply_args = (ProcessDocumentParser, http_response, self._processes, hash_string) # Push the task to the workers result = self._pool.apply_async(apply_with_return_error, (apply_args,)) try: parser_output = result.get(timeout=self.PARSER_TIMEOUT) except multiprocessing.TimeoutError: # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = 'An error occurred while killing the parser' \ ' process: "%s"' om.out.debug(msg % ose) msg = '[timeout] The parser took more than %s seconds'\ ' to complete parsing of "%s", killed it!' om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url())) # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else:
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._parser_finished_events = {} def clear(self): """ Clear all the internal variables :return: None """ # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): parser.clear() # We don't need the parsers anymore self._cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if required :param http_response: The http response instance :return: An instance of DocumentParser """ hash_string = get_request_unique_id(http_response) parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished try: parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) except: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except: # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else: save_to_cache = self.should_cache(http_response) and cache if save_to_cache: self._cache[hash_string] = parser else: self._handle_no_cache(hash_string) finally: event.set() self._parser_finished_events.pop(hash_string, None) return parser
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10) self._parser_finished_events = {} self._parser_blacklist = DiskSet() def clear(self): """ Clear all the internal variables :return: None """ # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): if hasattr(parser, 'clear'): parser.clear() # We don't need the parsers anymore self._cache.clear() self._can_parse_cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def can_parse(self, http_response): """ Check if we can parse an HTTP response :param http_response: The HTTP response to verify :return: True if we can parse this HTTP response """ cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None) if cached_can_parse is not None: return cached_can_parse # # We need to verify if we can parse this HTTP response # try: can_parse = DocumentParser.can_parse(http_response) except: # We catch all the exceptions here and just return False because # the real parsing procedure will (most likely) fail to parse # this response too. can_parse = False self._can_parse_cache[can_parse] = can_parse return can_parse def add_to_blacklist(self, hash_string): """ Add a hash_string representing an HTTP response to the blacklist, indicating that we won't try to parse this response never again. :return: None """ self._parser_blacklist.add(hash_string) def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if possible :param http_response: The http response instance :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): msg = 'There is no parser for "%s".' raise BaseFrameworkException(msg % http_response.get_url()) hash_string = get_response_unique_id(http_response) if hash_string in self._parser_blacklist: msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.' raise BaseFrameworkException(msg % http_response.get_url()) # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished try: parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) except: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached timeout parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except ScanMustStopException, e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except:
class ParserCache(object): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ LRU_LENGTH = 40 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 PARSER_TIMEOUT = 60 # in seconds DEBUG = False MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._cache = SynchronizedLRUDict(self.LRU_LENGTH) self._pool = None self._processes = None self._parser_finished_events = {} self._start_lock = threading.RLock() # These are here for debugging: self._archive = set() self._from_LRU = 0.0 self._calculated_more_than_once = 0.0 self._total = 0.0 def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue, )) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None self._processes = None # We don't need this data anymore self._cache.clear() if self.DEBUG: re_calc_rate = (self._calculated_more_than_once / self._total) print('parser_cache LRU rate: %s' % (self._from_LRU / self._total)) print('parser_cache re-calculation rate: %s' % re_calc_rate) print('parser_cache size: %s' % self.LRU_LENGTH) def get_cache_key(self, http_response): """ Before I used md5, but I realized that it was unnecessary. I experimented a little bit with python's hash functions and the builtin hash was the fastest. At first I thought that the built-in hash wasn't good enough, as it could create collisions... but... given that the LRU has only 40 positions, the real probability of a collision is too low. :return: The key to be used in the cache for storing this http_response """ # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the # body before hashing uri_str = http_response.get_uri().url_string.encode('utf-8') body_str = http_response.body if isinstance(body_str, unicode): body_str = body_str.encode('utf-8', 'replace') _to_hash = body_str + uri_str # Added adler32 after finding some hash() collisions in builds hash_string = str(hash(_to_hash)) hash_string += str(zlib.adler32(_to_hash)) return hash_string def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def _test_parse_http_response(self, http_response, *args): """ Left here for testing! """ return DocumentParser(http_response) def _parse_http_response_in_worker(self, http_response, hash_string): """ This parses the http_response in a pool worker. This has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :return: The DocumentParser instance """ event = multiprocessing.Event() self._parser_finished_events[hash_string] = event # Start the worker processes if needed self.start_workers() apply_args = (ProcessDocumentParser, http_response, self._processes, hash_string) # Push the task to the workers result = self._pool.apply_async(apply_with_return_error, (apply_args, )) try: parser_output = result.get(timeout=self.PARSER_TIMEOUT) except multiprocessing.TimeoutError: # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = 'An error occurred while killing the parser' \ ' process: "%s"' om.out.debug(msg % ose) msg = '[timeout] The parser took more than %s seconds'\ ' to complete parsing of "%s", killed it!' om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url())) # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else:
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10) self._parser_finished_events = {} self._parser_blacklist = DiskSet() def clear(self): """ Clear all the internal variables :return: None """ om.out.debug('Called clear() on ParserCache') # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): if hasattr(parser, 'clear'): parser.clear() # We don't need the parsers anymore self._cache.clear() self._can_parse_cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def can_parse(self, http_response): """ Check if we can parse an HTTP response :param http_response: The HTTP response to verify :return: True if we can parse this HTTP response """ cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None) if cached_can_parse is not None: return cached_can_parse # # We need to verify if we can parse this HTTP response # try: can_parse = DocumentParser.can_parse(http_response) except: # We catch all the exceptions here and just return False because # the real parsing procedure will (most likely) fail to parse # this response too. can_parse = False self._can_parse_cache[can_parse] = can_parse return can_parse def add_to_blacklist(self, hash_string): """ Add a hash_string representing an HTTP response to the blacklist, indicating that we won't try to parse this response never again. :return: None """ self._parser_blacklist.add(hash_string) def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if possible :param http_response: The http response instance :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): msg = 'There is no parser for "%s".' raise BaseFrameworkException(msg % http_response.get_url()) hash_string = get_response_unique_id(http_response) if hash_string in self._parser_blacklist: msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.' raise BaseFrameworkException(msg % http_response.get_url()) # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait( timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached timeout parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url( ) raise BaseFrameworkException(msg) except ScanMustStopException as e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except: # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else: save_to_cache = self.should_cache(http_response) and cache if save_to_cache: self._cache[hash_string] = parser else: self._handle_no_cache(hash_string) finally: event.set() self._parser_finished_events.pop(hash_string, None) return parser def _log_return_empty(self, http_response, detail): msg = 'Returning empty list in get_tags_by_filter("%s"). ' msg += detail om.out.debug(msg % http_response.get_uri()) def get_tags_by_filter(self, http_response, tags, yield_text=False, cache=True): """ Get specific tags from http_response using the cache if possible :param http_response: The http response instance :param tags: List of tags to get, or None if all tags should be returned :param yield_text: Include the tag text (<a>text</a>) :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # This is a performance hack that should reduce the time consumed by # this method without impacting its results. Note that in HTML this is # valid: # # <script # # And this is invalid: # # < script # # We use that in order to speed-up this function # if tags is not None: body_lower = http_response.get_body().lower() for tag in tags: lt_tag = '<%s' % tag if lt_tag in body_lower: break else: # No tag was found in the HTML return [] # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): self._log_return_empty(http_response, 'No parser available') return [] args = '%r%r' % (tags, yield_text) hash_string = get_body_unique_id(http_response, prepend=args) if hash_string in self._parser_blacklist: self._log_return_empty(http_response, 'HTTP response is blacklisted') return [] # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait( timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser self._log_return_empty(http_response, 'Timeout waiting for response') return [] # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: tags = mp_doc_parser.get_tags_by_filter(http_response, tags, yield_text=yield_text) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser self._log_return_empty( http_response, 'Timeout waiting for get_tags_by_filter()') return [] except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser self._log_return_empty(http_response, 'Reached memory usage limit') return [] except ScanMustStopException as e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except Exception as e: # Act just like when there is no parser msg = 'Unhandled exception running get_tags_by_filter("%s"): %s' args = (http_response.get_url(), e) raise BaseFrameworkException(msg % args) else: if cache: self._cache[hash_string] = tags else: self._handle_no_cache(hash_string) finally: event.set() self._parser_finished_events.pop(hash_string, None) return tags
class ParserCache(CacheStats): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ CACHE_SIZE = 10 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 DEBUG = core_profiling_is_enabled() def __init__(self): super(ParserCache, self).__init__() self._cache = SynchronizedLRUDict(self.CACHE_SIZE) self._parser_finished_events = {} def clear(self): """ Clear all the internal variables :return: None """ # Stop any workers mp_doc_parser.stop_workers() # Make sure the parsers clear all resources for parser in self._cache.itervalues(): parser.clear() # We don't need the parsers anymore self._cache.clear() def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def parser_warpper(func): @functools.wraps(func) def inner(self, *args, **kwargs): if not hasattr(self, 'disk_cache'): self.disk_cache = {'key_set': set(), 'disk_cache': DiskDict('rsp_parser')} return func(self, *args, **kwargs) return inner @parser_warpper def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if required :param http_response: The http response instance :return: An instance of DocumentParser """ if http_response.is_image(): # Act just like when there is no parser msg = 'There is no parser for image("%s")' % (http_response.get_url()) raise BaseFrameworkException(msg) hash_string = get_request_unique_id(http_response) parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished try: parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) except: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser: self._handle_cache_hit(hash_string) # om.out.debug('[parser cache][memory] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string)) else: # om.out.debug('[parser cache][memory] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string)) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event # Not in cache, have to work. self._handle_cache_miss(hash_string) try: if hash_string in self.disk_cache['key_set']: parser = self.disk_cache['disk_cache'][hash_string] # om.out.debug('[parser cache][disk] Hit for %s(%s)' % (http_response.get_uri().url_string, hash_string)) else: # om.out.debug('[parser cache][disk] Miss for %s(%s)' % (http_response.get_uri().url_string, hash_string)) try: parser = mp_doc_parser.get_document_parser_for(http_response) except Exception as e: # Act just like when there is no parser msg = 'There is no parser for "%s".e=%s' % (http_response.get_url(), e) raise BaseFrameworkException(msg) else: self.disk_cache['disk_cache'][hash_string] = parser self.disk_cache['key_set'].add(hash_string) save_to_cache = self.should_cache(http_response) and cache if save_to_cache: self._cache[hash_string] = parser else: self._handle_no_cache(hash_string) finally: self._parser_finished_events.pop(hash_string, None) event.set() return parser