class ParserCache(object): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ LRU_LENGTH = 40 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 PARSER_TIMEOUT = 60 # in seconds DEBUG = False MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._cache = SynchronizedLRUDict(self.LRU_LENGTH) self._pool = None self._processes = None self._parser_finished_events = {} self._start_lock = threading.RLock() # These are here for debugging: self._archive = set() self._from_LRU = 0.0 self._calculated_more_than_once = 0.0 self._total = 0.0 def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None self._processes = None # We don't need this data anymore self._cache.clear() if self.DEBUG: re_calc_rate = (self._calculated_more_than_once / self._total) print('parser_cache LRU rate: %s' % (self._from_LRU / self._total)) print('parser_cache re-calculation rate: %s' % re_calc_rate) print('parser_cache size: %s' % self.LRU_LENGTH) def get_cache_key(self, http_response): """ Before I used md5, but I realized that it was unnecessary. I experimented a little bit with python's hash functions and the builtin hash was the fastest. At first I thought that the built-in hash wasn't good enough, as it could create collisions... but... given that the LRU has only 40 positions, the real probability of a collision is too low. :return: The key to be used in the cache for storing this http_response """ # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the # body before hashing uri_str = http_response.get_uri().url_string.encode('utf-8') body_str = http_response.body if isinstance(body_str, unicode): body_str = body_str.encode('utf-8', 'replace') _to_hash = body_str + uri_str # Added adler32 after finding some hash() collisions in builds hash_string = str(hash(_to_hash)) hash_string += str(zlib.adler32(_to_hash)) return hash_string def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def _test_parse_http_response(self, http_response, *args): """ Left here for testing! """ return DocumentParser(http_response) def _parse_http_response_in_worker(self, http_response, hash_string): """ This parses the http_response in a pool worker. This has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :return: The DocumentParser instance """ event = multiprocessing.Event() self._parser_finished_events[hash_string] = event # Start the worker processes if needed self.start_workers() apply_args = (ProcessDocumentParser, http_response, self._processes, hash_string) # Push the task to the workers result = self._pool.apply_async(apply_with_return_error, (apply_args,)) try: parser_output = result.get(timeout=self.PARSER_TIMEOUT) except multiprocessing.TimeoutError: # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = 'An error occurred while killing the parser' \ ' process: "%s"' om.out.debug(msg % ose) msg = '[timeout] The parser took more than %s seconds'\ ' to complete parsing of "%s", killed it!' om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url())) # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else:
class ParserCache(object): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ LRU_LENGTH = 40 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 PARSER_TIMEOUT = 60 # in seconds DEBUG = False MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._cache = SynchronizedLRUDict(self.LRU_LENGTH) self._pool = None self._processes = None self._parser_finished_events = {} self._start_lock = threading.RLock() # These are here for debugging: self._archive = set() self._from_LRU = 0.0 self._calculated_more_than_once = 0.0 self._total = 0.0 def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue, )) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None self._processes = None # We don't need this data anymore self._cache.clear() if self.DEBUG: re_calc_rate = (self._calculated_more_than_once / self._total) print('parser_cache LRU rate: %s' % (self._from_LRU / self._total)) print('parser_cache re-calculation rate: %s' % re_calc_rate) print('parser_cache size: %s' % self.LRU_LENGTH) def get_cache_key(self, http_response): """ Before I used md5, but I realized that it was unnecessary. I experimented a little bit with python's hash functions and the builtin hash was the fastest. At first I thought that the built-in hash wasn't good enough, as it could create collisions... but... given that the LRU has only 40 positions, the real probability of a collision is too low. :return: The key to be used in the cache for storing this http_response """ # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the # body before hashing uri_str = http_response.get_uri().url_string.encode('utf-8') body_str = http_response.body if isinstance(body_str, unicode): body_str = body_str.encode('utf-8', 'replace') _to_hash = body_str + uri_str # Added adler32 after finding some hash() collisions in builds hash_string = str(hash(_to_hash)) hash_string += str(zlib.adler32(_to_hash)) return hash_string def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def _test_parse_http_response(self, http_response, *args): """ Left here for testing! """ return DocumentParser(http_response) def _parse_http_response_in_worker(self, http_response, hash_string): """ This parses the http_response in a pool worker. This has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :return: The DocumentParser instance """ event = multiprocessing.Event() self._parser_finished_events[hash_string] = event # Start the worker processes if needed self.start_workers() apply_args = (ProcessDocumentParser, http_response, self._processes, hash_string) # Push the task to the workers result = self._pool.apply_async(apply_with_return_error, (apply_args, )) try: parser_output = result.get(timeout=self.PARSER_TIMEOUT) except multiprocessing.TimeoutError: # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = 'An error occurred while killing the parser' \ ' process: "%s"' om.out.debug(msg % ose) msg = '[timeout] The parser took more than %s seconds'\ ' to complete parsing of "%s", killed it!' om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url())) # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else: