class OrderedCachedQueue(Queue.Queue, QueueSpeedMeasurement): """ This queue implements all the features explained in CachedQueue (see cached_queue.py) plus it will order the items in the queue as they are inserted. The queue is ordered by a unique identifier that is returned by the object being added. If the object is None, then it is is added to the end of the queue. The goal of this ordered queue is to impose an order in which URLs and forms identified by the w3af framework are processed by the plugins. Since plugins are run in threads, the order in which new URLs are added to the queue is "completely random" and depends on HTTP response times, CPU-load, memory swapping, etc. """ LAST_MD5_HASH = 'f' * 32 def __init__(self, maxsize=0, name='Unknown'): self.name = name self.max_in_memory = maxsize self.processed_tasks = 0 QueueSpeedMeasurement.__init__(self) self.queue_order = None self.hash_to_uuid = None self.memory = None self.disk = None # We want to send zero to the maxsize of the Queue implementation # here because we can write an infinite number of items. But keep # in mind that we don't really use the queue storage in any way Queue.Queue.__init__(self, maxsize=0) def get_name(self): return self.name def get_processed_tasks(self): return self.processed_tasks def next_item_saved_to_memory(self): return len(self.memory) < self.max_in_memory def _init(self, maxsize): """ Initialize the dicts and pointer :param maxsize: The max size for the queue """ self.queue_order = list() self.hash_to_uuid = dict() self.memory = dict() self.disk = DiskDict(table_prefix='%sCachedQueue' % self.name) def _qsize(self, _len=len): return _len(self.memory) + _len(self.disk) def _get_class_name(self, obj): try: return obj.__class__.__name__ except: return type(obj) def _get_hash(self, item): if item is None or item == POISON_PILL: # Return ffff...ffff which is the latest (in alphanumeric order) # hash that exists in MD5. This forces the None item to be placed # at the end of the queue. # # Warning! If FuzzableRequest.get_hash() ever changes its # implementation this will stop working as expected! return self.LAST_MD5_HASH return item.get_hash() def _put(self, item): """ Put a new item in the queue """ # # This is very useful information for finding bottlenecks in the # framework / strategy # if len(self.memory) == self.max_in_memory: # # If you see many messages like this in the scan log, then you # might want to experiment with a larger maxsize for this queue # msg = ('OrderedCachedQueue.put() will write a %r item to the %s' ' DiskDict. This uses more CPU and disk IO than storing' ' in memory but will avoid high memory usage issues. The' ' current %s DiskDict size is %s.') args = (self._get_class_name(item), self.get_name(), self.get_name(), len(self.disk)) om.out.debug(msg % args) # # Get the item hash to store it in the queue order list, and insert # it using bisect.insort() that will keep the order at a low cost # item_hash = self._get_hash(item) bisect.insort(self.queue_order, item_hash) # # Keep an in-memory dict that allows us to find the fuzzable requests # in the other dictionaries # unique_id = str(uuid.uuid4()) unique_id_list = self.hash_to_uuid.setdefault(item_hash, []) bisect.insort(unique_id_list, unique_id) # # And now we just save the item to memory (if there is space) or # disk (if it doesn't fit on memory) # if len(self.memory) < self.max_in_memory: self.memory[unique_id] = item else: self.disk[unique_id] = item self._item_added_to_queue() def _get(self): """ Get an item from the queue """ item_hash = self.queue_order.pop(0) unique_id_list = self.hash_to_uuid.pop(item_hash) unique_id = unique_id_list.pop(0) if unique_id_list: # # There are still items in this unique_id_list, this is most likely # because two items with the same hash were added to the queue, and # only one of those has been read. # # Need to add the other item(s) to the list again # bisect.insort(self.queue_order, item_hash) self.hash_to_uuid[item_hash] = unique_id_list try: item = self.memory.pop(unique_id) except KeyError: item = self.disk.pop(unique_id) if len(self.disk): # # If you see many messages like this in the scan log, then you # might want to experiment with a larger maxsize for this queue # msg = ('OrderedCachedQueue.get() from %s DiskDict was used to' ' read an item from disk. The current %s DiskDict' ' size is %s.') args = (self.get_name(), self.get_name(), len(self.disk)) om.out.debug(msg % args) self._item_left_queue() self.processed_tasks += 1 return item def join(self): """ Blocks until all items in the Queue have been read and processed. The count of unfinished tasks goes up whenever an item is added to the queue. The count goes down whenever a consumer thread calls task_done() to indicate the item was retrieved and all work on it is complete. When the count of unfinished tasks drops to zero, join() unblocks. """ msg = 'Called join on %s with %s unfinished tasks' args = (self.name, self.unfinished_tasks) om.out.debug(msg % args) self.all_tasks_done.acquire() try: while self.unfinished_tasks: result = self.all_tasks_done.wait(timeout=5) if result is None: msg = 'Still have %s unfinished tasks in %s join()' args = (self.unfinished_tasks, self.name) om.out.debug(msg % args) finally: self.all_tasks_done.release()
class CachedQueue(Queue.Queue, QueueSpeedMeasurement): """ The framework uses the producer / consumer design pattern extensively. In order to avoid high memory usage in the queues connecting the different parts of the framework we defined a max size. When a queue max size is reached, one or more threads will block. This line is printed during a real scan: Thread blocked 5.76617312431 seconds waiting for Queue.put() to have space in the Grep queue. The queue's maxsize is 20. In the case of the Grep consumer / producer the problem with a block is increased by the fact that HTTP responses won't reach other parts of the framework until the queue has space. Increasing the queue size would increase memory usage. Using an on-disk queue would increase CPU (serialization) and disk IO. The CacheQueue is a mix of in-memory and on-disk queue. The first N items are stored in memory, when more items are put() we just write them to disk. The CacheQueue object implements these methods from QueueSpeedMeasurement: * get_input_rpm * get_output_rpm Which allows users to understand how fast a queue is moving. """ def __init__(self, maxsize=0, name='Unknown'): self.name = name self.max_in_memory = maxsize QueueSpeedMeasurement.__init__(self) # We want to send zero to the maxsize of the Queue implementation # here because we can write an infinite number of items Queue.Queue.__init__(self, maxsize=0) def get_name(self): return self.name def next_item_saved_to_memory(self): return len(self.memory) < self.max_in_memory def _init(self, maxsize): """ Initialize the dicts and pointer :param maxsize: The max size for the queue """ self.memory = dict() self.disk = DiskDict(table_prefix='%sCachedQueue' % self.name) self.get_pointer = 0 self.put_pointer = 0 def _qsize(self, len=len): return len(self.memory) + len(self.disk) def _get_class_name(self, obj): try: return obj.__class__.__name__ except: return type(obj) def _put(self, item): """ Put a new item in the queue """ # # This is very useful information for finding bottlenecks in the # framework / strategy # if len(self.memory) == self.max_in_memory: # # If you see many messages like this in the scan log, then you # might want to experiment with a larger maxsize for this queue # msg = ('CachedQueue.put() will write a %r item to the %s DiskDict.' ' This uses more CPU and disk IO than storing in memory' ' but will avoid high memory usage issues. The current' ' %s DiskDict size is %s.') args = (self._get_class_name(item), self.get_name(), self.get_name(), len(self.disk)) om.out.debug(msg % args) # # And now we just save the item to memory (if there is space) or # disk (if it doesn't fit on memory) # if len(self.memory) < self.max_in_memory: self.memory[self.put_pointer] = item else: self.disk[self.put_pointer] = item self.put_pointer += 1 self._item_added_to_queue() def _get(self): """ Get an item from the queue """ try: item = self.memory.pop(self.get_pointer) except KeyError: item = self.disk.pop(self.get_pointer) if len(self.disk): # # If you see many messages like this in the scan log, then you # might want to experiment with a larger maxsize for this queue # msg = ('CachedQueue.get() from %s DiskDict was used to read an' ' item from disk. The current %s DiskDict size is %s.') args = (self.get_name(), self.get_name(), len(self.disk)) om.out.debug(msg % args) self._item_left_queue() self.get_pointer += 1 return item
class CachedDiskDict(object): """ This data structure keeps the `max_in_memory` most frequently accessed keys in memory and stores the rest on disk. It is ideal for situations where a DiskDict is frequently accessed, fast read / writes are required, and items can take considerable amounts of memory. """ def __init__(self, max_in_memory=50, table_prefix=None): """ :param max_in_memory: The max number of items to keep in memory """ assert max_in_memory > 0, 'In-memory items must be > 0' table_prefix = self._get_table_prefix(table_prefix) self._max_in_memory = max_in_memory self._disk_dict = DiskDict(table_prefix=table_prefix) self._in_memory = dict() self._access_count = Counter() def cleanup(self): self._disk_dict.cleanup() def _get_table_prefix(self, table_prefix): if table_prefix is None: table_prefix = 'cached_disk_dict_%s' % rand_alpha(16) else: args = (table_prefix, rand_alpha(16)) table_prefix = 'cached_disk_dict_%s_%s' % args return table_prefix def get(self, key, default=-456): try: return self[key] except KeyError: if default is not -456: return default raise KeyError() def __getitem__(self, key): try: value = self._in_memory[key] except KeyError: # This will raise KeyError if k is not found, and that is OK # because we don't need to increase the access count when the # key doesn't exist value = self._disk_dict[key] self._increase_access_count(key) return value def _get_keys_for_memory(self): """ :return: Generate the names of the keys that should be kept in memory. For example, if `max_in_memory` is set to 2 and: _in_memory: {1: None, 2: None} _access_count: {1: 10, 2: 20, 3: 5} _disk_dict: {3: None} Then the method will generate [1, 2]. """ return [k for k, v in self._access_count.most_common(self._max_in_memory)] def _increase_access_count(self, key): self._access_count.update([key]) keys_for_memory = self._get_keys_for_memory() self._move_key_to_disk_if_needed(keys_for_memory) self._move_key_to_memory_if_needed(key, keys_for_memory) def _move_key_to_disk_if_needed(self, keys_for_memory): """ Analyzes the current access count for the last accessed key and checks if any if the keys in memory should be moved to disk. :param keys_for_memory: The keys that should be in memory :return: The name of the key that was moved to disk, or None if all the keys are still in memory. """ for key in self._in_memory: if key in keys_for_memory: continue try: value = self._in_memory.pop(key) except KeyError: return else: self._disk_dict[key] = value return key def _move_key_to_memory_if_needed(self, key, keys_for_memory): """ Analyzes the current access count for the last accessed key and checks if any if the keys in disk should be moved to memory. :param key: The key that was last accessed :param keys_for_memory: The keys that should be in memory :return: The name of the key that was moved to memory, or None if all the keys are still on disk. """ # The key is already in memory, nothing to do here if key in self._in_memory: return # The key must not be in memory, nothing to do here if key not in keys_for_memory: return try: value = self._disk_dict.pop(key) except KeyError: return else: self._in_memory[key] = value return key def __setitem__(self, key, value): if key in self._in_memory: self._in_memory[key] = value elif len(self._in_memory) < self._max_in_memory: self._in_memory[key] = value else: self._disk_dict[key] = value self._increase_access_count(key) def __delitem__(self, key): try: del self._in_memory[key] except KeyError: # This will raise KeyError if k is not found, and that is OK # because we don't need to increase the access count when the # key doesn't exist del self._disk_dict[key] try: del self._access_count[key] except KeyError: # Another thread removed this key pass def __contains__(self, key): if key in self._in_memory: self._increase_access_count(key) return True if key in self._disk_dict: self._increase_access_count(key) return True return False def __iter__(self): """ Decided not to increase the access count when iterating through the items. In most cases the iteration will be performed on all items, thus increasing the access count +1 for each, which will leave all access counts +1, forcing no movements between memory and disk. """ for key in self._in_memory: yield key for key in self._disk_dict: yield key def iteritems(self): for key, value in self._in_memory.iteritems(): yield key, value for key, value in self._disk_dict.iteritems(): yield key, value
class OrderedCachedQueue(Queue.Queue, QueueSpeedMeasurement): """ This queue implements all the features explained in CachedQueue (see cached_queue.py) plus it will order the items in the queue as they are inserted. The queue is ordered by a unique identifier that is returned by the object being added. If the object is None, then it is is added to the end of the queue. The goal of this ordered queue is to impose an order in which URLs and forms identified by the w3af framework are processed by the plugins. Since plugins are run in threads, the order in which new URLs are added to the queue is "completely random" and depends on HTTP response times, CPU-load, memory swapping, etc. """ LAST_MD5_HASH = 'f' * 32 def __init__(self, maxsize=0, name='Unknown'): self.name = name self.max_in_memory = maxsize self.processed_tasks = 0 QueueSpeedMeasurement.__init__(self) self.queue_order = None self.hash_to_uuid = None self.memory = None self.disk = None # We want to send zero to the maxsize of the Queue implementation # here because we can write an infinite number of items. But keep # in mind that we don't really use the queue storage in any way Queue.Queue.__init__(self, maxsize=0) def get_name(self): return self.name def get_processed_tasks(self): return self.processed_tasks def next_item_saved_to_memory(self): return len(self.memory) < self.max_in_memory def _init(self, maxsize): """ Initialize the dicts and pointer :param maxsize: The max size for the queue """ self.queue_order = list() self.hash_to_uuid = dict() self.memory = dict() self.disk = DiskDict(table_prefix='%sCachedQueue' % self.name) def _qsize(self, _len=len): return _len(self.memory) + _len(self.disk) def _get_class_name(self, obj): try: return obj.__class__.__name__ except: return type(obj) def _get_hash(self, item): if item is None or item == POISON_PILL: # Return ffff...ffff which is the latest (in alphanumeric order) # hash that exists in MD5. This forces the None item to be placed # at the end of the queue. # # Warning! If FuzzableRequest.get_hash() ever changes its # implementation this will stop working as expected! return self.LAST_MD5_HASH return item.get_hash() def _put(self, item): """ Put a new item in the queue """ # # This is very useful information for finding bottlenecks in the # framework / strategy # if len(self.memory) == self.max_in_memory: # # If you see many messages like this in the scan log, then you # might want to experiment with a larger maxsize for this queue # msg = ('OrderedCachedQueue.put() will write a %r item to the %s' ' DiskDict. This uses more CPU and disk IO than storing' ' in memory but will avoid high memory usage issues. The' ' current %s DiskDict size is %s.') args = (self._get_class_name(item), self.get_name(), self.get_name(), len(self.disk)) om.out.debug(msg % args) # # Get the item hash to store it in the queue order list, and insert # it using bisect.insort() that will keep the order at a low cost # item_hash = self._get_hash(item) bisect.insort(self.queue_order, item_hash) # # Keep an in-memory dict that allows us to find the fuzzable requests # in the other dictionaries # unique_id = str(uuid.uuid4()) unique_id_list = self.hash_to_uuid.setdefault(item_hash, []) bisect.insort(unique_id_list, unique_id) # # And now we just save the item to memory (if there is space) or # disk (if it doesn't fit on memory) # if len(self.memory) < self.max_in_memory: self.memory[unique_id] = item else: self.disk[unique_id] = item self._item_added_to_queue() def _get(self): """ Get an item from the queue """ item_hash = self.queue_order.pop(0) unique_id_list = self.hash_to_uuid.pop(item_hash) unique_id = unique_id_list.pop(0) if unique_id_list: # # There are still items in this unique_id_list, this is most likely # because two items with the same hash were added to the queue, and # only one of those has been read. # # Need to add the other item(s) to the list again # bisect.insort(self.queue_order, item_hash) self.hash_to_uuid[item_hash] = unique_id_list try: item = self.memory.pop(unique_id) except KeyError: item = self.disk.pop(unique_id) if len(self.disk): # # If you see many messages like this in the scan log, then you # might want to experiment with a larger maxsize for this queue # msg = ('OrderedCachedQueue.get() from %s DiskDict was used to' ' read an item from disk. The current %s DiskDict' ' size is %s.') args = (self.get_name(), self.get_name(), len(self.disk)) om.out.debug(msg % args) self._item_left_queue() self.processed_tasks += 1 return item
class CachedDiskDict(object): """ This data structure keeps the `max_in_memory` most frequently accessed keys in memory and stores the rest on disk. It is ideal for situations where a DiskDict is frequently accessed, fast read / writes are required, and items can take considerable amounts of memory. """ def __init__(self, max_in_memory=50, table_prefix=None): """ :param max_in_memory: The max number of items to keep in memory """ assert max_in_memory > 0, 'In-memory items must be > 0' table_prefix = self._get_table_prefix(table_prefix) self._max_in_memory = max_in_memory self._disk_dict = DiskDict(table_prefix=table_prefix) self._in_memory = dict() self._access_count = dict() def cleanup(self): self._disk_dict.cleanup() def _get_table_prefix(self, table_prefix): if table_prefix is None: table_prefix = 'cached_disk_dict_%s' % rand_alpha(16) else: args = (table_prefix, rand_alpha(16)) table_prefix = 'cached_disk_dict_%s_%s' % args return table_prefix def get(self, key, default=-456): try: return self[key] except KeyError: if default is not -456: return default raise KeyError() def __getitem__(self, key): try: value = self._in_memory[key] except KeyError: # This will raise KeyError if k is not found, and that is OK # because we don't need to increase the access count when the # key doesn't exist value = self._disk_dict[key] self._increase_access_count(key) return value def _get_keys_for_memory(self): """ :return: Generate the names of the keys that should be kept in memory. For example, if `max_in_memory` is set to 2 and: _in_memory: {1: None, 2: None} _access_count: {1: 10, 2: 20, 3: 5} _disk_dict: {3: None} Then the method will generate [1, 2]. """ items = self._access_count.items() items.sort(sort_by_value) iterator = min(self._max_in_memory, len(items)) for i in xrange(iterator): yield items[i][0] def _belongs_in_memory(self, key): """ :param key: A key :return: True if the key should be stored in memory """ if key in self._get_keys_for_memory(): return True return False def _increase_access_count(self, key): access_count = self._access_count.get(key, 0) access_count += 1 self._access_count[key] = access_count self._move_key_to_disk_if_needed(key) self._move_key_to_memory_if_needed(key) def _move_key_to_disk_if_needed(self, key): """ Analyzes the current access count for the last accessed key and checks if any if the keys in memory should be moved to disk. :param key: The key that was last accessed :return: The name of the key that was moved to disk, or None if all the keys are still in memory. """ for key in self._in_memory.keys(): if not self._belongs_in_memory(key): try: value = self._in_memory[key] except KeyError: return None else: self._disk_dict[key] = value self._in_memory.pop(key, None) return key def _move_key_to_memory_if_needed(self, key): """ Analyzes the current access count for the last accessed key and checks if any if the keys in disk should be moved to memory. :param key: The key that was last accessed :return: The name of the key that was moved to memory, or None if all the keys are still on disk. """ key_belongs_in_memory = self._belongs_in_memory(key) if not key_belongs_in_memory: return None try: value = self._disk_dict[key] except KeyError: return None else: self._in_memory[key] = value self._disk_dict.pop(key, None) return key def __setitem__(self, key, value): if len(self._in_memory) < self._max_in_memory: self._in_memory[key] = value else: self._disk_dict[key] = value self._increase_access_count(key) def __delitem__(self, key): try: del self._in_memory[key] except KeyError: # This will raise KeyError if k is not found, and that is OK # because we don't need to increase the access count when the # key doesn't exist del self._disk_dict[key] try: del self._access_count[key] except KeyError: # Another thread removed this key pass def __contains__(self, key): if key in self._in_memory: self._increase_access_count(key) return True if key in self._disk_dict: self._increase_access_count(key) return True return False def __iter__(self): """ Decided not to increase the access count when iterating through the items. In most cases the iteration will be performed on all items, thus increasing the access count +1 for each, which will leave all access counts +1, forcing no movements between memory and disk. """ for key in self._in_memory: yield key for key in self._disk_dict: yield key