def create_huffman_tree(word_counts): """Make a huffman tree from a dictionary containing word counts. This method creates a binary huffman tree, that is required for :class:`BinaryHierarchicalSoftmax`. For example, ``{0: 8, 1: 5, 2: 6, 3: 4}`` is converted to ``((3, 1), (2, 0))``. Args: word_counts (``dict`` of ``int`` key and ``int`` or ``float`` values.): Dictionary representing counts of words. Returns: Binary huffman tree with tuples and keys of ``word_coutns``. """ if len(word_counts) == 0: raise ValueError('Empty vocabulary') q = PriorityQueue() for w, c in iteritems(word_counts): q.put((c, w)) while q.qsize() >= 2: (count1, word1) = q.get() (count2, word2) = q.get() count = count1 + count2 tree = (word1, word2) q.put((count, tree)) return q.get()[1]
class PushThread(Thread): # Define priority constants PUSH = 1 PUT = 2 def __init__(self): super(PushThread, self).__init__() self.queue = PriorityQueue() self.setDaemon(True) def put(self, obj, priority): self.queue.put(_WorkItem(priority, obj)) def run(self): while True: priority, obj = self.queue.get() if priority == PushThread.PUT: cursession().store_objects(obj) elif priority == PushThread.PUSH: push() # delete queued objects when training has finished if obj == "after_training": with self.queue.mutex: del self.queue.queue[:] break self.queue.task_done()
class PushThread(Thread): PUSH, PUT = range(2) def __init__(self, session, document): self.session = session self.document = document super(PushThread, self).__init__() self.queue = PriorityQueue() self.setDaemon(True) def put(self, obj, priority): self.queue.put(_WorkItem(priority, obj)) def run(self): while True: priority, obj = self.queue.get() if priority == PushThread.PUT: self.session.store_objects(obj) elif priority == PushThread.PUSH: self.session.store_document(self.document) # delete queued objects when training has finished if obj == 'after_training': with self.queue.mutex: del self.queue.queue[:] break self.queue.task_done()
class PushThread(Thread): PUSH, PUT = range(2) def __init__(self, session, document): self.session = session self.document = document super(PushThread, self).__init__() self.queue = PriorityQueue() self.setDaemon(True) def put(self, obj, priority): self.queue.put(_WorkItem(priority, obj)) def run(self): while True: # does it even make sense to have a priority que? # (instead of a simple FIFO, I mean we have a single-producer single-consumer # scenario) priority, obj = self.queue.get() if priority == PushThread.PUT: self.session.store_objects(obj) elif priority == PushThread.PUSH: self.session.store_document(self.document) # delete queued objects when training has finished if obj == 'after_training': with self.queue.mutex: del self.queue.queue[:] break self.queue.task_done()
class ConnectionThreadPoolExecutor(ThreadPoolExecutor): """ A wrapper class to maintain a pool of connections alongside the thread pool. We start by creating a priority queue of connections, and each job submitted takes one of those connections (initialising if necessary) and passes it as the first arg to the executed function. At the end of execution that connection is returned to the queue. By using a PriorityQueue we avoid creating more connections than required. We will only create as many connections as are required concurrently. """ def __init__(self, create_connection, max_workers): self._connections = PriorityQueue() self._create_connection = create_connection for p in range(0, max_workers): self._connections.put((p, None)) super(ConnectionThreadPoolExecutor, self).__init__(max_workers) def submit(self, fn, *args, **kwargs): def conn_fn(): priority = None conn = None try: # If we get a connection we must put it back later (priority, conn) = self._connections.get() if conn is None: conn = self._create_connection() conn_args = (conn,) + args return fn(*conn_args, **kwargs) finally: if priority is not None: self._connections.put((priority, conn)) return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
class ConnectionThreadPoolExecutor(ThreadPoolExecutor): """ A wrapper class to maintain a pool of connections alongside the thread pool. We start by creating a priority queue of connections, and each job submitted takes one of those connections (initialising if necessary) and passes it as the first arg to the executed function. At the end of execution that connection is returned to the queue. By using a PriorityQueue we avoid creating more connections than required. We will only create as many connections as are required concurrently. """ def __init__(self, create_connection, max_workers): self._connections = PriorityQueue() self._create_connection = create_connection for p in range(0, max_workers): self._connections.put((p, None)) super(ConnectionThreadPoolExecutor, self).__init__(max_workers) def submit(self, fn, *args, **kwargs): def conn_fn(): priority = None conn = None try: # If we get a connection we must put it back later (priority, conn) = self._connections.get() if conn is None: conn = self._create_connection() conn_args = (conn, ) + args return fn(*conn_args, **kwargs) finally: if priority is not None: self._connections.put((priority, conn)) return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
def _get_backfill_events(self, txn, room_id, event_list, limit): logger.debug( "_get_backfill_events: %s, %s, %s", room_id, repr(event_list), limit ) event_results = set() # We want to make sure that we do a breadth-first, "depth" ordered # search. query = ( "SELECT depth, prev_event_id FROM event_edges" " INNER JOIN events" " ON prev_event_id = events.event_id" " WHERE event_edges.event_id = ?" " AND event_edges.is_state = ?" " LIMIT ?" ) queue = PriorityQueue() for event_id in event_list: depth = self._simple_select_one_onecol_txn( txn, table="events", keyvalues={ "event_id": event_id, "room_id": room_id, }, retcol="depth", allow_none=True, ) if depth: queue.put((-depth, event_id)) while not queue.empty() and len(event_results) < limit: try: _, event_id = queue.get_nowait() except Empty: break if event_id in event_results: continue event_results.add(event_id) txn.execute( query, (event_id, False, limit - len(event_results)) ) for row in txn: if row[1] not in event_results: queue.put((-row[0], row[1])) return event_results
class LayersApplier(object): """ Most layers replace content. We try to do this intelligently here, so that layers don't step over each other. """ HTML_TAG_REGEX = re.compile(r'<[^>]*?>') def __init__(self): self.queue = PriorityQueue() self.text = None def enqueue_from_list(self, elements_list): for le in elements_list: self.enqueue(le) def enqueue(self, layer_element): original, replacement, locations = layer_element priority = len(original) item = (original, replacement, locations) self.queue.put((-priority, item)) def location_replace(self, xml_node, original, replacement, locations): LocationReplace().location_replace(xml_node, original, replacement, locations) def replace_all(self, original, replacement): """ Replace all occurrences of original with replacement. This is HTML aware; it effectively looks at all of the text in between HTML tags""" text_chunks = [] index = 0 for match in self.HTML_TAG_REGEX.finditer(self.text): text = self.text[index:match.start()] text_chunks.append(text.replace(original, replacement)) text_chunks.append(self.text[match.start():match.end()]) # tag index = match.end() text_chunks.append(self.text[index:]) # trailing text self.text = "".join(text_chunks) def replace_at(self, original, replacement, locations): """ Replace the occurrences of original at all the locations with replacement. """ locations.sort() self.text = LocationReplace().location_replace_text( self.text, original, replacement, locations) def apply_layers(self, original_text): self.text = original_text while not self.queue.empty(): priority, layer_element = self.queue.get() original, replacement, locations = layer_element if not locations: self.replace_all(original, replacement) else: self.replace_at(original, replacement, locations) return self.text
def _create_files_list(self): priorityQueue = PriorityQueue() for txt_file in self._txt_files: wav_file = os.path.splitext(txt_file)[0] + ".wav" wav_file_size = os.path.getsize(wav_file) priorityQueue.put((wav_file_size, (txt_file, wav_file))) files_list = [] while not priorityQueue.empty(): priority, (txt_file, wav_file) = priorityQueue.get() files_list.append((txt_file, wav_file)) return files_list
class ConnectionThreadPoolExecutor(ThreadPoolExecutor): """ A wrapper class to maintain a pool of connections alongside the thread pool. We start by creating a priority queue of connections, and each job submitted takes one of those connections (initialising if necessary) and passes it as the first arg to the executed function. At the end of execution that connection is returned to the queue. By using a PriorityQueue we avoid creating more connections than required. We will only create as many connections as are required concurrently. """ def __init__(self, create_connection, max_workers): """ Initializes a new ThreadPoolExecutor instance. :param create_connection: callable to use to create new connections :param max_workers: the maximum number of threads that can be used """ self._connections = PriorityQueue() self._create_connection = create_connection for p in range(0, max_workers): self._connections.put((p, None)) super(ConnectionThreadPoolExecutor, self).__init__(max_workers) def submit(self, fn, *args, **kwargs): """ Schedules the callable, `fn`, to be executed :param fn: the callable to be invoked :param args: the positional arguments for the callable :param kwargs: the keyword arguments for the callable :returns: a Future object representing the execution of the callable """ def conn_fn(): priority = None conn = None try: # If we get a connection we must put it back later (priority, conn) = self._connections.get() if conn is None: conn = self._create_connection() conn_args = (conn, ) + args return fn(*conn_args, **kwargs) finally: if priority is not None: self._connections.put((priority, conn)) return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
class AsyncDataLoader(object): """The AsyncDataLoader is a wrapper to asynchronous loading multiples batches of data. It keeps a buffer of batches, so when the model asks for a new batch, it's already in memory. After sending the batch to the model, it is removed from the buffer, and a new batch can be loaded. The buffer is filled using a separated thread. Then, each batch can be loaded using multiple processes, or multiples threads. This async batch loader is designed for heavy IO or heavy CPU batch generation. .. warning:: When using the multiprocessing batch loader, watch the ram usage, and avoid a high number of processes. Multiprocessing can easily lead to memory overflow. - Should I use the Async Data Loader ? If you check the Cogitare's DataHolder, it already provides the execution of the data loading through multiple threads or multiple processes. So you should use this if the time to generate a whole batch is expensive. - Should I use threads or processes ? It's recommended to use threads, they are lightweight and fast. Multiple processing usually will lead to a worse performance and memory usage, due to the communication pipe between processes and due to the extension sharing of the memory. However, it can be useful for CPU expensive operations, because will not suffer from GIL. Threads, in the other way, are lightweight and usually fast, but can suffer from GIL. For tasks with heavy IO, it is a good choice. Args: data (DataSet, AbsDataHolder, SequentialDataSet, SequentialAbsDataHolder): data holder, or dataset instance. buffer_size (int): size of the batch buffer. The async data loader will keep around ``buffer_size`` batches in memory. mode (str): should be ``threaded`` or ``multiprocessing``, indicating how to fetch batches. workers (int): the number of threads/processes used to load the batches. If None, will use the number of cores in the CPU. on_batch_loaded (callable): if provided, this function will be called when a new batch is loaded. It must receive one argument, the batch data. And return the batch after applying some operation on the data. This can be used to apply pre-processing functions on a batch of data (such as image filtering, moving the Example:: >>> mnist = fetch_mldata('MNIST original') >>> mnist.data = mnist.data / 255 >>> data = DataSet([mnist.data, mnist.target.astype(int)], batch_size=64) >>> data_train, data_validation = data.split(0.8) >>> # wraps the data_train dataset with the async loader. >>> data_train = AsyncDataLoader(data_train) >>> model.learn(data_train, optimizer) """ def __init__(self, data, buffer_size=8, mode='threaded', workers=None, on_batch_loaded=None): valid = ( 'threaded', 'multiprocessing', ) utils.assert_raise(mode in valid, ValueError, 'mode must be one of: ' + ', '.join(valid)) utils.assert_raise(buffer_size >= 2, ValueError, 'buffer_size must be greater or equal to 2') if mode == 'threaded': self._executor = C.ThreadPoolExecutor(workers) else: self._executor = C.ProcessPoolExecutor(workers) if on_batch_loaded is None: on_batch_loaded = _identity self._queue = PriorityQueue(buffer_size) self._data = data self._thread = None self._on_batch_loaded = on_batch_loaded self._cache_buffer = [] self._caching = False def __repr__(self): return repr(self._data) def _start(self): if self._thread is None: self._thread = Thread(target=self._produce) self._thread.daemon = True self._thread.start() def cache(self): """Start to load batches to buffer, and wait the buffer be full. This can be used before start the model training to cache the samples and speed up the model execution. Example:: >>> dh = CallableHolder(s.__next__, mode='sequential', total_samples=20000000, single=True) >>> dh = AsyncDataLoader(dh, buffer_size=64000, mode='threaded', workers=1) >>> print('caching ...') >>> dh.cache() >>> print('done') """ self._caching = True self._cache_buffer = [] self._start() while not self._queue.full(): time.sleep(0.1) while not all(f.done() for f in self._cache_buffer): time.sleep(0.1) self._caching = False def _produce(self): idx = 0 while True: future = self._executor.submit(_fetch, self._on_batch_loaded, self._data) self._queue.put((idx, future)) idx += 1 if self._caching: self._cache_buffer.append(future) def __iter__(self): return self def __next__(self): self._start() return self._queue.get()[1].result() next = __next__ def __len__(self): return len(self._data)
class ThreadedExecutor(Executor): """\ This executor provides a method of executing callables in a threaded worker pool. The number of outstanding requests can be limited by the ``maxsize`` parameter, which has the same behavior as the parameter of the same name for the ``PriorityQueue`` constructor. All threads are daemon threads and will remain alive until the main thread exits. Any items remaining in the queue at this point may not be executed! """ def __init__(self, worker_count=1, maxsize=0): self.__worker_count = worker_count self.__workers = set([]) self.__started = False self.__queue = PriorityQueue(maxsize) self.__lock = threading.Lock() def __worker(self): queue = self.__queue while True: priority, (function, future) = queue.get(True) if not future.set_running_or_notify_cancel(): continue try: result = function() except Exception as e: if six.PY3: future.set_exception(e) else: future.set_exception_info(*sys.exc_info()[1:]) else: future.set_result(result) queue.task_done() def start(self): with self.__lock: if self.__started: return for i in xrange(self.__worker_count): t = threading.Thread(target=self.__worker) t.daemon = True t.start() self.__workers.add(t) self.__started = True def submit(self, callable, priority=0, block=True, timeout=None): """\ Enqueue a task to be executed, returning a ``TimedFuture``. Tasks can be prioritized by providing a value for the ``priority`` argument, which follows the same specification as the standard library ``Queue.PriorityQueue`` (lowest valued entries are retrieved first.) If the worker pool has not already been started, calling this method will cause all of the worker threads to start running. """ if not self.__started: self.start() future = self.Future() task = (priority, (callable, future)) try: self.__queue.put(task, block=block, timeout=timeout) except Full as error: if future.set_running_or_notify_cancel(): future.set_exception(error) return future
class ThreadPool(object): def __init__(self, thread_manager, thread_count=10): """Initialization method :param thread_manager: the thread manager to use :param thread_count: the number of workers to instantiate """ self.logger = logging.getLogger( 'storj.downstream_farmer.utils.ThreadPool') self.tasks = PriorityQueue() self.thread_manager = thread_manager self.workers = list() self.workers_lock = threading.Lock() self.max_thread_count = 50 self.load_minimum = 0.01 self.load_maximum = 0.5 # managed monitor thread self.monitor_thread = self.thread_manager.create_thread( name='MonitorThread', target=self._monitor) for i in range(0, thread_count): self._add_thread() def thread_count(self): with self.workers_lock: return len(self.workers) def _add_thread(self): # unmanaged worker threads if (len(self.workers) < self.max_thread_count): self.logger.debug('{0} : adding worker'.format( threading.current_thread())) worker = WorkerThread(self) with self.workers_lock: self.workers.append(worker) return worker else: return None def _remove_thread(self): with self.workers_lock: if (len(self.workers) > 1): self.logger.debug('{0} : removing worker'.format( threading.current_thread())) # make sure to retain one worker thread = self.workers.pop() thread.stop() def calculate_loading(self): total_time = 0 work_time = 0 with self.workers_lock: for w in self.workers: total_time += w.load_tracker.total_time() work_time += w.load_tracker.work_time() if (total_time > 0): load = float(work_time) / float(total_time) else: load = 0 return load def max_load(self): max = 0 with self.workers_lock: for w in self.workers: load = w.load_tracker.load() if (load > max): max = load return max def check_loading(self): self.monitor_thread.wake() def _monitor(self): """This runs until the thread manager wakes it up during shutdown, at which time it will wait for any unfinished work in the queue, and then finish, allowing the program to exit """ # wait until shutdown is called while (self.thread_manager.running): # check loading every second to see if we should add another # thread. load = self.calculate_loading() if (load > self.load_maximum): worker = self._add_thread() if (worker is not None): worker.start() elif (load < self.load_minimum): self._remove_thread() self.thread_manager.sleep(10) # wait for any existing work to finish self.logger.debug('MonitorThread waiting for tasks to finish') self.tasks.join() self.logger.debug('MonitorThread finishing') # now, managed thread can exit so program can close cleanly def put_work(self, target, args=[], kwargs={}, priority=50): """Puts work in the work queue. :param work: callable work object """ self.tasks.put(WorkItem(target, args, kwargs, priority)) def start(self): """Starts the thread pool and all its workers and the monitor thread """ with self.workers_lock: for worker in self.workers: worker.start() self.monitor_thread.start()
class LayersApplier(object): """ Most layers replace content. We try to do this intelligently here, so that layers don't step over each other. """ HTML_TAG_REGEX = re.compile(r"<[^>]*?>") def __init__(self): self.queue = PriorityQueue() self.text = None def enqueue_from_list(self, elements_list): for le in elements_list: self.enqueue(le) def enqueue(self, layer_element): original, replacement, locations = layer_element priority = len(original) item = (original, replacement, locations) self.queue.put((-priority, item)) def location_replace(self, xml_node, original, replacement, locations): LocationReplace().location_replace(xml_node, original, replacement, locations) def unescape_text(self): """ Because of the way we do replace_all(), we need to unescape HTML entities. """ self.text = HTMLParser().unescape(self.text) def replace_all(self, original, replacement): """ Replace all occurrences of original with replacement. This is HTML aware; it effectively looks at all of the text in between HTML tags""" text_chunks = [] index = 0 for match in self.HTML_TAG_REGEX.finditer(self.text): text = self.text[index : match.start()] text_chunks.append(text.replace(original, replacement)) text_chunks.append(self.text[match.start() : match.end()]) # tag index = match.end() text_chunks.append(self.text[index:]) # trailing text self.text = "".join(text_chunks) self.unescape_text() def replace_at(self, original, replacement, locations): """ Replace the occurrences of original at all the locations with replacement. """ locations.sort() self.text = LocationReplace().location_replace_text(self.text, original, replacement, locations) self.unescape_text() def apply_layers(self, original_text): self.text = original_text while not self.queue.empty(): priority, layer_element = self.queue.get() original, replacement, locations = layer_element if not locations: self.replace_all(original, replacement) else: self.replace_at(original, replacement, locations) return self.text
class ThreadPool(object): def __init__(self, thread_manager, thread_count=10): """Initialization method :param thread_manager: the thread manager to use :param thread_count: the number of workers to instantiate """ self.logger = logging.getLogger( 'storj.downstream_farmer.utils.ThreadPool') self.tasks = PriorityQueue() self.thread_manager = thread_manager self.workers = list() self.workers_lock = threading.Lock() self.max_thread_count = 50 self.load_minimum = 0.01 self.load_maximum = 0.5 # managed monitor thread self.monitor_thread = self.thread_manager.create_thread( name='MonitorThread', target=self._monitor) for i in range(0, thread_count): self._add_thread() def thread_count(self): with self.workers_lock: return len(self.workers) def _add_thread(self): # unmanaged worker threads if (len(self.workers) < self.max_thread_count): self.logger.debug( '{0} : adding worker'.format(threading.current_thread())) worker = WorkerThread(self) with self.workers_lock: self.workers.append(worker) return worker else: return None def _remove_thread(self): with self.workers_lock: if (len(self.workers) > 1): self.logger.debug( '{0} : removing worker'.format(threading.current_thread())) # make sure to retain one worker thread = self.workers.pop() thread.stop() def calculate_loading(self): total_time = 0 work_time = 0 with self.workers_lock: for w in self.workers: total_time += w.load_tracker.total_time() work_time += w.load_tracker.work_time() if (total_time > 0): load = float(work_time) / float(total_time) else: load = 0 return load def max_load(self): max = 0 with self.workers_lock: for w in self.workers: load = w.load_tracker.load() if (load > max): max = load return max def check_loading(self): self.monitor_thread.wake() def _monitor(self): """This runs until the thread manager wakes it up during shutdown, at which time it will wait for any unfinished work in the queue, and then finish, allowing the program to exit """ # wait until shutdown is called while (self.thread_manager.running): # check loading every second to see if we should add another # thread. load = self.calculate_loading() if (load > self.load_maximum): worker = self._add_thread() if (worker is not None): worker.start() elif (load < self.load_minimum): self._remove_thread() self.thread_manager.sleep(10) # wait for any existing work to finish self.logger.debug('MonitorThread waiting for tasks to finish') self.tasks.join() self.logger.debug('MonitorThread finishing') # now, managed thread can exit so program can close cleanly def put_work(self, target, args=[], kwargs={}, priority=50): """Puts work in the work queue. :param work: callable work object """ self.tasks.put(WorkItem(target, args, kwargs, priority)) def start(self): """Starts the thread pool and all its workers and the monitor thread """ with self.workers_lock: for worker in self.workers: worker.start() self.monitor_thread.start()