def create_huffman_tree(word_counts): """Make a huffman tree from a dictionary containing word counts. This method creates a binary huffman tree, that is required for :class:`BinaryHierarchicalSoftmax`. For example, ``{0: 8, 1: 5, 2: 6, 3: 4}`` is converted to ``((3, 1), (2, 0))``. Args: word_counts (``dict`` of ``int`` key and ``int`` or ``float`` values.): Dictionary representing counts of words. Returns: Binary huffman tree with tuples and keys of ``word_coutns``. """ if len(word_counts) == 0: raise ValueError('Empty vocabulary') q = PriorityQueue() for w, c in iteritems(word_counts): q.put((c, w)) while q.qsize() >= 2: (count1, word1) = q.get() (count2, word2) = q.get() count = count1 + count2 tree = (word1, word2) q.put((count, tree)) return q.get()[1]
class PushThread(Thread): PUSH, PUT = range(2) def __init__(self, session, document): self.session = session self.document = document super(PushThread, self).__init__() self.queue = PriorityQueue() self.setDaemon(True) def put(self, obj, priority): self.queue.put(_WorkItem(priority, obj)) def run(self): while True: # does it even make sense to have a priority que? # (instead of a simple FIFO, I mean we have a single-producer single-consumer # scenario) priority, obj = self.queue.get() if priority == PushThread.PUT: self.session.store_objects(obj) elif priority == PushThread.PUSH: self.session.store_document(self.document) # delete queued objects when training has finished if obj == 'after_training': with self.queue.mutex: del self.queue.queue[:] break self.queue.task_done()
class ConnectionThreadPoolExecutor(ThreadPoolExecutor): """ A wrapper class to maintain a pool of connections alongside the thread pool. We start by creating a priority queue of connections, and each job submitted takes one of those connections (initialising if necessary) and passes it as the first arg to the executed function. At the end of execution that connection is returned to the queue. By using a PriorityQueue we avoid creating more connections than required. We will only create as many connections as are required concurrently. """ def __init__(self, create_connection, max_workers): self._connections = PriorityQueue() self._create_connection = create_connection for p in range(0, max_workers): self._connections.put((p, None)) super(ConnectionThreadPoolExecutor, self).__init__(max_workers) def submit(self, fn, *args, **kwargs): def conn_fn(): priority = None conn = None try: # If we get a connection we must put it back later (priority, conn) = self._connections.get() if conn is None: conn = self._create_connection() conn_args = (conn,) + args return fn(*conn_args, **kwargs) finally: if priority is not None: self._connections.put((priority, conn)) return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
class PushThread(Thread): PUSH, PUT = range(2) def __init__(self, session, document): self.session = session self.document = document super(PushThread, self).__init__() self.queue = PriorityQueue() self.setDaemon(True) def put(self, obj, priority): self.queue.put(_WorkItem(priority, obj)) def run(self): while True: priority, obj = self.queue.get() if priority == PushThread.PUT: self.session.store_objects(obj) elif priority == PushThread.PUSH: self.session.store_document(self.document) # delete queued objects when training has finished if obj == 'after_training': with self.queue.mutex: del self.queue.queue[:] break self.queue.task_done()
class PushThread(Thread): # Define priority constants PUSH = 1 PUT = 2 def __init__(self): super(PushThread, self).__init__() self.queue = PriorityQueue() self.setDaemon(True) def put(self, obj, priority): self.queue.put(_WorkItem(priority, obj)) def run(self): while True: priority, obj = self.queue.get() if priority == PushThread.PUT: cursession().store_objects(obj) elif priority == PushThread.PUSH: push() # delete queued objects when training has finished if obj == "after_training": with self.queue.mutex: del self.queue.queue[:] break self.queue.task_done()
class ConnectionThreadPoolExecutor(ThreadPoolExecutor): """ A wrapper class to maintain a pool of connections alongside the thread pool. We start by creating a priority queue of connections, and each job submitted takes one of those connections (initialising if necessary) and passes it as the first arg to the executed function. At the end of execution that connection is returned to the queue. By using a PriorityQueue we avoid creating more connections than required. We will only create as many connections as are required concurrently. """ def __init__(self, create_connection, max_workers): self._connections = PriorityQueue() self._create_connection = create_connection for p in range(0, max_workers): self._connections.put((p, None)) super(ConnectionThreadPoolExecutor, self).__init__(max_workers) def submit(self, fn, *args, **kwargs): def conn_fn(): priority = None conn = None try: # If we get a connection we must put it back later (priority, conn) = self._connections.get() if conn is None: conn = self._create_connection() conn_args = (conn, ) + args return fn(*conn_args, **kwargs) finally: if priority is not None: self._connections.put((priority, conn)) return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
class LayersApplier(object): """ Most layers replace content. We try to do this intelligently here, so that layers don't step over each other. """ HTML_TAG_REGEX = re.compile(r'<[^>]*?>') def __init__(self): self.queue = PriorityQueue() self.text = None def enqueue_from_list(self, elements_list): for le in elements_list: self.enqueue(le) def enqueue(self, layer_element): original, replacement, locations = layer_element priority = len(original) item = (original, replacement, locations) self.queue.put((-priority, item)) def location_replace(self, xml_node, original, replacement, locations): LocationReplace().location_replace(xml_node, original, replacement, locations) def replace_all(self, original, replacement): """ Replace all occurrences of original with replacement. This is HTML aware; it effectively looks at all of the text in between HTML tags""" text_chunks = [] index = 0 for match in self.HTML_TAG_REGEX.finditer(self.text): text = self.text[index:match.start()] text_chunks.append(text.replace(original, replacement)) text_chunks.append(self.text[match.start():match.end()]) # tag index = match.end() text_chunks.append(self.text[index:]) # trailing text self.text = "".join(text_chunks) def replace_at(self, original, replacement, locations): """ Replace the occurrences of original at all the locations with replacement. """ locations.sort() self.text = LocationReplace().location_replace_text( self.text, original, replacement, locations) def apply_layers(self, original_text): self.text = original_text while not self.queue.empty(): priority, layer_element = self.queue.get() original, replacement, locations = layer_element if not locations: self.replace_all(original, replacement) else: self.replace_at(original, replacement, locations) return self.text
def _create_files_list(self): priorityQueue = PriorityQueue() for txt_file in self._txt_files: wav_file = os.path.splitext(txt_file)[0] + ".wav" wav_file_size = os.path.getsize(wav_file) priorityQueue.put((wav_file_size, (txt_file, wav_file))) files_list = [] while not priorityQueue.empty(): priority, (txt_file, wav_file) = priorityQueue.get() files_list.append((txt_file, wav_file)) return files_list
class ConnectionThreadPoolExecutor(ThreadPoolExecutor): """ A wrapper class to maintain a pool of connections alongside the thread pool. We start by creating a priority queue of connections, and each job submitted takes one of those connections (initialising if necessary) and passes it as the first arg to the executed function. At the end of execution that connection is returned to the queue. By using a PriorityQueue we avoid creating more connections than required. We will only create as many connections as are required concurrently. """ def __init__(self, create_connection, max_workers): """ Initializes a new ThreadPoolExecutor instance. :param create_connection: callable to use to create new connections :param max_workers: the maximum number of threads that can be used """ self._connections = PriorityQueue() self._create_connection = create_connection for p in range(0, max_workers): self._connections.put((p, None)) super(ConnectionThreadPoolExecutor, self).__init__(max_workers) def submit(self, fn, *args, **kwargs): """ Schedules the callable, `fn`, to be executed :param fn: the callable to be invoked :param args: the positional arguments for the callable :param kwargs: the keyword arguments for the callable :returns: a Future object representing the execution of the callable """ def conn_fn(): priority = None conn = None try: # If we get a connection we must put it back later (priority, conn) = self._connections.get() if conn is None: conn = self._create_connection() conn_args = (conn, ) + args return fn(*conn_args, **kwargs) finally: if priority is not None: self._connections.put((priority, conn)) return super(ConnectionThreadPoolExecutor, self).submit(conn_fn)
class AsyncDataLoader(object): """The AsyncDataLoader is a wrapper to asynchronous loading multiples batches of data. It keeps a buffer of batches, so when the model asks for a new batch, it's already in memory. After sending the batch to the model, it is removed from the buffer, and a new batch can be loaded. The buffer is filled using a separated thread. Then, each batch can be loaded using multiple processes, or multiples threads. This async batch loader is designed for heavy IO or heavy CPU batch generation. .. warning:: When using the multiprocessing batch loader, watch the ram usage, and avoid a high number of processes. Multiprocessing can easily lead to memory overflow. - Should I use the Async Data Loader ? If you check the Cogitare's DataHolder, it already provides the execution of the data loading through multiple threads or multiple processes. So you should use this if the time to generate a whole batch is expensive. - Should I use threads or processes ? It's recommended to use threads, they are lightweight and fast. Multiple processing usually will lead to a worse performance and memory usage, due to the communication pipe between processes and due to the extension sharing of the memory. However, it can be useful for CPU expensive operations, because will not suffer from GIL. Threads, in the other way, are lightweight and usually fast, but can suffer from GIL. For tasks with heavy IO, it is a good choice. Args: data (DataSet, AbsDataHolder, SequentialDataSet, SequentialAbsDataHolder): data holder, or dataset instance. buffer_size (int): size of the batch buffer. The async data loader will keep around ``buffer_size`` batches in memory. mode (str): should be ``threaded`` or ``multiprocessing``, indicating how to fetch batches. workers (int): the number of threads/processes used to load the batches. If None, will use the number of cores in the CPU. on_batch_loaded (callable): if provided, this function will be called when a new batch is loaded. It must receive one argument, the batch data. And return the batch after applying some operation on the data. This can be used to apply pre-processing functions on a batch of data (such as image filtering, moving the Example:: >>> mnist = fetch_mldata('MNIST original') >>> mnist.data = mnist.data / 255 >>> data = DataSet([mnist.data, mnist.target.astype(int)], batch_size=64) >>> data_train, data_validation = data.split(0.8) >>> # wraps the data_train dataset with the async loader. >>> data_train = AsyncDataLoader(data_train) >>> model.learn(data_train, optimizer) """ def __init__(self, data, buffer_size=8, mode='threaded', workers=None, on_batch_loaded=None): valid = ( 'threaded', 'multiprocessing', ) utils.assert_raise(mode in valid, ValueError, 'mode must be one of: ' + ', '.join(valid)) utils.assert_raise(buffer_size >= 2, ValueError, 'buffer_size must be greater or equal to 2') if mode == 'threaded': self._executor = C.ThreadPoolExecutor(workers) else: self._executor = C.ProcessPoolExecutor(workers) if on_batch_loaded is None: on_batch_loaded = _identity self._queue = PriorityQueue(buffer_size) self._data = data self._thread = None self._on_batch_loaded = on_batch_loaded self._cache_buffer = [] self._caching = False def __repr__(self): return repr(self._data) def _start(self): if self._thread is None: self._thread = Thread(target=self._produce) self._thread.daemon = True self._thread.start() def cache(self): """Start to load batches to buffer, and wait the buffer be full. This can be used before start the model training to cache the samples and speed up the model execution. Example:: >>> dh = CallableHolder(s.__next__, mode='sequential', total_samples=20000000, single=True) >>> dh = AsyncDataLoader(dh, buffer_size=64000, mode='threaded', workers=1) >>> print('caching ...') >>> dh.cache() >>> print('done') """ self._caching = True self._cache_buffer = [] self._start() while not self._queue.full(): time.sleep(0.1) while not all(f.done() for f in self._cache_buffer): time.sleep(0.1) self._caching = False def _produce(self): idx = 0 while True: future = self._executor.submit(_fetch, self._on_batch_loaded, self._data) self._queue.put((idx, future)) idx += 1 if self._caching: self._cache_buffer.append(future) def __iter__(self): return self def __next__(self): self._start() return self._queue.get()[1].result() next = __next__ def __len__(self): return len(self._data)
class LayersApplier(object): """ Most layers replace content. We try to do this intelligently here, so that layers don't step over each other. """ HTML_TAG_REGEX = re.compile(r"<[^>]*?>") def __init__(self): self.queue = PriorityQueue() self.text = None def enqueue_from_list(self, elements_list): for le in elements_list: self.enqueue(le) def enqueue(self, layer_element): original, replacement, locations = layer_element priority = len(original) item = (original, replacement, locations) self.queue.put((-priority, item)) def location_replace(self, xml_node, original, replacement, locations): LocationReplace().location_replace(xml_node, original, replacement, locations) def unescape_text(self): """ Because of the way we do replace_all(), we need to unescape HTML entities. """ self.text = HTMLParser().unescape(self.text) def replace_all(self, original, replacement): """ Replace all occurrences of original with replacement. This is HTML aware; it effectively looks at all of the text in between HTML tags""" text_chunks = [] index = 0 for match in self.HTML_TAG_REGEX.finditer(self.text): text = self.text[index : match.start()] text_chunks.append(text.replace(original, replacement)) text_chunks.append(self.text[match.start() : match.end()]) # tag index = match.end() text_chunks.append(self.text[index:]) # trailing text self.text = "".join(text_chunks) self.unescape_text() def replace_at(self, original, replacement, locations): """ Replace the occurrences of original at all the locations with replacement. """ locations.sort() self.text = LocationReplace().location_replace_text(self.text, original, replacement, locations) self.unescape_text() def apply_layers(self, original_text): self.text = original_text while not self.queue.empty(): priority, layer_element = self.queue.get() original, replacement, locations = layer_element if not locations: self.replace_all(original, replacement) else: self.replace_at(original, replacement, locations) return self.text