Exemple #1
0
def run_example():
    # Make an iterableQueue instance
    iq = IterableQueue()

    # Start a bunch of producers, give each one a producer endpoint
    producers = []
    for producer_id in range(NUM_PRODUCERS):
        queue = iq.get_producer()
        p = Process(target=producer_func, args=(queue, producer_id))
        p.start()
        producers.append(p)

    # And start a bunch of consumers
    consumers = []
    for consumer_id in range(NUM_CONSUMERS):

        # Give each consumer a "consumer-queue"
        consumer_endpoint = iq.get_consumer()
        p = Process(target=consumer_func, args=(consumer_endpoint, consumer_id))
        p.start()
        consumers.append(p)

    # Lastly, *this is important*, close the IterableQueue.
    iq.close()    # This indicates no new producers endpoints will be made

    # Wait for workers to finish
    for p in producers + consumers:
        p.join()
def extract_all_features_from_archive(archive_path, log=None):

    start = time.time()

    # First, make an iterable queue.  Extract all the corenlp files from the
    # archive and load them onto it
    fnames_q = IterableQueue()
    fnames_producer = fnames_q.get_producer()
    archive = tarfile.open(archive_path)

    # Extract each member, putting it's path and contents on the queue
    try:
        for member in archive:

            # Extract the contents of the corenlp files, putting the text
            # for each file directly onto the queue
            if member.name.endswith('xml'):
                fnames_producer.put(
                    (member.name, archive.extractfile(member).read()))

    # If we encounter corruption in the archive, log or print a warning
    # and proceed with the processing of what was extracted so far.
    except IOError, e:
        message = '%s\tlast file was: %s' % (str(e), member.name)
        if log:
            log.write(message)
        else:
            print message
Exemple #3
0
    def __init__(
            self,
            base_iterator,
            n_batches,
            batch_processor_cls,
            enqueue_fn=None,  # batch_processor_cls : mldc.data.data_handler.CustomBatchProcessor
            n_workers=3,
            qcap=5,
            *args,
            **kwargs):
        LOG.info("BatchQueue: n_workers=%d max queue size=%d", n_workers, qcap)
        self._base_iterator = base_iterator  # mldc.data.data_handler.DataIterator
        self._n_batches = n_batches  # 2
        self._todoq = IterableQueue(qcap)
        self._doneq = IterableQueue(qcap)
        self._enqueue_fn = enqueue_fn  # None
        self._workers = []
        self._end_sigq = Queue()

        # use threading here to avoid pickling, particularly since this process is fairly lightweight.
        self._producer = Thread(target=self.enq_examples_for_workers,
                                args=(
                                    self._todoq.get_producer(),
                                    self._end_sigq,
                                ))  # <Thread(Thread-20, initial)>

        for wid in range(1, n_workers + 1):  # n_workers = 1
            worker_todo = self._todoq.get_consumer(
            )  # iterable_queue.iq.ConsumerQueue
            worker_done = self._doneq.get_producer(
            )  # iterable_queue.iq.ProducerQueue
            w = mp.Process(target=BatchQueue._worker_loop,
                           args=(worker_todo, worker_done, wid,
                                 batch_processor_cls, *args),
                           kwargs=kwargs)
            w.start()
            self._workers.append(w)

        self._main_done = self._doneq.get_consumer()
        self._producer.start()

        self._todoq.close()
        self._doneq.close()
Exemple #4
0
    def __init__(self,
                 base_iterator,
                 n_batches,
                 batch_processor_cls,
                 enqueue_fn=None,
                 n_workers=3,
                 qcap=5,
                 *args,
                 **kwargs):
        LOG.info("BatchQueue: n_workers=%d max queue size=%d", n_workers, qcap)
        self._base_iterator = base_iterator
        self._n_batches = n_batches
        self._todoq = IterableQueue(qcap)
        self._doneq = IterableQueue(qcap)
        self._enqueue_fn = enqueue_fn
        self._workers = []
        self._end_sigq = Queue()

        # use threading here to avoid pickling, particularly since this process is fairly lightweight.
        self._producer = Thread(target=self.enq_examples_for_workers,
                                args=(
                                    self._todoq.get_producer(),
                                    self._end_sigq,
                                ))

        for wid in range(1, n_workers + 1):
            worker_todo = self._todoq.get_consumer()
            worker_done = self._doneq.get_producer()
            w = mp.Process(target=BatchQueue._worker_loop,
                           args=(worker_todo, worker_done, wid,
                                 batch_processor_cls, *args),
                           kwargs=kwargs)
            w.start()
            self._workers.append(w)

        self._main_done = self._doneq.get_consumer()
        self._producer.start()

        self._todoq.close()
        self._doneq.close()
Exemple #5
0
def run_example():
    # Make an iterableQueue instance
    iq = IterableQueue()

    # Start a bunch of producers, give each one a producer endpoint
    producers = []
    for producer_id in range(NUM_PRODUCERS):
        queue = iq.get_producer()
        p = Process(target=producer_func, args=(queue, producer_id))
        p.start()
        producers.append(p)

    # And start a bunch of consumers
    consumers = []
    for consumer_id in range(NUM_CONSUMERS):

        # Give each consumer a "consumer-queue"
        consumer_endpoint = iq.get_consumer()
        p = Process(target=consumer_func,
                    args=(consumer_endpoint, consumer_id))
        p.start()
        consumers.append(p)

    # Lastly, *this is important*, close the IterableQueue.
    iq.close()  # This indicates no new producers endpoints will be made

    # Wait for workers to finish
    for p in producers + consumers:
        p.join()
def extract_all_features(articles_dir, limit=None):

    start = time.time()

    # First, make an iterable queue and load all the article fnames onto it
    fnames_q = IterableQueue()
    fnames_producer = fnames_q.get_producer()
    for fname in get_fnames(articles_dir)[:limit]:
        fnames_producer.put(fname)
    fnames_producer.close()

    # Make a queue to hold feature stats (results), and a consumer to
    # receive completed feature stats objects from workers
    features_q = IterableQueue()
    features_consumer = features_q.get_consumer()

    # Create workers that consume filenames and produce feature counts.
    for p in range(NUM_ARTICLE_LOADING_PROCESSES):
        fnames_consumer = fnames_q.get_consumer()
        features_producer = features_q.get_producer()
        process = Process(target=extract_features_from_articles,
                          args=(fnames_consumer, features_producer, False))
        process.start()

    # Close the iterable queues
    fnames_q.close()
    features_q.close()

    # Accumulate the results.  This blocks until workers are finished
    feature_accumulator = make_feature_accumulator()

    for accumulator in features_consumer:
        feature_accumulator.merge(accumulator)

    elapsed = time.time() - start
    print 'elapsed', elapsed

    return feature_accumulator
    # If we encounter corruption in the archive, log or print a warning
    # and proceed with the processing of what was extracted so far.
    except IOError, e:
        message = '%s\tlast file was: %s' % (str(e), member.name)
        if log:
            log.write(message)
        else:
            print message

    # We're done adding files to the queue
    fnames_producer.close()

    # Make a queue to hold feature stats (results), and a consumer to
    # receive completed feature stats objects from workers
    features_q = IterableQueue()
    features_consumer = features_q.get_consumer()

    # Create workers that consume filenames and produce feature counts.
    for p in range(NUM_ARTICLE_LOADING_PROCESSES):
        fnames_consumer = fnames_q.get_consumer()
        features_producer = features_q.get_producer()
        process = Process(target=extract_features_from_articles,
                          args=(fnames_consumer, features_producer, 'content'))
        process.start()

    # We're done making endpoints for the queues
    fnames_q.close()
    features_q.close()

    # We're going to accumulate the results.  Make some containers for that.
Exemple #8
0
    def generate_dataset_parallel(self, save_dir=None):
        '''
        Parallel version of generate_dataset_serial.  Each worker is 
        responsible for saving its own part of the dataset to disk, called 
        a macrobatch.  the files are saved at 
        'save_dir/examples/<batch-num>.npz'.
        '''
        # This cannot be called before calling prepare(), unless a prepared
        # UnigramDictionary was passed to the self's constructor

        if not self.is_prepared():
            raise DataSetReaderIllegalStateException(
                "DatasetReader: generate_examples() cannot be called "
                "before prepare() is called unless a prepared "
                "UnigramDictionary has was passed into the "
                "DatasetReader's constructor.")

        # We save dataset in the "examples" subdir of the model_dir
        if save_dir is not None:
            examples_dir = os.path.join(save_dir, 'examples')
            # We are willing to create both the save_dir, and the
            # 'examples' subdir, but not their parents
            if not os.path.exists(save_dir):
                os.mkdir(save_dir)
            if not os.path.exists(examples_dir):
                os.mkdir(examples_dir)
        else:
            examples_dir = None

        file_queue = IterableQueue()
        macrobatch_queue = IterableQueue(self.max_queue_size)

        # Put all the filenames on a producer queue
        file_producer = file_queue.get_producer()
        for filename in self.generate_filenames():
            file_producer.put(filename)
        file_producer.close()

        # Start a bunch of worker processes
        for process_num in range(self.num_processes):
            # Hop to a new location in the random-number-generator's state
            # chain
            reseed()
            # Start child process that generates a portion of the dataset
            args = (file_queue.get_consumer(), macrobatch_queue.get_producer())
            Process(target=self.generate_dataset_worker, args=args).start()

        # This will receive the macrobatches from all workers
        macrobatch_consumer = macrobatch_queue.get_consumer()

        # Close the iterable queues
        file_queue.close()
        macrobatch_queue.close()

        for signal_macrobatch, noise_macrobatch in macrobatch_consumer:

            if self.verbose:
                print('receiving macrobatch from child process')

            yield signal_macrobatch, noise_macrobatch

        # Explicitly close up macrobatch_consumer, which hopefully fixes the EOFError
        macrobatch_consumer.close()
Exemple #9
0
	def generate_dataset_parallel(self, save_dir=None):
		'''
		Parallel version of generate_dataset_serial.  Each worker is 
		responsible for saving its own part of the dataset to disk, called 
		a macrobatch.  the files are saved at 
		'save_dir/examples/<batch-num>.npz'.
		'''
		# This cannot be called before calling prepare(), unless a prepared
		# UnigramDictionary was passed to the self's constructor

		if not self.prepared:
			raise DataSetReaderIllegalStateException(
				"DatasetReader: generate_examples() cannot be called "
				"before prepare() is called unless a prepared "
				"UnigramDictionary has was passed into the DatasetReader's "
				"constructor."
			)

		# We save dataset in the "examples" subdir of the model_dir
		if save_dir is not None:
			examples_dir = os.path.join(save_dir, 'examples')
			# We are willing to create both the save_dir, and the
			# 'examples' subdir, but not their parents
			if not os.path.exists(save_dir):
				os.mkdir(save_dir)
			if not os.path.exists(examples_dir):
				os.mkdir(examples_dir)
		else:
			examples_dir = None

		file_queue = IterableQueue()
		macrobatch_queue = IterableQueue(self.max_queue_size)

		# Put all the filenames on a producer queue
		file_producer = file_queue.get_producer()
		for filename in self.generate_filenames():
			file_producer.put(filename)
		file_producer.close()

		# Start a bunch of worker processes
		for process_num in range(self.num_processes):
			# Hop to a new location in the random-number-generator's state 
			# chain
			reseed()
			# Start child process that generates a portion of the dataset
			args = (
				file_queue.get_consumer(),
				macrobatch_queue.get_producer()
			)
			Process(target=self.generate_dataset_worker, args=args).start()

		# This will receive the macrobatches from all workers
		macrobatch_consumer = macrobatch_queue.get_consumer()

		# Close the iterable queues
		file_queue.close()
		macrobatch_queue.close()

		# Retrieve the macrobatches from the workers, write them to file
		signal_macrobatches = []
		noise_macrobatches = []
		macrobatch_num = -1
		for signal_macrobatch, noise_macrobatch in macrobatch_consumer:

			if self.verbose:
				print 'receiving macrobatch from child process'

			#macrobatch_num += 1
			#if examples_dir is not None:
			#	save_path = os.path.join(
			#		examples_dir, '%d.npz' % macrobatch_num
			#	)
			#	np.savez(
			#		save_path,
			#		signal_examples=signal_macrobatch,
			#		noise_examples=noise_macrobatch
			#	)

			yield signal_macrobatch, noise_macrobatch
Exemple #10
0
class BatchQueue:
    """
  Wraps an iterator with a parallel asynchronous mechanism for queuing multiple batches at the same time.
  Implemented as a pool of resusable processes over two iterable threadsafe queues, avoiding process creation
  and setup (e.g. load fasttext) overhead.

  The producer process takes a batch from the base iter puts it on `todoq` (producer).
  A worker process takes a batch off `todoq` (consumer).
  The worker process processes the batch and places the result on `doneq` (producer).
  The main process takes a processed batch off `doneq` (consumer).

  To cleanly end iteration prematurely, call close() on the BatchQueue object.
  """
    @staticmethod
    def _worker_loop(todoq, doneq, wid, proc_class, *args, **kwargs):
        device_id = 0
        world_size = 1
        _set_cuda(True, device_id, world_size)

        # setup
        processor = proc_class(*args, **kwargs)  # CustomBatchProcessor

        for raw_batch in todoq:  # raw_batch
            processed_batch = processor.process_batch(raw_batch)  # 2
            doneq.put(pickle.dumps(processed_batch))

        doneq.close()

    def __init__(
            self,
            base_iterator,
            n_batches,
            batch_processor_cls,
            enqueue_fn=None,  # batch_processor_cls : mldc.data.data_handler.CustomBatchProcessor
            n_workers=3,
            qcap=5,
            *args,
            **kwargs):
        LOG.info("BatchQueue: n_workers=%d max queue size=%d", n_workers, qcap)
        self._base_iterator = base_iterator  # mldc.data.data_handler.DataIterator
        self._n_batches = n_batches  # 2
        self._todoq = IterableQueue(qcap)
        self._doneq = IterableQueue(qcap)
        self._enqueue_fn = enqueue_fn  # None
        self._workers = []
        self._end_sigq = Queue()

        # use threading here to avoid pickling, particularly since this process is fairly lightweight.
        self._producer = Thread(target=self.enq_examples_for_workers,
                                args=(
                                    self._todoq.get_producer(),
                                    self._end_sigq,
                                ))  # <Thread(Thread-20, initial)>

        for wid in range(1, n_workers + 1):  # n_workers = 1
            worker_todo = self._todoq.get_consumer(
            )  # iterable_queue.iq.ConsumerQueue
            worker_done = self._doneq.get_producer(
            )  # iterable_queue.iq.ProducerQueue
            w = mp.Process(target=BatchQueue._worker_loop,
                           args=(worker_todo, worker_done, wid,
                                 batch_processor_cls, *args),
                           kwargs=kwargs)
            w.start()
            self._workers.append(w)

        self._main_done = self._doneq.get_consumer()
        self._producer.start()

        self._todoq.close()
        self._doneq.close()

    def enq_examples_for_workers(self, todo_queue, end_queue):
        print('enq_examples_for_workers first')
        for bid, batch in enumerate(
                self._base_iterator
        ):  #type(batch) : MetaBatch / batch.__len__() : 1/ batch[0].__len__() : 2/ type(batch[0][0]) : Batch.keys() dict_keys(['seq_word_feat', 'orig_text', 'neg_orig_text', 'dlg_len', 'dlg_id', 'domain_id', 'task_id', 'neg_seq_word_feat', 'index', 'out_tokens'])
            if self._enqueue_fn:
                batch = self._enqueue_fn(batch)  # 2 Batch (support, target)
            while True:
                try:
                    todo_queue.put(batch, block=True, timeout=1)
                    break
                except Full:
                    # try again, but before that check whether stop was requested
                    time.sleep(0)  # yield control to other threads for now
                    pass
                finally:
                    # stop putting stuff in the queue if end signaled
                    if not end_queue.empty():
                        todo_queue.close()
                        return
        todo_queue.close()

    def close(self):
        """ Note: must be called explicitly since putting this in `__del__` doesn't work."""
        # stop generating data
        self._end_sigq.put("stop")

        # Drain the queue
        for _ in self._main_done:
            pass

        # note this cannot be done before draining
        self._producer.join()

    def __iter__(self):
        for item in self._main_done:
            temp = pickle.loads(item)
            yield temp

    def __len__(self):
        return self._n_batches
Exemple #11
0
	def get_async_batch_iterator(self):
		'''
		Builds an asynchronous minibatching pipeline, which reads all
		dataset files, parses them, generates training examples, and
		packages those training examples into minibatches.  Finally,
		it yields an iterable of minibatches, taking the form of an
		IterableQueue.ConsumerQueue.

		(no Inputs)

		OUTPUTS
		* [iterable (IterableQueue.ConsumerQueue)]: Iterable of
			minibatches.
		'''

		# TODO: currently the only randomness in minibatching comes from
		# the signal context and noise contexts that are drawn for a
		# given entity query tuple.  But the entity query tuples are read
		# deterministically in order through the corpus  Ideally examples
		# should be totally shuffled..


		file_queue = IterableQueue()
		example_queue = IterableQueue()
		minibatch_queue = IterableQueue()

		# Fill the file queue
		file_producer = file_queue.get_producer()
		for filename in self.generate_filenames():
			file_producer.put(filename)
		file_producer.close()

		# Make processes that process the files and put examples onto
		# the example queue
		for i in range(self.num_example_generators):

			# These calls to np.random are a hack to ensure that each
			# child example-generating process gets different randomness
			#reseed()
			Process(target=self.generate_examples_async, args=(
				file_queue.get_consumer(),
				example_queue.get_producer()
			)).start()

		# Make a processes that batches the files and puts examples onto
		# the minibatch queue
		Process(target=self.generate_minibatches_async, args=(
			example_queue.get_consumer(),
			minibatch_queue.get_producer()
		)).start()

		# Before closing the queues, make a consumer that will be used for
		# yielding minibatches to the external call for iteration.
		self.minibatch_consumer = minibatch_queue.get_consumer()

		# Close all queues
		file_queue.close()
		example_queue.close()
		minibatch_queue.close()

		# Return the minibatch_consumer as the iterator
		return self.minibatch_consumer
Exemple #12
0
	def __iter__(self):
		'''
		Builds an asynchronous minibatching pipeline, which reads all
		dataset files, parses them, generates training examples, and 
		packages those training examples into minibatches.  Finally, 
		it yields an iterable of minibatches, taking the form of an 
		IterableQueue.ConsumerQueue.

		(no Inputs)

		OUTPUTS
		* [iterable (IterableQueue.ConsumerQueue)]: Iterable of 
			minibatches.
		'''

		# TODO: currently the only randomness in minibatching comes from
		# the signal context and noise contexts that are drawn for a 
		# given entity query tuple.  But the entity query tuples are read
		# deterministically in order through the corpus  Ideally examples
		# should be totally shuffled..

		file_queue = IterableQueue()
		example_queue = IterableQueue()
		minibatch_queue = IterableQueue()

		# Fill the file queue
		file_producer = file_queue.get_producer()
		for filename in self.generate_filenames():
			file_producer.put(filename)
		file_producer.close()

		# Make processes that process the files and put examples onto
		# the example queue
		for i in range(self.num_example_generators):
			Process(target=self.generate_examples_async, args=(
				file_queue.get_consumer(),
				example_queue.get_producer()
			)).start()
			
		# Make a processes that batches the files and puts examples onto
		# the minibatch queue
		Process(target=self.generate_minibatches_async, args=(
			example_queue.get_consumer(),
			minibatch_queue.get_producer()
		)).start()

		# Before closing the queues, make a consumer that will be used for 
		# yielding minibatches to the external call for iteration.
		self.minibatch_consumer = minibatch_queue.get_consumer()

		# Close all queues
		file_queue.close()
		example_queue.close()
		minibatch_queue.close()

		# This is necessary because accessing randomness in the child 
		# processes doesn't advance the random state here in the parent
		# process, which would, mean that the exact same minibatch sequence 
		# would being generated on subsequent calls to `__iter__()`, which 
		# is not desired.  The simplest solution is to advance the 
		# random state by sampling randomness once.
		np.random.uniform()

		# Return the minibatch_consumer as the iterator
		return self.minibatch_consumer
Exemple #13
0
def fit(dictionary=None,
        files=[],
        dirs=[],
        match='',
        skip='$.^',
        batch_size=1000,
        num_topics=DEFAULT_NUM_TOPICS,
        time_range=None,
        alpha=None,
        beta=0.1,
        num_procs=NUM_PROCS,
        read=None,
        num_docs=None,
        min_frequency=5,
        num_epochs=100):

    # If we don't have the number of documents or a dictionary, then
    # run over the full dataset once to accumulate that information.
    if dictionary is None or num_docs is None or time_range is None:
        dictionary, num_docs, found_time_range = (get_corpus_stats(
            files=files,
            dirs=dirs,
            match=match,
            skip=skip,
            batch_size=batch_size,
            num_procs=num_procs,
            read=read,
            stopwords=STOPWORDS,
            min_frequency=min_frequency))

    if time_range is None:
        time_range = found_time_range

    if alpha is None:
        alpha = 1.

    total_docs = sum(num_docs)
    proc_doc_indices = [sum(num_docs[:i]) for i in range(len(num_docs) + 1)]

    m = np.ones((total_docs, num_topics))
    n = np.ones((len(dictionary), num_topics))

    psi = np.ones((num_topics, 2))

    #TODO: move worker creation outside of the epoch -- keep same worker pool
    # between epochs.  Workers can receive updates about m and n etc. over the
    # queue.
    for epoch in range(num_epochs):

        # Show progress
        print(float(epoch) / num_epochs * 100)

        # Pre-calculate the denominator in the sum of the probability dist
        n_denom = (n + beta).sum(axis=0) - 1
        B = np.array([beta_func(*psi_vals) for psi_vals in psi])
        denom = n_denom * B

        # The workers should calculate probabilities and then sample, producing
        # updates to m and n.
        updates_queue = IterableQueue()
        ctx = mp.get_context("spawn")
        for proc_num in range(num_procs):

            # Advance the randomness so children don't all get same seed
            np.random.random()

            doc_iterator = DocumentIterator(
                read=read,
                files=files,
                dirs=dirs,
                match=match,
                skip=skip,
                batch_size=batch_size,
                fold='%s/%s' % (proc_num, num_procs),
            )
            m_slice = m[proc_doc_indices[proc_num]:proc_doc_indices[proc_num +
                                                                    1]]

            p = ctx.Process(target=worker,
                            args=(proc_num, doc_iterator, dictionary,
                                  num_topics, time_range, alpha, beta, psi,
                                  n, m_slice, denom,
                                  updates_queue.get_producer()))
            p.start()

        updates_consumer = updates_queue.get_consumer()
        updates_queue.close()

        # Update m, n, and psi
        n = np.zeros((len(dictionary), num_topics))
        m = np.zeros((total_docs, num_topics))
        psi_updates = [[] for i in range(num_topics)]
        for proc_num, m_update, n_update, psi_update in updates_consumer:
            n += n_update
            start_idx = proc_doc_indices[proc_num]
            stop_idx = proc_doc_indices[proc_num + 1]
            m[start_idx:stop_idx] = m_update
            for i in range(num_topics):
                psi_updates[i].extend(psi_update[i])

        # Update psi
        for i in range(num_topics):
            psi[i] = fit_psi(psi_updates[i])

    return m, n, psi, dictionary
Exemple #14
0
def get_corpus_stats(files=[],
                     dirs=[],
                     match='',
                     skip='$.^',
                     batch_size=1000,
                     num_procs=NUM_PROCS,
                     read=None,
                     stopwords=STOPWORDS,
                     min_frequency=5):
    """
    Build a dictionary by running through the dataset fully.
    prune back according to min_frequency.  Ignore stopwords given.
    This dictionary facilitates the conversion between tokens and integers.
    """

    # Start meany workers.  Each will make a dictionary over a subset of the
    # documents.  They return their dictionaries over a queue.
    worker_dictionary_queue = IterableQueue()
    worker_num_docs_queue = IterableQueue()
    worker_time_range_queue = IterableQueue()
    ctx = mp.get_context('spawn')
    for proc_num in range(num_procs):
        doc_iterator = DocumentIterator(
            read=read,
            files=files,
            dirs=dirs,
            match=match,
            skip=skip,
            batch_size=batch_size,
            fold='%s/%s' % (proc_num, num_procs),
        )
        args = (
            proc_num,
            doc_iterator,
            worker_dictionary_queue.get_producer(),
            worker_num_docs_queue.get_producer(),
            worker_time_range_queue.get_producer(),
            stopwords,
        )
        p = ctx.Process(target=dictionary_worker, args=args)
        p.start()

    # Collect the workers' dictionaries into one.
    worker_dictionary_consumer = worker_dictionary_queue.get_consumer()
    worker_dictionary_queue.close()
    dictionary = UnigramDictionary()
    for worker_dictionary in worker_dictionary_consumer:
        dictionary.add_dictionary(worker_dictionary)

    # Prune rare words from the dictionary.
    dictionary.prune(min_frequency)

    # Get the number of documents for each process
    worker_num_docs_consumer = worker_num_docs_queue.get_consumer()
    worker_num_docs_queue.close()
    num_docs = [count for proc_num, count in sorted(worker_num_docs_consumer)]

    # Get time range for all documents
    worker_time_range_consumer = worker_time_range_queue.get_consumer()
    worker_time_range_queue.close()

    minimum_t = 999999999999
    maximum_t = 0
    for min_time, max_time in worker_time_range_consumer:
        minimum_t = min(min_time, minimum_t)
        maximum_t = max(max_time, maximum_t)

    #buffering with 1% on both sides to ensure a nonzero chance for each document
    time_difference = maximum_t - minimum_t
    wiggle_room = time_difference / 100
    time_range = (minimum_t - wiggle_room, maximum_t + wiggle_room)

    # Return the completed, pruned dictionary, and time range.
    return dictionary, num_docs, time_range