def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): if total_examples: # examples-based progress % logger.info( "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue) ) else: # words-based progress % logger.info( "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue) )
def train(self, train_pairs, total_ptrees=None, ptree_count=0, total_examples=None, queue_factor=2, report_delay=1.0): """ An example is a pair of a complete AST tree and an utterance. A ptree is a partial tree. """ if FAST_VERSION < 0: import warnings warnings.warn("C extension not loaded; training will be slow.") # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1 + NEG_SAMPLING_VOCAB_SIZE_THRESHOLD) self.neg_labels[0] = 1. logger.info( "training model with %i workers", self.workers) if not self.vocab_i or not self.vocab_k or not self.vocab_l or not self.vocab_r: raise RuntimeError("you must first build vocabulary before training the model") if not hasattr(self, 'syn0i') or not hasattr(self, 'syn0k') or not hasattr(self, 'syn0l'): raise RuntimeError("you must first finalize vocabulary before training the model") if total_ptrees is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count logger.info("expecting %i train pairs, matching count from corpus used for vocabulary survey", total_examples) else: raise ValueError("you must provide either total_ptrees or total_examples, to enable alpha and progress calculations") job_tally = 0 if self.iter > 1: train_pairs = gsutils.RepeatCorpusNTimes(train_pairs, self.iter) total_ptrees = total_ptrees and total_ptrees * self.iter total_examples = total_examples and total_examples * self.iter def worker_loop(): """Train the model, lifting lists of train_pairs from the job_queue.""" # per-thread private work memory - useless in numpy implementation work = matutils.zeros_aligned(self.vector_size, dtype=REAL) neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker train_pairs, alpha = job tally, raw_tally = self._do_train_job(train_pairs, alpha, (work, neu1)) progress_queue.put((len(train_pairs), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) def job_producer(): """Fill jobs queue using the input `train_pairs` iterator.""" job_batch, batch_size = [], 0 pushed_ptrees, pushed_examples = 0, 0 next_alpha = self.alpha job_no = 0 for train_pair in train_pairs: train_pair_length = self._raw_ptree_count([train_pair]) # can we fit this train_pair into the existing job batch? if batch_size + train_pair_length <= self.batch_ptrees: # yes => add it to the current job job_batch.append(train_pair) batch_size += train_pair_length else: # no => submit the existing job logger.debug( "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f", job_no, batch_size, len(job_batch), next_alpha) job_no += 1 job_queue.put((job_batch, next_alpha)) # update the learning rate for the next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) progress = 1.0 * pushed_examples / total_examples else: # ptrees-based decay pushed_ptrees += self._raw_ptree_count(job_batch) progress = 1.0 * pushed_ptrees / total_ptrees next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) # add the train_pair that didn't fit as the first item of a new job job_batch, batch_size = [train_pair], train_pair_length # add the last job too (may be significantly smaller than batch_ptrees) if job_batch: logger.debug( "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f", job_no, batch_size, len(job_batch), next_alpha) job_no += 1 job_queue.put((job_batch, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " "be sure to provide a corpus that offers restartable " "iteration = an iterable)." ) # give the workers heads up that they can finish -- no more work! for _ in xrange(self.workers): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) # buffer ahead only a limited number of jobs.. this is the reason we can't # simply use ThreadPool :( job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)] unfinished_worker_count = len(workers) workers.append(threading.Thread(target=job_producer)) for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() example_count, trained_ptree_count, raw_ptree_count = 0, 0, ptree_count start, next_report = default_timer() - 0.00001, 1.0 while unfinished_worker_count > 0: report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_ptrees, raw_ptrees = report job_tally += 1 # update progress stats example_count += examples trained_ptree_count += trained_ptrees # only ptrees in vocab & sampled raw_ptree_count += raw_ptrees # log progress once every report_delay seconds elapsed = default_timer() - start if elapsed >= next_report: if total_examples: # examples-based progress % logger.info( "PROGRESS: at %.2f%% examples, %.0f ptrees/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_ptree_count / elapsed, gsutils.qsize(job_queue), gsutils.qsize(progress_queue)) else: # ptrees-based progress % logger.info( "PROGRESS: at %.2f%% ptrees, %.0f ptrees/s, in_qsize %i, out_qsize %i", 100.0 * raw_ptree_count / total_ptrees, trained_ptree_count / elapsed, gsutils.qsize(job_queue), gsutils.qsize(progress_queue)) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw ptrees (%i effective ptrees) took %.1fs, %.0f effective ptrees/s", raw_ptree_count, trained_ptree_count, elapsed, trained_ptree_count / elapsed) if job_tally < 10 * self.workers: logger.warn("under 10 jobs per worker: consider setting a smaller `batch_ptrees' for smoother alpha decay") # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_ptrees and total_ptrees != raw_ptree_count: logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_ptree_count, total_ptrees) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed return trained_ptree_count
def train(self, sentences, total_words=None, word_count=0, total_examples=None, queue_factor=2, report_delay=1.0): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the sentences are the same as those that were used to initially build the vocabulary. """ logger.info("Starting training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. if FAST_VERSION < 0: import warnings warnings.warn( "C extension not loaded for Word2Vec, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window) if not self.vocab: raise RuntimeError( "you must first build vocabulary before training the model") if not hasattr(self, "syn0"): raise RuntimeError( "you must first finalize vocabulary before training the model") if total_words is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count logger.info( "expecting %i sentences, matching count from corpus used for vocabulary survey", total_examples) else: raise ValueError( "you must provide either total_words or total_examples, to enable alpha and progress calculations" ) job_tally = 0 if self.iter > 1: sentences = utils.RepeatCorpusNTimes(sentences, self.iter) total_words = total_words and total_words * self.iter total_examples = total_examples and total_examples * self.iter def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" work = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker sentences, pairwise, alpha = job tally, raw_tally = self._do_train_job(sentences, pairwise, alpha, (work, neu1)) progress_queue.put( (len(sentences), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) def job_producer(): """Fill jobs queue using the input `sentences` iterator.""" job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 next_alpha = self.alpha if next_alpha > self.min_alpha_yet_reached: logger.warn("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 for sent_idx, sentence in enumerate(sentences): sentence_length = self._raw_word_count([sentence]) # can we fit this sentence into the existing job batch? if batch_size + sentence_length <= self.batch_words: # yes => add it to the current job job_batch.append(sentence) batch_size += sentence_length else: # no => submit the existing job pair_idx = list( numpy.random.choice( range(len(self.pairwise_constraints)), int(batch_size * 0.2))) pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx] logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(pairwise_samples), next_alpha) job_no += 1 job_queue.put((job_batch, pairwise_samples, next_alpha)) # update the learning rate for the next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) progress = 1.0 * pushed_examples / total_examples else: # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [sentence], sentence_length # add the last job too (may be significantly smaller than batch_words) if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(self.pairwise_constraints), next_alpha) job_no += 1 job_queue.put((job_batch, self.pairwise_constraints, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " "be sure to provide a corpus that offers restartable " "iteration = an iterable).") # give the workers heads up that they can finish -- no more work! for _ in xrange(self.workers): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [ threading.Thread(target=worker_loop) for _ in xrange(self.workers) ] unfinished_worker_count = len(workers) workers.append(threading.Thread(target=job_producer)) for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() example_count, trained_word_count, raw_word_count = 0, 0, word_count start, next_report = default_timer() - 0.00001, 1.0 while unfinished_worker_count > 0: report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 logger.info( "worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 # update progress stats example_count += examples trained_word_count += trained_words # only words in vocab & sampled raw_word_count += raw_words # log progress once every report_delay seconds elapsed = default_timer() - start if elapsed >= next_report: if total_examples: # examples-based progress % logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue)) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * raw_word_count / total_words, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue)) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed) if job_tally < 10 * self.workers: logger.warn( "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" ) # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warn( "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: logger.warn( "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed self.clear_sims() return trained_word_count
def train(self, train_pairs, total_ptrees=None, ptree_count=0, total_examples=None, queue_factor=2, report_delay=1.0): """ An example is a pair of a complete AST tree and an utterance. A ptree is a partial tree. """ if FAST_VERSION < 0: import warnings warnings.warn("C extension not loaded; training will be slow.") # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1 + NEG_SAMPLING_VOCAB_SIZE_THRESHOLD) self.neg_labels[0] = 1. logger.info("training model with %i workers", self.workers) if not self.vocab_i or not self.vocab_k or not self.vocab_l or not self.vocab_r: raise RuntimeError( "you must first build vocabulary before training the model") if not hasattr(self, 'syn0i') or not hasattr( self, 'syn0k') or not hasattr(self, 'syn0l'): raise RuntimeError( "you must first finalize vocabulary before training the model") if total_ptrees is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count logger.info( "expecting %i train pairs, matching count from corpus used for vocabulary survey", total_examples) else: raise ValueError( "you must provide either total_ptrees or total_examples, to enable alpha and progress calculations" ) job_tally = 0 if self.iter > 1: train_pairs = gsutils.RepeatCorpusNTimes(train_pairs, self.iter) total_ptrees = total_ptrees and total_ptrees * self.iter total_examples = total_examples and total_examples * self.iter def worker_loop(): """Train the model, lifting lists of train_pairs from the job_queue.""" # per-thread private work memory - useless in numpy implementation work = matutils.zeros_aligned(self.vector_size, dtype=REAL) neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker train_pairs, alpha = job tally, raw_tally = self._do_train_job(train_pairs, alpha, (work, neu1)) progress_queue.put((len(train_pairs), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) def job_producer(): """Fill jobs queue using the input `train_pairs` iterator.""" job_batch, batch_size = [], 0 pushed_ptrees, pushed_examples = 0, 0 next_alpha = self.alpha job_no = 0 for train_pair in train_pairs: train_pair_length = self._raw_ptree_count([train_pair]) # can we fit this train_pair into the existing job batch? if batch_size + train_pair_length <= self.batch_ptrees: # yes => add it to the current job job_batch.append(train_pair) batch_size += train_pair_length else: # no => submit the existing job logger.debug( "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f", job_no, batch_size, len(job_batch), next_alpha) job_no += 1 job_queue.put((job_batch, next_alpha)) # update the learning rate for the next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) progress = 1.0 * pushed_examples / total_examples else: # ptrees-based decay pushed_ptrees += self._raw_ptree_count(job_batch) progress = 1.0 * pushed_ptrees / total_ptrees next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) # add the train_pair that didn't fit as the first item of a new job job_batch, batch_size = [train_pair], train_pair_length # add the last job too (may be significantly smaller than batch_ptrees) if job_batch: logger.debug( "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f", job_no, batch_size, len(job_batch), next_alpha) job_no += 1 job_queue.put((job_batch, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " "be sure to provide a corpus that offers restartable " "iteration = an iterable).") # give the workers heads up that they can finish -- no more work! for _ in xrange(self.workers): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) # buffer ahead only a limited number of jobs.. this is the reason we can't # simply use ThreadPool :( job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [ threading.Thread(target=worker_loop) for _ in xrange(self.workers) ] unfinished_worker_count = len(workers) workers.append(threading.Thread(target=job_producer)) for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() example_count, trained_ptree_count, raw_ptree_count = 0, 0, ptree_count start, next_report = default_timer() - 0.00001, 1.0 while unfinished_worker_count > 0: report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 logger.info( "worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_ptrees, raw_ptrees = report job_tally += 1 # update progress stats example_count += examples trained_ptree_count += trained_ptrees # only ptrees in vocab & sampled raw_ptree_count += raw_ptrees # log progress once every report_delay seconds elapsed = default_timer() - start if elapsed >= next_report: if total_examples: # examples-based progress % logger.info( "PROGRESS: at %.2f%% examples, %.0f ptrees/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_ptree_count / elapsed, gsutils.qsize(job_queue), gsutils.qsize(progress_queue)) else: # ptrees-based progress % logger.info( "PROGRESS: at %.2f%% ptrees, %.0f ptrees/s, in_qsize %i, out_qsize %i", 100.0 * raw_ptree_count / total_ptrees, trained_ptree_count / elapsed, gsutils.qsize(job_queue), gsutils.qsize(progress_queue)) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw ptrees (%i effective ptrees) took %.1fs, %.0f effective ptrees/s", raw_ptree_count, trained_ptree_count, elapsed, trained_ptree_count / elapsed) if job_tally < 10 * self.workers: logger.warn( "under 10 jobs per worker: consider setting a smaller `batch_ptrees' for smoother alpha decay" ) # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warn( "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_ptrees and total_ptrees != raw_ptree_count: logger.warn( "supplied raw word count (%i) did not equal expected count (%i)", raw_ptree_count, total_ptrees) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed return trained_ptree_count