def log_accuracy(section): correct, incorrect = section['correct'], section['incorrect'] if correct + incorrect > 0: logger.info( "%s: %.1f%% (%i/%i)" % (section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect))
def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting layer weights") random.seed(self.seed) self.syn0 = empty((len(self.vocab), self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once for i in range(len(self.vocab)): self.syn0[i] = (random.rand(self.layer1_size) - 0.5) / self.layer1_size self.syn1 = zeros((len(self.vocab), self.logistic_regression_size), dtype=REAL) self.syn0norm = None
def extend_vocab(self, sentences, oov_word=False, report_frequency=10000): """ Extend vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ logger.info("collecting all words and their counts") prev_sentence_no = -1 sentence_no, vocab = -1, {} total_words = 0 assign_to_vocab = vocab.__setitem__ # slight performance gain # https://wiki.python.org/moin/PythonSpeed/PerformanceTips get_from_vocab = vocab.__getitem__ for sentence_no, sentence in enumerate(sentences): if prev_sentence_no == sentence_no: break if sentence_no % report_frequency == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) for word in sentence: if word in vocab: get_from_vocab(word).count += 1 else: assign_to_vocab(word, Vocab(count=1)) total_words += len(sentence) prev_sentence_no = sentence_no logger.info( "collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) # assign a unique index to each word append = self.index2word.append assign_to_vocab = self.vocab.__setitem__ for word, v in vocab.items(): if word not in self.vocab: if v.count >= self.min_count: v.index = len(self.vocab) append(word) assign_to_vocab(word, v) else: self.vocab[word].count += v.count # add the special out of vocabulary word **UNKNOWN**: if oov_word: self.add_oov_word(count=len(vocab) - len(self.vocab)) logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) # add info about each word's Huffman encoding self.create_binary_tree() self.extend_weights()
def word2vector(X_train): """训练词向量""" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) wv_model = Word2Vec(X_train, size=wv_size, window=6, sg=1, min_count=5, workers=multiprocessing.cpu_count(), iter=10) gensim_dict = Dictionary() # 创建词语词典 gensim_dict.doc2bow(wv_model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号 w2vec = {word: wv_model[word] for word in w2indx.keys()} # 词语的词向量 return w2indx, w2vec
def build_paragraph_vocab(self, sentences): paragraph_vocab = {} for sentence_no, sentence in enumerate(sentences): sentence = " ".join(sentence) if sentence not in paragraph_vocab: paragraph_vocab[sentence] = Paragraph() logger.info("collected %i sentence types from a corpus of %i sentences." % (len(paragraph_vocab), sentence_no+1)) # assign a unique index to each sentence self.paragraph_vocab, self.index2paragraph = {}, [] append = self.index2paragraph.append assign_to_vocab = self.paragraph_vocab.__setitem__ for sentence, v in iteritems(paragraph_vocab): v.index = len(self.paragraph_vocab) assign_to_vocab(sentence, v) append(sentence)
def init_sims(self, replace=False): """ Precompute L2-normalized vectors. If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! Note that you **cannot continue training** after doing a replace. The model becomes effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`. """ super().init_sims(replace = replace) if getattr(self, 'synparagraphnorm', None) is None or replace: logger.info("precomputing L2-norms of word weight vectors") if replace: for i in range(self.synparagraph.shape[0]): self.synparagraph[i, :] /= sqrt((self.synparagraph[i, :] ** 2).sum(-1)) self.synparagraphnorm = self.synparagraph else: self.synparagraphnorm = (self.synparagraph / sqrt((self.synparagraph ** 2).sum(-1))[..., newaxis]).astype(REAL)
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ # oov_replacement = self.vocab.get(UnknownWord,None) if not self.vocab: raise RuntimeError( "you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [ word_count ], total_words or sum(v.count for v in self.vocab.values()) jobs = Queue( maxsize=2 * self.workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock( ) # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros( self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words) ) if self.weight_decay else self.alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = 0 for sentence in job: job_words += self.training_function( self, sentence, alpha, work) with lock: # here we can store the scores for later plotting and viewing... word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.debug( "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [ threading.Thread(target=worker_train) for _ in range(self.workers) ] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue no_oov = ([self.get_underlying_word_object(word) for word in sentence] for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, chunksize)): # logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info( "reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in range(self.workers): jobs.put( None ) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]
def build_vocab(self, sentences, oov_word=False, report_frequency=10000): """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ print("build vocab") path = (re.sub("/", "_", sentences.fname) + ("(mc=%d)" % (self.min_count)) + ".vocab") if hasattr(sentences, "fname") else None if path != None and file_exists(path): logger.info("loading from saved vocab list at \"%s\"" % (path)) file = gzip.open(path, 'r') saved_vocab = pickle.load(file) file.close() self.index2word = saved_vocab["index2word"] self.vocab = saved_vocab["vocab"] if oov_word: self.add_oov_word(count=10000) self.create_binary_tree() self.reset_weights() else: logger.info("collecting all words and their counts") prev_sentence_no = -1 sentence_no, vocab = -1, {} total_words = 0 assign_to_vocab = vocab.__setitem__ # slight performance gain # https://wiki.python.org/moin/PythonSpeed/PerformanceTips get_from_vocab = vocab.__getitem__ for sentence_no, sentence in enumerate(sentences): if prev_sentence_no == sentence_no: break if sentence_no % report_frequency == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) for word in sentence: if word in vocab: get_from_vocab(word).count += 1 else: assign_to_vocab(word, Vocab(count=1)) total_words += len(sentence) prev_sentence_no = sentence_no logger.info( "collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) # assign a unique index to each word self.vocab, self.index2word = {}, [] append = self.index2word.append assign_to_vocab = self.vocab.__setitem__ for word, v in vocab.items(): if v.count >= self.min_count: v.index = len(self.vocab) append(word) assign_to_vocab(word, v) # add the special out of vocabulary word **UNKNOWN**: if oov_word: self.add_oov_word(count=len(vocab) - len(self.vocab)) len(vocab) - len(self.vocab) logger.info( "total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) # add info about each word's Huffman encoding self.create_binary_tree() self.reset_weights() if path != None: logger.info("saving vocab list in \"%s\"" % (path)) with gzip.open(path, 'wb') as file: pickle.dump( { "vocab": self.vocab, "index2word": self.index2word }, file, 1)
def accuracy(self, questions, restrict_vocab=30000): """ Compute accuracy of the model (with **capitalizations**). `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = dict( sorted(self.vocab.items(), key=lambda item: -item[1].count)[:restrict_vocab]) ok_index = set(v.index for v in ok_vocab.values()) def log_accuracy(section): correct, incorrect = section['correct'], section['incorrect'] if correct + incorrect > 0: logger.info( "%s: %.1f%% (%i/%i)" % (section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect)) sections, section = [], None for line_no, line in enumerate(open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) log_accuracy(section) section = { 'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0 } else: if not section: raise ValueError( "missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, expected = line.split( ) # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line)) continue ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore predicted = None # find the most likely prediction, ignoring OOV words and input words for index in argsort( self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] if predicted != expected and predicted != expected.lower( ): logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted)) break section['correct' if predicted == expected else 'incorrect'] += 1 if section: # store the last section, too sections.append(section) log_accuracy(section) total = { 'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections) } log_accuracy(total) sections.append(total) return sections
def train(self, sentences, total_words=None, word_count=0, total_examples=None, queue_factor=2, report_delay=1.0): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the sentences are the same as those that were used to initially build the vocabulary. """ logger.info("Starting training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. if FAST_VERSION < 0: import warnings warnings.warn( "C extension not loaded for Word2Vec, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window) if not self.vocab: raise RuntimeError( "you must first build vocabulary before training the model") if not hasattr(self, "syn0"): raise RuntimeError( "you must first finalize vocabulary before training the model") if total_words is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count logger.info( "expecting %i sentences, matching count from corpus used for vocabulary survey", total_examples) else: raise ValueError( "you must provide either total_words or total_examples, to enable alpha and progress calculations" ) job_tally = 0 if self.iter > 1: sentences = utils.RepeatCorpusNTimes(sentences, self.iter) total_words = total_words and total_words * self.iter total_examples = total_examples and total_examples * self.iter def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" work = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker sentences, pairwise, alpha = job tally, raw_tally = self._do_train_job(sentences, pairwise, alpha, (work, neu1)) progress_queue.put( (len(sentences), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) def job_producer(): """Fill jobs queue using the input `sentences` iterator.""" job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 next_alpha = self.alpha if next_alpha > self.min_alpha_yet_reached: logger.warn("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 for sent_idx, sentence in enumerate(sentences): sentence_length = self._raw_word_count([sentence]) # can we fit this sentence into the existing job batch? if batch_size + sentence_length <= self.batch_words: # yes => add it to the current job job_batch.append(sentence) batch_size += sentence_length else: # no => submit the existing job pair_idx = list( numpy.random.choice( range(len(self.pairwise_constraints)), int(batch_size * 0.2))) pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx] logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(pairwise_samples), next_alpha) job_no += 1 job_queue.put((job_batch, pairwise_samples, next_alpha)) # update the learning rate for the next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) progress = 1.0 * pushed_examples / total_examples else: # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [sentence], sentence_length # add the last job too (may be significantly smaller than batch_words) if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(self.pairwise_constraints), next_alpha) job_no += 1 job_queue.put((job_batch, self.pairwise_constraints, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " "be sure to provide a corpus that offers restartable " "iteration = an iterable).") # give the workers heads up that they can finish -- no more work! for _ in xrange(self.workers): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [ threading.Thread(target=worker_loop) for _ in xrange(self.workers) ] unfinished_worker_count = len(workers) workers.append(threading.Thread(target=job_producer)) for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() example_count, trained_word_count, raw_word_count = 0, 0, word_count start, next_report = default_timer() - 0.00001, 1.0 while unfinished_worker_count > 0: report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 logger.info( "worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 # update progress stats example_count += examples trained_word_count += trained_words # only words in vocab & sampled raw_word_count += raw_words # log progress once every report_delay seconds elapsed = default_timer() - start if elapsed >= next_report: if total_examples: # examples-based progress % logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue)) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * raw_word_count / total_words, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue)) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed) if job_tally < 10 * self.workers: logger.warn( "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" ) # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warn( "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: logger.warn( "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed self.clear_sims() return trained_word_count
def build_vocab(self, sentences, oov_word = False, report_frequency = 10000): """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ path = (re.sub("/","_",sentences.fname)+".vocab") if hasattr(sentences, "fname") else None if path != None and file_exists(path): logger.info("loading from saved vocab list at \"%s\"" % (path)) file = gzip.open(path, 'r') saved_vocab = pickle.load(file) file.close() self.index2word = saved_vocab["index2word"] self.vocab = saved_vocab["vocab"] if oov_word: self.add_oov_word(count = 100000) if PaddingWord not in self.vocab: v = self.add_word_to_vocab(PaddingWord, count = 1000000) self.padding_word = v else: self.padding_word = self.vocab[PaddingWord] # add special padding word here. self.create_binary_tree() self.build_paragraph_vocab(sentences) self.reset_weights() else: logger.info("collecting all words and their counts") prev_sentence_no = -1 sentence_no, vocab = -1, {} total_words = 0 assign_to_vocab = vocab.__setitem__ # slight performance gain # https://wiki.python.org/moin/PythonSpeed/PerformanceTips get_from_vocab = vocab.__getitem__ for sentence_no, sentence in enumerate(sentences): if prev_sentence_no == sentence_no: break if sentence_no % report_frequency == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) for word in sentence: if word in vocab: get_from_vocab(word).count += 1 else: assign_to_vocab(word, Vocab(count=1)) total_words += len(sentence) prev_sentence_no = sentence_no logger.info("collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) # assign a unique index to each word self.vocab, self.index2word = {}, [] append = self.index2word.append assign_to_vocab = self.vocab.__setitem__ for word, v in iteritems(vocab): if v.count >= self.min_count: v.index = len(self.vocab) append(word) assign_to_vocab(word, v) # add the special out of vocabulary word **UNKNOWN**: if oov_word: self.add_oov_word(count = len(vocab) - len(self.vocab)) if PaddingWord not in self.vocab: v = self.add_word_to_vocab(PaddingWord, count = 1000000) self.padding_word = v else: self.padding_word = self.vocab[PaddingWord] logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) # add info about each word's Huffman encoding self.create_binary_tree() self.build_paragraph_vocab(sentences) self.reset_weights() if path != None: logger.info("saving vocab list in \"%s\"" % (path)) with gzip.open(path, 'wb') as file: pickle.dump({"vocab": self.vocab, "index2word": self.index2word}, file, 1)
def train(self, sentences, total_words=None, word_count=0, paragraphs_only = False, vocab = None, paragraphs = None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if paragraphs is None: paragraphs = self.synparagraph if vocab is None: vocab = self.paragraph_vocab if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab)) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) total_error = [0.0] def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" paragraph_work = zeros(self.paragraph_size, dtype=REAL) # each thread must have its own work memory error = zeros(1, dtype = REAL) if self.concatenate: # word work here is for each individual word, so it has length logistic regression - para size word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL) else: # here word work is aggregated: word_work = zeros(self.layer1_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) zeros(self.logistic_regression_size, dtype = REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job)) with lock: # here we can store the scores for later plotting and viewing... word_count[0] += job_words elapsed = time.time() - start total_error[0] += error[0] if elapsed >= next_report[0]: logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects, and paragraph to paragraph (Vocab) object: no_oov = (self.create_job(sentence,vocab) for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, self.batchsize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i sentences took %.1fs, %.0f sentences/s, %.6f" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0, total_error[0])) return (word_count[0], total_error[0])