def __init__(self, sentences=None, oov_word=False, size=100, alpha=0.035, window=5, min_count=5, seed=1, workers=1, min_alpha=0.0001, sg=1, training_function=train_sentence_sg_original, decay=True, vocab_report_frequency=10000): """ Construct a word2vec language model from a corpus `sentences`. Optionally specify whether an out of vocabulary word should be trained. Inputs (optional keyword arguments) ----------------------------------- oov_word bool : train unknown word vector? size int : embedding dimensions. alpha float : learning rate decay bool : anneal learning rate over time? min_alpha float : if learning rate is annealed, minimum rate to use. sg int : use skip gram method or average context? training_function function : training function that takes as parameters: Word2VecExtended, list<int>, float, np.array (stores gradient) window int : size of the context used to train embeddings. min_count int : minimum word occurence to include in model vocabulary. seed int : random seed to set up weights sentences object : what corpus to train on (See: `LineCorpus`, `BrownCorpus`, `BrownCorpusSimple`) """ self.vocab = {} # mapping from a word (string) to a Vocab object self.index2word = [ ] # map from a word's matrix index (int) to word (string) self.sg = int(sg) self.layer1_size = int(size) self.logistic_regression_size = self.layer1_size if size % 4 != 0: logger.warning( "consider setting layer size to a multiple of 4 for greater performance" ) self.alpha = float(alpha) self.window = int(window) self.weight_decay = decay self.seed = seed self.hs = True self.negative = False self.training_function = training_function self.min_count = min_count self.workers = workers self.min_alpha = min_alpha if sentences is not None: self.build_vocab(sentences, oov_word=oov_word, report_frequency=vocab_report_frequency) self.train(sentences) # maybe ?
from six.moves import xrange from timeit import default_timer from random import shuffle try: from queue import Queue, Empty except ImportError: from Queue import Queue, Empty try: from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH logger.debug("Fast version of {0} is being used".format(__name__)) except ImportError: # failed... fall back to plain numpy (20-80x slower training than the above) logger.warning("Slow version of {0} is being used".format(__name__)) FAST_VERSION = -1 MAX_WORDS_IN_BATCH = 10000 def train_batch_sg_constraints(model, constraints, alpha, work=None): """This function adds an additional constraint to the representation.""" result = 0 for constraint in constraints: word = model.vocab[constraint[0]] word2 = model.vocab[constraint[1]] # the representation of word2.index is used to predict model.index2word[word.index] train_sg_pair(model, model.index2word[word.index], word2.index, alpha) result += 1 return result
def job_producer(): """Fill jobs queue using the input `sentences` iterator.""" job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 next_alpha = self.alpha if next_alpha > self.min_alpha_yet_reached: logger.warn("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 for sent_idx, sentence in enumerate(sentences): sentence_length = self._raw_word_count([sentence]) # can we fit this sentence into the existing job batch? if batch_size + sentence_length <= self.batch_words: # yes => add it to the current job job_batch.append(sentence) batch_size += sentence_length else: # no => submit the existing job pair_idx = list( numpy.random.choice( range(len(self.pairwise_constraints)), int(batch_size * 0.2))) pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx] logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(pairwise_samples), next_alpha) job_no += 1 job_queue.put((job_batch, pairwise_samples, next_alpha)) # update the learning rate for the next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) progress = 1.0 * pushed_examples / total_examples else: # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [sentence], sentence_length # add the last job too (may be significantly smaller than batch_words) if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(self.pairwise_constraints), next_alpha) job_no += 1 job_queue.put((job_batch, self.pairwise_constraints, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " "be sure to provide a corpus that offers restartable " "iteration = an iterable).") # give the workers heads up that they can finish -- no more work! for _ in xrange(self.workers): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no)
def __init__(self, vocabulary = None, random_window = False, scale_updates = False, self_predict = 0, batchsize = 100, symmetric_window = True, oov_word = True, min_count = 5, paragraph_size = 400, concatenate = True, sentences=None, size=400, alpha=0.035, window=5, seed=1, workers=1, min_alpha=0.0001, decay = True, vocab_report_frequency = 10000): """ PVDM model for training and learning context paragraphs for sentiment and topic analysis, or information retrieval. This method uses hierarchical softmax, word2vec, and word windows to obtain an unsupervised model for these paragraphs and their context [1] [1] Quoc Le and Tomas Mikolov, "Distributed Representations of Sentences and Documents," ICML 2014. TODO: - add synparagraph for updating paragraph positions - store paragraph size - build record of paragraph index (without code) - update training function accordingly. """ if batchsize > MAX_BATCHSIZE: raise AssertionError("Maximum batch size is %d." % (MAX_BATCHSIZE)) self.batchsize = int(batchsize) if batchsize > 0 else 1 self.symmetric_window = symmetric_window self.scale_updates = scale_updates self.vocab = {} # mapping from a word (string) to a Vocab object self.paragraph_vocab = {} self.index2word = [] # map from a word's matrix index (int) to word (string) self.index2paragraph = [] # map from a paragraph's matrix index (int) to paragraph (string) self.layer1_size = int(size) self.paragraph_size = int(paragraph_size) self.concatenate = concatenate self.random_window = random_window if size % 4 != 0: logger.warning("consider setting layer size to a multiple of 4 for greater performance") self.alpha = float(alpha) self.window = int(window) self.weight_decay = decay self.seed = seed self.hs = True self.negative = False self.self_predict = self_predict self.min_count = min_count self.workers = workers self.min_alpha = min_alpha if self.concatenate: # the logistic regression layer for hierarchical softmax deals # first with the paragraph dimensions, then with window * 2 # words: if self.symmetric_window: self.logistic_regression_size = self.paragraph_size + self.window * 2 * self.layer1_size else: self.logistic_regression_size = self.paragraph_size + self.window * 1 * self.layer1_size else: # the logistic regression layer for hierarchical softmax deals first # with the paragraph dimensions, then with the average of the # 2 * window words: self.logistic_regression_size = self.layer1_size + self.paragraph_size if self_predict > 0: self.training_function = train_sentence_batch_pvdm_self_predict if self_predict == 1 else train_sentence_batch_pvdm_skipgram self.logistic_regression_size = self.layer1_size self.true_paragraph_size = self.paragraph_size self.paragraph_size = 0 else: self.training_function = train_sentence_batch_pvdm if sentences is not None: self.build_vocab(sentences, oov_word = oov_word, report_frequency = vocab_report_frequency) self.train(sentences) # maybe ?