def train_batch_sg(model, sentences, alpha, work=None): result = 0 for sentence in sentences: word_vocabs = [ model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint( model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[ word.index]] for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) for pos2, word2 in enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result
def train_sentence_dbow(model, sentence, lbls, alpha, work=None, train_words=True, train_lbls=True): """ Update distributed bag of words model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Doc2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from doc2vec_inner instead. """ neg_labels = [] if model.negative: # precompute negative labels neg_labels = zeros(model.negative + 1) neg_labels[0] = 1.0 for label in lbls: if label is None: continue # OOV word in the input sentence => skip for word in sentence: if word is None: continue # OOV word in the input sentence => skip train_sg_pair(model, word, label, alpha, neg_labels, train_words, train_lbls) return len([word for word in sentence if word is not None])
def train_sentence_sg(model, sentence, alpha, work=None): """ Update skip-gram model by training on a single sentence. The sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from `Word2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2vec_inner instead. """ word_vocabs = [ model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32 ] # for pos, word in enumerate(word_vocabs): if len(word_vocabs) > 0: pos = 0 word = word_vocabs[0] reduced_window = model.random.randint( model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, word2 in enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): # don't train on the `word` itself if pos2 != pos: word2vec.train_sg_pair(model, model.index2word[word.index], word2.index, alpha) return len(word_vocabs)
def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed bag of words model ("PV-DBOW") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. If `train_words` is True, simultaneously train word-to-word (not just doc-to-word) examples, exactly as per Word2Vec skip-gram training. (Without this option, word vectors are neither consulted nor updated during DBOW doc vector training.) Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from doc2vec_inner instead. """ if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf if train_words and learn_words: train_batch_sg(model, [doc_words], alpha, work) for doctag_index in doctag_indexes: for word in doc_words: train_sg_pair( model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden, context_vectors=doctag_vectors, context_locks=doctag_locks ) return len(doc_words)
def train_batch_sg(model, sentences, alpha, work=None, neu1=None): """Update skip-gram model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. sentences : iterable of iterables Iterable of the sentences directly from disk/network. alpha : float Learning rate. work : :class:`numpy.ndarray` Private working memory for each worker. neu1 : :class:`numpy.ndarray` Private working memory for each worker. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [ model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint( model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[ word.index]] for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) for pos2, word2 in enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result
def train_batch_sg_constraints(model, constraints, alpha, work=None): """This function adds an additional constraint to the representation.""" result = 0 for constraint in constraints: word = model.vocab[constraint[0]] word2 = model.vocab[constraint[1]] # the representation of word2.index is used to predict model.index2word[word.index] train_sg_pair(model, model.index2word[word.index], word2.index, alpha) result += 1 return result
def train_batch_sg(model, sentences, alpha, work=None, neu1=None): """Update skip-gram model by training on a sequence of sentences. Called internally from :meth:`~gensim.models.fasttext.FastText.train`. Notes ----- This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from :mod:`gensim.models.fasttext_inner` instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. sentences : iterable of list of str Iterable of the sentences directly from disk/network. alpha : float Learning rate. work : :class:`numpy.ndarray`, optional UNUSED. neu1 : :class:`numpy.ndarray`, optional UNUSED. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) subwords_indices = (word.index,) subwords_indices += model.wv.buckets_word[word.index] for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result
def train_batch_sg(model, sentences, alpha, work=None): result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result
def train_batch(model, sentences, alpha, work=None, neu1=None, compute_loss=False): """Update CBOW model by training on a sequence of sentences. Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`. Warnings -------- This is the non-optimized, pure Python version. If you have a C compiler, Gensim will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead. Parameters ---------- model : :class:`~gensim.models.word2vec.Word2Vec` The Word2Vec model instance to train. sentences : iterable of list of str The corpus used to train the model. alpha : float The learning rate work : object, optional Unused. neu1 : object, optional Unused. compute_loss : bool, optional Whether or not the training loss should be computed in this batch. Returns ------- int Number of words in the vocabulary actually used for training (that already existed in the vocabulary and were not discarded by negative sampling). """ result = 0 for sentence in sentences: word_vocabs = [ model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] word = word_vocabs[0] start = 1 window_pos = enumerate(word_vocabs[start:], start) word2_indices = [ word2.index for pos2, word2 in window_pos if (word2 is not None) ] l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss) for word2idx in word2_indices: train_sg_pair(model, model.wv.index2word[word.index], word2idx, alpha, compute_loss=compute_loss) train_sg_pair(model, model.wv.index2word[word2idx], word.index, alpha, compute_loss=compute_loss) result += len(word_vocabs) return result