def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_document_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.wv.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) count = len(word2_indexes) + len(doctag_indexes) if model.cbow_mean and count > 1: l1 /= count neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if not model.cbow_mean and count > 1: neu1e /= count if learn_doctags: for i in doctag_indexes: doctag_vectors[i] += neu1e * doctag_locks[i] if learn_words: for i in word2_indexes: word_vectors[i] += neu1e * word_locks[i] return len(word_vocabs)
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_document_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.wv.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) count = len(word2_indexes) + len(doctag_indexes) if model.cbow_mean and count > 1: l1 /= count neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if not model.cbow_mean and count > 1: neu1e /= count if learn_doctags: for i in doctag_indexes: doctag_vectors[i] += neu1e * doctag_locks[i] if learn_words: for i in word2_indexes: word_vectors[i] += neu1e * word_locks[i] return len(word_vocabs)
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): """Update CBOW model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. sentences : iterable of iterables Iterable of the sentences directly from disk/network. alpha : float Learning rate. work : :class:`numpy.ndarray` Private working memory for each worker. neu1 : :class:`numpy.ndarray` Private working memory for each worker. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] vocab_subwords_indices = [] ngrams_subwords_indices = [] for index in word2_indices: vocab_subwords_indices += [index] word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] for subword in word2_subwords: ngrams_subwords_indices.append(model.wv.ngrams[subword]) l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size l1 = np_sum([l1_vocab, l1_ngrams], axis=0) subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) # train on the sliding window for target word train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) result += len(word_vocabs) return result
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.wv.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] doctag_len = len(doctag_indexes) if doctag_len != model.dm_tag_count: return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) null_word = model.wv.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count): pos] # preceding words + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words ) predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, learn_hidden=learn_hidden, learn_vectors=False) # filter by locks and shape for addition to source vectors e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) neu1e_r = (neu1e.reshape(-1, model.vector_size) * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) if learn_doctags: np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) if learn_words: np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) return len(padded_document_indexes) - pre_pad_count - post_pad_count
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.wv.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] doctag_len = len(doctag_indexes) if doctag_len != model.dm_tag_count: return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) null_word = model.wv.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count): pos] # preceding words + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words ) predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, learn_hidden=learn_hidden, learn_vectors=False) # filter by locks and shape for addition to source vectors e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) neu1e_r = (neu1e.reshape(-1, model.vector_size) * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) if learn_doctags: np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) if learn_words: np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) return len(padded_document_indexes) - pre_pad_count - post_pad_count