def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True): """ Update distributed memory model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Doc2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from doc2vec_inner instead. """ lbl_indices = [lbl.index for lbl in lbls if lbl is not None] if(len(lbl_indices) <= model.K):return 0 docIndxPos = int(model.index2word[lbl_indices[0]][5:]) topKTopics = argsort(model.w_ld[docIndxPos])[::-1][:4] selected_lbl_indices = [lbl_indices[0]]; for i in range(2): selected_lbl_indices.append(lbl_indices[topKTopics[i]+1]) lbl_sum = np_sum(model.syn0[lbl_indices[0]], axis=0) ## lbl_len = len(lbl_indices) lbl_len = 1 neg_labels = [] if model.negative: # precompute negative labels neg_labels = zeros(model.negative + 1) neg_labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= (len(word2_indices) + lbl_len) neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words) if train_lbls: model.syn0[selected_lbl_indices[0]] += neu1e model.syn0[selected_lbl_indices[1:]] += (neu1e/model.noOfLabels) word2_indices.append(word.index) a_1 = np_sum(model.syn0[word2_indices], axis=0)/len(word2_indices) docIndxNeg = selectNegativeDocs(docIndxPos) myTrain(model, docIndxPos, docIndxNeg, a_1) return len([word for word in sentence if word is not None])
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_document_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) count = len(word2_indexes) + len(doctag_indexes) if model.cbow_mean and count > 1 : l1 /= count neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if not model.cbow_mean and count > 1: neu1e /= count if learn_doctags: for i in doctag_indexes: doctag_vectors[i] += neu1e * doctag_locks[i] if learn_words: for i in word2_indexes: word_vectors[i] += neu1e * word_locks[i] return len(word_vocabs)
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [ model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32 ] doctag_len = len(doctag_indexes) if doctag_len != model.dm_tag_count: return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) null_word = model.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None ] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count):pos] # preceding words + padded_document_indexes[ (pos + 1):(pos + 1 + post_pad_count)] # following words ) word_context_len = len(word_context_indexes) predict_word = model.vocab[model.index2word[ padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, learn_hidden=learn_hidden, learn_vectors=False) # filter by locks and shape for addition to source vectors e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) neu1e_r = (neu1e.reshape(-1, model.vector_size) * np_repeat( e_locks, model.vector_size).reshape(-1, model.vector_size)) if learn_doctags: np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) if learn_words: np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) return len(padded_document_indexes) - pre_pad_count - post_pad_count
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context window word vectors (rather than a sum or average). Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] doctag_len = len(doctag_indexes) if doctag_len != model.dm_tag_count: return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) null_word = model.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count): pos] # preceding words + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words ) word_context_len = len(word_context_indexes) predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]] # numpy advanced-indexing copies; concatenate, flatten to 1d l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, learn_hidden=learn_hidden, learn_vectors=False) # filter by locks and shape for addition to source vectors e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) neu1e_r = (neu1e.reshape(-1, model.vector_size) * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) if learn_doctags: np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) if learn_words: np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) return len(padded_document_indexes) - pre_pad_count - post_pad_count