コード例 #1
0
def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True):
    """
    Update distributed memory model by training on a single sentence.

    The sentence is a list of Vocab objects (or None, where the corresponding
    word is not in the vocabulary. Called internally from `Doc2Vec.train()`.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    
    lbl_indices = [lbl.index for lbl in lbls if lbl is not None]

    if(len(lbl_indices) <= model.K):return 0

    docIndxPos = int(model.index2word[lbl_indices[0]][5:])
    topKTopics = argsort(model.w_ld[docIndxPos])[::-1][:4]

    
    selected_lbl_indices = [lbl_indices[0]];
    for i in range(2):
        selected_lbl_indices.append(lbl_indices[topKTopics[i]+1])

    
    lbl_sum = np_sum(model.syn0[lbl_indices[0]], axis=0)
##    lbl_len = len(lbl_indices)
    lbl_len = 1
    neg_labels = []
    if model.negative:
        # precompute negative labels
        neg_labels = zeros(model.negative + 1)
        neg_labels[0] = 1.

    for pos, word in enumerate(sentence):
        if word is None:
            continue  # OOV word in the input sentence => skip
        reduced_window = random.randint(model.window)  # `b` in the original doc2vec code
        start = max(0, pos - model.window + reduced_window)
        window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start)
        word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
        
        l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum  # 1 x layer1_size
        if word2_indices and model.cbow_mean:
            l1 /= (len(word2_indices) + lbl_len)
        neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words)
        if train_lbls:
            model.syn0[selected_lbl_indices[0]] += neu1e
            model.syn0[selected_lbl_indices[1:]] += (neu1e/model.noOfLabels)
            
        word2_indices.append(word.index)
        a_1 = np_sum(model.syn0[word2_indices], axis=0)/len(word2_indices)

        
        docIndxNeg = selectNegativeDocs(docIndxPos)
        
        myTrain(model, docIndxPos, docIndxNeg, a_1)

    return len([word for word in sentence if word is not None])
コード例 #2
0
ファイル: doc2vec.py プロジェクト: psr6275/semi-Doc2vec
 def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                       learn_doctags=True, learn_words=True, learn_hidden=True,
                       word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
     """
     Update distributed memory model ("PV-DM") by training on a single document.
 
     Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
     method implements the DM model with a projection (input) layer that is
     either the sum or mean of the context vectors, depending on the model's
     `dm_mean` configuration field.  See `train_document_dm_concat()` for the DM
     model with a concatenated input layer.
 
     The document is provided as `doc_words`, a list of word tokens which are looked up
     in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
     into the doctag_vectors array.
 
     Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
     prevent learning-updates to those respective model weights, as if using the
     (partially-)frozen model to infer other compatible vectors.
 
     This is the non-optimized, Python version. If you have a C compiler, gensim
     will use the optimized version from doc2vec_inner instead.
 
     """
     
     if word_vectors is None:
         word_vectors = model.syn0
     if word_locks is None:
         word_locks = model.syn0_lockf
     if doctag_vectors is None:
         doctag_vectors = model.docvecs.doctag_syn0
     if doctag_locks is None:
         doctag_locks = model.docvecs.doctag_syn0_lockf
 
     word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
                    model.vocab[w].sample_int > model.random.rand() * 2**32]
 
     for pos, word in enumerate(word_vocabs):
         reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
         start = max(0, pos - model.window + reduced_window)
         window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
         word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos]
         l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0)
         count = len(word2_indexes) + len(doctag_indexes)
         if model.cbow_mean and count > 1 :
             l1 /= count
         neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha,
                                 learn_vectors=False, learn_hidden=learn_hidden)
         if not model.cbow_mean and count > 1:
             neu1e /= count
         if learn_doctags:
             for i in doctag_indexes:
                 doctag_vectors[i] += neu1e * doctag_locks[i]
         if learn_words:
             for i in word2_indexes:
                 word_vectors[i] += neu1e * word_locks[i]
 
     return len(word_vocabs)
コード例 #3
0
ファイル: doc2vec.py プロジェクト: bailingwjw/OpinionMining
    def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                          learn_doctags=True, learn_words=True, learn_hidden=True,
                          word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
        """
        Update distributed memory model ("PV-DM") by training on a single document.

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
        method implements the DM model with a projection (input) layer that is
        either the sum or mean of the context vectors, depending on the model's
        `dm_mean` configuration field.  See `train_document_dm_concat()` for the DM
        model with a concatenated input layer.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if word_vectors is None:
            word_vectors = model.syn0
        if word_locks is None:
            word_locks = model.syn0_lockf
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
                       model.vocab[w].sample_int > model.random.rand() * 2**32]

        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos]
            l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0)
            count = len(word2_indexes) + len(doctag_indexes)
            if model.cbow_mean and count > 1 :
                l1 /= count
            neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha,
                                    learn_vectors=False, learn_hidden=learn_hidden)
            if not model.cbow_mean and count > 1:
                neu1e /= count
            if learn_doctags:
                for i in doctag_indexes:
                    doctag_vectors[i] += neu1e * doctag_locks[i]
            if learn_words:
                for i in word2_indexes:
                    word_vectors[i] += neu1e * word_locks[i]

        return len(word_vocabs)
コード例 #4
0
ファイル: doc2vec.py プロジェクト: yangtaoxf/OpinionMining
    def train_document_dm_concat(model,
                                 doc_words,
                                 doctag_indexes,
                                 alpha,
                                 work=None,
                                 neu1=None,
                                 learn_doctags=True,
                                 learn_words=True,
                                 learn_hidden=True,
                                 word_vectors=None,
                                 word_locks=None,
                                 doctag_vectors=None,
                                 doctag_locks=None):
        """
        Update distributed memory model ("PV-DM") by training on a single document, using a
        concatenation of the context window word vectors (rather than a sum or average).

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if word_vectors is None:
            word_vectors = model.syn0
        if word_locks is None:
            word_locks = model.syn0_lockf
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        word_vocabs = [
            model.vocab[w] for w in doc_words if w in model.vocab
            and model.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        doctag_len = len(doctag_indexes)
        if doctag_len != model.dm_tag_count:
            return 0  # skip doc without expected number of doctag(s) (TODO: warn/pad?)

        null_word = model.vocab['\0']
        pre_pad_count = model.window
        post_pad_count = model.window
        padded_document_indexes = (
            (pre_pad_count * [null_word.index])  # pre-padding
            + [word.index for word in word_vocabs if word is not None
               ]  # elide out-of-Vocabulary words
            + (post_pad_count * [null_word.index])  # post-padding
        )

        for pos in range(pre_pad_count,
                         len(padded_document_indexes) - post_pad_count):
            word_context_indexes = (
                padded_document_indexes[(pos -
                                         pre_pad_count):pos]  # preceding words
                + padded_document_indexes[
                    (pos + 1):(pos + 1 + post_pad_count)]  # following words
            )
            word_context_len = len(word_context_indexes)
            predict_word = model.vocab[model.index2word[
                padded_document_indexes[pos]]]
            # numpy advanced-indexing copies; concatenate, flatten to 1d
            l1 = concatenate((doctag_vectors[doctag_indexes],
                              word_vectors[word_context_indexes])).ravel()
            neu1e = train_cbow_pair(model,
                                    predict_word,
                                    None,
                                    l1,
                                    alpha,
                                    learn_hidden=learn_hidden,
                                    learn_vectors=False)

            # filter by locks and shape for addition to source vectors
            e_locks = concatenate((doctag_locks[doctag_indexes],
                                   word_locks[word_context_indexes]))
            neu1e_r = (neu1e.reshape(-1, model.vector_size) * np_repeat(
                e_locks, model.vector_size).reshape(-1, model.vector_size))

            if learn_doctags:
                np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len])
            if learn_words:
                np_add.at(word_vectors, word_context_indexes,
                          neu1e_r[doctag_len:])

        return len(padded_document_indexes) - pre_pad_count - post_pad_count
コード例 #5
0
ファイル: doc2vec.py プロジェクト: bailingwjw/OpinionMining
    def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                                 learn_doctags=True, learn_words=True, learn_hidden=True,
                                 word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
        """
        Update distributed memory model ("PV-DM") by training on a single document, using a
        concatenation of the context window word vectors (rather than a sum or average).

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if word_vectors is None:
            word_vectors = model.syn0
        if word_locks is None:
            word_locks = model.syn0_lockf
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
                       model.vocab[w].sample_int > model.random.rand() * 2**32]
        doctag_len = len(doctag_indexes)
        if doctag_len != model.dm_tag_count:
            return 0  # skip doc without expected number of doctag(s) (TODO: warn/pad?)

        null_word = model.vocab['\0']
        pre_pad_count = model.window
        post_pad_count = model.window
        padded_document_indexes = (
            (pre_pad_count * [null_word.index])  # pre-padding
            + [word.index for word in word_vocabs if word is not None]  # elide out-of-Vocabulary words
            + (post_pad_count * [null_word.index])  # post-padding
        )

        for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count):
            word_context_indexes = (
                padded_document_indexes[(pos - pre_pad_count): pos]  # preceding words
                + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)]  # following words
            )
            word_context_len = len(word_context_indexes)
            predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
            # numpy advanced-indexing copies; concatenate, flatten to 1d
            l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
            neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,
                                    learn_hidden=learn_hidden, learn_vectors=False)

            # filter by locks and shape for addition to source vectors
            e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes]))
            neu1e_r = (neu1e.reshape(-1, model.vector_size)
                       * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size))

            if learn_doctags:
                np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len])
            if learn_words:
                np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:])

        return len(padded_document_indexes) - pre_pad_count - post_pad_count