Ejemplo n.º 1
0
def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
    """
    Update CBOW model by training on a single sentence.

    The sentence is a list of string tokens, which are looked up in the model's
    vocab dictionary. Called internally from `Word2Vec.train()`.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from word2vec_inner instead.

    """
    word_vocabs = [
        model.vocab[w] for w in sentence if w in model.vocab
        and model.vocab[w].sample_int > model.random.rand() * 2**32
    ]
    pos = 0
    word = word_vocabs[0]
    context = word_vocabs[1:]
    reduced_window = model.random.randint(
        model.window)  # `b` in the original word2vec code
    start = max(0, pos - model.window + reduced_window)
    window_pos = enumerate(
        word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
    word2_indices = [
        word2.index for pos2, word2 in enumerate(context)
        if (word2 is not None and pos2 != pos)
    ]
    l1 = word2vec.np_sum(model.syn0[word2_indices], axis=0)  # 1 x vector_size
    if word2_indices and model.cbow_mean:
        l1 /= len(word2_indices)
    word2vec.train_cbow_pair(model, word, word2_indices, l1, alpha)

    return len(word_vocabs)
Ejemplo n.º 2
0
 def train_batch_labeled_cbow(model,
                              sentences,
                              alpha,
                              work=None,
                              neu1=None):
     result = 0
     for sentence in sentences:
         document, target = sentence
         word_vocabs = [
             model.wv.vocab[w] for w in document if w in model.wv.vocab
             and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
         ]
         target_vocabs = [
             model.lvocab[t] for t in target if t in model.lvocab
         ]
         for target in target_vocabs:
             word2_indices = [w.index for w in word_vocabs]
             l1 = np_sum(model.wv.syn0[word2_indices],
                         axis=0)  # 1 x vector_size
             if word2_indices and model.cbow_mean:
                 l1 /= len(word2_indices)
             if model.softmax:
                 train_cbow_pair_softmax(model, target, word2_indices, l1,
                                         alpha)
             else:
                 train_cbow_pair(model, target, word2_indices, l1, alpha)
         result += len(word_vocabs)
     return result
Ejemplo n.º 3
0
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
    result = 0
    for sentence in sentences:
        word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
                       model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]

            word2_subwords = []
            vocab_subwords_indices = []
            ngrams_subwords_indices = []

            for index in word2_indices:
                vocab_subwords_indices += [index]
                word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]]

            for subword in word2_subwords:
                ngrams_subwords_indices.append(model.wv.ngrams[subword])

            l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0)  # 1 x vector_size
            l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0)  # 1 x vector_size

            l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
            subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
            if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
                l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))

            train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True)  # train on the sliding window for target word
        result += len(word_vocabs)
    return result
Ejemplo n.º 4
0
    def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
        """Update CBOW model by training on a sequence of sentences.

        Called internally from :meth:`~gensim.models.fasttext.FastText.train`.

        Notes
        -----
        This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version
        from :mod:`gensim.models.fasttext_inner` instead.

        Parameters
        ----------
        model : :class:`~gensim.models.fasttext.FastText`
            Model instance.
        sentences : iterable of list of str
            Iterable of the sentences.
        alpha : float
            Learning rate.
        work : :class:`numpy.ndarray`, optional
            UNUSED.
        neu1 : :class:`numpy.ndarray`, optional
            UNUSED.
        Returns
        -------
        int
            Effective number of words trained.

        """
        result = 0
        for sentence in sentences:
            word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
            for pos, word in enumerate(word_vocabs):
                reduced_window = model.random.randint(model.window)
                start = max(0, pos - model.window + reduced_window)
                window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
                word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]

                vocab_subwords_indices = []
                ngrams_subwords_indices = []

                for index in word2_indices:
                    vocab_subwords_indices += [index]
                    ngrams_subwords_indices.extend(model.wv.buckets_word[index])

                l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0)  # 1 x vector_size
                l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0)  # 1 x vector_size

                l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
                subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
                if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
                    l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))

                # train on the sliding window for target word
                train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True)
            result += len(word_vocabs)
        return result
Ejemplo n.º 5
0
    def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True):
        """
        Update distributed memory model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Doc2Vec.train()`.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        lbl_indices = [lbl.index for lbl in lbls if lbl is not None]
        lbl_sum = np_sum(model.syn0[lbl_indices], axis=0)
        lbl_len = len(lbl_indices)
        neg_labels = []
        if model.negative:
            # precompute negative labels
            neg_labels = zeros(model.negative + 1)
            neg_labels[0] = 1.

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            reduced_window = random.randint(model.window)  # `b` in the original doc2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start)
            word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
            l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum  # 1 x layer1_size
            if word2_indices and model.cbow_mean:
                l1 /= (len(word2_indices) + lbl_len)
            neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words)
            if train_lbls:
                model.syn0[lbl_indices] += neu1e

        return len([word for word in sentence if word is not None])
Ejemplo n.º 6
0
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
    result = 0
    for sentence in sentences:
        word_vocabs = [
            model.wv.vocab[w] for w in sentence if w in model.wv.vocab
            and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(
                word_vocabs[start:(pos + model.window + 1 - reduced_window)],
                start)
            word2_indices = [
                word2.index for pos2, word2 in window_pos
                if (word2 is not None and pos2 != pos)
            ]

            word2_subwords = []
            vocab_subwords_indices = []
            ngrams_subwords_indices = []

            for index in word2_indices:
                vocab_subwords_indices += [index]
                word2_subwords += model.wv.ngrams_word[
                    model.wv.index2word[index]]

            for subword in word2_subwords:
                ngrams_subwords_indices.append(model.wv.ngrams[subword])

            l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices],
                              axis=0)  # 1 x vector_size
            l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices],
                               axis=0)  # 1 x vector_size

            l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
            subwords_indices = [vocab_subwords_indices
                                ] + [ngrams_subwords_indices]
            if (subwords_indices[0]
                    or subwords_indices[1]) and model.cbow_mean:
                l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))

            train_cbow_pair(
                model, word, subwords_indices, l1, alpha,
                is_ft=True)  # train on the sliding window for target word
        result += len(word_vocabs)
    return result
Ejemplo n.º 7
0
    def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                          learn_doctags=True, learn_words=True, learn_hidden=True,
                          word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
        """
        Update distributed memory model ("PV-DM") by training on a single document.

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
        method implements the DM model with a projection (input) layer that is
        either the sum or mean of the context vectors, depending on the model's
        `dm_mean` configuration field.  See `train_dm_concat()` for the DM model
        with a concatenated input layer.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if word_vectors is None:
            word_vectors = model.syn0
        if word_locks is None:
            word_locks = model.syn0_lockf
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
                       model.vocab[w].sample_int > model.random.randint(2**32)]
        doctag_sum = np_sum(doctag_vectors[doctag_indexes], axis=0)
        doctag_len = len(doctag_indexes)

        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indexes = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
            l1 = np_sum(word_vectors[word2_indexes], axis=0) + doctag_sum  # 1 x layer1_size
            if word2_indexes and model.cbow_mean:
                l1 /= (len(word2_indexes) + doctag_len)
            neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha,
                                    learn_vectors=False, learn_hidden=learn_hidden)
            if word2_indexes and not model.cbow_mean:
                neu1e /= (len(word2_indexes) + doctag_len)
            if learn_doctags:
                doctag_vectors[doctag_indexes] += neu1e * \
                    np_repeat(doctag_locks[doctag_indexes], model.vector_size).reshape(-1, model.vector_size)
            if learn_words:
                word_vectors[word2_indexes] += neu1e * \
                    np_repeat(word_locks[word2_indexes], model.vector_size).reshape(-1, model.vector_size)

        return len(word_vocabs)
Ejemplo n.º 8
0
    def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                          learn_doctags=True, learn_words=True, learn_hidden=True,
                          word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
        """
        Update distributed memory model ("PV-DM") by training on a single document.

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
        method implements the DM model with a projection (input) layer that is
        either the sum or mean of the context vectors, depending on the model's
        `dm_mean` configuration field.  See `train_document_dm_concat()` for the DM
        model with a concatenated input layer.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if word_vectors is None:
            word_vectors = model.syn0
        if word_locks is None:
            word_locks = model.syn0_lockf
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
                       model.vocab[w].sample_int > model.random.rand() * 2**32]

        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos]
            l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0)
            count = len(word2_indexes) + len(doctag_indexes)
            if model.cbow_mean and count > 1 :
                l1 /= count
            neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha,
                                    learn_vectors=False, learn_hidden=learn_hidden)
            if not model.cbow_mean and count > 1:
                neu1e /= count
            if learn_doctags:
                for i in doctag_indexes:
                    doctag_vectors[i] += neu1e * doctag_locks[i]
            if learn_words:
                for i in word2_indexes:
                    word_vectors[i] += neu1e * word_locks[i]

        return len(word_vocabs)
Ejemplo n.º 9
0
def train_batch_labeled_cbow(model, tagged_docs, alpha, work=None, neu1=None):
        result = 0
        for tagged_doc in tagged_docs:
            document, target = tagged_doc
            word_vocabs = [model.wv.vocab[w] for w in document if w in model.wv.vocab and
                           model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
            target_vocabs = [model.lvocab[t] for t in target if t in model.lvocab]
            for target in target_vocabs:
                word2_indices = [w.index for w in word_vocabs]
                l1 = np_sum(model.wv.syn0[word2_indices], axis=0)  # 1 x vector_size
                if word2_indices and model.cbow_mean:
                    l1 /= len(word2_indices)
                if model.softmax:
                    train_cbow_pair_softmax(model, target, word2_indices, l1, alpha)
                else:
                    train_cbow_pair(model, target, word2_indices, l1, alpha)
            result += len(word_vocabs)
        return result
Ejemplo n.º 10
0
    def train_sentence_dm(model,
                          sentence,
                          lbls,
                          alpha,
                          work=None,
                          neu1=None,
                          train_words=True,
                          train_lbls=True):
        """
        Update distributed memory model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Doc2Vec.train()`.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        lbl_indices = [lbl.index for lbl in lbls if lbl is not None]
        lbl_sum = np_sum(model.syn0[lbl_indices], axis=0)
        lbl_len = len(lbl_indices)
        neg_labels = []
        if model.negative:
            # precompute negative labels
            neg_labels = zeros(model.negative + 1)
            neg_labels[0] = 1.

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            reduced_window = random.randint(
                model.window)  # `b` in the original doc2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(
                sentence[start:pos + model.window + 1 - reduced_window], start)
            word2_indices = [
                word2.index for pos2, word2 in window_pos
                if (word2 is not None and pos2 != pos)
            ]
            l1 = np_sum(model.syn0[word2_indices],
                        axis=0) + lbl_sum  # 1 x layer1_size
            if word2_indices and model.cbow_mean:
                l1 /= (len(word2_indices) + lbl_len)
            neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha,
                                    neg_labels, train_words, train_words)
            if train_lbls:
                model.syn0[lbl_indices] += neu1e

        return len([word for word in sentence if word is not None])
Ejemplo n.º 11
0
    def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                                 learn_doctags=True, learn_words=True, learn_hidden=True,
                                 word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
        """
        Update distributed memory model ("PV-DM") by training on a single document, using a
        concatenation of the context window word vectors (rather than a sum or average).

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have a C compiler, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if word_vectors is None:
            word_vectors = model.syn0
        if word_locks is None:
            word_locks = model.syn0_lockf
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
                       model.vocab[w].sample_int > model.random.rand() * 2**32]
        doctag_len = len(doctag_indexes)
        if doctag_len != model.dm_tag_count:
            return 0  # skip doc without expected number of doctag(s) (TODO: warn/pad?)

        null_word = model.vocab['\0']
        pre_pad_count = model.window
        post_pad_count = model.window
        padded_document_indexes = (
            (pre_pad_count * [null_word.index])  # pre-padding
            + [word.index for word in word_vocabs if word is not None]  # elide out-of-Vocabulary words
            + (post_pad_count * [null_word.index])  # post-padding
        )

        for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count):
            word_context_indexes = (
                padded_document_indexes[(pos - pre_pad_count): pos]  # preceding words
                + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)]  # following words
            )
            word_context_len = len(word_context_indexes)
            predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
            # numpy advanced-indexing copies; concatenate, flatten to 1d
            l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
            neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,
                                    learn_hidden=learn_hidden, learn_vectors=False)

            # filter by locks and shape for addition to source vectors
            e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes]))
            neu1e_r = (neu1e.reshape(-1, model.vector_size)
                       * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size))

            if learn_doctags:
                np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len])
            if learn_words:
                np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:])

        return len(padded_document_indexes) - pre_pad_count - post_pad_count
Ejemplo n.º 12
0
def train_batch(model,
                sentences,
                alpha,
                work=None,
                neu1=None,
                compute_loss=False):
    """Update CBOW model by training on a sequence of sentences.

    Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.

    Warnings
    --------
    This is the non-optimized, pure Python version. If you have a C compiler, Gensim
    will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead.

    Parameters
    ----------
    model : :class:`~gensim.models.word2vec.Word2Vec`
        The Word2Vec model instance to train.
    sentences : iterable of list of str
        The corpus used to train the model.
    alpha : float
        The learning rate
    work : object, optional
        Unused.
    neu1 : object, optional
        Unused.
    compute_loss : bool, optional
        Whether or not the training loss should be computed in this batch.

    Returns
    -------
    int
        Number of words in the vocabulary actually used for training (that already existed in the vocabulary
        and were not discarded by negative sampling).

    """
    result = 0
    for sentence in sentences:
        word_vocabs = [
            model.wv.vocab[w] for w in sentence if w in model.wv.vocab
            and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        word = word_vocabs[0]
        start = 1
        window_pos = enumerate(word_vocabs[start:], start)
        word2_indices = [
            word2.index for pos2, word2 in window_pos if (word2 is not None)
        ]
        l1 = np_sum(model.wv.syn0[word2_indices], axis=0)  # 1 x vector_size
        if word2_indices and model.cbow_mean:
            l1 /= len(word2_indices)
        train_cbow_pair(model,
                        word,
                        word2_indices,
                        l1,
                        alpha,
                        compute_loss=compute_loss)
        for word2idx in word2_indices:
            train_sg_pair(model,
                          model.wv.index2word[word.index],
                          word2idx,
                          alpha,
                          compute_loss=compute_loss)
            train_sg_pair(model,
                          model.wv.index2word[word2idx],
                          word.index,
                          alpha,
                          compute_loss=compute_loss)

        result += len(word_vocabs)
    return result