Exemple #1
0
    def train_document_dbow(
        model,
        doc_words,
        doctag_indexes,
        alpha,
        work=None,
        train_words=False,
        learn_doctags=True,
        learn_words=True,
        learn_hidden=True,
        word_vectors=None,
        word_locks=None,
        doctag_vectors=None,
        doctag_locks=None,
    ):
        """
        Update distributed bag of words model ("PV-DBOW") by training on a single document.

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
        examples, exactly as per Word2Vec skip-gram training. (Without this option,
        word vectors are neither consulted nor updated during DBOW doc vector training.)

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have cython installed, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        if train_words and learn_words:
            train_sentence_sg(model, doc_words, alpha, work)
        for doctag_index in doctag_indexes:
            for word in doc_words:
                train_sg_pair(
                    model,
                    word,
                    doctag_index,
                    alpha,
                    learn_vectors=learn_doctags,
                    learn_hidden=learn_hidden,
                    context_vectors=doctag_vectors,
                    context_locks=doctag_locks,
                )

        return len(doc_words)
Exemple #2
0
    def train_document_dbow(model,
                            doc_words,
                            doctag_indexes,
                            alpha,
                            work=None,
                            train_words=False,
                            learn_doctags=True,
                            learn_words=True,
                            learn_hidden=True,
                            word_vectors=None,
                            word_locks=None,
                            doctag_vectors=None,
                            doctag_locks=None):
        """
        Update distributed bag of words model ("PV-DBOW") by training on a single document.

        Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

        The document is provided as `doc_words`, a list of word tokens which are looked up
        in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
        into the doctag_vectors array.

        If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
        examples, exactly as per Word2Vec skip-gram training. (Without this option,
        word vectors are neither consulted nor updated during DBOW doc vector training.)

        Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
        prevent learning-updates to those respective model weights, as if using the
        (partially-)frozen model to infer other compatible vectors.

        This is the non-optimized, Python version. If you have cython installed, gensim
        will use the optimized version from doc2vec_inner instead.

        """
        if doctag_vectors is None:
            doctag_vectors = model.docvecs.doctag_syn0
        if doctag_locks is None:
            doctag_locks = model.docvecs.doctag_syn0_lockf

        if train_words and learn_words:
            train_sentence_sg(model, doc_words, alpha, work)
        for doctag_index in doctag_indexes:
            for word in doc_words:
                train_sg_pair(model,
                              word,
                              doctag_index,
                              alpha,
                              learn_vectors=learn_doctags,
                              learn_hidden=learn_hidden,
                              context_vectors=doctag_vectors,
                              context_locks=doctag_locks)

        return len(doc_words)