def train_document_dbow(model,
                        doc_words,
                        doctag_indexes,
                        alpha,
                        work=None,
                        train_words=False,
                        learn_doctags=True,
                        learn_words=True,
                        learn_hidden=True,
                        word_vectors=None,
                        word_locks=None,
                        doctag_vectors=None,
                        doctag_locks=None):
    """
    Update distributed bag of words model ("PV-DBOW") by training on a single document.

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
    examples, exactly as per Word2Vec skip-gram training. (Without this option,
    word vectors are neither consulted nor updated during DBOW doc vector training.)

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    if train_words and learn_words:
        train_batch_sg(model, [doc_words], alpha, work)
    for doctag_index in doctag_indexes:
        for word in doc_words:
            train_sg_pair(model,
                          word,
                          doctag_index,
                          alpha,
                          learn_vectors=learn_doctags,
                          learn_hidden=learn_hidden,
                          context_vectors=doctag_vectors,
                          context_locks=doctag_locks)

    return len(doc_words)
Exemple #2
0
def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
                        train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
                        word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
    """
    Update distributed bag of words model ("PV-DBOW") by training on a single document.

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
    examples, exactly as per Word2Vec skip-gram training. (Without this option,
    word vectors are neither consulted nor updated during DBOW doc vector training.)

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    if train_words and learn_words:
        train_batch_sg(model, [doc_words], alpha, work)
    for doctag_index in doctag_indexes:
        for word in doc_words:
            train_sg_pair(
                model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden,
                context_vectors=doctag_vectors, context_locks=doctag_locks
            )

    return len(doc_words)