Ejemplo n.º 1
0
def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
    """Update skip-gram model by training on a sequence of sentences.

    Each sentence is a list of string tokens, which are looked up in the model's
    vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from fasttext_inner instead.

    Parameters
    ----------
    model : :class:`~gensim.models.fasttext.FastText`
        `FastText` instance.
    sentences : iterable of iterables
        Iterable of the sentences directly from disk/network.
    alpha : float
        Learning rate.
    work : :class:`numpy.ndarray`
        Private working memory for each worker.
    neu1 : :class:`numpy.ndarray`
        Private working memory for each worker.

    Returns
    -------
    int
        Effective number of words trained.

    """
    result = 0
    for sentence in sentences:
        word_vocabs = [
            model.wv.vocab[w] for w in sentence if w in model.wv.vocab
            and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(
                model.window)  # `b` in the original word2vec code
            # now go over all words from the (reduced) window, predicting each one in turn
            start = max(0, pos - model.window + reduced_window)

            subwords_indices = [word.index]
            word2_subwords = model.wv.ngrams_word[model.wv.index2word[
                word.index]]

            for subword in word2_subwords:
                subwords_indices.append(model.wv.ngrams[subword])

            for pos2, word2 in enumerate(
                    word_vocabs[start:(pos + model.window + 1 -
                                       reduced_window)], start):
                if pos2 != pos:  # don't train on the `word` itself
                    train_sg_pair(model,
                                  model.wv.index2word[word2.index],
                                  subwords_indices,
                                  alpha,
                                  is_ft=True)

        result += len(word_vocabs)
    return result
def train_document_dbow(model,
                        doc_words,
                        doctag_indexes,
                        alpha,
                        work=None,
                        train_words=False,
                        learn_doctags=True,
                        learn_words=True,
                        learn_hidden=True,
                        word_vectors=None,
                        word_locks=None,
                        doctag_vectors=None,
                        doctag_locks=None):
    """
    Update distributed bag of words model ("PV-DBOW") by training on a single document.

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
    examples, exactly as per Word2Vec skip-gram training. (Without this option,
    word vectors are neither consulted nor updated during DBOW doc vector training.)

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    if train_words and learn_words:
        train_batch_sg(model, [doc_words], alpha, work)
    for doctag_index in doctag_indexes:
        for word in doc_words:
            train_sg_pair(model,
                          word,
                          doctag_index,
                          alpha,
                          learn_vectors=learn_doctags,
                          learn_hidden=learn_hidden,
                          context_vectors=doctag_vectors,
                          context_locks=doctag_locks)

    return len(doc_words)
Ejemplo n.º 3
0
def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
    """Update skip-gram model by training on a sequence of sentences.

    Each sentence is a list of string tokens, which are looked up in the model's
    vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from fasttext_inner instead.

    Parameters
    ----------
    model : :class:`~gensim.models.fasttext.FastText`
        `FastText` instance.
    sentences : iterable of iterables
        Iterable of the sentences directly from disk/network.
    alpha : float
        Learning rate.
    work : :class:`numpy.ndarray`
        Private working memory for each worker.
    neu1 : :class:`numpy.ndarray`
        Private working memory for each worker.

    Returns
    -------
    int
        Effective number of words trained.

    """
    result = 0
    for sentence in sentences:
        word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
                       model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
            # now go over all words from the (reduced) window, predicting each one in turn
            start = max(0, pos - model.window + reduced_window)

            subwords_indices = [word.index]
            word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]]

            for subword in word2_subwords:
                subwords_indices.append(model.wv.ngrams[subword])

            for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                if pos2 != pos:  # don't train on the `word` itself
                    train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)

        result += len(word_vocabs)
    return result
Ejemplo n.º 4
0
def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
                        train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
                        word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
    """
    Update distributed bag of words model ("PV-DBOW") by training on a single document.

    Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.

    The document is provided as `doc_words`, a list of word tokens which are looked up
    in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
    into the doctag_vectors array.

    If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
    examples, exactly as per Word2Vec skip-gram training. (Without this option,
    word vectors are neither consulted nor updated during DBOW doc vector training.)

    Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
    prevent learning-updates to those respective model weights, as if using the
    (partially-)frozen model to infer other compatible vectors.

    This is the non-optimized, Python version. If you have cython installed, gensim
    will use the optimized version from doc2vec_inner instead.

    """
    if doctag_vectors is None:
        doctag_vectors = model.docvecs.doctag_syn0
    if doctag_locks is None:
        doctag_locks = model.docvecs.doctag_syn0_lockf

    if train_words and learn_words:
        train_batch_sg(model, [doc_words], alpha, work)
    for doctag_index in doctag_indexes:
        for word in doc_words:
            train_sg_pair(
                model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden,
                context_vectors=doctag_vectors, context_locks=doctag_locks
            )

    return len(doc_words)