def train_document_dm_concat_xy_generator(model, docs):
    for doc in docs:
        indexed_doctags = model.docvecs.indexed_doctags(doc.tags)
        doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags

        word_vocabs = [model.vocab[w] for w in doc.words if w in model.vocab and
                       model.vocab[w].sample_int > model.random.rand() * 2**32]
        #doctag_len = len(doctag_indexes)
        #if doctag_len != model.dm_tag_count:

        null_word = model.vocab['\0']
        pre_pad_count = model.window
        post_pad_count = model.window
        padded_document_indexes = (
            (pre_pad_count * [null_word.index])  # pre-padding
            + [word.index for word in word_vocabs if word is not None]  # elide out-of-Vocabulary words
            + (post_pad_count * [null_word.index])  # post-padding
        )

        for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count):
            word_context_indexes = (
                padded_document_indexes[(pos - pre_pad_count): pos]  # preceding words
                + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)]  # following words
            )
            #word_context_len = len(word_context_indexes)
            #print word_context_len
            predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
            xy = train_cbow_pair(model, predict_word, word_context_indexes)
            x2=doctag_indexes
            xy1=[xy[0],x2,xy[1],xy[2]]
            yield xy1
def train_batch_dm_xy_generator(model, docs):
    for doc in docs:
        indexed_doctags = model.docvecs.indexed_doctags(doc.tags)
        doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags

        word_vocabs = [
            model.wv.vocab[w] for w in doc.words if w in model.wv.vocab
            and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(
                model.window)  # `b` in the original doc2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(
                word_vocabs[start:(pos + model.window + 1 - reduced_window)],
                start)
            word2_indexes = [
                word2.index for pos2, word2 in window_pos if pos2 != pos
            ]

            xy_gen = train_cbow_pair(model, word, word2_indexes)
            x2 = doctag_indexes
            for xy in xy_gen:
                if xy != None:
                    yield [xy[0], x2, xy[1], xy[2]]
def train_document_dm_concat_xy_generator(model, docs):
    for doc in docs:
        indexed_doctags = model.docvecs.indexed_doctags(doc.tags)
        doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags

        word_vocabs = [
            model.wv.vocab[w] for w in doc.words if w in model.wv.vocab
            and model.wv.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        null_word = model.wv.vocab['\0']
        pre_pad_count = model.window
        post_pad_count = model.window
        padded_document_indexes = (
            (pre_pad_count * [null_word.index])  # pre-padding
            + [word.index for word in word_vocabs if word is not None
               ]  # elide out-of-Vocabulary words
            + (post_pad_count * [null_word.index])  # post-padding
        )

        for pos in range(pre_pad_count,
                         len(padded_document_indexes) - post_pad_count):
            word_context_indexes = (
                padded_document_indexes[(pos -
                                         pre_pad_count):pos]  # preceding words
                + padded_document_indexes[
                    (pos + 1):(pos + 1 + post_pad_count)]  # following words
            )
            predict_word = model.wv.vocab[model.index2word[
                padded_document_indexes[pos]]]
            xy_gen = train_cbow_pair(model, predict_word, word_context_indexes)
            for xy in xy_gen:
                if xy != None:
                    x2 = doctag_indexes
                    xy1 = [xy[0], x2, xy[1], xy[2]]
                    yield xy1
def train_batch_score_cbow_xy_generator(model, scored_word_sentences):
    for scored_word_sentence in scored_word_sentences:
        #print scored_word_sentence
        scored_word_vocabs = [
            [model.vocab[w], s] for [w, s] in scored_word_sentence
            if w in model.vocab
            and model.vocab[w].sample_int > model.random.rand() * 2**32
        ]
        for pos, scored_word in enumerate(scored_word_vocabs):
            reduced_window = model.random.randint(
                model.window)  # `b` in the original word2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(
                scored_word_vocabs[start:(pos + model.window + 1 -
                                          reduced_window)], start)
            word2_indices = [
                scored_word2[0].index for pos2, scored_word2 in window_pos
                if (scored_word2 is not None and scored_word2[0] is not None
                    and pos2 != pos)
            ]
            xy_gen = train_cbow_pair(model, scored_word[0], word2_indices,
                                     None, None)
            for xy in xy_gen:
                if xy != None:
                    xy1 = [xy[0], xy[1], xy[2], [scored_word[1]]]
                    yield xy1
def train_batch_dm_xy_generator(model, docs):
    for doc in docs:
        indexed_doctags = model.docvecs.indexed_doctags(doc.tags)
        doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags

        word_vocabs = [model.vocab[w] for w in doc.words if w in model.vocab and
                           model.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos]
                
            xy_gen=train_cbow_pair(model, word, word2_indexes) #, l1=None, alpha=None,learn_vectors=False, learn_hidden=learn_hidden)
            xy_gen=train_cbow_pair(model, word , word2_indices , None, None)
            x2=doctag_indexes
            for xy in xy_gen:
                if xy !=None:
                    yield [xy[0],x2,xy[1],xy[2]]
def train_batch_score_cbow_xy_generator(model, scored_word_sentences):
    for scored_word_sentence in scored_word_sentences:
        #print scored_word_sentence
        scored_word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and  model.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, scored_word in enumerate(scored_word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indices = [scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos)]
            xy=train_cbow_pair(model, scored_word[0] , word2_indices , None, None)
            if xy !=None:
                xy1=[xy[0],xy[1],xy[2],scored_word[1]]
                yield xy1