def concat_embeddings():
    emb1_name = 'dbow_bert_ver2_500_200'
    emb2_name = 'bert_doc_embeddings_100'
    emb1 = utils.load_doc_embeddings(emb1_name)
    emb2 = utils.load_doc_embeddings(emb2_name)
    emb = utils.concat_embeddings([emb1, emb2])
    emb = utils.normalize_embeddings(emb)
    utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)
def train(gpu_no, show_loss, train_data, label_data, window_size, word_embedding_size, doc_embedding_size,
          batch_size, negative_sample_size, is_concat, epochs):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no)

    print('window_size', window_size)
    print('word_embedding_size', word_embedding_size)
    print('doc_embedding_size', doc_embedding_size)
    print('batch_size', batch_size)
    print('negative_sample_size', negative_sample_size)
    print('is_concat', is_concat)
    print('epochs:', epochs)

    # Init
    ops.reset_default_graph()
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    # Load
    print('Loading pre processed data')
    all_docs = utils.load('all_docs')
    word_dictionary = utils.load('word_dictionary')
    bert_word_embeddings = utils.load('bert_word_embeddings_100')

    docs_size = len(all_docs)
    vocabulary_size = len(word_dictionary)
    train_set_size = len(train_data)
    if is_concat:
        final_embedding_size = word_embedding_size * window_size + doc_embedding_size
    else:
        final_embedding_size = doc_embedding_size

    print('vocabulary_size:', vocabulary_size)
    print('final_embedding_size:', final_embedding_size)
    print('train_set_size:', train_set_size)

    print('Creating model')

    # Define Embeddings:
    with tf.name_scope('embeddings'):
        special_word_embeddings = tf.Variable(tf.random_uniform([2, word_embedding_size], -1.0, 1.0))
        word_embeddings = tf.concat([special_word_embeddings, tf.constant(bert_word_embeddings[2:])], axis=0)

        # word_embeddings = tf.constant(bert_word_embeddings)
        doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0))

    # NCE loss parameters
    nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, final_embedding_size],
                                                  stddev=1.0 / np.sqrt(final_embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Create data/target placeholders
    x_inputs = tf.placeholder(tf.int32, shape=[None, window_size + 1])  # plus 1 for doc index
    y_target = tf.placeholder(tf.int32, shape=[None, 1])

    # Lookup the word embedding
    # Add together element embeddings in window:
    # Concat all embeddings
    if is_concat:
        word_embed = [tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element]) for element in range(window_size)]
        doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1])
        doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices)
        final_embed = tf.concat([*word_embed, tf.squeeze(doc_embed, axis=1)], 1)
    else:
        word_embed = tf.zeros([batch_size, word_embedding_size])
        for element in range(window_size):
            word_embed += tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element])
        doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1])
        doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices), axis=1)
        final_embed = (word_embed + doc_embed) / (window_size + 1)

    # Get loss from prediction
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed,
                                             negative_sample_size, vocabulary_size))

    # Create optimizer
    optimizer = tf.train.AdamOptimizer()
    train_step = optimizer.minimize(loss)

    # Add variable initializer.
    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)

        print('Starting training')
        generations = math.ceil(train_set_size / batch_size)
        for epoch in range(epochs):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset_imdb.generate_batch_data(train_data, label_data,
                                                                            batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        print('Saving model')
        doc_embeddings = sess.run(doc_embeddings)

        # Norm
        doc_embeddings = utils.normalize_embeddings(doc_embeddings)
        utils.save_doc_embeddings(doc_embeddings, proj_name, is_concat=is_concat, window_size=window_size,
                                  batch_size=batch_size, negative_size=negative_sample_size)
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size,
          doc_embedding_size, batch_size, negative_sample_size, epochs):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no)

    print('word_embedding_size', word_embedding_size)
    print('doc_embedding_size', doc_embedding_size)
    print('batch_size', batch_size)
    print('negative_sample_size', negative_sample_size)
    print('epochs:', epochs)

    # Init
    ops.reset_default_graph()
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    # Load
    print('Loading pre processed data')
    all_docs = utils.load('all_docs')
    word_dictionary = utils.load('word_dictionary')
    bert_title_embeddings = utils.load('bert_title_embeddings')
    bert_detail_sentence_embeddings = utils.load(
        'bert_detail_sentence_embeddings')

    docs_size = len(all_docs)
    vocabulary_size = len(word_dictionary)
    train_set_size = len(train_data)
    final_embedding_size = doc_embedding_size

    print('vocabulary_size:', vocabulary_size)
    print('final_embedding_size:', final_embedding_size)
    print('train_set_size:', train_set_size)

    print('Creating model')

    # Define Embeddings:
    with tf.name_scope('embeddings'):
        detail_sentence_embeddings = tf.constant(
            bert_detail_sentence_embeddings)
        title_embeddings = tf.constant(bert_title_embeddings)
        doc_embeddings = tf.Variable(
            tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0))
        title_weights = tf.Variable(tf.random_uniform([docs_size, 1], 0.0,
                                                      1.0))

    # NCE loss parameters
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, final_embedding_size],
                            stddev=1.0 / np.sqrt(final_embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Create data/target placeholders
    x_inputs = tf.placeholder(tf.int32, shape=[None, 2])
    y_target = tf.placeholder(tf.int32, shape=[None, 1])

    # Lookup the word embedding
    # Add together element embeddings in window:
    # Concat all embeddings

    doc_indices = tf.slice(x_inputs, [0, 0], [batch_size, 1])
    sentence_indices = tf.slice(x_inputs, [0, 1], [batch_size, 1])

    doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices),
                           axis=1)
    sentence_embed = tf.squeeze(tf.nn.embedding_lookup(
        detail_sentence_embeddings, sentence_indices),
                                axis=1)
    title_embed = tf.squeeze(tf.nn.embedding_lookup(title_embeddings,
                                                    doc_indices),
                             axis=1)

    title_weight = tf.squeeze(tf.nn.embedding_lookup(title_weights,
                                                     doc_indices),
                              axis=1)
    title_embed_weighted = tf.math.multiply(title_embed, title_weight)

    final_embed = (title_embed_weighted + sentence_embed + doc_embed) / 3

    # Get loss from prediction
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed,
                           negative_sample_size, vocabulary_size))

    # Create optimizer
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    optimizer = tf.train.AdamOptimizer()
    train_step = optimizer.minimize(loss)

    # Add variable initializer.
    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)

        print('Starting training')
        generations = math.ceil(train_set_size / batch_size)
        for epoch in range(epochs):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset.generate_batch_data(
                    train_data, label_data, batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        print('Saving model')
        doc_embeddings = sess.run(doc_embeddings)

        title_weights = sess.run(title_weights)
        title_embeddings_weighted = np.multiply(bert_title_embeddings,
                                                title_weights)
        detail_emb = utils.mean_embeddings(
            [title_embeddings_weighted, doc_embeddings])
        detail_emb_norm = utils.normalize_embeddings(detail_emb)
        utils.save_doc_embeddings(detail_emb_norm,
                                  'memory_dbow_detail',
                                  batch_size=batch_size,
                                  negative_size=negative_sample_size)

        emb = utils.concat_embeddings([bert_title_embeddings, detail_emb_norm])
        utils.save_doc_embeddings(emb,
                                  'memory_dbow',
                                  batch_size=batch_size,
                                  negative_size=negative_sample_size)
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size,
          doc_embedding_size, batch_size, negative_sample_size, epochs_step_1,
          epochs_step_2):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no)

    print('word_embedding_size', word_embedding_size)
    print('doc_embedding_size', doc_embedding_size)
    print('batch_size', batch_size)
    print('negative_sample_size', negative_sample_size)
    print('epochs_step_1:', epochs_step_1)
    print('epochs_step_2:', epochs_step_2)

    # Init
    ops.reset_default_graph()
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    # Load
    print('Loading pre processed data')
    all_docs = utils.load('all_docs')
    word_dictionary = utils.load('word_dictionary')
    # bert_title_embeddings = utils.load('bert_title_embeddings')
    # bert_detail_embeddings = utils.load('bert_detail_embeddings_100')
    bert_embeddings = utils.load_doc_embeddings('bert_doc_embeddings')

    docs_size = len(all_docs)
    vocabulary_size = len(word_dictionary)
    train_set_size = len(train_data)
    final_embedding_size = doc_embedding_size

    print('docs_size:', docs_size)
    print('vocabulary_size:', vocabulary_size)
    print('final_embedding_size:', final_embedding_size)
    print('train_set_size:', train_set_size)

    print('Creating model')

    # Define Embeddings:
    with tf.name_scope('embeddings'):
        # doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0))
        # doc_embeddings = tf.Variable(bert_detail_embeddings)
        doc_embeddings = tf.Variable(bert_embeddings)

    # NCE loss parameters
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, final_embedding_size],
                            stddev=1.0 / np.sqrt(final_embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Create data/target placeholders
    x_inputs = tf.placeholder(tf.int32, shape=[None, 1])
    y_target = tf.placeholder(tf.int32, shape=[None, 1])

    # Lookup the embedding
    final_embed = tf.nn.embedding_lookup(doc_embeddings, x_inputs[:, 0])

    # Get loss from prediction
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed,
                           negative_sample_size, vocabulary_size))

    # Create optimizer
    optimizer = tf.train.AdamOptimizer()
    train_step = optimizer.minimize(loss, var_list=[nce_weights, nce_biases])
    optimizer_2 = tf.train.GradientDescentOptimizer(learning_rate=0.005)
    train_step_2 = optimizer_2.minimize(loss, var_list=[doc_embeddings])

    # Add variable initializer.
    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)

        print('Starting training')
        generations = math.ceil(train_set_size / batch_size)
        for epoch in range(epochs_step_1):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset.generate_batch_data(
                    train_data, label_data, batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        for epoch in range(epochs_step_2):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset.generate_batch_data(
                    train_data, label_data, batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step_2, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        print('Saving model')
        doc_embeddings = sess.run(doc_embeddings)

        # Norm
        doc_embeddings = utils.normalize_embeddings(doc_embeddings)
        utils.save_doc_embeddings(doc_embeddings,
                                  proj_name,
                                  batch_size=batch_size,
                                  negative_size=negative_sample_size)
def train(doc_embedding_size, negative_sample_size, epochs):

    print('doc_embedding_size:', doc_embedding_size)
    print('negative_sample_size:', negative_sample_size)
    print('epochs:', epochs)

    logging.getLogger().setLevel(logging.DEBUG)

    all_docs = utils.load('all_docs')
    alldocs = []
    corpus_size = len(all_docs)

    GoogleJobSkillDocument = namedtuple('GoogleJobSkillDocument', 'words tags')

    for i in range(corpus_size):
        words = all_docs[i].title_words
        tags = [i]
        alldocs.append(GoogleJobSkillDocument(words, tags))
    for i in range(corpus_size):
        words = all_docs[i].detail_words
        tags = [i + corpus_size]
        alldocs.append(GoogleJobSkillDocument(words, tags))

    print('docs size:', len(alldocs))

    doc_list = alldocs[:]
    shuffle(doc_list)

    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    model = Doc2Vec(dm=0,
                    vector_size=doc_embedding_size,
                    negative=negative_sample_size,
                    hs=0,
                    min_count=2,
                    sample=0,
                    epochs=epochs,
                    workers=cores)

    # Build corpus
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)
    print("vocab size:", len(model.wv.vocab))
    print("docvecs size:", len(model.docvecs))

    # Train
    print("Training %s" % model)
    model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

    # Save
    title_emb, detail_emb = utils.split_embeddings(model.docvecs, 2)
    doc_emb = utils.concat_embeddings([title_emb, detail_emb])

    title_emb = utils.normalize_embeddings(title_emb)
    detail_emb = utils.normalize_embeddings(detail_emb)
    doc_emb = utils.normalize_embeddings(doc_emb)

    utils.save_doc_embeddings(title_emb,
                              'gensim_dbow_title',
                              negative_size=negative_sample_size)
    utils.save_doc_embeddings(detail_emb,
                              'gensim_dbow_detail',
                              negative_size=negative_sample_size)
    utils.save_doc_embeddings(doc_emb,
                              'gensim_dbow',
                              negative_size=negative_sample_size)
def train(doc_embedding_size, window_size, negative_sample_size, is_concat,
          epochs):

    print('window_size:', window_size)
    print('doc_embedding_size:', doc_embedding_size)
    print('negative_sample_size:', negative_sample_size)
    print('is_concat', is_concat)
    print('epochs:', epochs)

    logging.getLogger().setLevel(logging.DEBUG)

    all_docs = utils.load('all_docs')
    alldocs = []
    corpus_size = len(all_docs)

    ImdbDocument = namedtuple('ImdbDocument', 'words tags')

    for i in range(corpus_size):
        words = all_docs[i].words
        tags = [i]
        alldocs.append(ImdbDocument(words, tags))

    print('docs size:', len(alldocs))

    doc_list = alldocs[:]
    shuffle(doc_list)

    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    if is_concat:
        model = Doc2Vec(dm=1,
                        vector_size=100,
                        negative=negative_sample_size,
                        window=window_size,
                        hs=0,
                        min_count=2,
                        sample=0,
                        epochs=epochs,
                        workers=cores,
                        alpha=0.05,
                        dm_concat=1)
    else:
        model = Doc2Vec(dm=1,
                        vector_size=100,
                        negative=negative_sample_size,
                        window=window_size,
                        hs=0,
                        min_count=2,
                        sample=0,
                        epochs=epochs,
                        workers=cores,
                        alpha=0.05)

    # Build corpus
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)
    print("vocab size:", len(model.wv.vocab))
    print("docvecs size:", len(model.docvecs))

    # Train
    print("Training %s" % model)
    model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

    # Save
    emb = []
    for i in range(corpus_size):
        emb.append(model.docvecs[i])

    emb = utils.normalize_embeddings(emb)

    utils.save_doc_embeddings(emb,
                              'gensim_dm',
                              is_concat=is_concat,
                              window_size=window_size,
                              negative_size=negative_sample_size)

    # Sample words
    sample_words = ['engineer']
    for word in sample_words:
        similars = model.wv.most_similar(word, topn=10)
        print(similars)
def concat_embeddings(emb1_name, emb2_name):
    emb1 = utils.load_doc_embeddings(emb1_name)
    emb2 = utils.load_doc_embeddings(emb2_name)
    emb = utils.concat_embeddings([emb1, emb2])
    emb = utils.normalize_embeddings(emb)
    utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)