Exemple #1
0
def build_model(params, with_dis):
    """
    Build all components of the model.
    """
    # source embeddings
    src_dico, _src_emb = load_embeddings(params, source=True)
    params.src_dico = src_dico
    src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True)
    src_emb.weight.data.copy_(_src_emb)

    # target embeddings
    if params.tgt_lang:
        tgt_dico, _tgt_emb = load_embeddings(params, source=False)
        params.tgt_dico = tgt_dico
        tgt_emb = nn.Embedding(len(tgt_dico), params.emb_dim, sparse=True)
        tgt_emb.weight.data.copy_(_tgt_emb)
    else:
        tgt_emb = None

    # mapping
    mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False)
    if getattr(params, 'map_id_init', True):
        mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim)))

    # normalize embeddings
    params.src_mean = normalize_embeddings(src_emb.weight.data,
                                           params.normalize_embeddings)
    if params.tgt_lang:
        params.tgt_mean = normalize_embeddings(tgt_emb.weight.data,
                                               params.normalize_embeddings)

    return src_emb, tgt_emb, mapping
def create_word_emb(bert_service_ip=None, reduced_size=100, normalize=False):
    # Load dict
    word_dictionary = utils.load('word_dictionary')

    # Show length info
    words = show_words_length_info(word_dictionary)

    # Bert
    print('Bert converting...')
    if bert_service_ip is None:
        bc = BertClient()
    else:
        bc = BertClient(ip=bert_service_ip)
    vecs = bc.encode(words)
    print('vecs type:', type(vecs))
    print('vecs shape:', vecs.shape)

    if normalize:
        vecs = utils.normalize_embeddings(vecs)

    # Save
    utils.save(vecs, 'bert_word_embeddings')

    # PCA
    vecs = utils.reduce_dim(vecs, reduced_size)

    if normalize:
        vecs = utils.normalize_embeddings(vecs)

    utils.save(vecs, 'bert_word_embeddings_' + str(reduced_size))
def create_detail_emb(bert_service_ip=None, reduced_size=100, normalize=False):
    # Load docs
    all_docs = utils.load('all_docs')
    print('docs num:', len(all_docs))

    # Show length info
    details = show_details_length_info(all_docs)

    # Bert
    print('Bert converting...')
    if bert_service_ip is None:
        bc = BertClient()
    else:
        bc = BertClient(ip=bert_service_ip)

    print('Converting detail')
    vecs = bc.encode(details)
    print('detail_vecs shape:', vecs.shape)

    if normalize:
        vecs = utils.normalize_embeddings(vecs)

    utils.save(vecs, 'bert_detail_embeddings')

    # PCA
    vecs = utils.reduce_dim(vecs, reduced_size)

    if normalize:
        vecs = utils.normalize_embeddings(vecs)

    utils.save(vecs, 'bert_detail_embeddings_' + str(reduced_size))
Exemple #4
0
    def export(self):
        """
        Export embeddings.
        """
        params = self.params

        # load all embeddings
        params.src_dico, src_emb = load_embeddings(params,
                                                   source=True,
                                                   full_vocab=True)
        params.tgt_dico, tgt_emb = load_embeddings(params,
                                                   source=False,
                                                   full_vocab=True)

        # apply same normalization as during training
        normalize_embeddings(src_emb,
                             params.normalize_embeddings,
                             mean=params.src_mean)
        normalize_embeddings(tgt_emb,
                             params.normalize_embeddings,
                             mean=params.tgt_mean)

        # map source embeddings to the target space
        bs = 4096
        for i, k in enumerate(range(0, len(src_emb), bs)):
            x = Variable(src_emb[k:k + bs], volatile=True)
            src_emb[k:k + bs] = self.mapping(x).data

        # write embeddings to the disk
        export_embeddings(src_emb, tgt_emb, params)
Exemple #5
0
def load_embeddings(embeddings_path, vocabulary_path=None,
                    generate=True, load_extra_from=None,
                    normalize=True):
    """
    Load and return an embedding model in either text format or
    numpy binary format. The text format is used if vocabulary_path
    is None (because the vocabulary is in the same file as the
    embeddings).

    :param embeddings_path: path to embeddings file
    :param vocabulary_path: path to text file with vocabulary,
        if needed
    :param generate: whether to generate random embeddings for
        unknown, padding and null
    :param load_extra_from: path to directory with embeddings
        file with vectors for unknown, padding and null
    :param normalize: whether to normalize embeddings
    :return: a tuple (defaultdict, array)
    """
    assert not (generate and load_extra_from), \
        'Either load or generate extra vectors'

    logging.debug('Loading embeddings')
    if vocabulary_path is None:
        wordlist, embeddings = load_text_embeddings(embeddings_path)
    else:
        wordlist, embeddings = load_binary_embeddings(embeddings_path,
                                                      vocabulary_path)

    if generate or load_extra_from:
        mapping = zip(wordlist, range(3, len(wordlist) + 3))

        # always map OOV words to 0
        wd = defaultdict(int, mapping)
        wd[utils.UNKNOWN] = 0
        wd[utils.PADDING] = 1
        wd[utils.GO] = 2

        if generate:
            vector_size = embeddings.shape[1]
            extra = [_generate_random_vector(vector_size),
                     _generate_random_vector(vector_size),
                     _generate_random_vector(vector_size)]

        else:
            path = os.path.join(load_extra_from, 'extra-embeddings.npy')
            extra = np.load(path)

        embeddings = np.append(extra, embeddings, 0)

    else:
        mapping = zip(wordlist, range(0, len(wordlist)))
        wd = defaultdict(int, mapping)

    logging.debug('Embeddings have shape {}'.format(embeddings.shape))
    if normalize:
        embeddings = utils.normalize_embeddings(embeddings)

    return wd, embeddings
def concat_embeddings():
    emb1_name = 'dbow_bert_ver2_500_200'
    emb2_name = 'bert_doc_embeddings_100'
    emb1 = utils.load_doc_embeddings(emb1_name)
    emb2 = utils.load_doc_embeddings(emb2_name)
    emb = utils.concat_embeddings([emb1, emb2])
    emb = utils.normalize_embeddings(emb)
    utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)
def create_emb(seq_list, save_file_name, bert_service_ip=None, reduced_size=100, normalize=False):
    # Bert
    print('Bert converting...')
    if bert_service_ip is None:
        bc = BertClient()
    else:
        bc = BertClient(ip=bert_service_ip)
    vecs = bc.encode(seq_list)
    print('vecs type:', type(vecs))
    print('vecs shape:', vecs.shape)

    if normalize:
        vecs = utils.normalize_embeddings(vecs)

    # Save
    utils.save(vecs, save_file_name)

    # PCA
    vecs = utils.reduce_dim(vecs, reduced_size)

    if normalize:
        vecs = utils.normalize_embeddings(vecs)

    utils.save(vecs, save_file_name + '_' + str(reduced_size))
Exemple #8
0
def main():
    if len(sys.argv) > 1:
        emb_path = sys.argv[1]
        if not os.path.exists(emb_path):
            print('Error. Embeddings file is not found')
            return
    else:
        print('Error. Specify path to embeddings file')
        return
    embeddings, words2ids = read_embeddings(emb_path)
    embeddings = normalize_embeddings(embeddings)
    print('SIMILARITY test:')
    human_vs_cos_sim_correlation('datasets/tt_similarity.csv', embeddings,
                                 words2ids)
    print('RELATEDNESS test:')
    human_vs_cos_sim_correlation('datasets/tt_relatedness.csv', embeddings,
                                 words2ids)
    print('ANALOGIES test:')
    top_k = 10
    answer_analogy_questions('datasets/tt_analogies.txt', embeddings,
                             words2ids, top_k)
def train(gpu_no, show_loss, train_data, label_data, window_size, word_embedding_size, doc_embedding_size,
          batch_size, negative_sample_size, is_concat, epochs):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no)

    print('window_size', window_size)
    print('word_embedding_size', word_embedding_size)
    print('doc_embedding_size', doc_embedding_size)
    print('batch_size', batch_size)
    print('negative_sample_size', negative_sample_size)
    print('is_concat', is_concat)
    print('epochs:', epochs)

    # Init
    ops.reset_default_graph()
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    # Load
    print('Loading pre processed data')
    all_docs = utils.load('all_docs')
    word_dictionary = utils.load('word_dictionary')
    bert_word_embeddings = utils.load('bert_word_embeddings_100')

    docs_size = len(all_docs)
    vocabulary_size = len(word_dictionary)
    train_set_size = len(train_data)
    if is_concat:
        final_embedding_size = word_embedding_size * window_size + doc_embedding_size
    else:
        final_embedding_size = doc_embedding_size

    print('vocabulary_size:', vocabulary_size)
    print('final_embedding_size:', final_embedding_size)
    print('train_set_size:', train_set_size)

    print('Creating model')

    # Define Embeddings:
    with tf.name_scope('embeddings'):
        special_word_embeddings = tf.Variable(tf.random_uniform([2, word_embedding_size], -1.0, 1.0))
        word_embeddings = tf.concat([special_word_embeddings, tf.constant(bert_word_embeddings[2:])], axis=0)

        # word_embeddings = tf.constant(bert_word_embeddings)
        doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0))

    # NCE loss parameters
    nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, final_embedding_size],
                                                  stddev=1.0 / np.sqrt(final_embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Create data/target placeholders
    x_inputs = tf.placeholder(tf.int32, shape=[None, window_size + 1])  # plus 1 for doc index
    y_target = tf.placeholder(tf.int32, shape=[None, 1])

    # Lookup the word embedding
    # Add together element embeddings in window:
    # Concat all embeddings
    if is_concat:
        word_embed = [tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element]) for element in range(window_size)]
        doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1])
        doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices)
        final_embed = tf.concat([*word_embed, tf.squeeze(doc_embed, axis=1)], 1)
    else:
        word_embed = tf.zeros([batch_size, word_embedding_size])
        for element in range(window_size):
            word_embed += tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element])
        doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1])
        doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices), axis=1)
        final_embed = (word_embed + doc_embed) / (window_size + 1)

    # Get loss from prediction
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed,
                                             negative_sample_size, vocabulary_size))

    # Create optimizer
    optimizer = tf.train.AdamOptimizer()
    train_step = optimizer.minimize(loss)

    # Add variable initializer.
    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)

        print('Starting training')
        generations = math.ceil(train_set_size / batch_size)
        for epoch in range(epochs):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset_imdb.generate_batch_data(train_data, label_data,
                                                                            batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        print('Saving model')
        doc_embeddings = sess.run(doc_embeddings)

        # Norm
        doc_embeddings = utils.normalize_embeddings(doc_embeddings)
        utils.save_doc_embeddings(doc_embeddings, proj_name, is_concat=is_concat, window_size=window_size,
                                  batch_size=batch_size, negative_size=negative_sample_size)
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size,
          doc_embedding_size, batch_size, negative_sample_size, epochs):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no)

    print('word_embedding_size', word_embedding_size)
    print('doc_embedding_size', doc_embedding_size)
    print('batch_size', batch_size)
    print('negative_sample_size', negative_sample_size)
    print('epochs:', epochs)

    # Init
    ops.reset_default_graph()
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    # Load
    print('Loading pre processed data')
    all_docs = utils.load('all_docs')
    word_dictionary = utils.load('word_dictionary')
    bert_title_embeddings = utils.load('bert_title_embeddings')
    bert_detail_sentence_embeddings = utils.load(
        'bert_detail_sentence_embeddings')

    docs_size = len(all_docs)
    vocabulary_size = len(word_dictionary)
    train_set_size = len(train_data)
    final_embedding_size = doc_embedding_size

    print('vocabulary_size:', vocabulary_size)
    print('final_embedding_size:', final_embedding_size)
    print('train_set_size:', train_set_size)

    print('Creating model')

    # Define Embeddings:
    with tf.name_scope('embeddings'):
        detail_sentence_embeddings = tf.constant(
            bert_detail_sentence_embeddings)
        title_embeddings = tf.constant(bert_title_embeddings)
        doc_embeddings = tf.Variable(
            tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0))
        title_weights = tf.Variable(tf.random_uniform([docs_size, 1], 0.0,
                                                      1.0))

    # NCE loss parameters
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, final_embedding_size],
                            stddev=1.0 / np.sqrt(final_embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Create data/target placeholders
    x_inputs = tf.placeholder(tf.int32, shape=[None, 2])
    y_target = tf.placeholder(tf.int32, shape=[None, 1])

    # Lookup the word embedding
    # Add together element embeddings in window:
    # Concat all embeddings

    doc_indices = tf.slice(x_inputs, [0, 0], [batch_size, 1])
    sentence_indices = tf.slice(x_inputs, [0, 1], [batch_size, 1])

    doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices),
                           axis=1)
    sentence_embed = tf.squeeze(tf.nn.embedding_lookup(
        detail_sentence_embeddings, sentence_indices),
                                axis=1)
    title_embed = tf.squeeze(tf.nn.embedding_lookup(title_embeddings,
                                                    doc_indices),
                             axis=1)

    title_weight = tf.squeeze(tf.nn.embedding_lookup(title_weights,
                                                     doc_indices),
                              axis=1)
    title_embed_weighted = tf.math.multiply(title_embed, title_weight)

    final_embed = (title_embed_weighted + sentence_embed + doc_embed) / 3

    # Get loss from prediction
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed,
                           negative_sample_size, vocabulary_size))

    # Create optimizer
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    optimizer = tf.train.AdamOptimizer()
    train_step = optimizer.minimize(loss)

    # Add variable initializer.
    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)

        print('Starting training')
        generations = math.ceil(train_set_size / batch_size)
        for epoch in range(epochs):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset.generate_batch_data(
                    train_data, label_data, batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        print('Saving model')
        doc_embeddings = sess.run(doc_embeddings)

        title_weights = sess.run(title_weights)
        title_embeddings_weighted = np.multiply(bert_title_embeddings,
                                                title_weights)
        detail_emb = utils.mean_embeddings(
            [title_embeddings_weighted, doc_embeddings])
        detail_emb_norm = utils.normalize_embeddings(detail_emb)
        utils.save_doc_embeddings(detail_emb_norm,
                                  'memory_dbow_detail',
                                  batch_size=batch_size,
                                  negative_size=negative_sample_size)

        emb = utils.concat_embeddings([bert_title_embeddings, detail_emb_norm])
        utils.save_doc_embeddings(emb,
                                  'memory_dbow',
                                  batch_size=batch_size,
                                  negative_size=negative_sample_size)
    parser.add_argument('load', help='Directory with saved model files')
    parser.add_argument('embeddings', help='Text or numpy file with word embeddings')
    parser.add_argument('--vocab', help='Vocabulary file (only needed if numpy'
                                        'embedding file is given)')
    args = parser.parse_args()

    utils.config_logger(verbose=False)
    logger = utils.get_logger()

    logger.info('Reading model')
    sess = tf.InteractiveSession()
    model = multimlp.MultiFeedForward.load(args.load, sess)
    word_dict, embeddings = readdata.load_embeddings(args.embeddings, args.vocab,
                                                     generate=False,
                                                     load_extra_from=args.load)
    embeddings = utils.normalize_embeddings(embeddings)
    model.initialize_embeddings(sess, embeddings)
    number_to_label = {v: k for (k, v) in utils.label_map.items()}

    while True:
        sent1 = raw_input('Type sentence 1: ')
        sent2 = raw_input('Type sentence 2: ')
        tokens1 = utils.tokenize(sent1)
        tokens2 = utils.tokenize(sent2)
        vector1 = convert_tokens(tokens1, word_dict, model.max_time_steps1)
        vector2 = convert_tokens(tokens2, word_dict, model.max_time_steps2,
                                 prepend=word_dict[utils.GO])

        feeds = {model.sentence1: vector1,
                 model.sentence2: vector2,
                 model.sentence1_size: [len(tokens1)],
Exemple #12
0
def build_model(params, with_dis):
    """
    Build all components of the model.
    """

    src_dico, src_adj, src_features = load_src_data(params.src_file,
                                                    params.src_nns, params)
    params.src_dico = src_dico
    src_emb = nn.Embedding(len(src_features), params.emb_dim, sparse=True)
    src_emb.weight.data.copy_(src_features)

    # target embeddings
    if params.tgt_lang:
        tgt_dico, tgt_adj, tgt_features = load_tgt_data(
            params.tgt_file, params.tgt_nns, params)
        params.tgt_dico = tgt_dico
        tgt_emb = nn.Embedding(len(tgt_features), params.emb_dim, sparse=True)
        tgt_emb.weight.data.copy_(tgt_features)
    else:
        tgt_emb = None

    # mapping
    if params.sparse:
        src_mapping = SpGAT(nfeat=params.emb_dim,
                            nhid=params.emb_dim,
                            nclass=params.enc_dim,
                            dropout=params.dropout,
                            nheads=params.nb_heads,
                            alpha=params.alpha)
        tgt_mapping = SpGAT(nfeat=params.emb_dim,
                            nhid=params.emb_dim,
                            nclass=params.enc_dim,
                            dropout=params.dropout,
                            nheads=params.nb_heads,
                            alpha=params.alpha)
    else:
        src_mapping = GAT(nfeat=params.emb_dim,
                          nhid=params.emb_dim,
                          nclass=params.enc_dim,
                          dropout=params.dropout,
                          nheads=params.nb_heads,
                          alpha=params.alpha)
        tgt_mapping = GAT(nfeat=params.emb_dim,
                          nhid=params.emb_dim,
                          nclass=params.enc_dim,
                          dropout=params.dropout,
                          nheads=params.nb_heads,
                          alpha=params.alpha)

    src_decoder = Decoder(params)
    tgt_decoder = Decoder(params)
    # discriminator
    discriminator = Discriminator(params) if with_dis else None

    # cuda
    if params.cuda:
        src_emb.cuda()
        src_adj.cuda()
        src_mapping.cuda()
        src_decoder.cuda()
        if params.tgt_lang:
            tgt_emb.cuda()
            tgt_adj.cuda()
            tgt_mapping.cuda()
            tgt_decoder.cuda()
        if with_dis:
            discriminator.cuda()

    # normalize embeddings
    params.src_mean = normalize_embeddings(src_emb.weight.data,
                                           params.normalize_embeddings)
    if params.tgt_lang:
        params.tgt_mean = normalize_embeddings(tgt_emb.weight.data,
                                               params.normalize_embeddings)

    return src_emb, tgt_emb, src_adj, tgt_adj, src_mapping, tgt_mapping, src_decoder, tgt_decoder, discriminator
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size,
          doc_embedding_size, batch_size, negative_sample_size, epochs_step_1,
          epochs_step_2):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no)

    print('word_embedding_size', word_embedding_size)
    print('doc_embedding_size', doc_embedding_size)
    print('batch_size', batch_size)
    print('negative_sample_size', negative_sample_size)
    print('epochs_step_1:', epochs_step_1)
    print('epochs_step_2:', epochs_step_2)

    # Init
    ops.reset_default_graph()
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    # Load
    print('Loading pre processed data')
    all_docs = utils.load('all_docs')
    word_dictionary = utils.load('word_dictionary')
    # bert_title_embeddings = utils.load('bert_title_embeddings')
    # bert_detail_embeddings = utils.load('bert_detail_embeddings_100')
    bert_embeddings = utils.load_doc_embeddings('bert_doc_embeddings')

    docs_size = len(all_docs)
    vocabulary_size = len(word_dictionary)
    train_set_size = len(train_data)
    final_embedding_size = doc_embedding_size

    print('docs_size:', docs_size)
    print('vocabulary_size:', vocabulary_size)
    print('final_embedding_size:', final_embedding_size)
    print('train_set_size:', train_set_size)

    print('Creating model')

    # Define Embeddings:
    with tf.name_scope('embeddings'):
        # doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0))
        # doc_embeddings = tf.Variable(bert_detail_embeddings)
        doc_embeddings = tf.Variable(bert_embeddings)

    # NCE loss parameters
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, final_embedding_size],
                            stddev=1.0 / np.sqrt(final_embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Create data/target placeholders
    x_inputs = tf.placeholder(tf.int32, shape=[None, 1])
    y_target = tf.placeholder(tf.int32, shape=[None, 1])

    # Lookup the embedding
    final_embed = tf.nn.embedding_lookup(doc_embeddings, x_inputs[:, 0])

    # Get loss from prediction
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed,
                           negative_sample_size, vocabulary_size))

    # Create optimizer
    optimizer = tf.train.AdamOptimizer()
    train_step = optimizer.minimize(loss, var_list=[nce_weights, nce_biases])
    optimizer_2 = tf.train.GradientDescentOptimizer(learning_rate=0.005)
    train_step_2 = optimizer_2.minimize(loss, var_list=[doc_embeddings])

    # Add variable initializer.
    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)

        print('Starting training')
        generations = math.ceil(train_set_size / batch_size)
        for epoch in range(epochs_step_1):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset.generate_batch_data(
                    train_data, label_data, batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        for epoch in range(epochs_step_2):
            for generation in range(generations):
                # Generate training data
                batch_train, batch_label = dataset.generate_batch_data(
                    train_data, label_data, batch_size, generation)

                # Run the train step
                feed_dict = {x_inputs: batch_train, y_target: batch_label}
                sess.run(train_step_2, feed_dict=feed_dict)

                # Print the loss
                if show_loss and (generation + 1) == generations:
                    loss_val = sess.run(loss, feed_dict=feed_dict)
                    print('Loss at epoch {} : {}'.format(epoch, loss_val))

        print('Saving model')
        doc_embeddings = sess.run(doc_embeddings)

        # Norm
        doc_embeddings = utils.normalize_embeddings(doc_embeddings)
        utils.save_doc_embeddings(doc_embeddings,
                                  proj_name,
                                  batch_size=batch_size,
                                  negative_size=negative_sample_size)
def train(doc_embedding_size, negative_sample_size, epochs):

    print('doc_embedding_size:', doc_embedding_size)
    print('negative_sample_size:', negative_sample_size)
    print('epochs:', epochs)

    logging.getLogger().setLevel(logging.DEBUG)

    all_docs = utils.load('all_docs')
    alldocs = []
    corpus_size = len(all_docs)

    GoogleJobSkillDocument = namedtuple('GoogleJobSkillDocument', 'words tags')

    for i in range(corpus_size):
        words = all_docs[i].title_words
        tags = [i]
        alldocs.append(GoogleJobSkillDocument(words, tags))
    for i in range(corpus_size):
        words = all_docs[i].detail_words
        tags = [i + corpus_size]
        alldocs.append(GoogleJobSkillDocument(words, tags))

    print('docs size:', len(alldocs))

    doc_list = alldocs[:]
    shuffle(doc_list)

    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    model = Doc2Vec(dm=0,
                    vector_size=doc_embedding_size,
                    negative=negative_sample_size,
                    hs=0,
                    min_count=2,
                    sample=0,
                    epochs=epochs,
                    workers=cores)

    # Build corpus
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)
    print("vocab size:", len(model.wv.vocab))
    print("docvecs size:", len(model.docvecs))

    # Train
    print("Training %s" % model)
    model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

    # Save
    title_emb, detail_emb = utils.split_embeddings(model.docvecs, 2)
    doc_emb = utils.concat_embeddings([title_emb, detail_emb])

    title_emb = utils.normalize_embeddings(title_emb)
    detail_emb = utils.normalize_embeddings(detail_emb)
    doc_emb = utils.normalize_embeddings(doc_emb)

    utils.save_doc_embeddings(title_emb,
                              'gensim_dbow_title',
                              negative_size=negative_sample_size)
    utils.save_doc_embeddings(detail_emb,
                              'gensim_dbow_detail',
                              negative_size=negative_sample_size)
    utils.save_doc_embeddings(doc_emb,
                              'gensim_dbow',
                              negative_size=negative_sample_size)
def train(doc_embedding_size, window_size, negative_sample_size, is_concat,
          epochs):

    print('window_size:', window_size)
    print('doc_embedding_size:', doc_embedding_size)
    print('negative_sample_size:', negative_sample_size)
    print('is_concat', is_concat)
    print('epochs:', epochs)

    logging.getLogger().setLevel(logging.DEBUG)

    all_docs = utils.load('all_docs')
    alldocs = []
    corpus_size = len(all_docs)

    ImdbDocument = namedtuple('ImdbDocument', 'words tags')

    for i in range(corpus_size):
        words = all_docs[i].words
        tags = [i]
        alldocs.append(ImdbDocument(words, tags))

    print('docs size:', len(alldocs))

    doc_list = alldocs[:]
    shuffle(doc_list)

    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    if is_concat:
        model = Doc2Vec(dm=1,
                        vector_size=100,
                        negative=negative_sample_size,
                        window=window_size,
                        hs=0,
                        min_count=2,
                        sample=0,
                        epochs=epochs,
                        workers=cores,
                        alpha=0.05,
                        dm_concat=1)
    else:
        model = Doc2Vec(dm=1,
                        vector_size=100,
                        negative=negative_sample_size,
                        window=window_size,
                        hs=0,
                        min_count=2,
                        sample=0,
                        epochs=epochs,
                        workers=cores,
                        alpha=0.05)

    # Build corpus
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)
    print("vocab size:", len(model.wv.vocab))
    print("docvecs size:", len(model.docvecs))

    # Train
    print("Training %s" % model)
    model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

    # Save
    emb = []
    for i in range(corpus_size):
        emb.append(model.docvecs[i])

    emb = utils.normalize_embeddings(emb)

    utils.save_doc_embeddings(emb,
                              'gensim_dm',
                              is_concat=is_concat,
                              window_size=window_size,
                              negative_size=negative_sample_size)

    # Sample words
    sample_words = ['engineer']
    for word in sample_words:
        similars = model.wv.most_similar(word, topn=10)
        print(similars)
def concat_embeddings(emb1_name, emb2_name):
    emb1 = utils.load_doc_embeddings(emb1_name)
    emb2 = utils.load_doc_embeddings(emb2_name)
    emb = utils.concat_embeddings([emb1, emb2])
    emb = utils.normalize_embeddings(emb)
    utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)