def concat_embeddings(): emb1_name = 'dbow_bert_ver2_500_200' emb2_name = 'bert_doc_embeddings_100' emb1 = utils.load_doc_embeddings(emb1_name) emb2 = utils.load_doc_embeddings(emb2_name) emb = utils.concat_embeddings([emb1, emb2]) emb = utils.normalize_embeddings(emb) utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)
def train(gpu_no, show_loss, train_data, label_data, window_size, word_embedding_size, doc_embedding_size, batch_size, negative_sample_size, is_concat, epochs): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no) print('window_size', window_size) print('word_embedding_size', word_embedding_size) print('doc_embedding_size', doc_embedding_size) print('batch_size', batch_size) print('negative_sample_size', negative_sample_size) print('is_concat', is_concat) print('epochs:', epochs) # Init ops.reset_default_graph() os.chdir(os.path.dirname(os.path.realpath(__file__))) # Load print('Loading pre processed data') all_docs = utils.load('all_docs') word_dictionary = utils.load('word_dictionary') bert_word_embeddings = utils.load('bert_word_embeddings_100') docs_size = len(all_docs) vocabulary_size = len(word_dictionary) train_set_size = len(train_data) if is_concat: final_embedding_size = word_embedding_size * window_size + doc_embedding_size else: final_embedding_size = doc_embedding_size print('vocabulary_size:', vocabulary_size) print('final_embedding_size:', final_embedding_size) print('train_set_size:', train_set_size) print('Creating model') # Define Embeddings: with tf.name_scope('embeddings'): special_word_embeddings = tf.Variable(tf.random_uniform([2, word_embedding_size], -1.0, 1.0)) word_embeddings = tf.concat([special_word_embeddings, tf.constant(bert_word_embeddings[2:])], axis=0) # word_embeddings = tf.constant(bert_word_embeddings) doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0)) # NCE loss parameters nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, final_embedding_size], stddev=1.0 / np.sqrt(final_embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Create data/target placeholders x_inputs = tf.placeholder(tf.int32, shape=[None, window_size + 1]) # plus 1 for doc index y_target = tf.placeholder(tf.int32, shape=[None, 1]) # Lookup the word embedding # Add together element embeddings in window: # Concat all embeddings if is_concat: word_embed = [tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element]) for element in range(window_size)] doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1]) doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices) final_embed = tf.concat([*word_embed, tf.squeeze(doc_embed, axis=1)], 1) else: word_embed = tf.zeros([batch_size, word_embedding_size]) for element in range(window_size): word_embed += tf.nn.embedding_lookup(word_embeddings, x_inputs[:, element]) doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1]) doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices), axis=1) final_embed = (word_embed + doc_embed) / (window_size + 1) # Get loss from prediction with tf.name_scope('loss'): loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed, negative_sample_size, vocabulary_size)) # Create optimizer optimizer = tf.train.AdamOptimizer() train_step = optimizer.minimize(loss) # Add variable initializer. init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) print('Starting training') generations = math.ceil(train_set_size / batch_size) for epoch in range(epochs): for generation in range(generations): # Generate training data batch_train, batch_label = dataset_imdb.generate_batch_data(train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) print('Saving model') doc_embeddings = sess.run(doc_embeddings) # Norm doc_embeddings = utils.normalize_embeddings(doc_embeddings) utils.save_doc_embeddings(doc_embeddings, proj_name, is_concat=is_concat, window_size=window_size, batch_size=batch_size, negative_size=negative_sample_size)
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size, doc_embedding_size, batch_size, negative_sample_size, epochs): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no) print('word_embedding_size', word_embedding_size) print('doc_embedding_size', doc_embedding_size) print('batch_size', batch_size) print('negative_sample_size', negative_sample_size) print('epochs:', epochs) # Init ops.reset_default_graph() os.chdir(os.path.dirname(os.path.realpath(__file__))) # Load print('Loading pre processed data') all_docs = utils.load('all_docs') word_dictionary = utils.load('word_dictionary') bert_title_embeddings = utils.load('bert_title_embeddings') bert_detail_sentence_embeddings = utils.load( 'bert_detail_sentence_embeddings') docs_size = len(all_docs) vocabulary_size = len(word_dictionary) train_set_size = len(train_data) final_embedding_size = doc_embedding_size print('vocabulary_size:', vocabulary_size) print('final_embedding_size:', final_embedding_size) print('train_set_size:', train_set_size) print('Creating model') # Define Embeddings: with tf.name_scope('embeddings'): detail_sentence_embeddings = tf.constant( bert_detail_sentence_embeddings) title_embeddings = tf.constant(bert_title_embeddings) doc_embeddings = tf.Variable( tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0)) title_weights = tf.Variable(tf.random_uniform([docs_size, 1], 0.0, 1.0)) # NCE loss parameters nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, final_embedding_size], stddev=1.0 / np.sqrt(final_embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Create data/target placeholders x_inputs = tf.placeholder(tf.int32, shape=[None, 2]) y_target = tf.placeholder(tf.int32, shape=[None, 1]) # Lookup the word embedding # Add together element embeddings in window: # Concat all embeddings doc_indices = tf.slice(x_inputs, [0, 0], [batch_size, 1]) sentence_indices = tf.slice(x_inputs, [0, 1], [batch_size, 1]) doc_embed = tf.squeeze(tf.nn.embedding_lookup(doc_embeddings, doc_indices), axis=1) sentence_embed = tf.squeeze(tf.nn.embedding_lookup( detail_sentence_embeddings, sentence_indices), axis=1) title_embed = tf.squeeze(tf.nn.embedding_lookup(title_embeddings, doc_indices), axis=1) title_weight = tf.squeeze(tf.nn.embedding_lookup(title_weights, doc_indices), axis=1) title_embed_weighted = tf.math.multiply(title_embed, title_weight) final_embed = (title_embed_weighted + sentence_embed + doc_embed) / 3 # Get loss from prediction with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed, negative_sample_size, vocabulary_size)) # Create optimizer # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) optimizer = tf.train.AdamOptimizer() train_step = optimizer.minimize(loss) # Add variable initializer. init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) print('Starting training') generations = math.ceil(train_set_size / batch_size) for epoch in range(epochs): for generation in range(generations): # Generate training data batch_train, batch_label = dataset.generate_batch_data( train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) print('Saving model') doc_embeddings = sess.run(doc_embeddings) title_weights = sess.run(title_weights) title_embeddings_weighted = np.multiply(bert_title_embeddings, title_weights) detail_emb = utils.mean_embeddings( [title_embeddings_weighted, doc_embeddings]) detail_emb_norm = utils.normalize_embeddings(detail_emb) utils.save_doc_embeddings(detail_emb_norm, 'memory_dbow_detail', batch_size=batch_size, negative_size=negative_sample_size) emb = utils.concat_embeddings([bert_title_embeddings, detail_emb_norm]) utils.save_doc_embeddings(emb, 'memory_dbow', batch_size=batch_size, negative_size=negative_sample_size)
def train(gpu_no, show_loss, train_data, label_data, word_embedding_size, doc_embedding_size, batch_size, negative_sample_size, epochs_step_1, epochs_step_2): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_no) print('word_embedding_size', word_embedding_size) print('doc_embedding_size', doc_embedding_size) print('batch_size', batch_size) print('negative_sample_size', negative_sample_size) print('epochs_step_1:', epochs_step_1) print('epochs_step_2:', epochs_step_2) # Init ops.reset_default_graph() os.chdir(os.path.dirname(os.path.realpath(__file__))) # Load print('Loading pre processed data') all_docs = utils.load('all_docs') word_dictionary = utils.load('word_dictionary') # bert_title_embeddings = utils.load('bert_title_embeddings') # bert_detail_embeddings = utils.load('bert_detail_embeddings_100') bert_embeddings = utils.load_doc_embeddings('bert_doc_embeddings') docs_size = len(all_docs) vocabulary_size = len(word_dictionary) train_set_size = len(train_data) final_embedding_size = doc_embedding_size print('docs_size:', docs_size) print('vocabulary_size:', vocabulary_size) print('final_embedding_size:', final_embedding_size) print('train_set_size:', train_set_size) print('Creating model') # Define Embeddings: with tf.name_scope('embeddings'): # doc_embeddings = tf.Variable(tf.random_uniform([docs_size, doc_embedding_size], -1.0, 1.0)) # doc_embeddings = tf.Variable(bert_detail_embeddings) doc_embeddings = tf.Variable(bert_embeddings) # NCE loss parameters nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, final_embedding_size], stddev=1.0 / np.sqrt(final_embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Create data/target placeholders x_inputs = tf.placeholder(tf.int32, shape=[None, 1]) y_target = tf.placeholder(tf.int32, shape=[None, 1]) # Lookup the embedding final_embed = tf.nn.embedding_lookup(doc_embeddings, x_inputs[:, 0]) # Get loss from prediction with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed, negative_sample_size, vocabulary_size)) # Create optimizer optimizer = tf.train.AdamOptimizer() train_step = optimizer.minimize(loss, var_list=[nce_weights, nce_biases]) optimizer_2 = tf.train.GradientDescentOptimizer(learning_rate=0.005) train_step_2 = optimizer_2.minimize(loss, var_list=[doc_embeddings]) # Add variable initializer. init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) print('Starting training') generations = math.ceil(train_set_size / batch_size) for epoch in range(epochs_step_1): for generation in range(generations): # Generate training data batch_train, batch_label = dataset.generate_batch_data( train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) for epoch in range(epochs_step_2): for generation in range(generations): # Generate training data batch_train, batch_label = dataset.generate_batch_data( train_data, label_data, batch_size, generation) # Run the train step feed_dict = {x_inputs: batch_train, y_target: batch_label} sess.run(train_step_2, feed_dict=feed_dict) # Print the loss if show_loss and (generation + 1) == generations: loss_val = sess.run(loss, feed_dict=feed_dict) print('Loss at epoch {} : {}'.format(epoch, loss_val)) print('Saving model') doc_embeddings = sess.run(doc_embeddings) # Norm doc_embeddings = utils.normalize_embeddings(doc_embeddings) utils.save_doc_embeddings(doc_embeddings, proj_name, batch_size=batch_size, negative_size=negative_sample_size)
def train(doc_embedding_size, negative_sample_size, epochs): print('doc_embedding_size:', doc_embedding_size) print('negative_sample_size:', negative_sample_size) print('epochs:', epochs) logging.getLogger().setLevel(logging.DEBUG) all_docs = utils.load('all_docs') alldocs = [] corpus_size = len(all_docs) GoogleJobSkillDocument = namedtuple('GoogleJobSkillDocument', 'words tags') for i in range(corpus_size): words = all_docs[i].title_words tags = [i] alldocs.append(GoogleJobSkillDocument(words, tags)) for i in range(corpus_size): words = all_docs[i].detail_words tags = [i + corpus_size] alldocs.append(GoogleJobSkillDocument(words, tags)) print('docs size:', len(alldocs)) doc_list = alldocs[:] shuffle(doc_list) cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" model = Doc2Vec(dm=0, vector_size=doc_embedding_size, negative=negative_sample_size, hs=0, min_count=2, sample=0, epochs=epochs, workers=cores) # Build corpus model.build_vocab(alldocs) print("%s vocabulary scanned & state initialized" % model) print("vocab size:", len(model.wv.vocab)) print("docvecs size:", len(model.docvecs)) # Train print("Training %s" % model) model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs) # Save title_emb, detail_emb = utils.split_embeddings(model.docvecs, 2) doc_emb = utils.concat_embeddings([title_emb, detail_emb]) title_emb = utils.normalize_embeddings(title_emb) detail_emb = utils.normalize_embeddings(detail_emb) doc_emb = utils.normalize_embeddings(doc_emb) utils.save_doc_embeddings(title_emb, 'gensim_dbow_title', negative_size=negative_sample_size) utils.save_doc_embeddings(detail_emb, 'gensim_dbow_detail', negative_size=negative_sample_size) utils.save_doc_embeddings(doc_emb, 'gensim_dbow', negative_size=negative_sample_size)
def train(doc_embedding_size, window_size, negative_sample_size, is_concat, epochs): print('window_size:', window_size) print('doc_embedding_size:', doc_embedding_size) print('negative_sample_size:', negative_sample_size) print('is_concat', is_concat) print('epochs:', epochs) logging.getLogger().setLevel(logging.DEBUG) all_docs = utils.load('all_docs') alldocs = [] corpus_size = len(all_docs) ImdbDocument = namedtuple('ImdbDocument', 'words tags') for i in range(corpus_size): words = all_docs[i].words tags = [i] alldocs.append(ImdbDocument(words, tags)) print('docs size:', len(alldocs)) doc_list = alldocs[:] shuffle(doc_list) cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" if is_concat: model = Doc2Vec(dm=1, vector_size=100, negative=negative_sample_size, window=window_size, hs=0, min_count=2, sample=0, epochs=epochs, workers=cores, alpha=0.05, dm_concat=1) else: model = Doc2Vec(dm=1, vector_size=100, negative=negative_sample_size, window=window_size, hs=0, min_count=2, sample=0, epochs=epochs, workers=cores, alpha=0.05) # Build corpus model.build_vocab(alldocs) print("%s vocabulary scanned & state initialized" % model) print("vocab size:", len(model.wv.vocab)) print("docvecs size:", len(model.docvecs)) # Train print("Training %s" % model) model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs) # Save emb = [] for i in range(corpus_size): emb.append(model.docvecs[i]) emb = utils.normalize_embeddings(emb) utils.save_doc_embeddings(emb, 'gensim_dm', is_concat=is_concat, window_size=window_size, negative_size=negative_sample_size) # Sample words sample_words = ['engineer'] for word in sample_words: similars = model.wv.most_similar(word, topn=10) print(similars)
def concat_embeddings(emb1_name, emb2_name): emb1 = utils.load_doc_embeddings(emb1_name) emb2 = utils.load_doc_embeddings(emb2_name) emb = utils.concat_embeddings([emb1, emb2]) emb = utils.normalize_embeddings(emb) utils.save_doc_embeddings(emb, emb1_name + ' + ' + emb2_name)