def load_embeddings(tf_path, flags_path): # Load the vocabulary vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) vocab_dictionary = vocab_processor.vocabulary_._mapping sorted_vocab = sorted(vocab_dictionary.items(), key=lambda x: x[1]) flags = read_flags(flags_path) w2v_path = "/home/mgimenez/Dev/resources/w2v/GoogleNews-vectors-negative300.bin" w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True) init_embedding = np.random.uniform( -1.0, 1.0, (len(vocab_processor.vocabulary_), flags.embedding_dim)) for word, word_idx in sorted_vocab: if word in w2v: init_embedding[word_idx] = w2v[word] return init_embedding
def train_double_siamese(tf_path, flags, num_epochs, out_dir=None, init_embeddings=None): tf.logging.set_verbosity(tf.logging.INFO) # Create the directory where the training will be saved if not out_dir: timestamp = str(int(time())) out_dir = abspath(join(curdir, "models", timestamp)) makedirs(out_dir, exist_ok=True) # Load the records train_sim_path = join(tf_path, 'train_sim.tfrecords') train_dis_path = join(tf_path, 'train_dis.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) n_labels = 1 with tf.Graph().as_default(): # Get similar sentences batch slabel_batch, s1_batch, s2_batch = input_pipeline( filepath=train_sim_path, batch_size=flags.batch_size, num_labels=n_labels, sequence_len=seq_len, num_epochs=num_epochs) # Get non-similar sentences batch dlabel_batch, d1_batch, d2_batch = input_pipeline( filepath=train_dis_path, batch_size=flags.batch_size, num_labels=n_labels, sequence_len=seq_len, num_epochs=num_epochs) double_siam = DoubleSiamese( sequence_length=seq_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=flags.embedding_dim, filter_sizes=list(map(int, flags.filter_sizes.split(","))), num_filters=flags.num_filters, margin=flags.margin) global_step = tf.Variable(0, trainable=False) train_op = tf.train.MomentumOptimizer(0.01, 0.5, use_nesterov=True) train_op = train_op.minimize(double_siam.loss, global_step=global_step) init_op = tf.global_variables_initializer() init_again = tf.local_variables_initializer() saver = tf.train.Saver() session_conf = tf.ConfigProto( allow_soft_placement=flags.allow_soft_placement, log_device_placement=flags.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default() as sess: sess.run(init_op) sess.run(init_again) # Show which variables are going to be train variables_names = [v.name for v in tf.trainable_variables()] values = sess.run(variables_names) for k, v in zip(variables_names, values): print("Variable: ", k, "- Shape: ", v.shape) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) try: while not coord.should_stop(): slabels, s1, s2 = sess.run( [slabel_batch, s1_batch, s2_batch]) dlabels, d1, d2 = sess.run( [dlabel_batch, d1_batch, d2_batch]) current_step = tf.train.global_step(sess, global_step) _, loss = sess.run( [train_op, double_siam.loss], feed_dict={ double_siam.sim_branch.left_input: s1, double_siam.sim_branch.right_input: s2, double_siam.sim_branch.labels: slabels, double_siam.disim_branch.left_input: d1, double_siam.disim_branch.right_input: d2, double_siam.disim_branch.labels: dlabels, double_siam.sim_branch.is_training: True, double_siam.disim_branch.is_training: True }) except tf.errors.OutOfRangeError: print("Done training!") finally: coord.request_stop() coord.join(threads) # Save the model if not out_dir: timestamp = str(int(time())) out_dir = abspath(join(curdir, "models", timestamp)) makedirs(out_dir, exist_ok=True) with open(join(out_dir, 'parameters.txt'), 'w') as param_file: param_file.write("Default parameters: \n") for attr, value in sorted(flags.__flags.items()): param_file.write(" - {}={}\n".format(attr.upper(), value)) save_path = saver.save(sess, join(out_dir, "model.ckpt")) print("Model saved in file: {}".format(save_path)) return out_dir
def test_double(tf_path, model_path, flags_path): # Import the parameters binarized test_tfrecors = join(tf_path, 'test.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) FLAGS = read_flags(flags_path) # TODO Remove this from the siamese class fully_layer = True if FLAGS.hash_size else False # TODO this is a parameter one_hot = False n_labels = 2 if one_hot else 1 # TEST THE SYSTEM with tf.Graph().as_default(): label_batch, test_1_batch, test_2_batch = input_pipeline_test( filepath=test_tfrecors, batch_size=1, num_labels=n_labels, sequence_len=seq_len, num_epochs=1) print(type(label_batch), type(test_1_batch), type(test_2_batch)) double_siam = DoubleSiamese( sequence_length=seq_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, margin=FLAGS.margin) init_op = tf.global_variables_initializer() init_again = tf.local_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: # Initialize variables sess.run(init_op) sess.run(init_again) # Restore the model saver.restore(sess, join(model_path, "model.ckpt")) # Create the coordinators to read the test data coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) test_sample, hits = 0, 0 try: while not coord.should_stop(): test_1, test_2, test_label = sess.run( [test_1_batch, test_2_batch, label_batch]) # TEST CLASSIFICATION loss, distance, accuracy = sess.run( [ double_siam.loss, double_siam.sim_branch.distance, double_siam.sim_branch.accuracy ], feed_dict={ double_siam.sim_branch.left_input: test_1, double_siam.sim_branch.right_input: test_2, double_siam.sim_branch.labels: test_label, double_siam.sim_branch.is_training: False, double_siam.disim_branch.left_input: test_1, double_siam.disim_branch.right_input: test_2, double_siam.disim_branch.labels: test_label, double_siam.disim_branch.is_training: False }) with open(join(model_path, 'test.log'), 'a') as log_file: log_str = "(#{0: <5} - {1}) - Loss: {2:.4f} - (d: {3:.4f})\n" log_file.write( log_str.format(test_sample, test_label[0][0], loss, distance[0])) with open(join(model_path, 'distances.log'), 'a') as dist_file: log_str = "{}\t{}\n" dist_file.write( log_str.format(distance[0], test_label[0][0])) test_sample += 1 if accuracy == 1: hits += 1 except tf.errors.OutOfRangeError: print("Done testing!") finally: coord.request_stop() coord.join(threads) sess.close() with open(join(model_path, 'results.txt'), 'w') as results_file: results_file.write("Accuracy: {} ({}/{})".format( hits / test_sample, hits, test_sample)) print("Results saved in: {}".format(join(model_path, 'results.txt'))) plot_distances(model_path)
def train_siamese_fromtf(tf_path, flags, num_epochs, out_dir=None, one_hot=False, verbose=False, init_embeddings=None): """ Train a Siamese NN using a tfrecords as an input""" tf.logging.set_verbosity(tf.logging.INFO) # Create the directory where the training will be saved if not out_dir: timestamp = str(int(time())) out_dir = abspath(join(curdir, "models", timestamp)) makedirs(out_dir, exist_ok=True) # Load the records train_path = join(tf_path, 'train.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) # Read the configuration flags # TODO Remove this from the siamese class n_labels = 2 if one_hot else 1 print('--------', n_labels) with tf.Graph().as_default(): label_batch, sentences_1_batch, sentences_2_batch = input_pipeline( filepath=train_path, batch_size=flags.batch_size, num_labels=n_labels, sequence_len=seq_len, num_epochs=num_epochs) siamese = Siamese(sequence_length=seq_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=flags.embedding_dim, filter_sizes=list( map(int, flags.filter_sizes.split(","))), num_filters=flags.num_filters, margin=flags.margin) global_step = tf.Variable(0, trainable=False) # learning_rate = tf.placeholder(tf.float32, shape=[]) # train_op = tf.train.GradientDescentOptimizer( # learning_rate=learning_rate).minimize(siamese.loss) # optimizer = tf.train.AdamOptimizer(0.2) # grads_and_vars = optimizer.compute_gradients(siamese.loss) # train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) starter_learning_rate = 0.01 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 1000000, 0.95, staircase=False) train_op = tf.train.MomentumOptimizer(learning_rate, 0.5, use_nesterov=True) # train_op = tf.train.MomentumOptimizer(0.01, 0.5, use_nesterov=True) train_op = train_op.minimize(siamese.loss, global_step=global_step) init_op = tf.global_variables_initializer() init_again = tf.local_variables_initializer() saver = tf.train.Saver() session_conf = tf.ConfigProto( allow_soft_placement=flags.allow_soft_placement, log_device_placement=flags.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default() as sess: if verbose: tf.summary.histogram('embedding', siamese.W_embedding) tf.summary.histogram('tensor_left', siamese.left_siamese) # tf.summary.histogram('tensor_left_z', tf.nn.zero_fraction(siamese.left_siamese)) tf.summary.histogram('tensor_right', siamese.right_siamese) # tf.summary.histogram('tensor_right_z', tf.nn.zero_fraction(siamese.right_siamese)) tf.summary.histogram('distance', siamese.distance) tf.summary.scalar('loss', siamese.loss) tf.summary.scalar('distance', siamese.distance[0]) tf.summary.scalar('attraction', siamese.attr[0][0]) tf.summary.scalar('repulsion', siamese.rep[0][0]) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter('./train', sess.graph) sess.run(init_op) sess.run(init_again) # Show which variables are going to be train variables_names = [v.name for v in tf.trainable_variables()] values = sess.run(variables_names) for k, v in zip(variables_names, values): print("Variable: ", k, "- Shape: ", v.shape) # Load embeddings if init_embeddings is not None: sess.run(siamese.W_embedding.assign(init_embeddings)) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) try: while not coord.should_stop(): labels, s1, s2 = sess.run( [label_batch, sentences_1_batch, sentences_2_batch]) current_step = tf.train.global_step(sess, global_step) if verbose: train_step_verbose(sess, train_op, summary_op, summary_writer, siamese, s1, s2, labels, current_step) else: train_step(sess, train_op, siamese, s1, s2, labels, out_dir, current_step) except tf.errors.OutOfRangeError: print("Done training!") finally: coord.request_stop() coord.join(threads) # Save the model if not out_dir: timestamp = str(int(time())) out_dir = abspath(join(curdir, "models", timestamp)) makedirs(out_dir, exist_ok=True) with open(join(out_dir, 'parameters.txt'), 'w') as param_file: param_file.write("Default parameters: \n") for attr, value in sorted(flags.__flags.items()): param_file.write(" - {}={}\n".format(attr.upper(), value)) save_path = saver.save(sess, join(out_dir, "model.ckpt")) print("Model saved in file: {}".format(save_path)) return out_dir
def dev_step(tf_path, model_path, flags_path, current_step): # Import the parameters binarized test_tfrecors = join(tf_path, 'dev.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) FLAGS = read_flags(flags_path) # TODO Remove this from the siamese class fully_layer = True if FLAGS.hash_size else False # TODO this is a parameter one_hot = False n_labels = 2 if one_hot else 1 distances_filename = join(model_path, 'dev_' + str(current_step) + '_distances.log') log_filename = join(model_path, 'dev_' + str(current_step) + '.log') # TEST THE SYSTEM with tf.Graph().as_default(): label_batch, test_1_batch, test_2_batch = input_pipeline_test( filepath=test_tfrecors, batch_size=1, num_labels=n_labels, sequence_len=seq_len, num_epochs=1) print(type(label_batch), type(test_1_batch), type(test_2_batch)) siamese = Siamese(sequence_length=seq_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, margin=FLAGS.margin) init_op = tf.global_variables_initializer() init_again = tf.local_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: # Initialize variables sess.run(init_op) sess.run(init_again) # Restore the model saver.restore(sess, join(model_path, "model.ckpt")) # Create the coordinators to read the test data coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) test_sample, hits = 0, 0 try: while not coord.should_stop(): test_1, test_2, test_label = sess.run( [test_1_batch, test_2_batch, label_batch]) loss, attraction, repulsion, dis = \ sess.run([siamese.loss, siamese.attr, siamese.rep, siamese.distance], feed_dict={ siamese.left_input: test_1, siamese.right_input: test_2, siamese.labels: test_label, siamese.is_training: False }) with open(log_filename, 'a') as log_file: log_str = "(#{0: <5} - {5}) - Loss: {1:.4f} - " \ "(a: {2:.3f} - r: {3:.3f} - " \ "d: {4:.4f})\n" log_file.write( log_str.format( test_sample, loss, attraction[0][0], repulsion[0][0], dis[0], test_label[0][0], )) with open(distances_filename, 'a') as dist_file: log_str = "{}\t{}\n" dist_file.write( log_str.format(dis[0], test_label[0][0])) test_sample += 1 except tf.errors.OutOfRangeError: print("Done evaluating!") finally: coord.request_stop() coord.join(threads) sess.close() with open(join(model_path, 'dev.txt'), 'a') as results_file: results_file.write("Accuracy: {} ({}/{})".format( hits / test_sample, hits, test_sample)) print("Results saved in: {}".format(join(model_path, 'dev.txt'))) find_threshold(model_path, distances_filename)