def quoraTF_default(flags_path, tf_path, out_dir=None, init_embeddings=None): flags = read_flags(flags_path) num_epochs = flags.num_epochs evaluate_epochs = flags.evaluate_epochs for i in range(0, num_epochs, evaluate_epochs): # Train n epochs and then evaluate the system if not out_dir: out_dir = train_siamese_fromtf(tf_path, flags, evaluate_epochs, init_embeddings=init_embeddings) else: train_siamese_fromtf(tf_path, flags, evaluate_epochs, out_dir, init_embeddings=init_embeddings) # dev_step(tf_path, out_dir, flags_path, i) print(' -----------------> ', out_dir) copyfile(flags_path, join(out_dir, 'flags.config')) print(' -----------------> ', out_dir) test_model(tf_path, out_dir, flags_path)
def quoraTF_double(flags_path, tf_path, out_dir=None): flags = read_flags(flags_path) num_epochs = flags.num_epochs evaluate_epochs = flags.evaluate_epochs for i in range(0, num_epochs, evaluate_epochs): # Train n epochs and then evaluate the system if not out_dir: out_dir = train_double_siamese(tf_path, flags, evaluate_epochs) else: train_double_siamese(tf_path, flags, evaluate_epochs, out_dir) # dev_step(tf_path, out_dir, flags_path, i) copyfile(flags_path, join(out_dir, 'flags.config')) test_double(tf_path, out_dir, flags_path)
def load_embeddings(tf_path, flags_path): # Load the vocabulary vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) vocab_dictionary = vocab_processor.vocabulary_._mapping sorted_vocab = sorted(vocab_dictionary.items(), key=lambda x: x[1]) flags = read_flags(flags_path) w2v_path = "/home/mgimenez/Dev/resources/w2v/GoogleNews-vectors-negative300.bin" w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True) init_embedding = np.random.uniform( -1.0, 1.0, (len(vocab_processor.vocabulary_), flags.embedding_dim)) for word, word_idx in sorted_vocab: if word in w2v: init_embedding[word_idx] = w2v[word] return init_embedding
def train_siamese_fromtf(tf_path, config_flags, one_hot=False): """ Train a Siamese NN using a tfrecords as an input""" # Load the records train_path = join(tf_path, 'train.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) # Read the configuration flags FLAGS = read_flags(config_flags) # TODO Remove this from the siamese class fully_layer = True if FLAGS.hash_size else False n_labels = 2 if one_hot else 1 print('--------', n_labels) # Load the dev records dev_path = join(tf_path, 'dev.tfrecords') # dev_labels, dev_s1, dev_s2 = get_all_records(dev_path, n_labels, seq_len) with tf.Graph().as_default(): label_batch, sentences_1_batch, sentences_2_batch = input_pipeline( filepath=train_path, batch_size=FLAGS.batch_size, num_labels=n_labels, sequence_len=seq_len, num_epochs=FLAGS.num_epochs) # # dev_labels, dev_sentences_1, dev_sentences_2 = input_pipeline(filepath=dev_path, # batch_size=FLAGS.batch_size, # num_labels=n_labels, # sequence_len=seq_len, # num_epochs=FLAGS.num_epochs) # print('HASH TRAIN ----->', FLAGS.hash_size) siamese = Siamese(sequence_length=seq_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, margin=FLAGS.margin, threshold=FLAGS.threshold, fully=fully_layer, hash_size=FLAGS.hash_size) global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True) train_op = tf.train.MomentumOptimizer( 0.0001, 0.95, use_nesterov=True).minimize(siamese.loss, global_step=global_step) init_op = tf.global_variables_initializer() init_again = tf.local_variables_initializer() saver = tf.train.Saver() session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default() as sess: sess.run(init_op) sess.run(init_again) # TODO la funcion map no la detecta y por lo tanto NO VA NADA # training_dataset = tf.contrib.data.TFRecordDataset([train_path]) # # training_dataset = training_dataset.map(lambda x: parse_function(x, n_labels, seq_len)) # training_dataset = training_dataset.map(lambda x: x) # # # training_dataset = training_dataset.shuffle(buffer_size=10000) # # training_dataset = training_dataset.repeat().batch(100) # # validation_dataset = tf.contrib.data.TFRecordDataset([train_path]) # # training_dataset = tf.contrib.data.TFRecordDataset([train_path]).map(lambda x: ) # # validation_dataset = tf.contrib.data.TFRecordDataset([train_path]).map( # # lambda x: parse_function(x, n_labels, seq_len)) # iterator = tf.contrib.data.Iterator.from_structure(training_dataset.output_types, # training_dataset.output_shapes) # next_element = iterator.get_next() # # training_init_op = iterator.make_initializer(training_dataset) # validation_init_op = iterator.make_initializer(training_dataset) # # # Run 20 epochs in which the training dataset is traversed, followed by the # # validation dataset. # for _ in range(1): # # Initialize an iterator over the training dataset. # sess.run(training_init_op) # for _ in range(1): # a = sess.run(next_element) # # parse_function(a, n_labels, seq_len) # print(a) # # # # # Initialize an iterator over the validation dataset. # # sess.run(validation_init_op) # # for _ in range(1): # # sess.run(next_element) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) step = 0 try: while not coord.should_stop(): # print('--------------------------------------------------------------') label, s1, s2 = sess.run( [label_batch, sentences_1_batch, sentences_2_batch]) step += 1 print(step, label.shape, step % 1000) # print(sess.run(sentences_1_batch).shape, sess.run(sentences_2_batch).shape, # sess.run(label_batch).shape) _, loss, attraction, repulsion, dis, acc = \ sess.run([train_op, siamese.loss, siamese.attraction_loss, siamese.repulsion_loss, siamese.distance, siamese.accuracy], feed_dict={ siamese.left_input: s1, siamese.right_input: s2, siamese.label: label, }) log_str = "(#{0: <5} - {6}) - Loss: {1:.4f} - " \ "(a: {2:.3f} - r: {3:.3f} - " \ "d: {4:.4f}, accuracy:{5:.4f})" print( log_str.format(sess.run(global_step), loss, np.mean(attraction), np.mean(repulsion), np.mean(dis), acc, np.mean(sess.run(label_batch)))) # TODO Dev # if not step % 10: # print('--------------------------------------------------------------') # coord_dev = tf.train.Coordinator() # threads = tf.train.start_queue_runners(coord=coord_dev, sess=sess) # devstep = 0 # try: # while not coord_dev.should_stop(): # label, s1, s2 = sess.run([dev_labels, dev_sentences_1, dev_sentences_2]) # devstep += 1 # print(devstep, label.shape) # except tf.errors.OutOfRangeError: # print("Done dev!") # finally: # coord.request_stop() # # coord.join(threads) except tf.errors.OutOfRangeError: print("Done training!") finally: coord.request_stop() coord.join(threads) # Save the model timestamp = str(int(time())) out_dir = abspath(join(curdir, "models", timestamp)) makedirs(out_dir, exist_ok=True) with open(join(out_dir, 'parameters.txt'), 'w') as param_file: param_file.write("Default parameters: \n") for attr, value in sorted(FLAGS.__flags.items()): param_file.write(" - {}={}\n".format(attr.upper(), value)) save_path = saver.save(sess, join(out_dir, "model.ckpt")) print("Model saved in file: {}".format(save_path))
def train_siamese(train_non_sim, train_sim, dev_non_sim, dev_sim, vocab_processor, sequence_len, config_flags=None): """ Train a siamese NN """ FLAGS = read_flags(config_flags) val_left_sentences, val_right_sentences, val_sim_labels = get_dev_data( dev_sim, dev_non_sim) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) # TODO Remove this from the siamese class if not FLAGS.hash_size: fully_layer = False else: fully_layer = True with sess.as_default(): print('HASH TRAIN ----->', FLAGS.hash_size) siamese = Siamese(sequence_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, margin=FLAGS.margin, threshold=FLAGS.threshold, fully=fully_layer, hash_size=FLAGS.hash_size) global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True) train_step = tf.train.MomentumOptimizer( 0.0001, 0.95, use_nesterov=True).minimize(siamese.loss, global_step=global_step) print() sess.run(tf.global_variables_initializer()) data_size = len(train_sim) + len(train_non_sim) num_batches_per_epoch = int(data_size / FLAGS.batch_size) + 1 print("Num batches per epoch: {} ({})\n".format( num_batches_per_epoch, data_size)) train_sim = np.array(train_sim) train_non_sim = np.array(train_non_sim) for epoch in range(FLAGS.num_epochs): print( "-------------------------------- EPOCH {} ---------------------------" .format(epoch)) # Prepare the batches if FLAGS.shuffle_epochs: shuffled_sim_data, shuffled_non_sim_data = shuffle_epochs( train_sim, train_non_sim) batches = batch_iter(shuffled_sim_data, shuffled_non_sim_data, FLAGS.batch_size, num_batches_per_epoch) else: batches = batch_iter(train_sim, train_non_sim, FLAGS.batch_size, num_batches_per_epoch) # TRAIN A BATCH sim_distances, non_sim_distances = [], [] for cur_batch, batch in enumerate(batches): batch_data, batch_type = batch[0], batch[1] right_sentences = [ sample.sentence_1 for sample in batch_data ] left_sentences = [ sample.sentence_2 for sample in batch_data ] sim_labels = [sample.label for sample in batch_data] # print(Counter(sim_labels)) # print(len(right_sentences)) assert len(right_sentences) == len(left_sentences) == len( sim_labels) _, loss, attraction, repulsion, d, accuracy, predictions, correct = sess.run( [ train_step, siamese.loss, siamese.attraction_loss, siamese.repulsion_loss, siamese.distance, siamese.accuracy, siamese.predictions, siamese.correct_predictions ], feed_dict={ siamese.left_input: left_sentences, siamese.right_input: right_sentences, siamese.label: sim_labels }) print("(#{0: <7}) - Loss: {1:.4f} (a: {2:.4f} - r: {3:.4f}" "- d: {4:.4f}, accuracy:{5:.4f})".format( batch_type, loss, np.mean(attraction), np.mean(repulsion), np.mean(d), accuracy)) if batch_type == 'SIM': sim_distances.append(d) else: non_sim_distances.append(d) print('---------------------> sim: {} - non sim: {}'.format( np.array(sim_distances).mean(), np.array(non_sim_distances).mean())) print(len(val_sim_labels)) dev_step(sess, siamese, val_left_sentences, val_right_sentences, val_sim_labels, epoch) print('Working dev step')
def test_double(tf_path, model_path, flags_path): # Import the parameters binarized test_tfrecors = join(tf_path, 'test.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) FLAGS = read_flags(flags_path) # TODO Remove this from the siamese class fully_layer = True if FLAGS.hash_size else False # TODO this is a parameter one_hot = False n_labels = 2 if one_hot else 1 # TEST THE SYSTEM with tf.Graph().as_default(): label_batch, test_1_batch, test_2_batch = input_pipeline_test( filepath=test_tfrecors, batch_size=1, num_labels=n_labels, sequence_len=seq_len, num_epochs=1) print(type(label_batch), type(test_1_batch), type(test_2_batch)) double_siam = DoubleSiamese( sequence_length=seq_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, margin=FLAGS.margin) init_op = tf.global_variables_initializer() init_again = tf.local_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: # Initialize variables sess.run(init_op) sess.run(init_again) # Restore the model saver.restore(sess, join(model_path, "model.ckpt")) # Create the coordinators to read the test data coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) test_sample, hits = 0, 0 try: while not coord.should_stop(): test_1, test_2, test_label = sess.run( [test_1_batch, test_2_batch, label_batch]) # TEST CLASSIFICATION loss, distance, accuracy = sess.run( [ double_siam.loss, double_siam.sim_branch.distance, double_siam.sim_branch.accuracy ], feed_dict={ double_siam.sim_branch.left_input: test_1, double_siam.sim_branch.right_input: test_2, double_siam.sim_branch.labels: test_label, double_siam.sim_branch.is_training: False, double_siam.disim_branch.left_input: test_1, double_siam.disim_branch.right_input: test_2, double_siam.disim_branch.labels: test_label, double_siam.disim_branch.is_training: False }) with open(join(model_path, 'test.log'), 'a') as log_file: log_str = "(#{0: <5} - {1}) - Loss: {2:.4f} - (d: {3:.4f})\n" log_file.write( log_str.format(test_sample, test_label[0][0], loss, distance[0])) with open(join(model_path, 'distances.log'), 'a') as dist_file: log_str = "{}\t{}\n" dist_file.write( log_str.format(distance[0], test_label[0][0])) test_sample += 1 if accuracy == 1: hits += 1 except tf.errors.OutOfRangeError: print("Done testing!") finally: coord.request_stop() coord.join(threads) sess.close() with open(join(model_path, 'results.txt'), 'w') as results_file: results_file.write("Accuracy: {} ({}/{})".format( hits / test_sample, hits, test_sample)) print("Results saved in: {}".format(join(model_path, 'results.txt'))) plot_distances(model_path)
def input_pipeline(filepath, batch_size, num_labels, sequence_len, num_epochs=None): with tf.name_scope('input'): filename_queue = tf.train.string_input_producer([filepath], num_epochs=num_epochs) pair_id, sentence_1, sentence_2 = read_sample(filename_queue, num_labels, sequence_len) pair_batch, sentences_1_batch, sentences_2_batch = tf.train.shuffle_batch([pair_id, sentence_1, sentence_2], batch_size=batch_size, num_threads=1, capacity=1000 + 3 * FLAGS.batch_size, min_after_dequeue=1000) return pair_batch, sentences_1_batch, sentences_2_batch if __name__ == "__main__": tf_path, model_path, flags_path = get_arguments() FLAGS = read_flags(flags_path) # Import the parameters binarized TEST_PATH = join(tf_path, 'test.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) # TODO this is a parameter one_hot = False n_labels = 2 if one_hot else 1 # TEST THE SYSTEM with tf.Graph().as_default(): label_batch, test_1_batch, test_2_batch = input_pipeline(filepath=TEST_PATH,
def dev_step(tf_path, model_path, flags_path, current_step): # Import the parameters binarized test_tfrecors = join(tf_path, 'dev.tfrecords') vocab_processor_path = join(tf_path, 'vocab.train') vocab_processor = load_binarize_data(vocab_processor_path) sequence_length_path = join(tf_path, 'sequence.len') seq_len = load_binarize_data(sequence_length_path) FLAGS = read_flags(flags_path) # TODO Remove this from the siamese class fully_layer = True if FLAGS.hash_size else False # TODO this is a parameter one_hot = False n_labels = 2 if one_hot else 1 distances_filename = join(model_path, 'dev_' + str(current_step) + '_distances.log') log_filename = join(model_path, 'dev_' + str(current_step) + '.log') # TEST THE SYSTEM with tf.Graph().as_default(): label_batch, test_1_batch, test_2_batch = input_pipeline_test( filepath=test_tfrecors, batch_size=1, num_labels=n_labels, sequence_len=seq_len, num_epochs=1) print(type(label_batch), type(test_1_batch), type(test_2_batch)) siamese = Siamese(sequence_length=seq_len, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, margin=FLAGS.margin) init_op = tf.global_variables_initializer() init_again = tf.local_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: # Initialize variables sess.run(init_op) sess.run(init_again) # Restore the model saver.restore(sess, join(model_path, "model.ckpt")) # Create the coordinators to read the test data coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) test_sample, hits = 0, 0 try: while not coord.should_stop(): test_1, test_2, test_label = sess.run( [test_1_batch, test_2_batch, label_batch]) loss, attraction, repulsion, dis = \ sess.run([siamese.loss, siamese.attr, siamese.rep, siamese.distance], feed_dict={ siamese.left_input: test_1, siamese.right_input: test_2, siamese.labels: test_label, siamese.is_training: False }) with open(log_filename, 'a') as log_file: log_str = "(#{0: <5} - {5}) - Loss: {1:.4f} - " \ "(a: {2:.3f} - r: {3:.3f} - " \ "d: {4:.4f})\n" log_file.write( log_str.format( test_sample, loss, attraction[0][0], repulsion[0][0], dis[0], test_label[0][0], )) with open(distances_filename, 'a') as dist_file: log_str = "{}\t{}\n" dist_file.write( log_str.format(dis[0], test_label[0][0])) test_sample += 1 except tf.errors.OutOfRangeError: print("Done evaluating!") finally: coord.request_stop() coord.join(threads) sess.close() with open(join(model_path, 'dev.txt'), 'a') as results_file: results_file.write("Accuracy: {} ({}/{})".format( hits / test_sample, hits, test_sample)) print("Results saved in: {}".format(join(model_path, 'dev.txt'))) find_threshold(model_path, distances_filename)