Exemple #1
0
def quoraTF_default(flags_path, tf_path, out_dir=None, init_embeddings=None):
    flags = read_flags(flags_path)
    num_epochs = flags.num_epochs
    evaluate_epochs = flags.evaluate_epochs

    for i in range(0, num_epochs, evaluate_epochs):

        # Train n epochs and then evaluate the system
        if not out_dir:
            out_dir = train_siamese_fromtf(tf_path,
                                           flags,
                                           evaluate_epochs,
                                           init_embeddings=init_embeddings)
        else:
            train_siamese_fromtf(tf_path,
                                 flags,
                                 evaluate_epochs,
                                 out_dir,
                                 init_embeddings=init_embeddings)

        # dev_step(tf_path, out_dir, flags_path, i)
    print(' -----------------> ', out_dir)
    copyfile(flags_path, join(out_dir, 'flags.config'))
    print(' -----------------> ', out_dir)
    test_model(tf_path, out_dir, flags_path)
Exemple #2
0
def quoraTF_double(flags_path, tf_path, out_dir=None):
    flags = read_flags(flags_path)
    num_epochs = flags.num_epochs
    evaluate_epochs = flags.evaluate_epochs
    for i in range(0, num_epochs, evaluate_epochs):
        # Train n epochs and then evaluate the system
        if not out_dir:
            out_dir = train_double_siamese(tf_path, flags, evaluate_epochs)
        else:
            train_double_siamese(tf_path, flags, evaluate_epochs, out_dir)

        # dev_step(tf_path, out_dir, flags_path, i)
    copyfile(flags_path, join(out_dir, 'flags.config'))
    test_double(tf_path, out_dir, flags_path)
Exemple #3
0
def load_embeddings(tf_path, flags_path):
    # Load the vocabulary
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    vocab_dictionary = vocab_processor.vocabulary_._mapping
    sorted_vocab = sorted(vocab_dictionary.items(), key=lambda x: x[1])

    flags = read_flags(flags_path)

    w2v_path = "/home/mgimenez/Dev/resources/w2v/GoogleNews-vectors-negative300.bin"
    w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
    init_embedding = np.random.uniform(
        -1.0, 1.0, (len(vocab_processor.vocabulary_), flags.embedding_dim))
    for word, word_idx in sorted_vocab:
        if word in w2v:
            init_embedding[word_idx] = w2v[word]

    return init_embedding
Exemple #4
0
def train_siamese_fromtf(tf_path, config_flags, one_hot=False):
    """ Train a Siamese NN using a tfrecords as an input"""
    # Load the records
    train_path = join(tf_path, 'train.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)

    # Read the configuration flags
    FLAGS = read_flags(config_flags)
    # TODO Remove this from the siamese class
    fully_layer = True if FLAGS.hash_size else False
    n_labels = 2 if one_hot else 1
    print('--------', n_labels)

    # Load the dev records
    dev_path = join(tf_path, 'dev.tfrecords')
    # dev_labels, dev_s1, dev_s2 = get_all_records(dev_path, n_labels, seq_len)

    with tf.Graph().as_default():

        label_batch, sentences_1_batch, sentences_2_batch = input_pipeline(
            filepath=train_path,
            batch_size=FLAGS.batch_size,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=FLAGS.num_epochs)
        #
        # dev_labels, dev_sentences_1, dev_sentences_2 = input_pipeline(filepath=dev_path,
        #                                                               batch_size=FLAGS.batch_size,
        #                                                               num_labels=n_labels,
        #                                                               sequence_len=seq_len,
        #                                                               num_epochs=FLAGS.num_epochs)
        #
        print('HASH TRAIN  ----->', FLAGS.hash_size)
        siamese = Siamese(sequence_length=seq_len,
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          margin=FLAGS.margin,
                          threshold=FLAGS.threshold,
                          fully=fully_layer,
                          hash_size=FLAGS.hash_size)

        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = 0.1
        learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                   global_step,
                                                   100000,
                                                   0.96,
                                                   staircase=True)
        train_op = tf.train.MomentumOptimizer(
            0.0001, 0.95, use_nesterov=True).minimize(siamese.loss,
                                                      global_step=global_step)
        init_op = tf.global_variables_initializer()
        init_again = tf.local_variables_initializer()

        saver = tf.train.Saver()
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)

        sess = tf.Session(config=session_conf)
        with sess.as_default() as sess:
            sess.run(init_op)
            sess.run(init_again)

            # TODO la funcion map no la detecta y por lo tanto NO VA NADA
            # training_dataset = tf.contrib.data.TFRecordDataset([train_path])
            # # training_dataset = training_dataset.map(lambda x: parse_function(x, n_labels, seq_len))
            # training_dataset = training_dataset.map(lambda x: x)
            #
            # # training_dataset = training_dataset.shuffle(buffer_size=10000)
            # # training_dataset = training_dataset.repeat().batch(100)
            #
            # validation_dataset = tf.contrib.data.TFRecordDataset([train_path])
            # # training_dataset = tf.contrib.data.TFRecordDataset([train_path]).map(lambda x: )
            # # validation_dataset = tf.contrib.data.TFRecordDataset([train_path]).map(
            # #     lambda x: parse_function(x, n_labels, seq_len))
            # iterator = tf.contrib.data.Iterator.from_structure(training_dataset.output_types,
            #                                                    training_dataset.output_shapes)
            # next_element = iterator.get_next()
            #
            # training_init_op = iterator.make_initializer(training_dataset)
            # validation_init_op = iterator.make_initializer(training_dataset)
            #
            # # Run 20 epochs in which the training dataset is traversed, followed by the
            # # validation dataset.
            # for _ in range(1):
            #     # Initialize an iterator over the training dataset.
            #     sess.run(training_init_op)
            #     for _ in range(1):
            #         a = sess.run(next_element)
            #         # parse_function(a, n_labels, seq_len)
            #         print(a)
            #     #
            #     # # Initialize an iterator over the validation dataset.
            #     # sess.run(validation_init_op)
            #     # for _ in range(1):
            #     #     sess.run(next_element)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            step = 0
            try:
                while not coord.should_stop():
                    # print('--------------------------------------------------------------')
                    label, s1, s2 = sess.run(
                        [label_batch, sentences_1_batch, sentences_2_batch])
                    step += 1
                    print(step, label.shape, step % 1000)
                    # print(sess.run(sentences_1_batch).shape, sess.run(sentences_2_batch).shape,
                    #       sess.run(label_batch).shape)

                    _, loss, attraction, repulsion, dis, acc = \
                        sess.run([train_op, siamese.loss, siamese.attraction_loss,
                                  siamese.repulsion_loss, siamese.distance,
                                  siamese.accuracy],
                                 feed_dict={
                                     siamese.left_input: s1,
                                     siamese.right_input: s2,
                                     siamese.label: label,
                                     })
                    log_str = "(#{0: <5} - {6}) - Loss: {1:.4f} - " \
                              "(a: {2:.3f} - r: {3:.3f} - " \
                              "d: {4:.4f}, accuracy:{5:.4f})"
                    print(
                        log_str.format(sess.run(global_step), loss,
                                       np.mean(attraction), np.mean(repulsion),
                                       np.mean(dis), acc,
                                       np.mean(sess.run(label_batch))))

                    # TODO Dev
                    # if not step % 10:
                    #     print('--------------------------------------------------------------')
                    #     coord_dev = tf.train.Coordinator()
                    #     threads = tf.train.start_queue_runners(coord=coord_dev, sess=sess)
                    #     devstep = 0
                    #     try:
                    #         while not coord_dev.should_stop():
                    #             label, s1, s2 = sess.run([dev_labels, dev_sentences_1, dev_sentences_2])
                    #             devstep += 1
                    #             print(devstep, label.shape)
                    #     except tf.errors.OutOfRangeError:
                    #         print("Done dev!")
                    #     finally:
                    #         coord.request_stop()
                    #
                    #     coord.join(threads)

            except tf.errors.OutOfRangeError:
                print("Done training!")
            finally:
                coord.request_stop()

            coord.join(threads)

            # Save the model
            timestamp = str(int(time()))
            out_dir = abspath(join(curdir, "models", timestamp))
            makedirs(out_dir, exist_ok=True)

            with open(join(out_dir, 'parameters.txt'), 'w') as param_file:
                param_file.write("Default parameters: \n")
                for attr, value in sorted(FLAGS.__flags.items()):
                    param_file.write(" - {}={}\n".format(attr.upper(), value))

            save_path = saver.save(sess, join(out_dir, "model.ckpt"))
            print("Model saved in file: {}".format(save_path))
Exemple #5
0
def train_siamese(train_non_sim,
                  train_sim,
                  dev_non_sim,
                  dev_sim,
                  vocab_processor,
                  sequence_len,
                  config_flags=None):
    """ Train a siamese NN """
    FLAGS = read_flags(config_flags)
    val_left_sentences, val_right_sentences, val_sim_labels = get_dev_data(
        dev_sim, dev_non_sim)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)

        sess = tf.Session(config=session_conf)
        # TODO Remove this from the siamese class
        if not FLAGS.hash_size:
            fully_layer = False
        else:
            fully_layer = True

        with sess.as_default():
            print('HASH TRAIN  ----->', FLAGS.hash_size)
            siamese = Siamese(sequence_len,
                              vocab_size=len(vocab_processor.vocabulary_),
                              embedding_size=FLAGS.embedding_dim,
                              filter_sizes=list(
                                  map(int, FLAGS.filter_sizes.split(","))),
                              num_filters=FLAGS.num_filters,
                              margin=FLAGS.margin,
                              threshold=FLAGS.threshold,
                              fully=fully_layer,
                              hash_size=FLAGS.hash_size)

            global_step = tf.Variable(0, trainable=False)
            starter_learning_rate = 0.1
            learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                       global_step,
                                                       100000,
                                                       0.96,
                                                       staircase=True)
            train_step = tf.train.MomentumOptimizer(
                0.0001, 0.95,
                use_nesterov=True).minimize(siamese.loss,
                                            global_step=global_step)

            print()
            sess.run(tf.global_variables_initializer())
            data_size = len(train_sim) + len(train_non_sim)
            num_batches_per_epoch = int(data_size / FLAGS.batch_size) + 1
            print("Num batches per epoch: {} ({})\n".format(
                num_batches_per_epoch, data_size))

            train_sim = np.array(train_sim)
            train_non_sim = np.array(train_non_sim)

            for epoch in range(FLAGS.num_epochs):
                print(
                    "-------------------------------- EPOCH {} ---------------------------"
                    .format(epoch))
                # Prepare the batches
                if FLAGS.shuffle_epochs:
                    shuffled_sim_data, shuffled_non_sim_data = shuffle_epochs(
                        train_sim, train_non_sim)
                    batches = batch_iter(shuffled_sim_data,
                                         shuffled_non_sim_data,
                                         FLAGS.batch_size,
                                         num_batches_per_epoch)
                else:
                    batches = batch_iter(train_sim, train_non_sim,
                                         FLAGS.batch_size,
                                         num_batches_per_epoch)

                # TRAIN A BATCH
                sim_distances, non_sim_distances = [], []
                for cur_batch, batch in enumerate(batches):
                    batch_data, batch_type = batch[0], batch[1]
                    right_sentences = [
                        sample.sentence_1 for sample in batch_data
                    ]
                    left_sentences = [
                        sample.sentence_2 for sample in batch_data
                    ]
                    sim_labels = [sample.label for sample in batch_data]
                    # print(Counter(sim_labels))
                    # print(len(right_sentences))
                    assert len(right_sentences) == len(left_sentences) == len(
                        sim_labels)

                    _, loss, attraction, repulsion, d, accuracy, predictions, correct = sess.run(
                        [
                            train_step, siamese.loss, siamese.attraction_loss,
                            siamese.repulsion_loss, siamese.distance,
                            siamese.accuracy, siamese.predictions,
                            siamese.correct_predictions
                        ],
                        feed_dict={
                            siamese.left_input: left_sentences,
                            siamese.right_input: right_sentences,
                            siamese.label: sim_labels
                        })

                    print("(#{0: <7}) - Loss: {1:.4f} (a: {2:.4f} - r: {3:.4f}"
                          "- d: {4:.4f}, accuracy:{5:.4f})".format(
                              batch_type, loss, np.mean(attraction),
                              np.mean(repulsion), np.mean(d), accuracy))
                    if batch_type == 'SIM':
                        sim_distances.append(d)
                    else:
                        non_sim_distances.append(d)
                print('---------------------> sim: {} -  non sim: {}'.format(
                    np.array(sim_distances).mean(),
                    np.array(non_sim_distances).mean()))
                print(len(val_sim_labels))
                dev_step(sess, siamese, val_left_sentences,
                         val_right_sentences, val_sim_labels, epoch)
                print('Working dev step')
Exemple #6
0
def test_double(tf_path, model_path, flags_path):
    # Import the parameters binarized
    test_tfrecors = join(tf_path, 'test.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)
    FLAGS = read_flags(flags_path)
    # TODO Remove this from the siamese class
    fully_layer = True if FLAGS.hash_size else False
    # TODO this is a parameter
    one_hot = False
    n_labels = 2 if one_hot else 1

    # TEST THE SYSTEM
    with tf.Graph().as_default():

        label_batch, test_1_batch, test_2_batch = input_pipeline_test(
            filepath=test_tfrecors,
            batch_size=1,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=1)

        print(type(label_batch), type(test_1_batch), type(test_2_batch))
        double_siam = DoubleSiamese(
            sequence_length=seq_len,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            margin=FLAGS.margin)

        init_op = tf.global_variables_initializer()
        init_again = tf.local_variables_initializer()

        saver = tf.train.Saver()

        with tf.Session() as sess:
            # Initialize variables
            sess.run(init_op)
            sess.run(init_again)

            # Restore the model
            saver.restore(sess, join(model_path, "model.ckpt"))

            # Create the coordinators to read the test data
            coord = tf.train.Coordinator()

            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            test_sample, hits = 0, 0

            try:
                while not coord.should_stop():
                    test_1, test_2, test_label = sess.run(
                        [test_1_batch, test_2_batch, label_batch])
                    # TEST CLASSIFICATION
                    loss, distance, accuracy = sess.run(
                        [
                            double_siam.loss, double_siam.sim_branch.distance,
                            double_siam.sim_branch.accuracy
                        ],
                        feed_dict={
                            double_siam.sim_branch.left_input: test_1,
                            double_siam.sim_branch.right_input: test_2,
                            double_siam.sim_branch.labels: test_label,
                            double_siam.sim_branch.is_training: False,
                            double_siam.disim_branch.left_input: test_1,
                            double_siam.disim_branch.right_input: test_2,
                            double_siam.disim_branch.labels: test_label,
                            double_siam.disim_branch.is_training: False
                        })

                    with open(join(model_path, 'test.log'), 'a') as log_file:
                        log_str = "(#{0: <5} - {1}) - Loss: {2:.4f} - (d: {3:.4f})\n"
                        log_file.write(
                            log_str.format(test_sample, test_label[0][0], loss,
                                           distance[0]))

                    with open(join(model_path, 'distances.log'),
                              'a') as dist_file:
                        log_str = "{}\t{}\n"
                        dist_file.write(
                            log_str.format(distance[0], test_label[0][0]))
                    test_sample += 1
                    if accuracy == 1:
                        hits += 1
            except tf.errors.OutOfRangeError:
                print("Done testing!")
            finally:
                coord.request_stop()

            coord.join(threads)
            sess.close()

            with open(join(model_path, 'results.txt'), 'w') as results_file:
                results_file.write("Accuracy: {} ({}/{})".format(
                    hits / test_sample, hits, test_sample))

            print("Results saved in: {}".format(join(model_path,
                                                     'results.txt')))
            plot_distances(model_path)
Exemple #7
0
def input_pipeline(filepath, batch_size, num_labels, sequence_len, num_epochs=None):
    with tf.name_scope('input'):
        filename_queue = tf.train.string_input_producer([filepath], num_epochs=num_epochs)
        pair_id, sentence_1, sentence_2 = read_sample(filename_queue, num_labels, sequence_len)
        pair_batch, sentences_1_batch, sentences_2_batch = tf.train.shuffle_batch([pair_id, sentence_1, sentence_2],
                                                                                  batch_size=batch_size,
                                                                                  num_threads=1,
                                                                                  capacity=1000 + 3 * FLAGS.batch_size,
                                                                                  min_after_dequeue=1000)
    return pair_batch, sentences_1_batch, sentences_2_batch

if __name__ == "__main__":

    tf_path, model_path, flags_path = get_arguments()
    FLAGS = read_flags(flags_path)

    # Import the parameters binarized
    TEST_PATH = join(tf_path, 'test.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)

    # TODO this is a parameter
    one_hot =  False
    n_labels = 2 if one_hot else 1

  # TEST THE SYSTEM
    with tf.Graph().as_default():
        label_batch, test_1_batch, test_2_batch = input_pipeline(filepath=TEST_PATH,
Exemple #8
0
def dev_step(tf_path, model_path, flags_path, current_step):

    # Import the parameters binarized
    test_tfrecors = join(tf_path, 'dev.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)
    FLAGS = read_flags(flags_path)
    # TODO Remove this from the siamese class
    fully_layer = True if FLAGS.hash_size else False
    # TODO this is a parameter
    one_hot = False
    n_labels = 2 if one_hot else 1

    distances_filename = join(model_path,
                              'dev_' + str(current_step) + '_distances.log')
    log_filename = join(model_path, 'dev_' + str(current_step) + '.log')

    # TEST THE SYSTEM
    with tf.Graph().as_default():

        label_batch, test_1_batch, test_2_batch = input_pipeline_test(
            filepath=test_tfrecors,
            batch_size=1,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=1)

        print(type(label_batch), type(test_1_batch), type(test_2_batch))
        siamese = Siamese(sequence_length=seq_len,
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          margin=FLAGS.margin)

        init_op = tf.global_variables_initializer()
        init_again = tf.local_variables_initializer()

        saver = tf.train.Saver()

        with tf.Session() as sess:
            # Initialize variables
            sess.run(init_op)
            sess.run(init_again)

            # Restore the model
            saver.restore(sess, join(model_path, "model.ckpt"))

            # Create the coordinators to read the test data
            coord = tf.train.Coordinator()

            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            test_sample, hits = 0, 0

            try:
                while not coord.should_stop():
                    test_1, test_2, test_label = sess.run(
                        [test_1_batch, test_2_batch, label_batch])
                    loss, attraction, repulsion, dis = \
                        sess.run([siamese.loss, siamese.attr,
                                  siamese.rep, siamese.distance],
                                 feed_dict={
                                     siamese.left_input: test_1,
                                     siamese.right_input: test_2,
                                     siamese.labels: test_label,
                                     siamese.is_training: False
                                 })

                    with open(log_filename, 'a') as log_file:
                        log_str = "(#{0: <5} - {5}) - Loss: {1:.4f} - " \
                                  "(a: {2:.3f} - r: {3:.3f} - " \
                                  "d: {4:.4f})\n"
                        log_file.write(
                            log_str.format(
                                test_sample,
                                loss,
                                attraction[0][0],
                                repulsion[0][0],
                                dis[0],
                                test_label[0][0],
                            ))

                    with open(distances_filename, 'a') as dist_file:
                        log_str = "{}\t{}\n"
                        dist_file.write(
                            log_str.format(dis[0], test_label[0][0]))
                    test_sample += 1

            except tf.errors.OutOfRangeError:
                print("Done evaluating!")
            finally:
                coord.request_stop()

            coord.join(threads)
            sess.close()

            with open(join(model_path, 'dev.txt'), 'a') as results_file:
                results_file.write("Accuracy: {} ({}/{})".format(
                    hits / test_sample, hits, test_sample))

            print("Results saved in: {}".format(join(model_path, 'dev.txt')))
            find_threshold(model_path, distances_filename)