Esempio n. 1
0
def load_embeddings(tf_path, flags_path):
    # Load the vocabulary
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    vocab_dictionary = vocab_processor.vocabulary_._mapping
    sorted_vocab = sorted(vocab_dictionary.items(), key=lambda x: x[1])

    flags = read_flags(flags_path)

    w2v_path = "/home/mgimenez/Dev/resources/w2v/GoogleNews-vectors-negative300.bin"
    w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
    init_embedding = np.random.uniform(
        -1.0, 1.0, (len(vocab_processor.vocabulary_), flags.embedding_dim))
    for word, word_idx in sorted_vocab:
        if word in w2v:
            init_embedding[word_idx] = w2v[word]

    return init_embedding
Esempio n. 2
0
def train_double_siamese(tf_path,
                         flags,
                         num_epochs,
                         out_dir=None,
                         init_embeddings=None):
    tf.logging.set_verbosity(tf.logging.INFO)

    # Create the directory where the training will be saved
    if not out_dir:
        timestamp = str(int(time()))
        out_dir = abspath(join(curdir, "models", timestamp))
        makedirs(out_dir, exist_ok=True)

    # Load the records
    train_sim_path = join(tf_path, 'train_sim.tfrecords')
    train_dis_path = join(tf_path, 'train_dis.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)

    n_labels = 1

    with tf.Graph().as_default():
        # Get similar sentences batch
        slabel_batch, s1_batch, s2_batch = input_pipeline(
            filepath=train_sim_path,
            batch_size=flags.batch_size,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=num_epochs)
        # Get non-similar sentences batch
        dlabel_batch, d1_batch, d2_batch = input_pipeline(
            filepath=train_dis_path,
            batch_size=flags.batch_size,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=num_epochs)
        double_siam = DoubleSiamese(
            sequence_length=seq_len,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=flags.embedding_dim,
            filter_sizes=list(map(int, flags.filter_sizes.split(","))),
            num_filters=flags.num_filters,
            margin=flags.margin)

        global_step = tf.Variable(0, trainable=False)

        train_op = tf.train.MomentumOptimizer(0.01, 0.5, use_nesterov=True)
        train_op = train_op.minimize(double_siam.loss, global_step=global_step)

        init_op = tf.global_variables_initializer()
        init_again = tf.local_variables_initializer()

        saver = tf.train.Saver()
        session_conf = tf.ConfigProto(
            allow_soft_placement=flags.allow_soft_placement,
            log_device_placement=flags.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default() as sess:
            sess.run(init_op)
            sess.run(init_again)

            # Show which variables are going to be train
            variables_names = [v.name for v in tf.trainable_variables()]
            values = sess.run(variables_names)
            for k, v in zip(variables_names, values):
                print("Variable: ", k, "- Shape: ", v.shape)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            try:
                while not coord.should_stop():
                    slabels, s1, s2 = sess.run(
                        [slabel_batch, s1_batch, s2_batch])
                    dlabels, d1, d2 = sess.run(
                        [dlabel_batch, d1_batch, d2_batch])
                    current_step = tf.train.global_step(sess, global_step)
                    _, loss = sess.run(
                        [train_op, double_siam.loss],
                        feed_dict={
                            double_siam.sim_branch.left_input: s1,
                            double_siam.sim_branch.right_input: s2,
                            double_siam.sim_branch.labels: slabels,
                            double_siam.disim_branch.left_input: d1,
                            double_siam.disim_branch.right_input: d2,
                            double_siam.disim_branch.labels: dlabels,
                            double_siam.sim_branch.is_training: True,
                            double_siam.disim_branch.is_training: True
                        })

            except tf.errors.OutOfRangeError:
                print("Done training!")
            finally:
                coord.request_stop()

            coord.join(threads)

            # Save the model
            if not out_dir:
                timestamp = str(int(time()))
                out_dir = abspath(join(curdir, "models", timestamp))
                makedirs(out_dir, exist_ok=True)

            with open(join(out_dir, 'parameters.txt'), 'w') as param_file:
                param_file.write("Default parameters: \n")
                for attr, value in sorted(flags.__flags.items()):
                    param_file.write(" - {}={}\n".format(attr.upper(), value))

            save_path = saver.save(sess, join(out_dir, "model.ckpt"))
            print("Model saved in file: {}".format(save_path))
            return out_dir
Esempio n. 3
0
def test_double(tf_path, model_path, flags_path):
    # Import the parameters binarized
    test_tfrecors = join(tf_path, 'test.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)
    FLAGS = read_flags(flags_path)
    # TODO Remove this from the siamese class
    fully_layer = True if FLAGS.hash_size else False
    # TODO this is a parameter
    one_hot = False
    n_labels = 2 if one_hot else 1

    # TEST THE SYSTEM
    with tf.Graph().as_default():

        label_batch, test_1_batch, test_2_batch = input_pipeline_test(
            filepath=test_tfrecors,
            batch_size=1,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=1)

        print(type(label_batch), type(test_1_batch), type(test_2_batch))
        double_siam = DoubleSiamese(
            sequence_length=seq_len,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            margin=FLAGS.margin)

        init_op = tf.global_variables_initializer()
        init_again = tf.local_variables_initializer()

        saver = tf.train.Saver()

        with tf.Session() as sess:
            # Initialize variables
            sess.run(init_op)
            sess.run(init_again)

            # Restore the model
            saver.restore(sess, join(model_path, "model.ckpt"))

            # Create the coordinators to read the test data
            coord = tf.train.Coordinator()

            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            test_sample, hits = 0, 0

            try:
                while not coord.should_stop():
                    test_1, test_2, test_label = sess.run(
                        [test_1_batch, test_2_batch, label_batch])
                    # TEST CLASSIFICATION
                    loss, distance, accuracy = sess.run(
                        [
                            double_siam.loss, double_siam.sim_branch.distance,
                            double_siam.sim_branch.accuracy
                        ],
                        feed_dict={
                            double_siam.sim_branch.left_input: test_1,
                            double_siam.sim_branch.right_input: test_2,
                            double_siam.sim_branch.labels: test_label,
                            double_siam.sim_branch.is_training: False,
                            double_siam.disim_branch.left_input: test_1,
                            double_siam.disim_branch.right_input: test_2,
                            double_siam.disim_branch.labels: test_label,
                            double_siam.disim_branch.is_training: False
                        })

                    with open(join(model_path, 'test.log'), 'a') as log_file:
                        log_str = "(#{0: <5} - {1}) - Loss: {2:.4f} - (d: {3:.4f})\n"
                        log_file.write(
                            log_str.format(test_sample, test_label[0][0], loss,
                                           distance[0]))

                    with open(join(model_path, 'distances.log'),
                              'a') as dist_file:
                        log_str = "{}\t{}\n"
                        dist_file.write(
                            log_str.format(distance[0], test_label[0][0]))
                    test_sample += 1
                    if accuracy == 1:
                        hits += 1
            except tf.errors.OutOfRangeError:
                print("Done testing!")
            finally:
                coord.request_stop()

            coord.join(threads)
            sess.close()

            with open(join(model_path, 'results.txt'), 'w') as results_file:
                results_file.write("Accuracy: {} ({}/{})".format(
                    hits / test_sample, hits, test_sample))

            print("Results saved in: {}".format(join(model_path,
                                                     'results.txt')))
            plot_distances(model_path)
Esempio n. 4
0
def train_siamese_fromtf(tf_path,
                         flags,
                         num_epochs,
                         out_dir=None,
                         one_hot=False,
                         verbose=False,
                         init_embeddings=None):
    """ Train a Siamese NN using a tfrecords as an input"""

    tf.logging.set_verbosity(tf.logging.INFO)

    # Create the directory where the training will be saved
    if not out_dir:
        timestamp = str(int(time()))
        out_dir = abspath(join(curdir, "models", timestamp))
        makedirs(out_dir, exist_ok=True)

    # Load the records
    train_path = join(tf_path, 'train.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)

    # Read the configuration flags

    # TODO Remove this from the siamese class
    n_labels = 2 if one_hot else 1
    print('--------', n_labels)

    with tf.Graph().as_default():

        label_batch, sentences_1_batch, sentences_2_batch = input_pipeline(
            filepath=train_path,
            batch_size=flags.batch_size,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=num_epochs)
        siamese = Siamese(sequence_length=seq_len,
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=flags.embedding_dim,
                          filter_sizes=list(
                              map(int, flags.filter_sizes.split(","))),
                          num_filters=flags.num_filters,
                          margin=flags.margin)

        global_step = tf.Variable(0, trainable=False)

        # learning_rate = tf.placeholder(tf.float32, shape=[])
        # train_op = tf.train.GradientDescentOptimizer(
        #     learning_rate=learning_rate).minimize(siamese.loss)

        # optimizer = tf.train.AdamOptimizer(0.2)
        # grads_and_vars = optimizer.compute_gradients(siamese.loss)
        # train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        starter_learning_rate = 0.01
        learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                   global_step,
                                                   1000000,
                                                   0.95,
                                                   staircase=False)
        train_op = tf.train.MomentumOptimizer(learning_rate,
                                              0.5,
                                              use_nesterov=True)

        # train_op = tf.train.MomentumOptimizer(0.01, 0.5, use_nesterov=True)
        train_op = train_op.minimize(siamese.loss, global_step=global_step)

        init_op = tf.global_variables_initializer()
        init_again = tf.local_variables_initializer()

        saver = tf.train.Saver()
        session_conf = tf.ConfigProto(
            allow_soft_placement=flags.allow_soft_placement,
            log_device_placement=flags.log_device_placement)

        sess = tf.Session(config=session_conf)
        with sess.as_default() as sess:
            if verbose:
                tf.summary.histogram('embedding', siamese.W_embedding)
                tf.summary.histogram('tensor_left', siamese.left_siamese)
                # tf.summary.histogram('tensor_left_z', tf.nn.zero_fraction(siamese.left_siamese))
                tf.summary.histogram('tensor_right', siamese.right_siamese)
                # tf.summary.histogram('tensor_right_z', tf.nn.zero_fraction(siamese.right_siamese))
                tf.summary.histogram('distance', siamese.distance)

                tf.summary.scalar('loss', siamese.loss)
                tf.summary.scalar('distance', siamese.distance[0])
                tf.summary.scalar('attraction', siamese.attr[0][0])
                tf.summary.scalar('repulsion', siamese.rep[0][0])

                summary_op = tf.summary.merge_all()
                summary_writer = tf.summary.FileWriter('./train', sess.graph)

            sess.run(init_op)
            sess.run(init_again)

            # Show which variables are going to be train
            variables_names = [v.name for v in tf.trainable_variables()]
            values = sess.run(variables_names)
            for k, v in zip(variables_names, values):
                print("Variable: ", k, "- Shape: ", v.shape)

            # Load embeddings
            if init_embeddings is not None:
                sess.run(siamese.W_embedding.assign(init_embeddings))

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            try:
                while not coord.should_stop():
                    labels, s1, s2 = sess.run(
                        [label_batch, sentences_1_batch, sentences_2_batch])
                    current_step = tf.train.global_step(sess, global_step)
                    if verbose:
                        train_step_verbose(sess, train_op, summary_op,
                                           summary_writer, siamese, s1, s2,
                                           labels, current_step)

                    else:
                        train_step(sess, train_op, siamese, s1, s2, labels,
                                   out_dir, current_step)

            except tf.errors.OutOfRangeError:
                print("Done training!")
            finally:
                coord.request_stop()

            coord.join(threads)

            # Save the model
            if not out_dir:
                timestamp = str(int(time()))
                out_dir = abspath(join(curdir, "models", timestamp))
                makedirs(out_dir, exist_ok=True)

            with open(join(out_dir, 'parameters.txt'), 'w') as param_file:
                param_file.write("Default parameters: \n")
                for attr, value in sorted(flags.__flags.items()):
                    param_file.write(" - {}={}\n".format(attr.upper(), value))

            save_path = saver.save(sess, join(out_dir, "model.ckpt"))
            print("Model saved in file: {}".format(save_path))
            return out_dir
Esempio n. 5
0
def dev_step(tf_path, model_path, flags_path, current_step):

    # Import the parameters binarized
    test_tfrecors = join(tf_path, 'dev.tfrecords')
    vocab_processor_path = join(tf_path, 'vocab.train')
    vocab_processor = load_binarize_data(vocab_processor_path)
    sequence_length_path = join(tf_path, 'sequence.len')
    seq_len = load_binarize_data(sequence_length_path)
    FLAGS = read_flags(flags_path)
    # TODO Remove this from the siamese class
    fully_layer = True if FLAGS.hash_size else False
    # TODO this is a parameter
    one_hot = False
    n_labels = 2 if one_hot else 1

    distances_filename = join(model_path,
                              'dev_' + str(current_step) + '_distances.log')
    log_filename = join(model_path, 'dev_' + str(current_step) + '.log')

    # TEST THE SYSTEM
    with tf.Graph().as_default():

        label_batch, test_1_batch, test_2_batch = input_pipeline_test(
            filepath=test_tfrecors,
            batch_size=1,
            num_labels=n_labels,
            sequence_len=seq_len,
            num_epochs=1)

        print(type(label_batch), type(test_1_batch), type(test_2_batch))
        siamese = Siamese(sequence_length=seq_len,
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          margin=FLAGS.margin)

        init_op = tf.global_variables_initializer()
        init_again = tf.local_variables_initializer()

        saver = tf.train.Saver()

        with tf.Session() as sess:
            # Initialize variables
            sess.run(init_op)
            sess.run(init_again)

            # Restore the model
            saver.restore(sess, join(model_path, "model.ckpt"))

            # Create the coordinators to read the test data
            coord = tf.train.Coordinator()

            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            test_sample, hits = 0, 0

            try:
                while not coord.should_stop():
                    test_1, test_2, test_label = sess.run(
                        [test_1_batch, test_2_batch, label_batch])
                    loss, attraction, repulsion, dis = \
                        sess.run([siamese.loss, siamese.attr,
                                  siamese.rep, siamese.distance],
                                 feed_dict={
                                     siamese.left_input: test_1,
                                     siamese.right_input: test_2,
                                     siamese.labels: test_label,
                                     siamese.is_training: False
                                 })

                    with open(log_filename, 'a') as log_file:
                        log_str = "(#{0: <5} - {5}) - Loss: {1:.4f} - " \
                                  "(a: {2:.3f} - r: {3:.3f} - " \
                                  "d: {4:.4f})\n"
                        log_file.write(
                            log_str.format(
                                test_sample,
                                loss,
                                attraction[0][0],
                                repulsion[0][0],
                                dis[0],
                                test_label[0][0],
                            ))

                    with open(distances_filename, 'a') as dist_file:
                        log_str = "{}\t{}\n"
                        dist_file.write(
                            log_str.format(dis[0], test_label[0][0]))
                    test_sample += 1

            except tf.errors.OutOfRangeError:
                print("Done evaluating!")
            finally:
                coord.request_stop()

            coord.join(threads)
            sess.close()

            with open(join(model_path, 'dev.txt'), 'a') as results_file:
                results_file.write("Accuracy: {} ({}/{})".format(
                    hits / test_sample, hits, test_sample))

            print("Results saved in: {}".format(join(model_path, 'dev.txt')))
            find_threshold(model_path, distances_filename)