Ejemplo n.º 1
0
def preprocess(file_path_dem, file_path_rep):
    
    # inputs the positive and negative examples
    
    del_all_flags(tf.flags.FLAGS)
    tf.flags.DEFINE_string("positive_data_file", file_path_dem, "Data source for the positive data.")
    tf.flags.DEFINE_string("negative_data_file", file_path_rep, "Data source for the negative data.")
    FLAGS = tf.flags.FLAGS

    # Load data
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_train = x[shuffle_indices]
    y_train = y[shuffle_indices]

    del x, y
    
    print('Data Loaded.')
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print('Features Shape:', x_train.shape)
    print('Labels Shape:', y_train.shape, '\n')
    
    return x_train, y_train, vocab_processor
def preprocess():
    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    del x, y, x_shuffled, y_shuffled

    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, vocab_processor, x_dev, y_dev
Ejemplo n.º 3
0
              nb_epoch=num_epochs, validation_split=val_split, verbose=2)

    model_name = 'imdb_' + model_variation + str(num_epochs) + '.h5'
    model.save_weights(model_name)




else:

    print("Loading data test data...")
    neg_test_path = './data/imdb_test.neg'
    pos_test_path = './data/imdb_test.pos'

    # send data through cleaner function
    sentences, labels = data_helpers.load_data_and_labels(pos_test_path, neg_test_path)

    # Model Hyperparameters
    embedding_dim = 20
    filter_sizes = (3, 4)
    num_filters = 3
    dropout_prob = (0.7, 0.8)
    hidden_dims = 100

    # Word2Vec parameters, see train_word2vec
    min_word_count = 1  # Minimum word count
    context = 10  # Context window size


    model_dir = 'word2vec_models'
    model_name = "{:d}features_{:d}minwords_{:d}context_{:s}".format(embedding_dim, min_word_count, context,
Ejemplo n.º 4
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.data_file)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
Ejemplo n.º 5
0
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

#load labels
with open('labels.pkl', 'rb') as f:
    all_labels = pickle.load(f)


# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["I love charles dickens book"]
    # y_test = [2, 4]

y_test = None
# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
Ejemplo n.º 6
0
    def __init__(self):

        x_text, y = data_helpers.load_data_and_labels("./train/pos","./train/neg")

        # Build vocabulary
        x_list = [x.split(" ") for x in x_text]
        vocab_processor = data_helpers.n_grams(x_list, max_word_cnt, n_gram)
        print 'feed finished'
        x = np.array(data_helpers.fit_transform(vocab_processor, x_list, max_document_length, n_gram))
        # print x[0]
        print 'fit transform finished'


        # Randomly shuffle data
        np.random.seed(10)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x_shuffled = x[shuffle_indices]
        y_shuffled = y[shuffle_indices]

        # Split train/test set
        # TODO: This is very crude, should use cross-validation
        x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
        y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
        print("Vocabulary Size: {:d}".format(len(vocab_processor)))
        print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


        # Training
        # ==================================================

        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
              allow_soft_placement=FLAGS.allow_soft_placement,
              log_device_placement=FLAGS.log_device_placement)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                cnn = MLP(
                    sequence_length=x_train.shape[1],
                    num_classes=2,
                    vocab_size=len(vocab_processor),
                    embedding_size=FLAGS.embedding_dim,
                    filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                    num_filters = FLAGS.num_filters,
                    l2_reg_lambda=FLAGS.l2_reg_lambda)

                # Define Training procedure
                global_step = tf.Variable(0, name="global_step", trainable=False)
                starter_learning_rate = 1e-3
                learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                           3000, 0.96, staircase=True)
                optimizer = tf.train.AdamOptimizer(starter_learning_rate)
                grads_and_vars = optimizer.compute_gradients(cnn.loss)
                train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

                # Keep track of gradient values and sparsity (optional)
                grad_summaries = []
                for g, v in grads_and_vars:
                    if g is not None:
                        grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                        sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                        grad_summaries.append(grad_hist_summary)
                        grad_summaries.append(sparsity_summary)
                grad_summaries_merged = tf.merge_summary(grad_summaries)

                # Output directory for models and summaries
                timestamp = str(int(time.time()))
                self.out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
                print("Writing to {}\n".format(self.out_dir))

                # Summaries for loss and accuracy
                loss_summary = tf.scalar_summary("loss", cnn.loss)
                acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

                # Train Summaries
                train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
                train_summary_dir = os.path.join(self.out_dir, "summaries", "train")
                train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

                # Dev summaries
                dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
                dev_summary_dir = os.path.join(self.out_dir, "summaries", "dev")
                dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

                # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
                self.checkpoint_dir = os.path.abspath(os.path.join(self.out_dir, "checkpoints"))
                checkpoint_prefix = os.path.join(self.checkpoint_dir, "model")
                if not os.path.exists(self.checkpoint_dir):
                    os.makedirs(self.checkpoint_dir)
                saver = tf.train.Saver(tf.all_variables())
                # Write vocabulary
                pickle.dump(vocab_processor, open(os.path.join(self.out_dir,"vocab"), "wb" ) )
                # Initialize all variables
                sess.run(tf.initialize_all_variables())

                def train_step(x_batch, y_batch):
                    """
                    A single training step
                    """
                    feed_dict = {
                      cnn.input_x: x_batch,
                      cnn.input_y: y_batch,
                      cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }
                    _, step, summaries, loss, accuracy = sess.run(
                        [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                        feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                    train_summary_writer.add_summary(summaries, step)

                def dev_step(x_batch, y_batch, writer=None):
                    """
                    Evaluates model on a dev set
                    """
                    feed_dict = {
                      cnn.input_x: x_batch,
                      cnn.input_y: y_batch,
                      cnn.dropout_keep_prob:1
                    }
                    step, summaries, loss, accuracy = sess.run(
                        [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                        feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                    if writer:
                        writer.add_summary(summaries, step)

                # Generate batches
                t = list(zip(x_train, y_train))
                batches = data_helpers.batch_iter(
                    t, FLAGS.batch_size, FLAGS.num_epochs)
                # Training loop. For each batch...
                for batch in batches:
                    x_batch, y_batch = zip(*batch)
                    train_step(x_batch, y_batch)
                    current_step = tf.train.global_step(sess, global_step)
                    if current_step % FLAGS.evaluate_every == 0:
                        print("\nEvaluation:")
                        dev_step(x_dev, y_dev, writer=dev_summary_writer)
                        print("")
                    if current_step % FLAGS.checkpoint_every == 0:
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        print("Saved model checkpoint to {}\n".format(path))
Ejemplo n.º 7
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS(sys.argv)
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Load data
print("Loading data...")
x_text, y = dh.load_data_and_labels(train_test_text, Y_train_test)
x_1, y_train = dh.load_data_and_labels(train_text, Y_train)
x_2, y_test = dh.load_data_and_labels(test_text, Y_test)

# Build vocabulary
print("building vocab...")
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform((x_text))))

x_train = np.array(list(vocab_processor.fit_transform((x_1))))
x_test = np.array(list(vocab_processor.fit_transform((x_2))))

g = tf.Graph()
with g.as_default():
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
Ejemplo n.º 8
0
import tensorflow as tf
import data_helpers

# Define Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
tf.flags.DEFINE_string("sentence", "the movie was bad", "sentence to classify")

FLAGS = tf.flags.FLAGS

#######################################################################################################################
# process the raw sentence
new_review = data_helpers.clean_senetnce(FLAGS.sentence)

# load vocabulary
sentences, _ = data_helpers.load_data_and_labels()
sequence_length = max(len(x) for x in sentences)
sentences_padded = data_helpers.pad_sentences(sentences)
vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded)

num_padding = sequence_length - len(new_review)
new_sentence = new_review + ["<PAD/>"] * num_padding

# convert sentence to input matrix
array = []
for word in new_sentence:
    try:
        word_vector=vocabulary[word]
    except KeyError:
        word_vector=vocabulary["<PAD/>"]
    array.append(word_vector)
Ejemplo n.º 9
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x, y, word_count, letter_count = data_helpers.load_data_and_labels(
    FLAGS.data_path, FLAGS.vocab_path, FLAGS.max_length)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print(x_train.shape)
print(y_train.shape)
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
Ejemplo n.º 10
0
def main(_):
    # FLAGS._parse_flags()
    # print("\nParameters:")
    # for attr, value in sorted(FLAGS.items()):
    #     print("{}={}".format(attr.upper(), value))
    # print("")

    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.train_file,
                                                  FLAGS.num_class)

    # Build vocabulary
    if FLAGS.embedding_type == "random":
        vocab_processor = learn.preprocessing.VocabularyProcessor(
            FLAGS.max_length)
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    elif FLAGS.embedding_type == "none-static":
        x, w2v = [], KeyedVectors.load_word2vec_format(FLAGS.word2vec_model,
                                                       binary=False)
        vocab, embeddings = w2v.vocab, np.zeros(
            (len(w2v.index2word), w2v.vector_size), dtype=np.float32)

        for k, v in vocab.items():
            embeddings[v.index] = w2v[k]
        for item in x_text:
            x.append([
                w2v.vocab[word].index
                if word in w2v.vocab else w2v.vocab["__UNK__"].index
                for word in item.split(" ")
            ])
        x = np.array(x, dtype=np.int32)
        print("Vocabulary Size: {:d}".format(len(w2v.vocab)))
    else:
        raise RuntimeError("embedding_type is random or none-static")

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x, y, x_shuffled, y_shuffled

    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            if FLAGS.embedding_type == "random":
                cnn = TextCNN(sequence_length=FLAGS.max_length,
                              num_classes=FLAGS.num_class,
                              vocab_size=len(vocab_processor.vocabulary_),
                              embedding_size=FLAGS.embedding_dim,
                              filter_sizes=list(
                                  map(int, FLAGS.filter_sizes.split(","))),
                              num_filters=FLAGS.num_filters,
                              l2_reg_lambda=FLAGS.l2_reg_lambda)
            elif FLAGS.embedding_type == "none-static":
                cnn = TextCNN(sequence_length=FLAGS.max_length,
                              num_classes=FLAGS.num_class,
                              embedding=embeddings,
                              embedding_size=embeddings.shape[1],
                              filter_sizes=list(
                                  map(int, FLAGS.filter_sizes.split(","))),
                              num_filters=FLAGS.num_filters,
                              l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            lr = tf.train.exponential_decay(FLAGS.learning_rate,
                                            global_step,
                                            2500,
                                            0.8,
                                            staircase=True)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints,
                                   save_relative_paths=True)

            # Write vocabulary
            if FLAGS.embedding_type == "random":
                vocab_processor.save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
Ejemplo n.º 11
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y, seqlen = data_helpers.load_data_and_labels()

# Build vocabulary
max_document_length = max(seqlen)
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
seqlen_shuffled = seqlen[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
Ejemplo n.º 12
0
def eval():
    with tf.device('/gpu:0'):
        x_text, y, desc1, desc2, wType, type_index = data_helpers.load_data_and_labels(
            FLAGS.test_path)

    text_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
    text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(
        text_path)
    x = np.array(list(text_vocab_processor.transform(x_text)))

    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_text = graph.get_operation_by_name("input_text").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            emb_dropout_keep_prob = graph.get_operation_by_name(
                "emb_dropout_keep_prob").outputs[0]
            rnn_dropout_keep_prob = graph.get_operation_by_name(
                "rnn_dropout_keep_prob").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(x),
                                              FLAGS.batch_size,
                                              1,
                                              shuffle=False)

            # Collect the predictions here
            preds = []
            for x_batch in batches:
                pred = sess.run(
                    predictions, {
                        input_text: x_batch,
                        emb_dropout_keep_prob: 1.0,
                        rnn_dropout_keep_prob: 1.0,
                        dropout_keep_prob: 1.0
                    })
                preds.append(pred)
            preds = np.concatenate(preds)
            truths = np.argmax(y, axis=1)

            print(truths)
            result = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0,
                                           0], [0, 0, 0, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0]]
            # result = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]]
            for i in range(len(preds)):
                result[truths[i]][preds[i]] += 1
            print("===the prediction result===")
            print("\t0\t1\t2\t3\t4\t5")
            count = 0
            for i in range(len(result)):
                print(
                    str(count) + "\t" + str(result[i][0]) + "\t" +
                    str(result[i][1]) + "\t" + str(result[i][2]) + "\t" +
                    str(result[i][3]) + "\t" + str(result[i][4]) + "\t" +
                    str(result[i][5]))
                count += 1
            precision = []
            recall = []
            for j in range(len(result)):
                p = round(result[j][j] / sum(result[j]), 3) * 100
                col = [x[j] for x in result]
                r = round(result[j][j] / sum(col), 3) * 100
                precision.append(p)
                recall.append(r)
            f1_scores = []
            for k in range(len(precision)):
                if (precision[k] + recall[k]) == 0:
                    f1_scores.append(0)
                else:
                    f1 = round((2 * precision[k] * recall[k]) /
                               (precision[k] + recall[k]), 1)
                    f1_scores.append(f1)
            print(precision, recall, f1_scores)
            relationName = [
                "before", "after", "simultaneous", "include", "be_included",
                "vague"
            ]
            for l in range(6):
                print(relationName[l] + "acc:" + str(precision[l]) +
                      "%,recall:" + str(recall[l]) + "%,f1:" +
                      str(f1_scores[l]) + "%")
            precision_ave = round(sum(precision) / 6, 1)
            recall_ave = round(sum(recall) / 6, 1)
            # f1_score_ave = round(sum(f1_scores)/6,1)
            f1_score_ave = f1_score(truths,
                                    preds,
                                    labels=np.array(range(6)),
                                    average="micro")
            print("acc_avg:" + str(precision_ave) + "%,recall_avg:" +
                  str(recall_ave) + "%,f1:" + str(f1_score_ave) + "%")
            print("modelFile:" + str(FLAGS.checkpoint_dir))
def test_cnn():
    """Test CNN model."""

    # Load data
    logger.info("✔ Loading data...")
    logger.info('Recommand padding Sequence length is: {}'.format(
        FLAGS.pad_seq_len))

    logger.info('✔︎ Test data processing...')
    test_data = data_helpers.load_data_and_labels(FLAGS.test_data_file,
                                                  FLAGS.num_classes,
                                                  FLAGS.embedding_dim)

    logger.info('✔︎ Test data padding...')
    x_test, y_test = data_helpers.pad_data(test_data, FLAGS.pad_seq_len)
    y_test_bind = test_data.labels_bind

    # Build vocabulary
    VOCAB_SIZE = data_helpers.load_vocab_size(FLAGS.embedding_dim)
    pretrained_word2vec_matrix = data_helpers.load_word2vec_matrix(
        VOCAB_SIZE, FLAGS.embedding_dim)

    # Load cnn model
    logger.info("✔ Loading model...")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]

            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # pre-trained_word2vec
            pretrained_embedding = graph.get_operation_by_name(
                "embedding/W").outputs[0]

            # Tensors we want to evaluate
            logits = graph.get_operation_by_name("output/logits").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(
                zip(x_test, y_test, y_test_bind)),
                                              FLAGS.batch_size,
                                              1,
                                              shuffle=False)

            # Collect the predictions here
            all_predicitons = []
            eval_loss, eval_rec, eval_acc, eval_counter = 0.0, 0.0, 0.0, 0
            for batch_test in batches:
                x_batch_test, y_batch_test, y_batch_test_bind = zip(
                    *batch_test)
                feed_dict = {input_x: x_batch_test, dropout_keep_prob: 1.0}
                batch_logits = sess.run(logits, feed_dict)

                if FLAGS.use_classbind_or_not == 'Y':
                    predicted_labels = data_helpers.get_label_using_logits_and_classbind(
                        batch_logits,
                        y_batch_test_bind,
                        top_number=FLAGS.top_num)
                if FLAGS.use_classbind_or_not == 'N':
                    predicted_labels = data_helpers.get_label_using_logits(
                        batch_logits, top_number=FLAGS.top_num)

                all_predicitons = np.append(all_predicitons, predicted_labels)
                cur_rec, cur_acc = 0.0, 0.0
                for index, predicted_label in enumerate(predicted_labels):
                    rec_inc, acc_inc = data_helpers.cal_rec_and_acc(
                        predicted_label, y_batch_test[index])
                    cur_rec, cur_acc = cur_rec + rec_inc, cur_acc + acc_inc

                cur_rec = cur_rec / len(y_batch_test)
                cur_acc = cur_acc / len(y_batch_test)

                eval_rec, eval_acc, eval_counter = eval_rec + cur_rec, eval_acc + cur_acc, eval_counter + 1
                logger.info(
                    "✔︎ validation batch {} finished.".format(eval_counter))

            eval_rec = float(eval_rec / eval_counter)
            eval_acc = float(eval_acc / eval_counter)
            logger.info("☛ Recall {:g}, Accuracy {:g}".format(
                eval_rec, eval_acc))
            np.savetxt(SAVE_FILE, list(zip(all_predicitons)), fmt='%s')

    logger.info("✔ Done.")
Ejemplo n.º 14
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.test_data_folder)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
Ejemplo n.º 15
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
Ejemplo n.º 16
0
def train():
    # parse arguments
    FLAGS(sys.argv)
    print(FLAGS.batch_size)

    # This is not working any more, because api has been changed!!!
    print("\nParameters:")
    for attr, value in sorted(FLAGS.__flags.items()):
        print("{}={}".format(attr.upper(), value))
    print("")

    # Data Preparation
    # ==================================================
    
    # Load data
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
    
    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    
    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    
    del x, y, x_shuffled, y_shuffled
    
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

    # Training
    # ==================================================
    with tf.Graph().as_default():
        sequence_length = x_train.shape[1]
        num_classes = y_train.shape[1]
        input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        session_conf = tf.ConfigProto(
                        allow_soft_placement=FLAGS.allow_soft_placement,
                        log_device_placement=FLAGS.log_device_placement)
        with tf.Session(config=session_conf) as sess:
            cnn_text = TextModel(
                            input_x, input_y,
                            max_sequence_len=sequence_length,
                            num_classes=num_classes,
                            vocab_size=len(vocab_processor.vocabulary_),
                            embedding_size=FLAGS.embedding_dim,
                            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                            num_filters=FLAGS.num_filters,
                            l2_reg_lambda=FLAGS.l2_reg_lambda)

            prediction, loss, optimize, accuracy = cnn_text.get_model_variables()

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", loss)
            acc_summary = tf.summary.scalar("accuracy", accuracy)

            # train summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # eval summaries
            eval_summary_op = tf.summary.merge([loss_summary, acc_summary])
            eval_summary_dir = os.path.join(out_dir, "summaries", "eval")
            eval_summary_writer = tf.summary.FileWriter(eval_summary_dir, sess.graph)

            # checkpoint directory
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            # tensorflow assumes this directory already exists, so we need to create it if it not exists
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

            # write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))

            # initialize all variables
            init_g = tf.global_variables_initializer()
            init_l = tf.local_variables_initializer()
            sess.run(init_l)
            sess.run(init_g)

            def train_step(x_batch, y_batch):
                """
                train_step
                """
                feed_dict = {
                    cnn_text.data: x_batch,
                    cnn_text.target: y_batch,
                    cnn_text.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, summaries, train_loss, train_accuracy = sess.run(
                            [optimize, train_summary_op, loss, accuracy],
                            feed_dict)
                time_str = datetime.datetime.now().isoformat()
                current_step = tf.train.global_step(sess, cnn_text.global_step)
                print("{0}: step {1},  loss {2:g},  acc {3:g}".format(time_str, current_step, train_loss, train_accuracy))
                train_summary_writer.add_summary(summaries)

            def eval_step(x_batch, y_batch):
                """
                eval_step
                """
                feed_dict = {
                    cnn_text.data: x_batch,
                    cnn_text.target: y_batch,
                    cnn_text.dropout_keep_prob: 1.0
                }
                step, summaries, eval_loss, eval_accuracy = sess.run(
                            [cnn_text.global_step, eval_summary_op, loss, accuracy],
                            feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("evaluation {0}: step {1},  loss {2:g},  acc {3:g}".format(time_str, step, eval_loss, eval_accuracy))
                eval_summary_writer.add_summary(summaries)
                
            
            # generate batches
            batches = data_helpers.batch_iter(
                        list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)

            # training loop, for each batch ...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, cnn_text.global_step)
                if 0 == current_step % FLAGS.evaluate_every:
                    eval_step(x_dev, y_dev)
                if 0 == current_step % FLAGS.checkpoint_every:
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    print("saved model checkpoint to {0}".format(path))
Ejemplo n.º 17
0
def train():
    with tf.device('/cpu:0'):
        x_text, y = data_helpers.load_data_and_labels(FLAGS.pos_dir,
                                                      FLAGS.neg_dir)

    text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length, min_frequency=FLAGS.min_frequency)
    x = np.array(list(text_vocab_processor.fit_transform(x_text)))
    print("Text Vocabulary Size: {:d}".format(
        len(text_vocab_processor.vocabulary_)))

    print("x = {0}".format(x.shape))
    print("y = {0}".format(y.shape))
    print("")

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            rcnn = HAN(sequence_length=x_train.shape[1],
                       num_classes=y_train.shape[1],
                       vocab_size=len(text_vocab_processor.vocabulary_),
                       word_embedding_size=FLAGS.word_embedding_dim,
                       context_embedding_size=FLAGS.context_embedding_dim,
                       attention_size=FLAGS.attention_size,
                       hidden_size=FLAGS.hidden_size,
                       l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
                rcnn.loss, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", rcnn.loss)
            acc_summary = tf.summary.scalar("accuracy", rcnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            text_vocab_processor.save(os.path.join(out_dir, "text_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Pre-trained word2vec
            if FLAGS.word2vec:
                # initial matrix with random uniform
                initW = np.random.uniform(
                    -0.25, 0.25, (len(text_vocab_processor.vocabulary_),
                                  FLAGS.word_embedding_dim))
                # load any vectors from the word2vec
                print("Load word2vec file {0}".format(FLAGS.word2vec))
                with open(FLAGS.word2vec, "rb") as f:
                    header = f.readline()
                    vocab_size, layer1_size = map(int, header.split())
                    binary_len = np.dtype('float32').itemsize * layer1_size
                    for line in range(vocab_size):
                        word = []
                        while True:
                            ch = f.read(1).decode('latin-1')
                            if ch == ' ':
                                word = ''.join(word)
                                break
                            if ch != '\n':
                                word.append(ch)
                        idx = text_vocab_processor.vocabulary_.get(word)
                        if idx != 0:
                            initW[idx] = np.fromstring(f.read(binary_len),
                                                       dtype='float32')
                        else:
                            f.read(binary_len)
                sess.run(rcnn.W_text.assign(initW))
                print("Success to load pre-trained word2vec model!\n")

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                # Train
                feed_dict = {
                    rcnn.input_text: x_batch,
                    rcnn.input_y: y_batch,
                    rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, rcnn.loss,
                    rcnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    feed_dict_dev = {
                        rcnn.input_text: x_dev,
                        rcnn.input_y: y_dev,
                        rcnn.dropout_keep_prob: 1.0
                    }
                    summaries_dev, loss, accuracy = sess.run(
                        [dev_summary_op, rcnn.loss, rcnn.accuracy],
                        feed_dict_dev)
                    dev_summary_writer.add_summary(summaries_dev, step)

                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}\n".format(
                        time_str, step, loss, accuracy))

                # Model checkpoint
                if step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))
Ejemplo n.º 18
0
import sys
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.contrib import learn

from data_helpers import load_data_and_labels, batch_iter
from text_cnn import TextCNN


# Load original data
path = sys.path[0]
pos_filename = path + "/data/rt-polarity.pos"
neg_filename = path + "/data/rt-polarity.neg"

X_data, y_data = load_data_and_labels(pos_filename, neg_filename)
max_document_length = max([len(sen.split(" ")) for sen in X_data])
print("Max_document_length:,", max_document_length)
# Create the vacabulary
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
# The idx data
x = np.array(list(vocab_processor.fit_transform(X_data)), dtype=np.float32)
y = np.array(y_data, dtype=np.int32)
vocabulary_size = len(vocab_processor.vocabulary_)
print("The size of vocabulary:", vocabulary_size)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1111)
print("X_train shape {0}, y_train shape {1}".format(X_train.shape, y_train.shape))
print("X_test shape {0}, y_test shape {1}".format(X_test.shape, y_test.shape))

# The parameters of RNN
Ejemplo n.º 19
0
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

tasks = ['anger', 'fear', 'joy', 'sadness']

for task in tasks:
    print 'running for task', task

    # Load data
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.data_dir, task,
                                                  'train')

    # Build vocabulary
    #max_document_length = max([len(x.split(" ")) for x in x_text])
    #vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    #x = np.array(list(vocab_processor.fit_transform(x_text)))

    x, vocab_vector = data_helpers.build_vocabulary(x_text)
    #np.save('tmp/x.data', x)
    #np.save('tmp/vocab_vector.data', vocab_vector)
    #x = np.load('tmp/x.data.npy')
    #vocab_vector = np.load('tmp/vocab_vector.data.npy')

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
Ejemplo n.º 20
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels()

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
Ejemplo n.º 21
0
    macro_precison = total_precision / len(emotion)
    macro_recall = total_recall / len(emotion)
    if (macro_precison + macro_recall) != 0:
        macro_f1 = 2.0 * macro_precison * macro_recall / (macro_precison +
                                                          macro_recall)

    for e in range(len(emotion)):
        print(emotion[e], '	Precision:', performance[e][0], '	Recall:',
              performance[e][1], '	F1:', performance[e][2])
    print("Micro F1 Score:", micro_f1)
    print("Macro F1 Score:", macro_f1)


# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.test_file,
                                                      FLAGS.gold_labels)
    #y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
Ejemplo n.º 22
0
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

print("preprocess data...")
# before loading data cut words........txt file in style: sample1 \t label

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.label_data_file)
# Build vocabulary

max_document_length = max([len(x.split(" ")) for x in x_text])
# print(x_text)
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)

x = np.array(list(vocab_processor.fit_transform(x_text)))
y = np.array(y)
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
Ejemplo n.º 23
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))

# Data Preparation
# ==================================================

# Load data
print("Loading data...")

x_text, y_label = data_helpers.load_data_and_labels(FLAGS.data_file)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
#分词
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
y = np.array(y_label)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
print(type(x), type(y))

x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
Ejemplo n.º 24
0
print("Using word2vec model file : {}".format(trained_word2vec_model_file))

# validate training params file
training_params_file = os.path.join(FLAGS.checkpoint_dir, "..", "training_params.pickle")
if not os.path.exists(training_params_file):
    print("Training params file \'{}\' is missing!".format(training_params_file))
print("Using training params file : {}".format(training_params_file))

# Load params
params = data_helpers.loadDict(training_params_file)
num_labels = int(params['num_labels'])
max_document_length = int(params['max_document_length'])

# Load data
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.input_text_file, FLAGS.input_label_file, num_labels)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Get Embedding vector x_test
sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length)
x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = trained_word2vec_model_file))
print("x_test.shape = {}".format(x_test.shape))


# Evaluation
# ==================================================
print("\nEvaluating...\n")
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
Ejemplo n.º 25
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:-------------------------------------------------------")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================

# Load data
print("Loading data for CNN classification model--------------------------")
x_text, y = data_helpers.load_data_and_labels(FLAGS.labeled_data_dir)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
#max_document_length = max([len(x) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
Ejemplo n.º 26
0
Archivo: train.py Proyecto: AntNLP/opie
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")
domain_dir = os.path.join(os.getenv("OPIE_DIR"), "data", "domains", FLAGS.domain)
multi_opin_expr_dir = os.path.join(domain_dir, "multiopinexpr")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y, vocab = data_helpers.load_data_and_labels(
    domain_dir,
    max_document_length=FLAGS.max_document_length,
    shuffle_data=FLAGS.shuffle_data)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
min_document_length = min([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(
    max_document_length,
    vocabulary=vocab,
    tokenizer_fn=data_helpers.tokenizer)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Split train/test set
# TODO: This is very crude, should use cross-validation
test_num = 30000
x_train, x_dev = x[:-test_num], x[-test_num:]
Ejemplo n.º 27
0
def main(unused_argv):

    x_data, y = data_helpers.load_data_and_labels(conf.pos, conf.neg)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_data])
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_data)))

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    test_sample_index = -1 * int(conf.test_sam * float(len(y)))
    x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[
        test_sample_index:]
    y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[
        test_sample_index:]

    x_train1 = np.asarray(x_train)
    y_train1 = np.asarray(y_train, dtype=np.int32)

    x_test1 = np.asarray(x_test)
    y_test1 = np.asarray(y_test, dtype=np.int32)

    del x, y, x_shuffled, y_shuffled

    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_test)))

    # Create the Estimator
    text_classifier = tf.estimator.Estimator(model_fn=model_fn_1,
                                             model_dir="./models")

    # Set up logging for predictions
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=100)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": x_train1},
        y=y_train1,
        batch_size=conf.batch_size,
        num_epochs=None,
        shuffle=True)
    text_classifier.train(input_fn=train_input_fn,
                          steps=30000,
                          hooks=[logging_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": x_test1},
        y=y_test1,
        batch_size=conf.test_num,
        num_epochs=1,
        shuffle=False)
    eval_results = text_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
Ejemplo n.º 28
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels()
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off.", 'it is very nice.', 'f**k off.', 'its not bad.']
    y_test = [1, 0, 1, 0, 1]
    yy_test = [[1,0], [0,1], [1,0], [0,1], [1,0]]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
Ejemplo n.º 29
0
def train():
    with tf.device('/cpu:0'):
        train_text, train_y, train_pos1, train_pos2, train_x_text_clean, train_sentence_len = data_helpers.load_data_and_labels(FLAGS.train_path)
    with tf.device('/cpu:0'):
        test_text, test_y, test_pos1, test_pos2, test_x_text_clean, test_sentence_len = data_helpers.load_data_and_labels(FLAGS.test_path)
    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
    # dimension = FLAGS.max_sentence_length
    # print("text:",x_text)
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length)
    vocab_processor.fit(train_text + test_text)
    train_x = np.array(list(vocab_processor.transform(train_text)))
    test_x = np.array(list(vocab_processor.transform(test_text)))
    train_text = np.array(train_text)
    print("train_text",train_text[0:2])
    test_text = np.array(test_text)
    print("\nText Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("train_x = {0}".format(train_x.shape))  # (8000,90)
    print("train_y = {0}".format(train_y.shape))  # (8000,19)
    print("test_x = {0}".format(test_x.shape))  # (2717, 90)
    print("test_y = {0}".format(test_y.shape))  # (2717,19)

    # Example: pos1[3] = [-2 -1  0  1  2   3   4 999 999 999 ... 999]
    # [95 96 97 98 99 100 101 999 999 999 ... 999]
    # =>
    # [11 12 13 14 15  16  21  17  17  17 ...  17]
    # dimension = MAX_SENTENCE_LENGTH
    pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length)
    pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2)
    train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1)))
    train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2)))
    test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1)))
    test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2)))
    print("\nPosition Vocabulary Size: {:d}".format(len(pos_vocab_processor.vocabulary_)))
    print("train_p1 = {0}".format(train_p1.shape))  # (8000, 90)
    print("test_p1 = {0}".format(test_p1.shape))  # (2717, 90)
    print("")

    # Randomly shuffle data to split into train and test(dev)
    # np.random.seed(10)
    #
    # shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000
    # x_shuffled = x[shuffle_indices]
    # p1_shuffled = p1[shuffle_indices]
    # p2_shuffled = p2[shuffle_indices]
    # y_shuffled = y[shuffle_indices]
    # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled)

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800
    # x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    # p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[dev_sample_index:]
    # p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[dev_sample_index:]
    # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    # print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))
    # print(x_train)
    # print(np.array(x_train))
    # print(x_dev)
    # print(np.array(x_dev))

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=FLAGS.max_sentence_length, #90
                num_classes=train_y.shape[1],#19
                text_vocab_size=len(vocab_processor.vocabulary_), #19151
                text_embedding_size=FLAGS.text_embedding_size,#300
                pos_vocab_size=len(pos_vocab_processor.vocabulary_),#162
                pos_embedding_size=FLAGS.pos_embedding_dim,#50
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), #2,3,4,5
                num_filters=FLAGS.num_filters, #128
                l2_reg_lambda=FLAGS.l2_reg_lambda, #1e-5
                use_elmo = (FLAGS.embeddings == 'elmo'))

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6)
            gvs = optimizer.compute_gradients(cnn.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print("\nWriting to {}\n".format(out_dir))

            # Logger
            logger = Logger(out_dir)

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))
            pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            if FLAGS.embeddings == "word2vec":
                pretrain_W = utils.load_word2vec('resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size,vocab_processor)
                sess.run(cnn.W_text.assign(pretrain_W))
                print("Success to load pre-trained word2vec model!\n")
            elif FLAGS.embeddings == "glove100":
                pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor)
                sess.run(cnn.W_text.assign(pretrain_W))
                print("Success to load pre-trained glove100 model!\n")
            elif FLAGS.embeddings == "glove300":
                pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor)
                sess.run(cnn.W_text.assign(pretrain_W))
                print("Success to load pre-trained glove300 model!\n")

            # Generate batches
            train_batches = data_helpers.batch_iter(list(zip(train_x, train_y, train_text,
                                                             train_p1, train_p2)),
                                                    FLAGS.batch_size, FLAGS.num_epochs)
            # Training loop. For each batch...
            best_f1 = 0.0  # For save checkpoint(model)
            for train_batch in train_batches:
                train_bx, train_by, train_btxt,train_bp1, train_bp2 = zip(*train_batch)
                # print("train_bxt",list(train_btxt)[:2])
                # print(np.array(train_be1).shape) #(20, )
                # print(train_be1)
                feed_dict = {
                    cnn.input_text: train_bx,
                    cnn.input_y: train_by,
                    cnn.input_x_text: list(train_btxt),
                    cnn.input_p1: train_bp1,
                    cnn.input_p2: train_bp2,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run(
                    [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    logger.logging_train(step, loss, accuracy)

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    # Generate batches
                    test_batches = data_helpers.batch_iter(list(zip(test_x, test_y, test_text,
                                                                     test_p1, test_p2)),
                                                           FLAGS.batch_size, 1, shuffle=False)
                    # Training loop. For each batch...
                    losses = 0.0
                    accuracy = 0.0
                    predictions = []
                    iter_cnt = 0
                    for test_batch in test_batches:
                        test_bx, test_by, test_btxt, test_bp1, test_bp2 = zip(*test_batch)
                        feed_dict = {
                            cnn.input_text: test_bx,
                            cnn.input_y: test_by,
                            cnn.input_x_text: list(test_btxt),
                            cnn.input_p1: test_bp1,
                            cnn.input_p2: test_bp2,
                            cnn.dropout_keep_prob: 1.0
                        }
                        loss, acc, pred = sess.run(
                            [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict)
                        losses += loss
                        accuracy += acc
                        predictions += pred.tolist()
                        iter_cnt += 1
                    losses /= iter_cnt
                    accuracy /= iter_cnt
                    predictions = np.array(predictions, dtype='int')

                    logger.logging_eval(step, loss, accuracy, predictions)

                    # Model checkpoint
                    if best_f1 < logger.best_f1:
                        best_f1 = logger.best_f1
                        path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step)
                        print("Saved model checkpoint to {}\n".format(path))
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels()

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
Ejemplo n.º 31
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_files,
                                              FLAGS.negative_data_files)

# Build vocabulary
max_document_length = max([len(nltk.word_tokenize(x)) for x in x_text])
print("Processing vocab...")


def tokenizer(iterator):
    for value in iterator:
        yield nltk.word_tokenize(value)


vocab_processor = learn.preprocessing.VocabularyProcessor(
    max_document_length, tokenizer_fn=tokenizer)
x = np.array(list(vocab_processor.fit_transform(x_text)))
Ejemplo n.º 32
0
def evaluate():
    # parse arguments
    FLAGS(sys.argv)
    print(FLAGS.batch_size)
    
    # map data into vocabulary
    vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
    print(vocab_path)
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)

    # CHANGE THIS: Load data. Load your own data here
    if FLAGS.eval_train:
        x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
        y_test = np.argmax(y_test, axis=1)
    else:
        x_raw = ["a masterpiece four years in the making", "everything is off."]
        y_test = [1, 0] 

    x_test = np.array(list(vocab_processor.transform(x_raw)))
    print("\nEvaluating...\n")
    
    # Evaluation
    # ==================================================
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
                            allow_soft_placement=FLAGS.allow_soft_placement,
                            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name("cnn_output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predictions = []

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
                all_predictions = np.concatenate([all_predictions, batch_predictions])

    # Print accuracy if y_test is defined
    if y_test is not None:
        correct_predictions = float(sum(all_predictions == y_test))
        print("Total number of test examples: {}".format(len(y_test)))
        print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))

    # Save the evaluation to a csv
    predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
    out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
    print("Saving evaluation to {0}".format(out_path))
    with open(out_path, 'w') as f:
        import csv
        csv.writer(f).writerows(predictions_human_readable)
Ejemplo n.º 33
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
# FLAGS._parse_flags()
# print("\nParameters:")
# for attr, value in sorted(FLAGS.__flags.items()):
#     print("{}={}".format(attr.upper(), value))
# print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.data_dir)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
Ejemplo n.º 34
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
def test_cnn():
    """Test CNN model."""

    # Load data
    logger.info("✔ Loading data...")
    logger.info('Recommand padding Sequence length is: {0}'.format(
        FLAGS.pad_seq_len))

    logger.info('✔︎ Test data processing...')
    test_data = data_helpers.load_data_and_labels(FLAGS.test_data_file,
                                                  FLAGS.embedding_dim)

    logger.info('✔︎ Test data padding...')
    x_test_front, x_test_behind, y_test = data_helpers.pad_data(
        test_data, FLAGS.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE = data_helpers.load_vocab_size(FLAGS.embedding_dim)
    pretrained_word2vec_matrix = data_helpers.load_word2vec_matrix(
        VOCAB_SIZE, FLAGS.embedding_dim)

    # Load cnn model
    logger.info("✔ Loading model...")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x_front = graph.get_operation_by_name(
                "input_x_front").outputs[0]
            input_x_behind = graph.get_operation_by_name(
                "input_x_behind").outputs[0]

            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # pre-trained_word2vec
            pretrained_embedding = graph.get_operation_by_name(
                "embedding/embedding").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]
            softmax_scores = graph.get_operation_by_name(
                "output/SoftMax_scores").outputs[0]
            topKPreds = graph.get_operation_by_name(
                "output/topKPreds").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(
                zip(x_test_front, x_test_behind)),
                                              FLAGS.batch_size,
                                              1,
                                              shuffle=False)

            # Collect the predictions here
            all_scores = []
            all_softmax_scores = []
            all_predictions = []
            all_topKPreds = []

            for x_test_batch in batches:
                x_batch_front, x_batch_behind = zip(*x_test_batch)
                feed_dict = {
                    input_x_front: x_batch_front,
                    input_x_behind: x_batch_behind,
                    dropout_keep_prob: 1.0
                }
                batch_scores = sess.run(scores, feed_dict)
                all_scores = np.append(all_scores, batch_scores)

                batch_softmax_scores = sess.run(softmax_scores, feed_dict)
                all_softmax_scores = np.append(all_softmax_scores,
                                               batch_softmax_scores)

                batch_predictions = sess.run(predictions, feed_dict)
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

                batch_topKPreds = sess.run(topKPreds, feed_dict)
                all_topKPreds = np.append(all_topKPreds, batch_topKPreds)

            np.savetxt(SAVE_FILE,
                       list(zip(all_predictions, all_topKPreds)),
                       fmt='%s')

    logger.info("✔ Done.")
Ejemplo n.º 36
0
# nlp1.py
import tensorflow as tf
import numpy as np
import data_helpers
from tensorflow.contrib import learn

dev_sample_percentage = .1
positive_data_file = "./data/rt-polarity.pos"
negative_data_file = "./data/rt-polarity.neg"
embedding_dim = 120
batch_size = 40
num_epochs = 200

x_text, y = data_helpers.load_data_and_labels(positive_data_file, negative_data_file)

max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

tf.reset_default_graph()
Ejemplo n.º 37
0
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    #x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
    #y_test = np.argmax(y_test, axis=1)
    x_test, y_test = data_helpers.load_data_and_labels(
        FLAGS.positive_data_file, FLAGS.negative_data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
'''vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))'''

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
Ejemplo n.º 38
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.datasample_file_1, FLAGS.datasample_file_2, FLAGS.datasample_file_3, FLAGS.datasample_file_4, FLAGS.datasample_file_5)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
Ejemplo n.º 39
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.Artificial_Inteligence,
                                              FLAGS.Machine_Learning)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))