print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")

x_text, y, category = data_helpers.load_data_and_labels(FLAGS.data_file)
print(category)
y = np.array(data_helpers.transform_labels(y, category))

# Build vocabulary
max_document_length = max([len(x) for x in x_text])
# vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(data_helpers.fit_transform(x_text, FLAGS.max_data_length))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices, :]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

del x, y, x_shuffled, y_shuffled
Esempio n. 2
0
# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, x_char, y_text, handcraft, bag_of_entity, vocab, w2v, data_pos, data_entity, pos_vocab, entity_vocab, train_test_dev, class_label = data_helpers.load_data_and_labels(FLAGS.Training_Data, FLAGS.Test_data_entity)

# Build vocabulary
# max_document_length = max([len(x.split(" ")) for x in x_text])

max_document_length = 39
char_length = 50

vocab_processor = np.zeros([len(x_text), max_document_length+1])
x = data_helpers.fit_transform(x_text, vocab_processor, vocab)

# Build vocabulary
vocab_processor_pos = np.zeros([len(data_pos), max_document_length+1])
x_pos = data_helpers.fit_transform_pos(data_pos, vocab_processor_pos)


# Build vocabulary
vocab_processor_entity = np.zeros([len(data_entity), max_document_length+1])
x_entity = data_helpers.fit_transform_pos(data_entity, vocab_processor_entity)


x_shuf, x_char_shuf, y_shuf, handcraft_shuf, bag_of_entity_shuf, x_pos_shuf, x_entity_shuf = x, x_char, y_text, handcraft, bag_of_entity, x_pos, x_entity

offset = int(x_shuf.shape[0] * 0)
x_shuffled, x_char_shuffled, y_shuffled, handcraft_shuffled, x_pos_shuffled, x_entity_shuffled, bag_of_entity_shuffled = x_shuf[offset:], x_char_shuf[offset:], y_shuf[offset:], handcraft_shuf[offset:], x_pos_shuf[offset:], x_entity_shuf[offset:], bag_of_entity_shuf[offset:]
Esempio n. 3
0
# Parameters
# ==================================================

# Eval Parameters
tf.flags.DEFINE_string("checkpoint_dir", T.checkpoint_dir, "Checkpoint directory from training run")
FLAGS = tf.flags.FLAGS


x_raw, y_test = data_helpers.load_data_and_labels("./test/pos","./test/neg")
y_test = np.argmax(y_test, axis=1)


# Map data into vocabulary
vocab_processor = pickle.load( open(os.path.join(T.out_dir,"vocab"), "rb" ) )
x_list = [x.split(" ") for x in x_raw]
x_test = np.array(list(data_helpers.fit_transform(vocab_processor, x_list, max_document_length, n_gram)))

print("\nEvaluating...\n")
checkpoint_file = tf.train.latest_checkpoint(T.checkpoint_dir)

# Evaluation
# ==================================================
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
Esempio n. 4
0
FLAGS = tf.flags.FLAGS

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, x_char, x_ib, x_pos, x_mtopic, x_features, x_spId, x_hub, y_text, handcraft, vocab, w2v, pos_vocab, train_dev_index, data_pred, class_label = data_helpers.load_data_and_labels(
    FLAGS.Training_Data, FLAGS.Test_Data)

# Build vocabulary
max_document_length = max([len(x1.split(" ")) for x1 in x_text[0]])
print max_document_length
# max_document_length = 25
vocab_processor = np.zeros([len(x_text[0]), max_document_length])
x = data_helpers.fit_transform(x_text[0], vocab_processor, vocab)
vocab_processor = np.zeros([len(x_text[0]), max_document_length])
x_utt1 = data_helpers.fit_transform(x_text[1], vocab_processor, vocab)
vocab_processor = np.zeros([len(x_text[0]), max_document_length])
x_utt2 = data_helpers.fit_transform(x_text[2], vocab_processor, vocab)
vocab_processor = np.zeros([len(x_text[0]), max_document_length])
x_utt3 = data_helpers.fit_transform(x_text[3], vocab_processor, vocab)


x_shuf, x_utt1_shuf, x_utt2_shuf, x_utt3_shuf, x_char_shuf, x_char1_shuf, x_char2_shuf, x_ib1_shuf, x_ib2_shuf, x_pos0_shuf, x_pos1_shuf, x_pos2_shuf, x_pos3_shuf, x_spId0_shuf, x_spId1_shuf, x_spId2_shuf, x_spId3_shuf, x_hub0_shuf, x_hub1_shuf, x_hub2_shuf, x_mtp0_shuf, x_mtp1_shuf, x_mtp2_shuf, x_feat0_shuf, x_feat1_shuf, x_feat2_shuf, x_feat3_shuf, y_shuf, handcraft_shuf = \
x, x_utt1, x_utt2, x_utt3, x_char[0], x_char[1], x_char[2], x_ib[0], x_ib[1], x_pos[0], x_pos[1], x_pos[2], x_pos[3], x_spId[0], x_spId[1], x_spId[2], x_spId[3], x_hub[0], x_hub[1], x_hub[2], x_mtopic[0], x_mtopic[1], x_mtopic[2], x_features[0],  x_features[1], x_features[2], x_features[3], y_text, handcraft

vocab_processor = np.zeros([len(x_text[0]), max_document_length])
x_pos0_shuf = data_helpers.fit_transform_pos(x_pos0_shuf, vocab_processor)
vocab_processor = np.zeros([len(x_text[0]), max_document_length])
x_pos1_shuf = data_helpers.fit_transform_pos(x_pos1_shuf, vocab_processor)
    def __init__(self):

        x_text, y = data_helpers.load_data_and_labels("./train/pos","./train/neg")

        # Build vocabulary
        x_list = [x.split(" ") for x in x_text]
        vocab_processor = data_helpers.n_grams(x_list, max_word_cnt, n_gram)
        print 'feed finished'
        x = np.array(data_helpers.fit_transform(vocab_processor, x_list, max_document_length, n_gram))
        # print x[0]
        print 'fit transform finished'


        # Randomly shuffle data
        np.random.seed(10)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x_shuffled = x[shuffle_indices]
        y_shuffled = y[shuffle_indices]

        # Split train/test set
        # TODO: This is very crude, should use cross-validation
        x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
        y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
        print("Vocabulary Size: {:d}".format(len(vocab_processor)))
        print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))


        # Training
        # ==================================================

        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
              allow_soft_placement=FLAGS.allow_soft_placement,
              log_device_placement=FLAGS.log_device_placement)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                cnn = MLP(
                    sequence_length=x_train.shape[1],
                    num_classes=2,
                    vocab_size=len(vocab_processor),
                    embedding_size=FLAGS.embedding_dim,
                    filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                    num_filters = FLAGS.num_filters,
                    l2_reg_lambda=FLAGS.l2_reg_lambda)

                # Define Training procedure
                global_step = tf.Variable(0, name="global_step", trainable=False)
                starter_learning_rate = 1e-3
                learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                           3000, 0.96, staircase=True)
                optimizer = tf.train.AdamOptimizer(starter_learning_rate)
                grads_and_vars = optimizer.compute_gradients(cnn.loss)
                train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

                # Keep track of gradient values and sparsity (optional)
                grad_summaries = []
                for g, v in grads_and_vars:
                    if g is not None:
                        grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                        sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                        grad_summaries.append(grad_hist_summary)
                        grad_summaries.append(sparsity_summary)
                grad_summaries_merged = tf.merge_summary(grad_summaries)

                # Output directory for models and summaries
                timestamp = str(int(time.time()))
                self.out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
                print("Writing to {}\n".format(self.out_dir))

                # Summaries for loss and accuracy
                loss_summary = tf.scalar_summary("loss", cnn.loss)
                acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

                # Train Summaries
                train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
                train_summary_dir = os.path.join(self.out_dir, "summaries", "train")
                train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

                # Dev summaries
                dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
                dev_summary_dir = os.path.join(self.out_dir, "summaries", "dev")
                dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

                # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
                self.checkpoint_dir = os.path.abspath(os.path.join(self.out_dir, "checkpoints"))
                checkpoint_prefix = os.path.join(self.checkpoint_dir, "model")
                if not os.path.exists(self.checkpoint_dir):
                    os.makedirs(self.checkpoint_dir)
                saver = tf.train.Saver(tf.all_variables())
                # Write vocabulary
                pickle.dump(vocab_processor, open(os.path.join(self.out_dir,"vocab"), "wb" ) )
                # Initialize all variables
                sess.run(tf.initialize_all_variables())

                def train_step(x_batch, y_batch):
                    """
                    A single training step
                    """
                    feed_dict = {
                      cnn.input_x: x_batch,
                      cnn.input_y: y_batch,
                      cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }
                    _, step, summaries, loss, accuracy = sess.run(
                        [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                        feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                    train_summary_writer.add_summary(summaries, step)

                def dev_step(x_batch, y_batch, writer=None):
                    """
                    Evaluates model on a dev set
                    """
                    feed_dict = {
                      cnn.input_x: x_batch,
                      cnn.input_y: y_batch,
                      cnn.dropout_keep_prob:1
                    }
                    step, summaries, loss, accuracy = sess.run(
                        [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                        feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                    if writer:
                        writer.add_summary(summaries, step)

                # Generate batches
                t = list(zip(x_train, y_train))
                batches = data_helpers.batch_iter(
                    t, FLAGS.batch_size, FLAGS.num_epochs)
                # Training loop. For each batch...
                for batch in batches:
                    x_batch, y_batch = zip(*batch)
                    train_step(x_batch, y_batch)
                    current_step = tf.train.global_step(sess, global_step)
                    if current_step % FLAGS.evaluate_every == 0:
                        print("\nEvaluation:")
                        dev_step(x_dev, y_dev, writer=dev_summary_writer)
                        print("")
                    if current_step % FLAGS.checkpoint_every == 0:
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        print("Saved model checkpoint to {}\n".format(path))