print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y, category = data_helpers.load_data_and_labels(FLAGS.data_file) print(category) y = np.array(data_helpers.transform_labels(y, category)) # Build vocabulary max_document_length = max([len(x) for x in x_text]) # vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(data_helpers.fit_transform(x_text, FLAGS.max_data_length)) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices, :] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del x, y, x_shuffled, y_shuffled
# Data Preparation # ================================================== # Load data print("Loading data...") x_text, x_char, y_text, handcraft, bag_of_entity, vocab, w2v, data_pos, data_entity, pos_vocab, entity_vocab, train_test_dev, class_label = data_helpers.load_data_and_labels(FLAGS.Training_Data, FLAGS.Test_data_entity) # Build vocabulary # max_document_length = max([len(x.split(" ")) for x in x_text]) max_document_length = 39 char_length = 50 vocab_processor = np.zeros([len(x_text), max_document_length+1]) x = data_helpers.fit_transform(x_text, vocab_processor, vocab) # Build vocabulary vocab_processor_pos = np.zeros([len(data_pos), max_document_length+1]) x_pos = data_helpers.fit_transform_pos(data_pos, vocab_processor_pos) # Build vocabulary vocab_processor_entity = np.zeros([len(data_entity), max_document_length+1]) x_entity = data_helpers.fit_transform_pos(data_entity, vocab_processor_entity) x_shuf, x_char_shuf, y_shuf, handcraft_shuf, bag_of_entity_shuf, x_pos_shuf, x_entity_shuf = x, x_char, y_text, handcraft, bag_of_entity, x_pos, x_entity offset = int(x_shuf.shape[0] * 0) x_shuffled, x_char_shuffled, y_shuffled, handcraft_shuffled, x_pos_shuffled, x_entity_shuffled, bag_of_entity_shuffled = x_shuf[offset:], x_char_shuf[offset:], y_shuf[offset:], handcraft_shuf[offset:], x_pos_shuf[offset:], x_entity_shuf[offset:], bag_of_entity_shuf[offset:]
# Parameters # ================================================== # Eval Parameters tf.flags.DEFINE_string("checkpoint_dir", T.checkpoint_dir, "Checkpoint directory from training run") FLAGS = tf.flags.FLAGS x_raw, y_test = data_helpers.load_data_and_labels("./test/pos","./test/neg") y_test = np.argmax(y_test, axis=1) # Map data into vocabulary vocab_processor = pickle.load( open(os.path.join(T.out_dir,"vocab"), "rb" ) ) x_list = [x.split(" ") for x in x_raw] x_test = np.array(list(data_helpers.fit_transform(vocab_processor, x_list, max_document_length, n_gram))) print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(T.checkpoint_dir) # Evaluation # ================================================== graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
FLAGS = tf.flags.FLAGS # Data Preparation # ================================================== # Load data print("Loading data...") x_text, x_char, x_ib, x_pos, x_mtopic, x_features, x_spId, x_hub, y_text, handcraft, vocab, w2v, pos_vocab, train_dev_index, data_pred, class_label = data_helpers.load_data_and_labels( FLAGS.Training_Data, FLAGS.Test_Data) # Build vocabulary max_document_length = max([len(x1.split(" ")) for x1 in x_text[0]]) print max_document_length # max_document_length = 25 vocab_processor = np.zeros([len(x_text[0]), max_document_length]) x = data_helpers.fit_transform(x_text[0], vocab_processor, vocab) vocab_processor = np.zeros([len(x_text[0]), max_document_length]) x_utt1 = data_helpers.fit_transform(x_text[1], vocab_processor, vocab) vocab_processor = np.zeros([len(x_text[0]), max_document_length]) x_utt2 = data_helpers.fit_transform(x_text[2], vocab_processor, vocab) vocab_processor = np.zeros([len(x_text[0]), max_document_length]) x_utt3 = data_helpers.fit_transform(x_text[3], vocab_processor, vocab) x_shuf, x_utt1_shuf, x_utt2_shuf, x_utt3_shuf, x_char_shuf, x_char1_shuf, x_char2_shuf, x_ib1_shuf, x_ib2_shuf, x_pos0_shuf, x_pos1_shuf, x_pos2_shuf, x_pos3_shuf, x_spId0_shuf, x_spId1_shuf, x_spId2_shuf, x_spId3_shuf, x_hub0_shuf, x_hub1_shuf, x_hub2_shuf, x_mtp0_shuf, x_mtp1_shuf, x_mtp2_shuf, x_feat0_shuf, x_feat1_shuf, x_feat2_shuf, x_feat3_shuf, y_shuf, handcraft_shuf = \ x, x_utt1, x_utt2, x_utt3, x_char[0], x_char[1], x_char[2], x_ib[0], x_ib[1], x_pos[0], x_pos[1], x_pos[2], x_pos[3], x_spId[0], x_spId[1], x_spId[2], x_spId[3], x_hub[0], x_hub[1], x_hub[2], x_mtopic[0], x_mtopic[1], x_mtopic[2], x_features[0], x_features[1], x_features[2], x_features[3], y_text, handcraft vocab_processor = np.zeros([len(x_text[0]), max_document_length]) x_pos0_shuf = data_helpers.fit_transform_pos(x_pos0_shuf, vocab_processor) vocab_processor = np.zeros([len(x_text[0]), max_document_length]) x_pos1_shuf = data_helpers.fit_transform_pos(x_pos1_shuf, vocab_processor)
def __init__(self): x_text, y = data_helpers.load_data_and_labels("./train/pos","./train/neg") # Build vocabulary x_list = [x.split(" ") for x in x_text] vocab_processor = data_helpers.n_grams(x_list, max_word_cnt, n_gram) print 'feed finished' x = np.array(data_helpers.fit_transform(vocab_processor, x_list, max_document_length, n_gram)) # print x[0] print 'fit transform finished' # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print("Vocabulary Size: {:d}".format(len(vocab_processor))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = MLP( sequence_length=x_train.shape[1], num_classes=2, vocab_size=len(vocab_processor), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters = FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) starter_learning_rate = 1e-3 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 3000, 0.96, staircase=True) optimizer = tf.train.AdamOptimizer(starter_learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) self.out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(self.out_dir)) # Summaries for loss and accuracy loss_summary = tf.scalar_summary("loss", cnn.loss) acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(self.out_dir, "summaries", "train") train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) dev_summary_dir = os.path.join(self.out_dir, "summaries", "dev") dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath(os.path.join(self.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(self.checkpoint_dir, "model") if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) # Write vocabulary pickle.dump(vocab_processor, open(os.path.join(self.out_dir,"vocab"), "wb" ) ) # Initialize all variables sess.run(tf.initialize_all_variables()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob:1 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # Generate batches t = list(zip(x_train, y_train)) batches = data_helpers.batch_iter( t, FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))