def preprocess(file_path_dem, file_path_rep): # inputs the positive and negative examples del_all_flags(tf.flags.FLAGS) tf.flags.DEFINE_string("positive_data_file", file_path_dem, "Data source for the positive data.") tf.flags.DEFINE_string("negative_data_file", file_path_rep, "Data source for the negative data.") FLAGS = tf.flags.FLAGS # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_train = x[shuffle_indices] y_train = y[shuffle_indices] del x, y print('Data Loaded.') print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print('Features Shape:', x_train.shape) print('Labels Shape:', y_train.shape, '\n') return x_train, y_train, vocab_processor
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, vocab_processor, x_dev, y_dev
nb_epoch=num_epochs, validation_split=val_split, verbose=2) model_name = 'imdb_' + model_variation + str(num_epochs) + '.h5' model.save_weights(model_name) else: print("Loading data test data...") neg_test_path = './data/imdb_test.neg' pos_test_path = './data/imdb_test.pos' # send data through cleaner function sentences, labels = data_helpers.load_data_and_labels(pos_test_path, neg_test_path) # Model Hyperparameters embedding_dim = 20 filter_sizes = (3, 4) num_filters = 3 dropout_prob = (0.7, 0.8) hidden_dims = 100 # Word2Vec parameters, see train_word2vec min_word_count = 1 # Minimum word count context = 10 # Context window size model_dir = 'word2vec_models' model_name = "{:d}features_{:d}minwords_{:d}context_{:s}".format(embedding_dim, min_word_count, context,
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") #load labels with open('labels.pkl', 'rb') as f: all_labels = pickle.load(f) # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["I love charles dickens book"] # y_test = [2, 4] y_test = None # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ==================================================
def __init__(self): x_text, y = data_helpers.load_data_and_labels("./train/pos","./train/neg") # Build vocabulary x_list = [x.split(" ") for x in x_text] vocab_processor = data_helpers.n_grams(x_list, max_word_cnt, n_gram) print 'feed finished' x = np.array(data_helpers.fit_transform(vocab_processor, x_list, max_document_length, n_gram)) # print x[0] print 'fit transform finished' # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print("Vocabulary Size: {:d}".format(len(vocab_processor))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = MLP( sequence_length=x_train.shape[1], num_classes=2, vocab_size=len(vocab_processor), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters = FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) starter_learning_rate = 1e-3 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 3000, 0.96, staircase=True) optimizer = tf.train.AdamOptimizer(starter_learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) self.out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(self.out_dir)) # Summaries for loss and accuracy loss_summary = tf.scalar_summary("loss", cnn.loss) acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(self.out_dir, "summaries", "train") train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) dev_summary_dir = os.path.join(self.out_dir, "summaries", "dev") dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath(os.path.join(self.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(self.checkpoint_dir, "model") if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) # Write vocabulary pickle.dump(vocab_processor, open(os.path.join(self.out_dir,"vocab"), "wb" ) ) # Initialize all variables sess.run(tf.initialize_all_variables()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob:1 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # Generate batches t = list(zip(x_train, y_train)) batches = data_helpers.batch_iter( t, FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS(sys.argv) print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Load data print("Loading data...") x_text, y = dh.load_data_and_labels(train_test_text, Y_train_test) x_1, y_train = dh.load_data_and_labels(train_text, Y_train) x_2, y_test = dh.load_data_and_labels(test_text, Y_test) # Build vocabulary print("building vocab...") max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform((x_text)))) x_train = np.array(list(vocab_processor.fit_transform((x_1)))) x_test = np.array(list(vocab_processor.fit_transform((x_2)))) g = tf.Graph() with g.as_default(): sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
import tensorflow as tf import data_helpers # Define Parameters tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") tf.flags.DEFINE_string("sentence", "the movie was bad", "sentence to classify") FLAGS = tf.flags.FLAGS ####################################################################################################################### # process the raw sentence new_review = data_helpers.clean_senetnce(FLAGS.sentence) # load vocabulary sentences, _ = data_helpers.load_data_and_labels() sequence_length = max(len(x) for x in sentences) sentences_padded = data_helpers.pad_sentences(sentences) vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded) num_padding = sequence_length - len(new_review) new_sentence = new_review + ["<PAD/>"] * num_padding # convert sentence to input matrix array = [] for word in new_sentence: try: word_vector=vocabulary[word] except KeyError: word_vector=vocabulary["<PAD/>"] array.append(word_vector)
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x, y, word_count, letter_count = data_helpers.load_data_and_labels( FLAGS.data_path, FLAGS.vocab_path, FLAGS.max_length) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] print(x_train.shape) print(y_train.shape) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
def main(_): # FLAGS._parse_flags() # print("\nParameters:") # for attr, value in sorted(FLAGS.items()): # print("{}={}".format(attr.upper(), value)) # print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.train_file, FLAGS.num_class) # Build vocabulary if FLAGS.embedding_type == "random": vocab_processor = learn.preprocessing.VocabularyProcessor( FLAGS.max_length) x = np.array(list(vocab_processor.fit_transform(x_text))) print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) elif FLAGS.embedding_type == "none-static": x, w2v = [], KeyedVectors.load_word2vec_format(FLAGS.word2vec_model, binary=False) vocab, embeddings = w2v.vocab, np.zeros( (len(w2v.index2word), w2v.vector_size), dtype=np.float32) for k, v in vocab.items(): embeddings[v.index] = w2v[k] for item in x_text: x.append([ w2v.vocab[word].index if word in w2v.vocab else w2v.vocab["__UNK__"].index for word in item.split(" ") ]) x = np.array(x, dtype=np.int32) print("Vocabulary Size: {:d}".format(len(w2v.vocab))) else: raise RuntimeError("embedding_type is random or none-static") # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): if FLAGS.embedding_type == "random": cnn = TextCNN(sequence_length=FLAGS.max_length, num_classes=FLAGS.num_class, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) elif FLAGS.embedding_type == "none-static": cnn = TextCNN(sequence_length=FLAGS.max_length, num_classes=FLAGS.num_class, embedding=embeddings, embedding_size=embeddings.shape[1], filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) lr = tf.train.exponential_decay(FLAGS.learning_rate, global_step, 2500, 0.8, staircase=True) optimizer = tf.train.GradientDescentOptimizer(lr) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints, save_relative_paths=True) # Write vocabulary if FLAGS.embedding_type == "random": vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y, seqlen = data_helpers.load_data_and_labels() # Build vocabulary max_document_length = max(seqlen) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] seqlen_shuffled = seqlen[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation
def eval(): with tf.device('/gpu:0'): x_text, y, desc1, desc2, wType, type_index = data_helpers.load_data_and_labels( FLAGS.test_path) text_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore( text_path) x = np.array(list(text_vocab_processor.transform(x_text))) checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_text = graph.get_operation_by_name("input_text").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] emb_dropout_keep_prob = graph.get_operation_by_name( "emb_dropout_keep_prob").outputs[0] rnn_dropout_keep_prob = graph.get_operation_by_name( "rnn_dropout_keep_prob").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here preds = [] for x_batch in batches: pred = sess.run( predictions, { input_text: x_batch, emb_dropout_keep_prob: 1.0, rnn_dropout_keep_prob: 1.0, dropout_keep_prob: 1.0 }) preds.append(pred) preds = np.concatenate(preds) truths = np.argmax(y, axis=1) print(truths) result = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] # result = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]] for i in range(len(preds)): result[truths[i]][preds[i]] += 1 print("===the prediction result===") print("\t0\t1\t2\t3\t4\t5") count = 0 for i in range(len(result)): print( str(count) + "\t" + str(result[i][0]) + "\t" + str(result[i][1]) + "\t" + str(result[i][2]) + "\t" + str(result[i][3]) + "\t" + str(result[i][4]) + "\t" + str(result[i][5])) count += 1 precision = [] recall = [] for j in range(len(result)): p = round(result[j][j] / sum(result[j]), 3) * 100 col = [x[j] for x in result] r = round(result[j][j] / sum(col), 3) * 100 precision.append(p) recall.append(r) f1_scores = [] for k in range(len(precision)): if (precision[k] + recall[k]) == 0: f1_scores.append(0) else: f1 = round((2 * precision[k] * recall[k]) / (precision[k] + recall[k]), 1) f1_scores.append(f1) print(precision, recall, f1_scores) relationName = [ "before", "after", "simultaneous", "include", "be_included", "vague" ] for l in range(6): print(relationName[l] + "acc:" + str(precision[l]) + "%,recall:" + str(recall[l]) + "%,f1:" + str(f1_scores[l]) + "%") precision_ave = round(sum(precision) / 6, 1) recall_ave = round(sum(recall) / 6, 1) # f1_score_ave = round(sum(f1_scores)/6,1) f1_score_ave = f1_score(truths, preds, labels=np.array(range(6)), average="micro") print("acc_avg:" + str(precision_ave) + "%,recall_avg:" + str(recall_ave) + "%,f1:" + str(f1_score_ave) + "%") print("modelFile:" + str(FLAGS.checkpoint_dir))
def test_cnn(): """Test CNN model.""" # Load data logger.info("✔ Loading data...") logger.info('Recommand padding Sequence length is: {}'.format( FLAGS.pad_seq_len)) logger.info('✔︎ Test data processing...') test_data = data_helpers.load_data_and_labels(FLAGS.test_data_file, FLAGS.num_classes, FLAGS.embedding_dim) logger.info('✔︎ Test data padding...') x_test, y_test = data_helpers.pad_data(test_data, FLAGS.pad_seq_len) y_test_bind = test_data.labels_bind # Build vocabulary VOCAB_SIZE = data_helpers.load_vocab_size(FLAGS.embedding_dim) pretrained_word2vec_matrix = data_helpers.load_word2vec_matrix( VOCAB_SIZE, FLAGS.embedding_dim) # Load cnn model logger.info("✔ Loading model...") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) logger.info(checkpoint_file) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # pre-trained_word2vec pretrained_embedding = graph.get_operation_by_name( "embedding/W").outputs[0] # Tensors we want to evaluate logits = graph.get_operation_by_name("output/logits").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list( zip(x_test, y_test, y_test_bind)), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predicitons = [] eval_loss, eval_rec, eval_acc, eval_counter = 0.0, 0.0, 0.0, 0 for batch_test in batches: x_batch_test, y_batch_test, y_batch_test_bind = zip( *batch_test) feed_dict = {input_x: x_batch_test, dropout_keep_prob: 1.0} batch_logits = sess.run(logits, feed_dict) if FLAGS.use_classbind_or_not == 'Y': predicted_labels = data_helpers.get_label_using_logits_and_classbind( batch_logits, y_batch_test_bind, top_number=FLAGS.top_num) if FLAGS.use_classbind_or_not == 'N': predicted_labels = data_helpers.get_label_using_logits( batch_logits, top_number=FLAGS.top_num) all_predicitons = np.append(all_predicitons, predicted_labels) cur_rec, cur_acc = 0.0, 0.0 for index, predicted_label in enumerate(predicted_labels): rec_inc, acc_inc = data_helpers.cal_rec_and_acc( predicted_label, y_batch_test[index]) cur_rec, cur_acc = cur_rec + rec_inc, cur_acc + acc_inc cur_rec = cur_rec / len(y_batch_test) cur_acc = cur_acc / len(y_batch_test) eval_rec, eval_acc, eval_counter = eval_rec + cur_rec, eval_acc + cur_acc, eval_counter + 1 logger.info( "✔︎ validation batch {} finished.".format(eval_counter)) eval_rec = float(eval_rec / eval_counter) eval_acc = float(eval_acc / eval_counter) logger.info("☛ Recall {:g}, Accuracy {:g}".format( eval_rec, eval_acc)) np.savetxt(SAVE_FILE, list(zip(all_predicitons)), fmt='%s') logger.info("✔ Done.")
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.test_data_folder) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
def train(): # parse arguments FLAGS(sys.argv) print(FLAGS.batch_size) # This is not working any more, because api has been changed!!! print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training # ================================================== with tf.Graph().as_default(): sequence_length = x_train.shape[1] num_classes = y_train.shape[1] input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) with tf.Session(config=session_conf) as sess: cnn_text = TextModel( input_x, input_y, max_sequence_len=sequence_length, num_classes=num_classes, vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) prediction, loss, optimize, accuracy = cnn_text.get_model_variables() # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", loss) acc_summary = tf.summary.scalar("accuracy", accuracy) # train summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # eval summaries eval_summary_op = tf.summary.merge([loss_summary, acc_summary]) eval_summary_dir = os.path.join(out_dir, "summaries", "eval") eval_summary_writer = tf.summary.FileWriter(eval_summary_dir, sess.graph) # checkpoint directory checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") # tensorflow assumes this directory already exists, so we need to create it if it not exists if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # initialize all variables init_g = tf.global_variables_initializer() init_l = tf.local_variables_initializer() sess.run(init_l) sess.run(init_g) def train_step(x_batch, y_batch): """ train_step """ feed_dict = { cnn_text.data: x_batch, cnn_text.target: y_batch, cnn_text.dropout_keep_prob: FLAGS.dropout_keep_prob } _, summaries, train_loss, train_accuracy = sess.run( [optimize, train_summary_op, loss, accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() current_step = tf.train.global_step(sess, cnn_text.global_step) print("{0}: step {1}, loss {2:g}, acc {3:g}".format(time_str, current_step, train_loss, train_accuracy)) train_summary_writer.add_summary(summaries) def eval_step(x_batch, y_batch): """ eval_step """ feed_dict = { cnn_text.data: x_batch, cnn_text.target: y_batch, cnn_text.dropout_keep_prob: 1.0 } step, summaries, eval_loss, eval_accuracy = sess.run( [cnn_text.global_step, eval_summary_op, loss, accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("evaluation {0}: step {1}, loss {2:g}, acc {3:g}".format(time_str, step, eval_loss, eval_accuracy)) eval_summary_writer.add_summary(summaries) # generate batches batches = data_helpers.batch_iter( list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # training loop, for each batch ... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, cnn_text.global_step) if 0 == current_step % FLAGS.evaluate_every: eval_step(x_dev, y_dev) if 0 == current_step % FLAGS.checkpoint_every: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("saved model checkpoint to {0}".format(path))
def train(): with tf.device('/cpu:0'): x_text, y = data_helpers.load_data_and_labels(FLAGS.pos_dir, FLAGS.neg_dir) text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length, min_frequency=FLAGS.min_frequency) x = np.array(list(text_vocab_processor.fit_transform(x_text))) print("Text Vocabulary Size: {:d}".format( len(text_vocab_processor.vocabulary_))) print("x = {0}".format(x.shape)) print("y = {0}".format(y.shape)) print("") # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): rcnn = HAN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(text_vocab_processor.vocabulary_), word_embedding_size=FLAGS.word_embedding_dim, context_embedding_size=FLAGS.context_embedding_dim, attention_size=FLAGS.attention_size, hidden_size=FLAGS.hidden_size, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( rcnn.loss, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", rcnn.loss) acc_summary = tf.summary.scalar("accuracy", rcnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary text_vocab_processor.save(os.path.join(out_dir, "text_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # Pre-trained word2vec if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform( -0.25, 0.25, (len(text_vocab_processor.vocabulary_), FLAGS.word_embedding_dim)) # load any vectors from the word2vec print("Load word2vec file {0}".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in range(vocab_size): word = [] while True: ch = f.read(1).decode('latin-1') if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = text_vocab_processor.vocabulary_.get(word) if idx != 0: initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) sess.run(rcnn.W_text.assign(initW)) print("Success to load pre-trained word2vec model!\n") # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) # Train feed_dict = { rcnn.input_text: x_batch, rcnn.input_y: y_batch, rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, rcnn.loss, rcnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") feed_dict_dev = { rcnn.input_text: x_dev, rcnn.input_y: y_dev, rcnn.dropout_keep_prob: 1.0 } summaries_dev, loss, accuracy = sess.run( [dev_summary_op, rcnn.loss, rcnn.accuracy], feed_dict_dev) dev_summary_writer.add_summary(summaries_dev, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}\n".format( time_str, step, loss, accuracy)) # Model checkpoint if step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path))
import sys import numpy as np import tensorflow as tf from sklearn.model_selection import train_test_split from tensorflow.contrib import learn from data_helpers import load_data_and_labels, batch_iter from text_cnn import TextCNN # Load original data path = sys.path[0] pos_filename = path + "/data/rt-polarity.pos" neg_filename = path + "/data/rt-polarity.neg" X_data, y_data = load_data_and_labels(pos_filename, neg_filename) max_document_length = max([len(sen.split(" ")) for sen in X_data]) print("Max_document_length:,", max_document_length) # Create the vacabulary vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) # The idx data x = np.array(list(vocab_processor.fit_transform(X_data)), dtype=np.float32) y = np.array(y_data, dtype=np.int32) vocabulary_size = len(vocab_processor.vocabulary_) print("The size of vocabulary:", vocabulary_size) # Split the data X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1111) print("X_train shape {0}, y_train shape {1}".format(X_train.shape, y_train.shape)) print("X_test shape {0}, y_test shape {1}".format(X_test.shape, y_test.shape)) # The parameters of RNN
print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== tasks = ['anger', 'fear', 'joy', 'sadness'] for task in tasks: print 'running for task', task # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.data_dir, task, 'train') # Build vocabulary #max_document_length = max([len(x.split(" ")) for x in x_text]) #vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) #x = np.array(list(vocab_processor.fit_transform(x_text))) x, vocab_vector = data_helpers.build_vocabulary(x_text) #np.save('tmp/x.data', x) #np.save('tmp/vocab_vector.data', vocab_vector) #x = np.load('tmp/x.data.npy') #vocab_vector = np.load('tmp/vocab_vector.data.npy') # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y)))
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels() # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
macro_precison = total_precision / len(emotion) macro_recall = total_recall / len(emotion) if (macro_precison + macro_recall) != 0: macro_f1 = 2.0 * macro_precison * macro_recall / (macro_precison + macro_recall) for e in range(len(emotion)): print(emotion[e], ' Precision:', performance[e][0], ' Recall:', performance[e][1], ' F1:', performance[e][2]) print("Micro F1 Score:", micro_f1) print("Macro F1 Score:", macro_f1) # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.test_file, FLAGS.gold_labels) #y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== print("preprocess data...") # before loading data cut words........txt file in style: sample1 \t label # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.label_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) # print(x_text) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) y = np.array(y) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices]
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y_label = data_helpers.load_data_and_labels(FLAGS.data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) #分词 vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) y = np.array(y_label) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) print(type(x), type(y)) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices]
print("Using word2vec model file : {}".format(trained_word2vec_model_file)) # validate training params file training_params_file = os.path.join(FLAGS.checkpoint_dir, "..", "training_params.pickle") if not os.path.exists(training_params_file): print("Training params file \'{}\' is missing!".format(training_params_file)) print("Using training params file : {}".format(training_params_file)) # Load params params = data_helpers.loadDict(training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) # Load data if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.input_text_file, FLAGS.input_label_file, num_labels) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length) x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph()
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:-------------------------------------------------------") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data for CNN classification model--------------------------") x_text, y = data_helpers.load_data_and_labels(FLAGS.labeled_data_dir) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) #max_document_length = max([len(x) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation
FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") domain_dir = os.path.join(os.getenv("OPIE_DIR"), "data", "domains", FLAGS.domain) multi_opin_expr_dir = os.path.join(domain_dir, "multiopinexpr") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y, vocab = data_helpers.load_data_and_labels( domain_dir, max_document_length=FLAGS.max_document_length, shuffle_data=FLAGS.shuffle_data) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) min_document_length = min([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length, vocabulary=vocab, tokenizer_fn=data_helpers.tokenizer) x = np.array(list(vocab_processor.fit_transform(x_text))) # Split train/test set # TODO: This is very crude, should use cross-validation test_num = 30000 x_train, x_dev = x[:-test_num], x[-test_num:]
def main(unused_argv): x_data, y = data_helpers.load_data_and_labels(conf.pos, conf.neg) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_data]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_data))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set test_sample_index = -1 * int(conf.test_sam * float(len(y))) x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[ test_sample_index:] y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[ test_sample_index:] x_train1 = np.asarray(x_train) y_train1 = np.asarray(y_train, dtype=np.int32) x_test1 = np.asarray(x_test) y_test1 = np.asarray(y_test, dtype=np.int32) del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_test))) # Create the Estimator text_classifier = tf.estimator.Estimator(model_fn=model_fn_1, model_dir="./models") # Set up logging for predictions tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": x_train1}, y=y_train1, batch_size=conf.batch_size, num_epochs=None, shuffle=True) text_classifier.train(input_fn=train_input_fn, steps=30000, hooks=[logging_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": x_test1}, y=y_test1, batch_size=conf.test_num, num_epochs=1, shuffle=False) eval_results = text_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels() y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off.", 'it is very nice.', 'f**k off.', 'its not bad.'] y_test = [1, 0, 1, 0, 1] yy_test = [[1,0], [0,1], [1,0], [0,1], [1,0]] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ==================================================
def train(): with tf.device('/cpu:0'): train_text, train_y, train_pos1, train_pos2, train_x_text_clean, train_sentence_len = data_helpers.load_data_and_labels(FLAGS.train_path) with tf.device('/cpu:0'): test_text, test_y, test_pos1, test_pos2, test_x_text_clean, test_sentence_len = data_helpers.load_data_and_labels(FLAGS.test_path) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = FLAGS.max_sentence_length # print("text:",x_text) vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length) vocab_processor.fit(train_text + test_text) train_x = np.array(list(vocab_processor.transform(train_text))) test_x = np.array(list(vocab_processor.transform(test_text))) train_text = np.array(train_text) print("train_text",train_text[0:2]) test_text = np.array(test_text) print("\nText Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("train_x = {0}".format(train_x.shape)) # (8000,90) print("train_y = {0}".format(train_y.shape)) # (8000,19) print("test_x = {0}".format(test_x.shape)) # (2717, 90) print("test_y = {0}".format(test_y.shape)) # (2717,19) # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length) pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2) train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1))) train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2))) test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1))) test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2))) print("\nPosition Vocabulary Size: {:d}".format(len(pos_vocab_processor.vocabulary_))) print("train_p1 = {0}".format(train_p1.shape)) # (8000, 90) print("test_p1 = {0}".format(test_p1.shape)) # (2717, 90) print("") # Randomly shuffle data to split into train and test(dev) # np.random.seed(10) # # shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000 # x_shuffled = x[shuffle_indices] # p1_shuffled = p1[shuffle_indices] # p2_shuffled = p2[shuffle_indices] # y_shuffled = y[shuffle_indices] # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800 # x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] # p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[dev_sample_index:] # p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[dev_sample_index:] # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] # print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) # print(x_train) # print(np.array(x_train)) # print(x_dev) # print(np.array(x_dev)) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=FLAGS.max_sentence_length, #90 num_classes=train_y.shape[1],#19 text_vocab_size=len(vocab_processor.vocabulary_), #19151 text_embedding_size=FLAGS.text_embedding_size,#300 pos_vocab_size=len(pos_vocab_processor.vocabulary_),#162 pos_embedding_size=FLAGS.pos_embedding_dim,#50 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), #2,3,4,5 num_filters=FLAGS.num_filters, #128 l2_reg_lambda=FLAGS.l2_reg_lambda, #1e-5 use_elmo = (FLAGS.embeddings == 'elmo')) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(cnn.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("\nWriting to {}\n".format(out_dir)) # Logger logger = Logger(out_dir) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) if FLAGS.embeddings == "word2vec": pretrain_W = utils.load_word2vec('resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size,vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained word2vec model!\n") elif FLAGS.embeddings == "glove100": pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained glove100 model!\n") elif FLAGS.embeddings == "glove300": pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor) sess.run(cnn.W_text.assign(pretrain_W)) print("Success to load pre-trained glove300 model!\n") # Generate batches train_batches = data_helpers.batch_iter(list(zip(train_x, train_y, train_text, train_p1, train_p2)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for train_batch in train_batches: train_bx, train_by, train_btxt,train_bp1, train_bp2 = zip(*train_batch) # print("train_bxt",list(train_btxt)[:2]) # print(np.array(train_be1).shape) #(20, ) # print(train_be1) feed_dict = { cnn.input_text: train_bx, cnn.input_y: train_by, cnn.input_x_text: list(train_btxt), cnn.input_p1: train_bp1, cnn.input_p2: train_bp2, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: logger.logging_train(step, loss, accuracy) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") # Generate batches test_batches = data_helpers.batch_iter(list(zip(test_x, test_y, test_text, test_p1, test_p2)), FLAGS.batch_size, 1, shuffle=False) # Training loop. For each batch... losses = 0.0 accuracy = 0.0 predictions = [] iter_cnt = 0 for test_batch in test_batches: test_bx, test_by, test_btxt, test_bp1, test_bp2 = zip(*test_batch) feed_dict = { cnn.input_text: test_bx, cnn.input_y: test_by, cnn.input_x_text: list(test_btxt), cnn.input_p1: test_bp1, cnn.input_p2: test_bp2, cnn.dropout_keep_prob: 1.0 } loss, acc, pred = sess.run( [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict) losses += loss accuracy += acc predictions += pred.tolist() iter_cnt += 1 losses /= iter_cnt accuracy /= iter_cnt predictions = np.array(predictions, dtype='int') logger.logging_eval(step, loss, accuracy, predictions) # Model checkpoint if best_f1 < logger.best_f1: best_f1 = logger.best_f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_files, FLAGS.negative_data_files) # Build vocabulary max_document_length = max([len(nltk.word_tokenize(x)) for x in x_text]) print("Processing vocab...") def tokenizer(iterator): for value in iterator: yield nltk.word_tokenize(value) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length, tokenizer_fn=tokenizer) x = np.array(list(vocab_processor.fit_transform(x_text)))
def evaluate(): # parse arguments FLAGS(sys.argv) print(FLAGS.batch_size) # map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") print(vocab_path) vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name("cnn_output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0}) all_predictions = np.concatenate([all_predictions, batch_predictions]) # Print accuracy if y_test is defined if y_test is not None: correct_predictions = float(sum(all_predictions == y_test)) print("Total number of test examples: {}".format(len(y_test))) print("Accuracy: {:g}".format(correct_predictions/float(len(y_test)))) # Save the evaluation to a csv predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions)) out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") print("Saving evaluation to {0}".format(out_path)) with open(out_path, 'w') as f: import csv csv.writer(f).writerows(predictions_human_readable)
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS # FLAGS._parse_flags() # print("\nParameters:") # for attr, value in sorted(FLAGS.__flags.items()): # print("{}={}".format(attr.upper(), value)) # print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.data_dir) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
def test_cnn(): """Test CNN model.""" # Load data logger.info("✔ Loading data...") logger.info('Recommand padding Sequence length is: {0}'.format( FLAGS.pad_seq_len)) logger.info('✔︎ Test data processing...') test_data = data_helpers.load_data_and_labels(FLAGS.test_data_file, FLAGS.embedding_dim) logger.info('✔︎ Test data padding...') x_test_front, x_test_behind, y_test = data_helpers.pad_data( test_data, FLAGS.pad_seq_len) # Build vocabulary VOCAB_SIZE = data_helpers.load_vocab_size(FLAGS.embedding_dim) pretrained_word2vec_matrix = data_helpers.load_word2vec_matrix( VOCAB_SIZE, FLAGS.embedding_dim) # Load cnn model logger.info("✔ Loading model...") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) logger.info(checkpoint_file) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{0}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x_front = graph.get_operation_by_name( "input_x_front").outputs[0] input_x_behind = graph.get_operation_by_name( "input_x_behind").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # pre-trained_word2vec pretrained_embedding = graph.get_operation_by_name( "embedding/embedding").outputs[0] # Tensors we want to evaluate scores = graph.get_operation_by_name("output/scores").outputs predictions = graph.get_operation_by_name( "output/predictions").outputs[0] softmax_scores = graph.get_operation_by_name( "output/SoftMax_scores").outputs[0] topKPreds = graph.get_operation_by_name( "output/topKPreds").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list( zip(x_test_front, x_test_behind)), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_scores = [] all_softmax_scores = [] all_predictions = [] all_topKPreds = [] for x_test_batch in batches: x_batch_front, x_batch_behind = zip(*x_test_batch) feed_dict = { input_x_front: x_batch_front, input_x_behind: x_batch_behind, dropout_keep_prob: 1.0 } batch_scores = sess.run(scores, feed_dict) all_scores = np.append(all_scores, batch_scores) batch_softmax_scores = sess.run(softmax_scores, feed_dict) all_softmax_scores = np.append(all_softmax_scores, batch_softmax_scores) batch_predictions = sess.run(predictions, feed_dict) all_predictions = np.concatenate( [all_predictions, batch_predictions]) batch_topKPreds = sess.run(topKPreds, feed_dict) all_topKPreds = np.append(all_topKPreds, batch_topKPreds) np.savetxt(SAVE_FILE, list(zip(all_predictions, all_topKPreds)), fmt='%s') logger.info("✔ Done.")
# nlp1.py import tensorflow as tf import numpy as np import data_helpers from tensorflow.contrib import learn dev_sample_percentage = .1 positive_data_file = "./data/rt-polarity.pos" negative_data_file = "./data/rt-polarity.neg" embedding_dim = 120 batch_size = 40 num_epochs = 200 x_text, y = data_helpers.load_data_and_labels(positive_data_file, negative_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) tf.reset_default_graph()
"Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: #x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) #y_test = np.argmax(y_test, axis=1) x_test, y_test = data_helpers.load_data_and_labels( FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary '''vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw)))''' print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.datasample_file_1, FLAGS.datasample_file_2, FLAGS.datasample_file_3, FLAGS.datasample_file_4, FLAGS.datasample_file_5) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.Artificial_Inteligence, FLAGS.Machine_Learning) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))