tf.flags.DEFINE_string("training_file_neg", "twitter-datasets/train_neg.txt", "Path and name for the training file (neg examples)") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helper.load_data_and_labels(FLAGS.training_file_pos, FLAGS.training_file_neg) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # It's better to use cross-validation
def train_cnn(dataset_name): """Step 0: load sentences, labels, and training parameters""" dataset = '../dataset/' + dataset_name + '_csv/train.csv' testset = '../dataset/' + dataset_name + '_csv/test.csv' parameter_file = "./parameters.json" params = json.loads(open(parameter_file).read()) learning_rate = params['learning_rate'] filter_sizes = list(int(x) for x in params['filter_sizes'].split(',')) if params['enable_max_len'] == 1: enable_max = True else: enable_max = False if params['watch_rnn_output'] == 1: watch_rnn_output = True else: watch_rnn_output = False if params['is_simple'] == 1: is_simple = True else: is_simple = False x_raw, y_raw, target_raw, df, labels = data_helper.load_data_and_labels( dataset, dataset_name, params['max_length'], params['max_summary_length'], enable_max, True) x_test_raw, y_test_raw, target_test_raw, df_test, labels_test = data_helper.load_data_and_labels( testset, dataset_name, params['max_length'], params['max_summary_length'], enable_max, False) word_counts = {} count_words(word_counts, x_raw) logging.info("Size of Vocabulary: {}".format(len(word_counts))) """Step 1: pad each sentence to the same length and map each word to an id""" max_document_length = max([len(x.split(' ')) for x in x_raw]) min_document_length = min([len(x.split(' ')) for x in x_raw]) logging.info( 'The maximum length of all sentences: {}'.format(max_document_length)) logging.info( 'The minimum length of all sentences: {}'.format(min_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length, min_frequency=params['min_frequency']) vocab_processor.fit_transform(x_raw) vocab_to_int = vocab_processor.vocabulary_._mapping # Special tokens that will be added to our vocab codes = ["UNK", "PAD", "EOS", "GO"] # Add codes to vocab for code in codes: vocab_to_int[code] = len(vocab_to_int) # Dictionary to convert integers to words int_to_vocab = {} for word, value in vocab_to_int.items(): int_to_vocab[value] = word usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100 logging.info("Total number of words: {}".format(len(word_counts))) logging.info("Number of words we will use: {}".format(len(vocab_to_int))) logging.info("Percent of words we will use: {0:.2f}%".format(usage_ratio)) # Apply convert_to_ints to clean_summaries and clean_texts word_count = 0 unk_count = 0 int_summaries, word_count, unk_count = convert_to_ints( target_raw, vocab_to_int, word_count, unk_count) int_texts, word_count, unk_count = convert_to_ints(x_raw, vocab_to_int, word_count, unk_count, eos=True) int_test_summaries, word_count, unk_count = convert_to_ints( target_test_raw, vocab_to_int, word_count, unk_count) int_test_texts, word_count, unk_count = convert_to_ints(x_test_raw, vocab_to_int, word_count, unk_count, eos=True) unk_percent = round(unk_count / word_count, 4) * 100 logging.info("Total number of words in texts: {}".format(word_count)) logging.info("Total number of UNKs in texts: {}".format(unk_count)) logging.info("Percent of words that are UNK: {0:.2f}%".format(unk_percent)) """Step 1: pad each sentence to the same length and map each word to an id""" x_int = pad_sentence_batch(vocab_to_int, int_texts) target_int = pad_sentence_batch(vocab_to_int, int_summaries) x_test_int = pad_sentence_batch(vocab_to_int, int_test_texts) target_test_int = pad_sentence_batch(vocab_to_int, int_test_summaries) x = np.array(x_int) y = np.array(y_raw) x_test = np.array(x_test_int) y_test = np.array(y_test_raw) target = np.array(target_int) target_test = np.array(target_test_int) t = np.array(list(len(x) for x in x_int)) t_test = np.array(list(len(x) for x in x_test_int)) s = np.array(list(params['max_summary_length'] for x in x_int)) s_test = np.array(list(params['max_summary_length'] for x in x_test_int)) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] target_shuffled = target[shuffle_indices] t_shuffled = t[shuffle_indices] s_shuffled = s[shuffle_indices] x_train, x_dev, y_train, y_dev, target_train, target_dev, t_train, t_dev, s_train, s_dev = train_test_split( x_shuffled, y_shuffled, target_shuffled, t_shuffled, s_shuffled, test_size=0.1) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) logging.info('target_train: {}, target_dev: {}, target_test: {}'.format( len(target_train), len(target_dev), len(target_test))) logging.info('t_train: {}, t_dev: {}, t_test: {}'.format( len(t_train), len(t_dev), len(t_test))) logging.info('s_train: {}, s_dev: {}, s_test: {}'.format( len(s_train), len(s_dev), len(s_test))) """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = seq2CNN(num_classes=y_train.shape[1], filter_sizes=filter_sizes, max_summary_length=params['max_summary_length'], rnn_size=params['rnn_size'], vocab_to_int=vocab_to_int, num_filters=params['num_filters'], vocab_size=len(vocab_to_int), embedding_size=params['embedding_dim']) global_step = tf.Variable(0, name="global_step", trainable=False) num_batches_per_epoch = int( (len(x_train) - 1) / params['batch_size']) + 1 epsilon = params['epsilon'] learning_rate = tf.train.exponential_decay(params['learning_rate'], global_step, num_batches_per_epoch, 0.95, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate, epsilon) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) gradients, variables = zip(*optimizer.compute_gradients(cnn.loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, dataset_name + "_" + timestamp)) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") #for tensorboard train_writer = tf.summary.FileWriter( '/home/tgisaturday/Workspace/Taehoon/VGG_text_cnn/seq2CNN' + '/graphs/train/' + dataset_name + '_' + timestamp, sess.graph) test_writer = tf.summary.FileWriter( '/home/tgisaturday/Workspace/Taehoon/VGG_text_cnn/seq2CNN' + '/graphs/test/' + dataset_name + '_' + timestamp) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch, target_batch, t_batch, s_batch, seq_lambda): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.targets: target_batch, cnn.text_length: t_batch, cnn.summary_length: s_batch, cnn.batch_size: len(x_batch), cnn.dropout_keep_prob: params['dropout_keep_prob'], cnn.seq_lambda: seq_lambda, cnn.is_training: True } summary, _, logits, step, loss, seq_loss, cnn_loss, acc = sess.run( [ cnn.merged, train_op, cnn.training_logits, global_step, cnn.loss, cnn.seq_loss, cnn.cnn_loss, cnn.accuracy ], feed_dict) current_step = tf.train.global_step(sess, global_step) train_writer.add_summary(summary, current_step) return loss, seq_loss, cnn_loss, acc, logits # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch, target_batch, t_batch, s_batch, seq_lambda): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.targets: target_batch, cnn.text_length: t_batch, cnn.summary_length: s_batch, cnn.batch_size: len(x_batch), cnn.dropout_keep_prob: 1.0, cnn.seq_lambda: seq_lambda, cnn.is_training: False } summary, step, loss, seq_loss, acc, num_correct, examples = sess.run( [ cnn.merged, global_step, cnn.loss, cnn.seq_loss, cnn.accuracy, cnn.num_correct, cnn.inference_logits ], feed_dict) if watch_rnn_output == True: pad = vocab_to_int['PAD'] result = " ".join( [int_to_vocab[j] for j in examples[0] if j != pad]) logging.info('{}'.format(result)) current_step = tf.train.global_step(sess, global_step) test_writer.add_summary(summary, current_step) return num_correct # Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter( list(zip(x_train, y_train, target_train, t_train, s_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch, target_train_batch, t_train_batch, s_train_batch = zip( *train_batch) current_step = tf.train.global_step(sess, global_step) seq_lambda = exponential_lambda_decay(params['seq_lambda'], current_step, num_batches_per_epoch, 0.95, staircase=True) #seq_lambda = params['seq_lambda'] train_loss, train_seq_loss, train_cnn_loss, train_acc, examples = train_step( x_train_batch, y_train_batch, target_train_batch, t_train_batch, s_train_batch, seq_lambda) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: logging.critical( 'step: {} accuracy: {:0.6f} learning_rate: {:0.6f} seq_lambda: {:0.6f} loss: {:0.6f} seq_loss: {:0.6f} cnn_loss: {:0.6f}' .format(current_step, train_acc, learning_rate.eval(), seq_lambda, train_loss, train_seq_loss, train_cnn_loss)) pad = vocab_to_int['PAD'] result = " ".join( [int_to_vocab[j] for j in examples[0] if j != pad]) logging.info('{}'.format(result)) dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev, target_dev, t_dev, s_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch, target_dev_batch, t_dev_batch, s_dev_batch = zip( *dev_batch) num_dev_correct = dev_step(x_dev_batch, y_dev_batch, target_dev_batch, t_dev_batch, s_dev_batch, seq_lambda) total_dev_correct += num_dev_correct dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical( 'Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy on dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model at {} at step {}'.format( path, best_at_step)) logging.critical( 'Best accuracy is {} at step {}'.format( best_accuracy, best_at_step)) """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter( list(zip(x_test, y_test, target_test, t_test, s_test)), params['batch_size'], 1) total_test_correct = 0 watch_rnn_output = True start = time.time() for test_batch in test_batches: x_test_batch, y_test_batch, target_test_batch, t_test_batch, s_test_batch = zip( *test_batch) num_test_correct = dev_step(x_test_batch, y_test_batch, target_test_batch, t_test_batch, s_test_batch, seq_lambda) total_test_correct += num_test_correct path = saver.save(sess, checkpoint_prefix) test_accuracy = float(total_test_correct) / len(y_test) logging.critical( "\nExecution time for testing = {0:.6f}".format(time.time() - start)) logging.critical( 'Accuracy on test set is {} based on the best model {}'.format( test_accuracy, path)) logging.critical('The training is complete')
tf.flags.DEFINE_integer('ckpt_interval', 1000, 'save model after given number of training loop') # Data Parameters tf.flags.DEFINE_string('train_pos_file', 'twitter-datasets/train_pos.txt',"the path of positive training data") tf.flags.DEFINE_string('train_neg_file', 'twitter-datasets/train_neg.txt',"the path of negative training data") tf.flags.DEFINE_string('embedding_path', 'twitter-datasets/glove.6B.50d.txt',"the path for embeddings") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() for attr,value in FLAGS.__flags.items(): print("{}={}".format(attr,value)) # Data Preparation x_text, y = load_data_and_labels(FLAGS.train_pos_file,FLAGS.train_neg_file) # build dict max_length = max([len(text.strip().split(' ')) for text in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_length) x = np.array(list(vocab_processor.fit_transform(x_text))) print("data prepared and dict built") # split train and valid set x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size=0.01,random_state=10) with tf.Session() as sess: model = TextCNN(sequence_length=max_length, num_class=2,vocab_size=len(vocab_processor.vocabulary_), emb_dim=FLAGS.emb_dim, filter_size_list=list(map(int, FLAGS.filter_size_list.split(','))),
from __future__ import print_function import numpy as np from data_helper import load_data_and_labels from model import TextCNN VALIDATION_SPLIT = 0.1 CORPUS_DIR = './data' BATCH_SIZE = 32 EPOCHS = 10 EMBEDDING_SIZE = 256 NUM_FILTERS = 128 FILTER_SIZES = [3, 4, 5] # Load data and labels data, labels, num_words = load_data_and_labels(CORPUS_DIR) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] print('Training model.') text_cnn = TextCNN(num_class=y_train.shape[1], num_words=num_words, sequence_length=data.shape[1],
def train_cnn(): path = '' """Step 0: load sentences, labels, and training parameters""" train_file = sys.argv[1] x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file) parameter_file = sys.argv[2] params = json.loads(open(parameter_file).read()) """Step 1: pad each sentence to the same length and map each word to an id""" max_document_length = max([len(x.split(' ')) for x in x_raw]) logging.info('The maximum length of all sentences: {}'.format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) """Step 2: split the original dataset into train and test sets""" x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list(map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "trained_model_" + timestamp)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver() # One training step: train the model with one batch def train_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob']} _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict) # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch): feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0} step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict) return num_correct # Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) num_dev_correct = dev_step(x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical('Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy on dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model at {} at step {}'.format(path, best_at_step)) logging.critical('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step)) """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) num_test_correct = dev_step(x_test_batch, y_test_batch) total_test_correct += num_test_correct test_accuracy = float(total_test_correct) / len(y_test) logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path)) logging.critical('The training is complete')
def data_preprocess(): # Data preprocess # ======================================================= # Load data print("Loading data...") if not os.path.exists(os.path.join(out_dir, "data_x.npy")): x, y = data_helper.load_data_and_labels(FLAGS.data_file) # Get embedding vector x = x[:1000] y = y[:1000] sentences, max_document_length = data_helper.padding_sentences( x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length) print(len(sentences[0])) if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")): x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) else: print('w2v model found...') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'), file_to_load=os.path.join(out_dir, 'trained_word2vec.model'))) y = np.array(y) # np.save(os.path.join(out_dir,"data_x.npy"),x) # np.save(os.path.join(out_dir,"data_y.npy"),y) del sentences else: print('data found...') x = np.load(os.path.join(out_dir, "data_x.npy")) y = np.load(os.path.join(out_dir, "data_y.npy")) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params if not os.path.exists(os.path.join(out_dir, "training_params.pickle")): training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helper.saveDict(params, training_params_file) # Shuffle data randomly # np.random.seed(10) # shuffle_indices = np.random.permutation(np.arange(len(y))) # x_shuffled = x[shuffle_indices] # y_shuffled = y[shuffle_indices] # del x,y # x_train, x_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio del x, y return x_train, x_test, y_train, y_test
# Parametros do tensorflow tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParametros:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.eval_train: x_raw, y_test = data_helper.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # carregando vocabulario do treino vocab_path = os.path.join(FLAGS.vocab_dir, "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) # transformando frases fixas em formato de array de palavras x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nAvaliando...\n") # Avaliacao # ==================================================
def train(): # prepare data positive_file = os.path.join(os.path.dirname(__file__), 'data/rt-polaritydata/rt-polarity.pos') negative_file = os.path.join(os.path.dirname(__file__), 'data/rt-polaritydata/rt-polarity.neg') data_x, data_y, vocab_size = load_data_and_labels(positive_file, negative_file) # generate train_data and validate_data validate_index = -1 * int(FLAGS.val_percent * len(data_y)) x_train, x_val = data_x[:validate_index], data_x[validate_index:] y_train, y_val = data_y[:validate_index], data_y[validate_index:] with tf.Graph().as_default(): with tf.Session() as sess: model = Model(learning_rate=FLAGS.learning_rate, sequence_length=x_train.shape[1], num_classes=FLAGS.num_classes, vocab_size=vocab_size, embedding_size=FLAGS.embedding_size, filter_sizes=list( map(int, FLAGS.filter_sizes.split(','))), num_filters=FLAGS.num_filters, num_checkpoints=FLAGS.num_checkpoints, l2_reg_lambda=FLAGS.l2_reg_lambda) # initialize init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) def train_op(x_batch, y_batch): loss, accuracy, global_step, summaries, _ = sess.run( [ model.loss, model.accuracy, model.global_step, model.train_summary, model.train_op ], feed_dict={ model.input_x: x_batch, model.output_y: y_batch, model.dropout: FLAGS.dropout_keep_prob }) print("step: {:d}, loss {:g}, acc {:g}".format( global_step, loss, accuracy)) # model.train_summary_writer.add_summary(summaries, global_step) return global_step def val_op(val_x, val_y): loss, accuracy, summaries = sess.run( [model.loss, model.accuracy, model.val_summary], feed_dict={ model.input_x: val_x, model.output_y: val_y, model.dropout: 1.0 }) print("loss {:g}, acc {:g}".format(loss, accuracy)) # model.train_summary_writer.add_summary(val_summary, step) # train and validate # generate batches batches = batch_iter(list(zip(x_train, y_train)), batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) for batch in batches: x_batch, y_batch = zip(*batch) x_batch = np.array(x_batch, dtype=np.int32) y_batch = np.array(y_batch, dtype=np.int32) current_step = train_op(x_batch, y_batch) if current_step % FLAGS.evaluate_every == 0: print('Evaluate\n') val_op(val_x=x_val, val_y=y_val)
def train_cnn(): """Step 0: load sentences, labels, and training parameters""" train_file = sys.argv[1] x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file) parameter_file = sys.argv[2] params = json.loads(open(parameter_file).read()) """Step 1: pad each sentence to the same length and map each word to an id""" max_document_length = max([len(x.split(' ')) for x in x_raw]) logging.info('The maximum length of all sentences: {}'.format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) """Step 2: split the original dataset into train and test sets""" x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list(map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "trained_model_" + timestamp)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver() # One training step: train the model with one batch def train_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob']} _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict) # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch): feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0} step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict) return num_correct # Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) num_dev_correct = dev_step(x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical('Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy on dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model at {} at step {}'.format(path, best_at_step)) logging.critical('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step)) """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) num_test_correct = dev_step(x_test_batch, y_test_batch) total_test_correct += num_test_correct test_accuracy = float(total_test_correct) / len(y_test) logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path)) logging.critical('The training is complete')
def compute_accurcy(x_data, y_data): correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(input_y, 1)) accurcy = tf.reduce_mean(tf.cast(correct_prediction, "float")) return sess.run(accurcy, feed_dict={input_x: x_data, input_y: y_data}) learning_rate = 0.1 train_steps = 100 pos_file = "pos.txt" neg_file = "neg.txt" dev_sample_percentage = .1 display_steps = 1 print("loading data...") x_text, y = data_helper.load_data_and_labels(pos_file, neg_file) max_document_length = max([len(line.split(" ")) for line in x_text]) print("max_document_length = ", max_document_length) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) #print(x) #Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
def train_cnn(dataset_name): """Step 0: load sentences, labels, and training parameters""" dataset = './dataset/' + dataset_name + '_csv/train.csv' parameter_file = "./parameters.json" params = json.loads(open(parameter_file).read()) learning_rate = params['learning_rate'] if params['enable_max_len'] == 1: enable_max = True else: enable_max = False if params['summary_using_keywords'] == 1: enable_keywords = True else: enable_keywords = False if params['layer_norm'] == 1: layer_norm = True else: layer_norm = False if params['watch_rnn_output'] == 1: watch_rnn_output = True else: watch_rnn_output = False if params['use_he_uniform'] == 1: use_he_uniform = True else: use_he_uniform = False if params['optional_shortcut'] == 1: optional_shortcut = True else: optional_shortcut = False x_raw, y_raw, target_raw, df, labels = data_helper.load_data_and_labels( dataset, params['max_length'], params['max_summary_length'], enable_max, enable_keywords) word_counts = {} count_words(word_counts, x_raw) logging.info("Size of Vocabulary: {}".format(len(word_counts))) # Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better # (https://github.com/commonsense/conceptnet-numberbatch) embeddings_index = {} with open('./dataset/embeddings/numberbatch-en.txt', encoding='utf-8') as f: for line in f: values = line.split(' ') word = values[0] embedding = np.asarray(values[1:], dtype='float32') embeddings_index[word] = embedding max_document_length = max([len(x.split(' ')) for x in x_raw]) # Find the number of words that are missing from CN, and are used more than our threshold. missing_words = 0 threshold = params['min_frequency'] for word, count in word_counts.items(): if count > threshold: if word not in embeddings_index: missing_words += 1 missing_ratio = round(missing_words / len(word_counts), 4) * 100 logging.info("Number of words missing from CN: {}".format(missing_words)) logging.info( "Percent of words that are missing from vocabulary: {0:.2f}%".format( missing_ratio)) #dictionary to convert words to integers """Step 1: pad each sentence to the same length and map each word to an id""" value = 0 vocab_to_int = {} for word, count in word_counts.items(): if count >= threshold: vocab_to_int[word] = value value += 1 # Special tokens that will be added to our vocab codes = ["UNK", "PAD", "EOS", "GO"] # Add codes to vocab for code in codes: vocab_to_int[code] = len(vocab_to_int) # Dictionary to convert integers to words int_to_vocab = {} for word, value in vocab_to_int.items(): int_to_vocab[value] = word usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100 logging.info("Total number of words: {}".format(len(word_counts))) logging.info("Number of words we will use: {}".format(len(vocab_to_int))) logging.info("Percent of words we will use: {0:.2f}%".format(usage_ratio)) # Need to use 300 for embedding dimensions to match CN's vectors. embedding_dim = 300 nb_words = len(vocab_to_int) logging.info("Size of vocab_to_int: {}".format(len(vocab_to_int))) # Create matrix with default values of zero word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32) for word, i in vocab_to_int.items(): if word in embeddings_index: word_embedding_matrix[i] = embeddings_index[word] else: # If word not in CN, create a random embedding for it new_embedding = np.array( np.random.uniform(-1.0, 1.0, embedding_dim)) embeddings_index[word] = new_embedding word_embedding_matrix[i] = new_embedding # Check if value matches len(vocab_to_int) logging.info("Size of word embedding matrix: {}".format( len(word_embedding_matrix))) # Apply convert_to_ints to clean_summaries and clean_texts word_count = 0 unk_count = 0 logging.info("text_example: {}".format(x_raw[0])) logging.info("helper_example: {}".format(target_raw[0])) int_summaries, word_count, unk_count = convert_to_ints( target_raw, vocab_to_int, word_count, unk_count) int_texts, word_count, unk_count = convert_to_ints(x_raw, vocab_to_int, word_count, unk_count, eos=True) unk_percent = round(unk_count / word_count, 4) * 100 logging.info("Total number of words in texts: {}".format(word_count)) logging.info("Total number of UNKs in texts: {}".format(unk_count)) logging.info("Percent of words that are UNK: {0:.2f}%".format(unk_percent)) """Step 1: pad each sentence to the same length and map each word to an id""" x_int = pad_sentence_batch(vocab_to_int, int_texts) target_int = pad_sentence_batch(vocab_to_int, int_summaries) x = np.array(x_int) y = np.array(y_raw) target = np.array(target_int) t = np.array(list(len(x) for x in x_int)) max_summary_length = max([len(sentence) for sentence in target_int]) s = np.array(list(max_summary_length for x in x_int)) """Step 2: split the original dataset into train and test sets""" x_, x_test, y_, y_test, target_, target_test, t_, t_test, s_, s_test = train_test_split( x, y, target, t, s, test_size=0.1, random_state=42) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] target_shuffled = target_[shuffle_indices] t_shuffled = t_[shuffle_indices] s_shuffled = s_[shuffle_indices] x_train, x_dev, y_train, y_dev, target_train, target_dev, t_train, t_dev, s_train, s_dev = train_test_split( x_shuffled, y_shuffled, target_shuffled, t_shuffled, s_shuffled, test_size=0.1) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) logging.info('target_train: {}, target_dev: {}, target_test: {}'.format( len(target_train), len(target_dev), len(target_test))) logging.info('t_train: {}, t_dev: {}, t_test: {}'.format( len(t_train), len(t_dev), len(t_test))) logging.info('s_train: {}, s_dev: {}, s_test: {}'.format( len(s_train), len(s_dev), len(s_test))) """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = seq2CNN(embeddings=word_embedding_matrix, num_classes=y_train.shape[1], max_summary_length=max_summary_length, rnn_size=params['rnn_size'], rnn_num_layers=params['rnn_num_layers'], vocab_to_int=vocab_to_int, num_filters=params['num_filters'], vocab_size=len(vocab_to_int), embedding_size=300, layer_norm=layer_norm, depth=params['VDCNN_depth'], downsampling_type=params['downsampling_type'], use_he_uniform=use_he_uniform, optional_shortcut=optional_shortcut) global_step = tf.Variable(0, name="global_step", trainable=False) num_batches_per_epoch = int( (len(x_train) - 1) / params['batch_size']) + 1 epsilon = params['epsilon'] learning_rate = tf.train.exponential_decay(params['learning_rate'], global_step, params['num_epochs'] * num_batches_per_epoch, 0.95, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate, epsilon) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) cnn_gradients, cnn_variables = zip( *optimizer.compute_gradients(cnn.loss)) seq_gradients, seq_variables = zip( *optimizer.compute_gradients(cnn.seq_loss)) cnn_gradients, _ = tf.clip_by_global_norm(cnn_gradients, 7.0) seq_gradients, _ = tf.clip_by_global_norm(seq_gradients, 7.0) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(zip( cnn_gradients, cnn_variables), global_step=global_step) seq_train_op = optimizer.apply_gradients( zip(seq_gradients, seq_variables), global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "result_" + timestamp)) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch, target_batch, t_batch, s_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.targets: target_batch, cnn.text_length: t_batch, cnn.summary_length: s_batch, cnn.batch_size: len(x_batch), cnn.dropout_keep_prob: params['dropout_keep_prob'], cnn.is_training: True } _, logits, step, loss, seq_loss, acc = sess.run([ train_op, cnn.training_logits, global_step, cnn.loss, cnn.seq_loss, cnn.accuracy ], feed_dict) return loss, seq_loss, acc def seq_train_step(x_batch, y_batch, target_batch, t_batch, s_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.targets: target_batch, cnn.text_length: t_batch, cnn.summary_length: s_batch, cnn.batch_size: len(x_batch), cnn.dropout_keep_prob: params['dropout_keep_prob'], cnn.is_training: True } _, logits, step, loss, seq_loss, acc = sess.run([ seq_train_op, cnn.training_logits, global_step, cnn.loss, cnn.seq_loss, cnn.accuracy ], feed_dict) return loss, seq_loss, acc # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch, target_batch, t_batch, s_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.targets: target_batch, cnn.text_length: t_batch, cnn.summary_length: s_batch, cnn.batch_size: len(x_batch), cnn.dropout_keep_prob: 1.0, cnn.is_training: False } step, loss, seq_loss, acc, num_correct, examples = sess.run([ global_step, cnn.loss, cnn.seq_loss, cnn.accuracy, cnn.num_correct, cnn.training_logits ], feed_dict) if watch_rnn_output == True: pad = vocab_to_int['PAD'] result = " ".join( [int_to_vocab[j] for j in examples[0] if j != pad]) logging.info('{}'.format(result)) return num_correct sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter( list(zip(x_train, y_train, target_train, t_train, s_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch, target_train_batch, t_train_batch, s_train_batch = zip( *train_batch) train_loss, train_seq_loss, train_acc = train_step( x_train_batch, y_train_batch, target_train_batch, t_train_batch, s_train_batch) current_step = tf.train.global_step(sess, global_step) train_loss, train_seq_loss, train_acc = seq_train_step( x_train_batch, y_train_batch, target_train_batch, t_train_batch, s_train_batch) if current_step % params['evaluate_every'] == 0: logging.critical( 'step: {} accuracy: {} cnn_loss: {} seq_loss: {}'. format(current_step, train_acc, train_loss, train_seq_loss)) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev, target_dev, t_dev, s_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch, target_dev_batch, t_dev_batch, s_dev_batch = zip( *dev_batch) num_dev_correct = dev_step(x_dev_batch, y_dev_batch, target_dev_batch, t_dev_batch, s_dev_batch) total_dev_correct += num_dev_correct dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical( 'Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy on dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model at {} at step {}'.format( path, best_at_step)) logging.critical( 'Best accuracy is {} at step {}'.format( best_accuracy, best_at_step)) """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter( list(zip(x_test, y_test, target_test, t_test, s_test)), params['batch_size'], 1) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch, target_test_batch, t_test_batch, s_test_batch = zip( *test_batch) num_test_correct = dev_step(x_test_batch, y_test_batch, target_test_batch, t_test_batch, s_test_batch) total_test_correct += num_test_correct path = saver.save(sess, checkpoint_prefix) test_accuracy = float(total_test_correct) / len(y_test) logging.critical( 'Accuracy on test set is {} based on the best model {}'.format( test_accuracy, path)) logging.critical('The training is complete')
tf.flags.DEFINE_string("training_file_neg", "twitter-datasets/train_neg.txt", "Path and name for the training file (neg examples)") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helper.load_data_and_labels(FLAGS.training_file_pos, FLAGS.training_file_neg) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: It's better to use cross-validation
def train_cnn(): """Step 0: load sentences, labels, and training parameters""" train_file = '../data/iseardataset.csv' x_raw, y_raw, df, labels, embedding_mat = data_helper.load_data_and_labels( train_file) parameter_file = '../training_config.json' params = json.loads(open(parameter_file).read()) """Step 1: pad each sentence to the same length and map each word to an id""" max_document_length = max([len(x.split(' ')) for x in x_raw]) logging.info( 'The maximum length of all sentences: {}'.format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) # print x.shape """Step 2: split the original dataset into train and test sets""" x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.2, random_state=42) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.2) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=9000, embedding_size=params['embedding_dim'], filter_sizes=list( map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], embedding_mat=embedding_mat, l2_reg_lambda=params['l2_reg_lambda']) # Optimizing our loss function using Adam's optimizer global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "trained_model_" + timestamp)) print("Writing to {}\n".format(out_dir)) # Summary for predictions # predictions_summary = tf.summary.scalar("predictions", cnn.predictions) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'] } _, step, summaries, loss, acc = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, acc)) train_summary_writer.add_summary(summaries, step) # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch, writer=None): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, acc, num_correct, predictions = \ sess.run([global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.num_correct, cnn.predictions], feed_dict) if writer: writer.add_summary(summaries, step) return num_correct, predictions # Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) sess.run(tf.global_variables_initializer()) print "Loading Embeddings !" embedding_dimension = 200 embedding_dir = '../embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt' # embedding_dir = '../GoogleNews-vectors-negative300.bin' initW = data_helper.load_embedding_vectors_glove( vocab_processor.vocabulary_, embedding_dir, embedding_dimension) # initW = data_helper.load_embedding_vectors_word2vec(vocab_processor.vocabulary_, embedding_dir, embedding_dimension) sess.run(cnn.W.assign(initW)) print "Loaded Embeddings !" # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: if len(train_batch) == 0: continue x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: if len(dev_batch) == 0: continue x_dev_batch, y_dev_batch = zip(*dev_batch) num_dev_correct, y_pred_tre = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical( 'Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy of the dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) classes = [ "joy", "fear", "anger", "sadness", "disgust", "shame", "guilt" ] """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 for test_batch in test_batches: if len(test_batch) == 0: continue print "Non Zero Length" x_test_batch, y_test_batch = zip(*test_batch) num_test_correct, y_pred = dev_step(x_test_batch, y_test_batch) total_test_correct += num_test_correct test_accuracy = (float(total_test_correct) / len(y_test)) * 100 train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], 1) total_train_correct = 0 for train_batch in train_batches: if len(train_batch) == 0: continue print "Non Zero Length" x_train_batch, y_train_batch = zip(*train_batch) num_test_correct, y_ = dev_step(x_train_batch, y_train_batch) total_train_correct += num_test_correct train_accuracy = (float(total_train_correct) / len(y_train)) * 100 print 'Accuracy on test set is {} based on the best model'.format( test_accuracy) print 'Accuracy on train set is {} based on the best model'.format( train_accuracy) # logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path)) print(len(y_test_batch)) print(y_test_batch[0]) print(len(y_pred)) print(y_pred[0]) # Y_test = np.argmax(y_test_batch, axis=1) # y_pred_class = np.argmax(y_pred, axis=1) print(classification_report(y_test_batch, y_pred, target_names=classes)) # # Create confusion matrix # cnf_matrix = confusion_matrix(Y_test, y_pred_class) # plt.figure(figsize=(20, 10)) # data_helper.plot_confusion_matrix(cnf_matrix, labels=classes) logging.critical('The training is complete')
import os import data_helper from tensorflow.contrib import learn import csv # 改变这里:加载数据。加载自己的数据 positive_data_file = './data/rt-polarity.pos' negative_data_file = './data/rt-polarity.neg' if_eval = True checkpoint_dir = './runs/1548567747' allow_soft_placement = True log_device_placement = False batch_size = 16 if if_eval: x_raw, y_test = data_helper.load_data_and_labels(positive_data_file, negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ['a masterpiece four years in the making', 'everying is off'] y_test = [1, 0] # map data into vocabulary vocab_path = os.path.join(checkpoint_dir, 'vocab') print(vocab_path) vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print('\n Evaluating...\n') # Evaluation # ====================================
batch_size = 16 num_epochs = 10 evaluate_every = 100 checkpoint_every = 100 num_checkpoints = 5 allow_soft_placement = True log_device_placement = False filter_sizes = '3,4,5' num_filters = 128 # Data Preparation # ============================================== # Load data加载数据,返回数据集和标签 print('Loading data...') x_text, y = data_helper.load_data_and_labels(positive_data_file, negative_data_file) # Build vocabulary 生成但是字典 # 得到最大邮件长度(单词个数),不足的用0补充 max_document_length = max([len(x.split(' ')) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list( vocab_processor.fit_transform(x_text))) # todo 生成word_to_id-Metrix ,不够的补零 # 数据打乱数据集 np.random.seed(32) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffle = x[shuffle_indices] y_shuffle = y[shuffle_indices]
def train_cnn(): """Step 0: load sentences, labels, and training parameters""" train_file = "C:\\Users\\s1761548\\Downloads\\NPS\\nps(New)\\New NPS\\nps_sentiment_training.zip" x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file) parameter_file = "C:\\Code_Sketch\\NPS\\S3134076\\PycharmProjects\\nps\\parameters_sentiment.json" params = json.loads(open(parameter_file).read()) """Step 1: pad each sentence to the same length and map each word to an id""" max_document_length = max([len(x.split(' ')) for x in x_raw]) logging.info( 'The maximum length of all sentences: {}'.format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) """Step 2: split the original dataset into train and test sets""" x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1) """Step 4: save the labels into labels.json since predict_fraud.py needs it""" with open('./labels_sentiment.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) for num_epoch in [25]: params['num_epochs'] = num_epoch for batch_size in [30]: params['batch_size'] = batch_size for l2 in [0.0]: params['l2_reg_lambda'] = l2 """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list( map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients( grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge( grad_summaries) timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "trained_model_" + timestamp)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar( "accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join( out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph_def) # Dev summaries dev_summary_op = tf.summary.merge( [loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter( dev_summary_dir, sess.graph_def) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join( checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'] } _, step, summaries, loss, acc = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch, writer=None): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, acc, num_correct = sess.run( [ global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.num_correct ], feed_dict) if writer: writer.add_summary(summaries, step) return num_correct # Save the word_to_id map since predict_fraud.py needs it vocab_processor.save( os.path.join(out_dir, "vocab_sentiment.pickle")) sess.run(tf.initialize_all_variables()) # Training starts here train_batches = data_helper.batch_iter( list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step( sess, global_step) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" print("hello1") print(current_step) print(params['evaluate_every']) if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) num_dev_correct = dev_step( x_dev_batch, y_dev_batch, writer=dev_summary_writer) total_dev_correct += num_dev_correct dev_accuracy = float(total_dev_correct) / len( y_dev) print("hello2") logging.critical( 'Accuracy on dev set: {}'.format( dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy of the dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical( 'Saved model {} at step {}'.format( path, best_at_step)) logging.critical( 'Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter( list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) num_test_correct = dev_step( x_test_batch, y_test_batch) total_test_correct += num_test_correct test_accuracy = float(total_test_correct) / len(y_test) logging.critical( 'Accuracy on test set is {} based on the best model {}' .format(test_accuracy, path)) logging.critical('The training is complete')
from text_cnn import TextCNN from config import FLAGS import tensorflow as tf import data_helper x_test_data, y_test = data_helper.load_data_and_labels(FLAGS.test_data_file, FLAGS.test_label_file) padded_sentences_test, max_padding_length = data_helper.padding_sentence( sentences=x_test_data, padding_sentence_length=FLAGS.padding_sentence_length, padding_move=FLAGS.padding_move) x_test, vocabulary_len = data_helper.embedding_sentences( embedding_file=FLAGS.embedding_file, padded_sentences=padded_sentences_test, embedding_dimension=FLAGS.embedding_dimension) print("x_test.shape = {}".format(x_test.shape)) print("y_test.shape = {}".format(y_test.shape)) cnn = TextCNN(sequence_length=FLAGS.padding_sentence_length, num_classes=FLAGS.num_classes, embedding_dimension=FLAGS.embedding_dimension, filter_sizes=list(map(int, FLAGS.filter_size.split(','))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.L2_reg_lambda) with tf.Session() as sess: saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_save_path))
def train_cnn(): """Step 0: 加载数据和参数""" train_file = sys.argv[1] x_raw, y_raw, _, labels = data_helper.load_data_and_labels(train_file) parameter_file = sys.argv[2] params = json.loads(open(parameter_file).read()) """Step 1: 完成单词到ID的映射,一行为一个sequence""" max_document_length = max([len(x.split(' ')) for x in x_raw]) logging.info( 'The maximum length of all sentences: {}'.format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) """Step 2: _x和测试集""" x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42) """Step 3: 将_x分为训练集和验证集""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) #------------------------------------------------------------------------------------------------------------------- """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): # tf.ConfigProto一般用在创建session的时候。用来对session进行参数配置 #log_device_placement=True : 是否打印设备分配日志 #allow_soft_placement=True : 如果你指定的设备不存在,允许TF自动分配设备 session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list( map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) #学习率 global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.train.exponential_decay(1e-3, global_step, 1000, 0.99, staircase=True) #优化器,选用最优的adam,速度快,效果稳定 可以直接这样用:train_step = tf.train.AdagradOptimizer(learning_rate).minimize(loss, global_step=global_step) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) #保存模型和断点的路径 timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "trained_model_" + timestamp)) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch, train_summary_op): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'] } _, step, train_summary = sess.run( [train_op, global_step, train_summary_op], feed_dict) return train_summary # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch, dev_summary_op): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, dev_summary, num_correct = sess.run( [global_step, dev_summary_op, cnn.num_correct], feed_dict) return dev_summary, num_correct # 可视化 loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) train_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "logs", "train") dev_summary_dir = os.path.join(out_dir, "logs", "dev") sess.run(tf.global_variables_initializer()) # 保存单词表用于预测Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) # 可视化 train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: 训练train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_summary = train_step(x_train_batch, y_train_batch, train_summary_op) current_step = tf.train.global_step(sess, global_step) if current_step % 100 == 0: train_summary_writer.add_summary(train_summary, current_step) """Step 6.1: 用验证集评价模型evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) dev_summary, num_dev_correct = dev_step( x_dev_batch, y_dev_batch, dev_summary_op) total_dev_correct += num_dev_correct dev_summary_writer.add_summary(dev_summary, current_step) dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical( 'Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2:保存模型save the model if it is the best based on accuracy of the dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) """Step 7: 预测predict x_test (batch by batch)""" test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) _, num_test_correct = dev_step(x_test_batch, y_test_batch, dev_summary_op) total_test_correct += num_test_correct test_accuracy = float(total_test_correct) / len(y_test) logging.critical( 'Accuracy on test set is {} based on the best model {}'.format( test_accuracy, path)) logging.critical('complete!')
# validate training params file training_params_file = os.path.join(FLAGS.checkpoint_dir, "..", "training_params.pickle") if not os.path.exists(training_params_file): print( "Training params file \'{}\' is missing!".format(training_params_file)) print("Using training params file : {}".format(training_params_file)) # Load params params = data_helper.loadDict(training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) # Load data if FLAGS.eval_train and FLAGS.single_url is None: x_raw, y_test = data_helper.load_data_and_labels(FLAGS.input_text_file) elif FLAGS.single_url is not None: x_raw = [FLAGS.single_url] y_test = None else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Get Embedding vector x_test sentences, max_document_length = data_helper.padding_sentences( x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape))
def train_cnn(): FLAGS = tf.flags.FLAGS with open("config.yml", 'r') as ymlfile: cfg = yaml.load(ymlfile) if FLAGS.enable_word_embeddings and cfg['word_embeddings'][ 'default'] is not None: embedding_name = cfg['word_embeddings']['default'] embedding_dimension = cfg['word_embeddings'][embedding_name][ 'dimension'] else: embedding_dimension = 300 filename = "./sun_firefox.csv.zip" x_raw, y_raw, df, labels = data_helper.load_data_and_labels(filename) #print(x_raw[0]) parameter_file = sys.argv[2] params = json.loads(open(parameter_file).read()) """Step 1: pad each sentence to the same length and map each word to an id""" max_document_length = max([len(x.split(' ')) for x in x_raw]) #print(max_document_length) logging.info( 'The maximum length of all sentences: {}'.format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) #x=np.array(x_raw) #y = np.array(y_raw) """Step 2: split the original dataset into train and test sets""" #x_, x_test, y_, y_test = train_test_split(x_raw, y_raw, test_size=0.1) #print(x.shape) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1, random_state=1) #print(x_train.shape) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev))) logging.info('y_train: {}, y_dev: {}'.format(len(y_train), len(y_dev))) """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list( map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(cnn.learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "netbeans_trained_model_" + timestamp)) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch, learning_rate): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'], cnn.learning_rate: learning_rate } _, step, loss, acc, k_2_accuracy, k_3_accuracy, k_4_accuracy, k_5_accuracy, k_6_accuracy, k_7_accuracy, k_8_accuracy, k_9_accuracy, k_10_accuracy = sess.run( [ train_op, global_step, cnn.loss, cnn.accuracy, cnn.k_2_accuracy, cnn.k_3_accuracy, cnn.k_4_accuracy, cnn.k_5_accuracy, cnn.k_6_accuracy, cnn.k_7_accuracy, cnn.k_8_accuracy, cnn.k_9_accuracy, cnn.k_10_accuracy ], feed_dict) print( "Train Step: step {}, loss {:g}, acc {:g},Top-2-Accuracy{:g},Top-3-Accuracy{:g},Top-4-Accuracy{:g}, Top-5-Accuracy{:g}, Top-6-Accuracy{:g}, Top-7-Accuracy{:g}, Top-8-Accuracy{:g}, Top-9-Accuracy{:g}, Top-10-Accuracy{:g}" .format(step, loss, acc, k_2_accuracy, k_3_accuracy, k_4_accuracy, k_5_accuracy, k_6_accuracy, k_7_accuracy, k_8_accuracy, k_9_accuracy, k_10_accuracy)) # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, loss, acc, k_2_accuracy, k_3_accuracy, k_4_accuracy, k_5_accuracy, k_6_accuracy, k_7_accuracy, k_8_accuracy, k_9_accuracy, k_10_accuracy, num_correct, scores, k_2_num_correct, k_3_num_correct, k_4_num_correct, k_5_num_correct, k_6_num_correct, k_7_num_correct, k_8_num_correct, k_9_num_correct, k_10_num_correct = sess.run( [ global_step, cnn.loss, cnn.accuracy, cnn.k_2_accuracy, cnn.k_3_accuracy, cnn.k_4_accuracy, cnn.k_5_accuracy, cnn.k_6_accuracy, cnn.k_7_accuracy, cnn.k_8_accuracy, cnn.k_9_accuracy, cnn.k_10_accuracy, cnn.num_correct, cnn.scores, cnn.k_2_num_correct, cnn.k_3_num_correct, cnn.k_4_num_correct, cnn.k_5_num_correct, cnn.k_6_num_correct, cnn.k_7_num_correct, cnn.k_8_num_correct, cnn.k_9_num_correct, cnn.k_10_num_correct ], feed_dict) #top_k_predications=tf.nn.top_k(scores,5) #print(num_correct) #print(k_num_correct) print( "Dev Step: step {}, loss {:g}, acc {:g},Top-2-Accuracy{:g},Top-3-Accuracy{:g},Top-4-Accuracy{:g}, Top-5-Accuracy{:g}, Top-6-Accuracy{:g}, Top-7-Accuracy{:g}, Top-8-Accuracy{:g}, Top-9-Accuracy{:g}, Top-10-Accuracy{:g}" .format(step, loss, acc, k_2_accuracy, k_3_accuracy, k_4_accuracy, k_5_accuracy, k_6_accuracy, k_7_accuracy, k_8_accuracy, k_9_accuracy, k_10_accuracy)) return num_correct, k_2_num_correct, k_3_num_correct, k_4_num_correct, k_5_num_correct, k_6_num_correct, k_7_num_correct, k_8_num_correct, k_9_num_correct, k_10_num_correct # Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab")) sess.run(tf.global_variables_initializer()) # GLoVE Embedding #if FLAGS.enable_word_embeddings and cfg['word_embeddings']['default'] is not None: vocabulary = vocab_processor.vocabulary_ # initW = None # if embedding_name == 'word2vec': # print("Load word2vec file {}".format(cfg['word_embeddings']['word2vec']['path'])) # initW = data_helper.load_embedding_vectors_word2vec(vocabulary,cfg['word_embeddings']['word2vec']['path'],cfg['word_embeddings']['word2vec']['binary']) # print("word2vec file has been loaded") # elif embedding_name == 'glove': # print("Load glove file {}".format(cfg['word_embeddings']['glove']['path'])) # initW = data_helper.load_embedding_vectors_glove(vocabulary,cfg['word_embeddings']['glove']['path'],embedding_dimension) # print("glove file has been loaded\n") # elif embedding_name == 'elmo': # print("Loading Elmo Model") #url = "https://tfhub.dev/google/elmo/2" #embed = hub.Module(url, trainable=True) #initW = embed(tf.reshape(tf.cast(x_train, tf.string), [-1]), signature="default", as_dict=True)['default'] #initW = embed(tf.squeeze(tf.cast(vocabulary, tf.string)), signature="default", as_dict=True)['default'] #print (initW) #sess.run(cnn.W.assign(initW)) # It uses dynamic learning rate with a high value at the beginning to speed up the training max_learning_rate = 0.005 min_learning_rate = 0.0001 decay_speed = FLAGS.decay_coefficient * len( y_train) / params['batch_size'] # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 counter = 0 #start_time=gmtime(); """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: #learning_rate = 0.001 learning_rate = min_learning_rate + ( max_learning_rate - min_learning_rate) * math.exp( -counter / decay_speed) counter += 1 x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch, learning_rate) current_step = tf.train.global_step(sess, global_step) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 k_2_total_dev_correct = 0 k_3_total_dev_correct = 0 k_4_total_dev_correct = 0 k_5_total_dev_correct = 0 k_6_total_dev_correct = 0 k_7_total_dev_correct = 0 k_8_total_dev_correct = 0 k_9_total_dev_correct = 0 k_10_total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) num_dev_correct, k_2_num_dev_correct, k_3_num_dev_correct, k_4_num_dev_correct, k_5_num_dev_correct, k_6_num_dev_correct, k_7_num_dev_correct, k_8_num_dev_correct, k_9_num_dev_correct, k_10_num_dev_correct = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct k_2_total_dev_correct += k_2_num_dev_correct k_3_total_dev_correct += k_3_num_dev_correct k_4_total_dev_correct += k_4_num_dev_correct k_5_total_dev_correct += k_5_num_dev_correct k_6_total_dev_correct += k_6_num_dev_correct k_7_total_dev_correct += k_7_num_dev_correct k_8_total_dev_correct += k_8_num_dev_correct k_9_total_dev_correct += k_9_num_dev_correct k_10_total_dev_correct += k_10_num_dev_correct dev_accuracy = float(total_dev_correct) / len(y_dev) k_2_dev_accuracy = float(k_2_total_dev_correct) / len( y_dev) k_3_dev_accuracy = float(k_3_total_dev_correct) / len( y_dev) k_4_dev_accuracy = float(k_4_total_dev_correct) / len( y_dev) k_5_dev_accuracy = float(k_5_total_dev_correct) / len( y_dev) k_6_dev_accuracy = float(k_6_total_dev_correct) / len( y_dev) k_7_dev_accuracy = float(k_7_total_dev_correct) / len( y_dev) k_8_dev_accuracy = float(k_8_total_dev_correct) / len( y_dev) k_9_dev_accuracy = float(k_9_total_dev_correct) / len( y_dev) k_10_dev_accuracy = float(k_10_total_dev_correct) / len( y_dev) print("\n\n") logging.critical( 'Accuracy on dev set: {}'.format(dev_accuracy)) logging.critical('Top-2 Accuracy on dev set: {}'.format( k_2_dev_accuracy)) logging.critical('Top-3 Accuracy on dev set: {}'.format( k_3_dev_accuracy)) logging.critical('Top-4 Accuracy on dev set: {}'.format( k_4_dev_accuracy)) logging.critical('Top-5 Accuracy on dev set: {}'.format( k_5_dev_accuracy)) logging.critical('Top-6 Accuracy on dev set: {}'.format( k_6_dev_accuracy)) logging.critical('Top-7 Accuracy on dev set: {}'.format( k_7_dev_accuracy)) logging.critical('Top-8 Accuracy on dev set: {}'.format( k_8_dev_accuracy)) logging.critical('Top-9 Accuracy on dev set: {}'.format( k_9_dev_accuracy)) logging.critical('Top-10 Accuracy on dev set: {}'.format( k_10_dev_accuracy)) print("\n\n") """Step 6.2: save the model if it is the best based on accuracy of the dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) # """Step 7: predict x_test (batch by batch)""" # end_time=gmtime(); # print("\n\n") # print("Start Time:",start_time) # print("End Time:",end_time) # test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) # total_test_correct = 0 # k_2_total_test_correct = 0 # k_3_total_test_correct = 0 # k_4_total_test_correct = 0 # k_5_total_test_correct = 0 # k_6_total_test_correct = 0 # k_7_total_test_correct = 0 # k_8_total_test_correct = 0 # k_9_total_test_correct = 0 # k_10_total_test_correct = 0 # for test_batch in test_batches: # x_test_batch, y_test_batch = zip(*test_batch) # num_test_correct,k_2_num_test_correct,k_3_num_test_correct,k_4_num_test_correct,k_5_num_test_correct,k_6_num_test_correct,k_7_num_test_correct,k_8_num_test_correct,k_9_num_test_correct,k_10_num_test_correct = dev_step(x_test_batch, y_test_batch) # total_test_correct += num_test_correct # k_2_total_test_correct += k_2_num_test_correct # k_3_total_test_correct += k_3_num_test_correct # k_4_total_test_correct += k_4_num_test_correct # k_5_total_test_correct += k_5_num_test_correct # k_6_total_test_correct += k_6_num_test_correct # k_7_total_test_correct += k_7_num_test_correct # k_8_total_test_correct += k_8_num_test_correct # k_9_total_test_correct += k_9_num_test_correct # k_10_total_test_correct += k_10_num_test_correct # test_accuracy = float(total_test_correct) / len(y_test) # k_2_test_accuracy = float(k_2_total_test_correct) / len(y_test) # k_3_test_accuracy = float(k_3_total_test_correct) / len(y_test) # k_4_test_accuracy = float(k_4_total_test_correct) / len(y_test) # k_5_test_accuracy = float(k_5_total_test_correct) / len(y_test) # k_6_test_accuracy = float(k_6_total_test_correct) / len(y_test) # k_7_test_accuracy = float(k_7_total_test_correct) / len(y_test) # k_8_test_accuracy = float(k_8_total_test_correct) / len(y_test) # k_9_test_accuracy = float(k_9_total_test_correct) / len(y_test) # k_10_test_accuracy = float(k_10_total_test_correct) / len(y_test) # print("\n\n") # logging.critical('Accuracy on test set is {} '.format(test_accuracy)) # logging.critical('Top-2 Accuracy on test set is {}'.format(k_2_test_accuracy)) # logging.critical('Top-3 Accuracy on test set is {}'.format(k_3_test_accuracy)) # logging.critical('Top-4 Accuracy on test set is {}'.format(k_4_test_accuracy)) # logging.critical('Top-5 Accuracy on test set is {}'.format(k_5_test_accuracy)) # logging.critical('Top-6 Accuracy on test set is {}'.format(k_6_test_accuracy)) # logging.critical('Top-7 Accuracy on test set is {}'.format(k_7_test_accuracy)) # logging.critical('Top-8 Accuracy on test set is {}'.format(k_8_test_accuracy)) # logging.critical('Top-9 Accuracy on test set is {}'.format(k_9_test_accuracy)) # logging.critical('Top-10 Accuracy on test set is {}'.format(k_10_test_accuracy)) print("\n\n") logging.critical('The training is complete') end_time = gmtime() print("\n\n") print("Start Time:", start_time) print("End Time:", end_time)
def train_cnn(): """Step 0: load sentences, labels, and training parameters""" create_test = False # load train, cat and and othe path configurations from parameter file train_file = params['train_file'] cat_file = params['cat_file'] test_set_dir = params['test_set_dir'] desc_col = params["desc_col"] #dev_set_dir = params['dev_set_dir'] x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file, cat_file, desc_col, ispickle=False) """Step 1: pad each sentence to the same length and map each word to an id""" # MAX DOCUMENT LENGTH #max_document_length = max([len(x.split(' ')) for x in x_raw]) max_document_length = params['max_document_length'] logger.debug('The maximum length set for all transactions: {}'.format( max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length, tokenizer_fn=data_helper.tokenizer) #x_raw = x_raw.apply(lambda x: str(x)) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) if create_test: """Step 2: split the original dataset into train and test sets""" loggger.info("preparing test set") x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=42) logger.info("saving test set") x_test.tocsv(os.path.join(test_set_dir, 'x_test.csv'), index=None) y_test.tocsv(os.path.join(test_set_dir, 'y_test.csv'), index=None) logger.debug("x_test: {}, y_test: {}".format(len(x_test), len(y_test))) else: x_ = x y_ = y logger.info("preparing dev set") """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y_))) x_shuffled = x_[shuffle_indices] y_shuffled = y_[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split( x_shuffled, y_shuffled, stratify=y_shuffled, test_size=params['val_set_ratio'], random_state=42) #x_dev.tocsv(os.path.join(dev_set_dir,'x_test.csv'),index=None) #x_dev.tocsv(os.path.join(dev_set_dir,'y_test.csv'),index=None) """Step 4: save the labels into labels.json since predict.py needs it""" logger.info("saving labels into json file") with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logger.debug('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev))) logger.debug('y_train: {}, y_dev: {}'.format(len(y_train), len(y_dev))) """Step 5: build a graph and cnn object""" logger.info("building tensorflow graph") graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list( map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(params['learning_rate']) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) timestamp = time.strftime("%m%d-%H%M") output_dir = params['output_dir'] out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", output_dir, timestamp)) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'] } _, step, summaries, loss, acc = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() logger.debug("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, acc)) train_summary_writer.add_summary(summaries, step) # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch, writer=None): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, acc, num_correct = sess.run([ global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.num_correct ], feed_dict) time_str = datetime.datetime.now().isoformat() logger.info("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, acc)) #if writer: # writer.add_summary(summaries, step) dev_summary_writer.add_summary(summaries, step) return num_correct # Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: #dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) #total_dev_correct = 0 #for dev_batch in dev_batches: # x_dev_batch, y_dev_batch = zip(*dev_batch) # num_dev_correct = dev_step(x_dev_batch, y_dev_batch) # total_dev_correct += num_dev_correct total_dev_correct = dev_step(x_dev, y_dev) dev_accuracy = float(total_dev_correct) / len(y_dev) logger.info('Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy of the dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logger.info('Saved model {} at step {}'.format( path, best_at_step)) logger.info('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) if create_test: """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter( list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 #for test_batch in test_batches: # x_test_batch, y_test_batch = zip(*test_batch) # num_test_correct = dev_step(x_test_batch, y_test_batch) # total_test_correct += num_test_correct total_test_correct = dev_step(x_test, y_test) test_accuracy = float(total_test_correct) / len(y_test) print('Accuracy on test set is {} based on the best model {}'. format(test_accuracy, path)) print('The training is complete')
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS(sys.argv) print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helper.load_data_and_labels(FLAGS.regret_short_story, FLAGS.drugs_consumption_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
def train_cnn(): global my_min # by odg # 오리지널 파일로부터 입력과 출력값을 배열로 만들어내기. 또한 CNN의 설정값 또한 파일로서 읽기 """Step 0: load sentences, labels, and training parameters""" # 파라미터로 받은 파일을 로딩해서 문장배열(x_raw)과 각 문장들의 분류값배열(y_raw)을 얻어낸다. x는 뉴럴넷의 input이로, y는 output이다. train_file = sys.argv[1] # CNN네트워크의 각종 세부 설정값(hyper parameter)들을 로딩한다. 이 안에는 num_epochs, batch_size, num_filters등의 값들이 들어있다. x_raw, y_raw, df, labels = data_helper.load_data_and_labels( train_file) # @ # x_raw는 데이터셋, y_raw는 label의 One-hot vector, df는 라벨 포함 데이터셋, labels는 라벨 들어가있음. parameter_file = sys.argv[2] params = json.loads(open(parameter_file).read()) model_dir = sys.argv[3] # 모델 폴더 이름 max_document_length = 0 minimum_frequency = 5 # 단어장에 넣을 단어의 최소한의 빈도수(해당 빈도수 이상 있어야 단어장에 등록) list_max_final_scores = [] # final_scores 들 중 가장 큰 값만 저장한 리스트 by odg if (model_dir == "new"): timestamp = str(int(time.time())) model_name = "./trained_model_" + timestamp # 학습내용을 기록할 디렉토리 정의 out_dir = os.path.abspath(os.path.join(os.path.curdir, model_name)) # !새롭게 생기는 폴더. checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) vectorize_list = list(mytoken.tokenizer(x_raw)) for i in vectorize_list: if max_document_length < len(i): max_document_length = len(i) word2Vec = Word2Vec(vectorize_list, size=params['embedding_dim'] - params['num_of_class'], window=3, min_count=minimum_frequency, workers=4) word2Vec.save(model_name + "/word2Vec.vec") # @ fastText = FastText(vectorize_list, size=params['embedding_dim'] - params['num_of_class'], window=3, min_count=minimum_frequency, workers=4) fastText.save(model_name + "/fastText.vec") # @ vocab_dict, _ = data_helper.build_vocab(max_document_length, word2Vec.wv.index2word, params['num_of_class'], True) # 학습내용중 Tensorflow 내부의 변수상태가 저장됨 (예:AdamOptimizer) else: out_dir = os.path.abspath(os.path.join(model_dir)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) vocab_dict, max_document_length = data_helper.build_vocab( 0, None, None, False) model_name = model_dir checkpoint_prefix = os.path.join(checkpoint_dir, "model") # 전체 문장을 동일한 크기로 맞추고, 단어마다 ID를 부여해서, ID로 이루어진 문장을 만들기 """Step 1: pad each sentence to the same length and map each word to an id""" # 문장배열의 크기를 pad값을 사용해서 같은 크기로 맞추어 주고, 문장안의 단어들을 ID로 매핑시켜주는 작업을 통해 학습 문장을 숫자 매트릭스형태로 만들어 학습이 가능한 상태로 만든다. logging.info('가장 긴 길이의 문장: {}'.format(max_document_length)) # 21 vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length=max_document_length, vocabulary=vocab_dict, tokenizer_fn=mytoken.tokenizer) # 데이터셋의 단어들에 대해 인덱스를 붙여주는... #! x = np.array(list(vocab_processor.transform(x_raw))) # ! vocab_dictionary = vocab_processor.vocabulary_._mapping # ! y = np.array(y_raw) # y는 라벨에 대한 One-hot vector # 데이터셋을 학습용과 테스트용으로 나누기 """Step 2: split the original dataset into train and test sets""" x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42) # 학습 문장과 결과값을 학습과 테스트 두개의 그룹으로 나눈다.(10%만 검증용으로 사용한다) # 데이터셋을 임의로 배치하고, 학습 데이터를 학습용과 검증용으로 다시 분류하기. """Step 3: shuffle the train set and split the train set into train and dev sets""" # 학습용 문장 배열(x_)의 순서를 그때마다 다르게 하기 위해 random방식으로 배열의 순서를 바꾸는 과정이다. # 학습데이터를 다시 두개의 그룹으로 나누는 것은 학습과 검증을 나눔으로서 overfitting을 줄이고 학습의 효과를 확인하기 쉽게 하기 위해서이다. 전체 데이터셋 구성은 다음과 같다. https://blog.naver.com/2feelus/221005831312 에서 확인. shuffle_indices = np.random.permutation(np.arange(len(y_))) # 인덱스를 섞어줌 x_shuffled = x_[ shuffle_indices] # 데이터셋에서 랜덤으로 셔플된 문장 인덱스 가져옴 ex) [5 69 0 ... 0] y_shuffled = y_[shuffle_indices] # 해당 데이터셋에 대한 라벨 One-hot vector를 가져옴 x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1) # 카테고리 라벨을 파일로 저장하여, 예측시에 활용할수 있도록 하기 # 전체 카테고리 라벨들이 label.json 파일의 내용으로 저장. 실제 예측시에 이파일에 저장된 카테고리 순서에 따라 예측값을 얻어낸다. with open('./labels.json', 'w', encoding='utf-8-sig') as outfile: # by odg json.dump(labels, outfile, indent=4, ensure_ascii=False) # by odg logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # 텐서 플로우 그래프생성이후 CNN 객체를 생성하기 """Step 5: build a graph and cnn object""" # 텐서플로우에서 머신러닝의 데이터 흐름을 표현하는 그래프를 새로 생성한다. 그래프는 여러가지 머신러닝용 계산 명령 객체들을 포함하고 있다. graph = tf.Graph() # 파이선의 Context Manager 개념을 사용하여 기존의 기본 그래프 객체를 위에서 선언한 graph 객체로 대체하여 내부 블럭에 적용한다. # 멀티프로세스로 돌아가는 환경에서 이러한 방식을 사용하여 쓰레드에서 각각의 그래프 객체를 사용하도록 한다. with graph.as_default(): # 세션을 새로 생성한다. 세션의 설정옵션으로 GPU를 특정하지 않기(allow_soft_placement=True), # 연산이 어느디바이스로 설정되었는지 보여주여주지 않기(log_device_placement=False) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.InteractiveSession(config=session_conf) # !! # 세션또한 Context manager를 사용하여 세션의 열고 닫는 처리를 자동으로 해준다. with sess.as_default(): # CNN객체를 생성한다. 파라미터 = 문장의 최대길이(sequence_length):912, 분류 카테고리수(num_classes):11, # 사전에 등록된 단어수(vocab_size):52943, 워드임베딩 사이즈(embedding_size):50, # CNN필터(커널)의 크기는 3x3,4x4,5x5 , 필터의 갯수는 총 32 개, # 오버피팅 방지를 위한 가중치 영향력 감소 수치(l2_reg_lambda):0.0 cnn = TextCNN( sequence_length=x_train.shape[1], # 들어온 문장의 최대 길이 num_classes=y_train.shape[1], # 라벨의 개수 (= One-hot vector의 길이) vocab_size=len(vocab_processor.vocabulary_), # 단어의 수 embedding_size=params['embedding_dim'], filter_sizes=list(map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters'], l2_reg_lambda=params['l2_reg_lambda'], vec_dir=model_name # @ ) global_step = tf.Variable(0, name="global_step", trainable=False) # !!원본 바꾼것 # Cost function으로 Adam Optimizer사용 optimizer = tf.train.AdamOptimizer(1e-3) # cnn의 loss(오차) 값을 파리미터로 받아 점진하강. grads_and_vars = optimizer.compute_gradients(cnn.loss) # 학습에 사용할 함수 정의(session.run에서 사용됨). tf.summary.scalar("cnn_loss", cnn.loss) # @@@ tf.summary.scalar("cnn_accuracy", cnn.accuracy) # @@@ train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # saver를 사용해 학습 내용을 저장 saver = tf.train.Saver() # One training step: train the model with one batch # train_step 은 모델을 학습하는 하나의 묶음(batch)이다. 만약 batch size가 50이라면 50번의 Traning과 그에 따른 50번의 Test가 실행되게 된다. def train_step(x_batch, y_batch): # 입력/예측 출력값을 넣어줌으로서 학습/평가를 할수 있도록 한다. feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: params['dropout_keep_prob'] } # Overfitting을 줄이기 위해, Dropout(신경망 노드 탈락시키기) 확률을 지정. # 위에서 설정한 값들을 사용해 학습을 시작한다. _, step, loss, acc = sess.run( [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict) # One evaluation step: evaluate the model with one batch # dev_step 은 학습 결과 묶음(batch)를 평가(Evaluation)하는 메소드이다. def dev_step(x_batch, y_batch): # 평가시에는 dropout은 사용하지 않는다.(dropout_keep_prob:1.0 => off) feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } # 평가시에는 학습용 train_op 파라미터는 넣지 않는다. step, loss, acc, num_correct, summary, scores, final_scores = sess.run( [ global_step, cnn.loss, cnn.accuracy, cnn.num_correct, merged, cnn.scores, cnn.final_scores ], feed_dict) # @@@ for j in final_scores: # 각 final_scores에서 최대값 얻어오기 max_final_scores = max(j) list_max_final_scores.append( max_final_scores) # 각 최대값을 리스트에 추가 min_final_scores = min(list_max_final_scores) # 리스트에서 가장 작은 값 writer.add_summary(summary, step) # @@@ return num_correct, min_final_scores # 사용된 단어들을 ID에 매핑시켜 차후 예측시에 사용한다.(학습시에는 사용하지 않음) vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) # 텐서플로우에서 사용하는 변수들을 초기화 ckpt = tf.train.get_checkpoint_state( model_dir + "/checkpoints") # checkpoint 얻는다.(모델의 Variable값을 얻어옴) #!! if ckpt and tf.train.checkpoint_exists( ckpt.model_checkpoint_path): # 모델 checkpoint가 존재하면 #!! print("다음 파일에서 모델을 읽는 중 입니다..", ckpt.model_checkpoint_path) # !! saver.restore(sess, ckpt.model_checkpoint_path ) # checkpoint파일에서 모델의 변수값을 얻어온다. #!! else: # 모델 checkpoint가 존재하지 않는다면 #!! print("새로운 모델을 생성하는 중 입니다.") # !! sess.run(tf.global_variables_initializer()) # !! # Training starts here # 학습의 총 배치 갯수를 세팅한다. batch_iter 함수는 generator형식으로 작성되어있어서, 아래처럼 초기화를 해놓으면, for문안에서 배치단위로 값을 돌려주게 되어있다. # 한번에 학습단위묶음은 37개(batch_size=37). 학습데이터는 전체 학습에 한번 씩만 사용할것이다. (num_epochs=1) train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) # 최고의 정확성을 저장하기 위한 변수 best_accuracy, best_at_step = 0, 0 merged = tf.summary.merge_all() # @@@ writer = tf.summary.FileWriter("./logs", graph=graph) # @@@ """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: # zip을 사용하여 배치렬로 x(입력)과 y(기대출력)값을 각각 뽑아낸다. x_train_batch, y_train_batch = zip( *train_batch ) # *는 unpack 하는거. https://stackoverflow.com/questions/2921847/what-does-the-star-operator-mean 참고. # 배치단위로 학습 진행 train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # 현재 학습 회차가 evaluate 할 순서이면 evaluate를 한x_dev다. 기본은 200번 마다. """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: # 개발용 데이터를 배치단위로 가져온다. dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) # 학습된 모델에 개발용 배치 데이터를 넣어서 예측 성공 갯수를 누적한다. num_dev_correct, _ = dev_step(x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct # 모델의 정확성을 화면에 출력한다. dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical( 'Accuracy on dev set: {}'.format(dev_accuracy)) # 가장 예측 확률이 좋게 나온 모델을 저장한다. 기준은 dev_accuracy가 가장 좋게 나온 step의 모델이다. """Step 6.2: save the model if it is the best based on accuracy on dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) # !! tf.Print(path, [path], "This is saver : ") logging.critical('Saved model at {} at step {}'.format( path, best_at_step)) logging.critical( 'Best accuracy is {} at step {}'.format( best_accuracy, best_at_step)) # 학습데이터와 Test데이터는 9:1로 나누었다. # Test데이터는 학습에 사용되지 않은 데이터로서, 학습된 모델이 객관성을 가지는지 확인하기 위한 용도이다 """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) num_test_correct, min_final_scores = dev_step( x_test_batch, y_test_batch) my_min = min_final_scores # by odg total_test_correct += num_test_correct f = open(checkpoint_dir + "_min.txt", "w") f.write(str(my_min)) f.close() test_accuracy = float(total_test_correct) / len(y_test) logging.critical('테스트셋 Accuracy {}, best 모델 {}'.format( test_accuracy, path)) # 자꾸 여기서 오류남. 이건 그냥 log 띄우는거라 없어도 될거같은데.. logging.critical('트레이닝 완료')
def train_cnn(dataset_name): """Step 0: load sentences, labels, and training parameters""" dataset = '../dataset/'+dataset_name+'_csv/train.csv' testset = '../dataset/'+dataset_name+'_csv/test.csv' parameter_file = "./parameters.json" params = json.loads(open(parameter_file).read()) x_raw, y_raw, df, labels = data_helper.load_data_and_labels(dataset,dataset_name,True) x_test, y_test, df, labels = data_helper.load_data_and_labels(testset,dataset_name,False) """Step 1: pad each sentence to the same length and map each word to an id""" max_document_length = max([len(x.split(' ')) for x in x_raw]) logging.info('The maximum length of all sentences: {}'.format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_raw))) y = np.array(y_raw) x_test = np.array(list(vocab_processor.fit_transform(x_test))) y_test = np.array(y_test) """Step 3: shuffle the train set and split the train set into train and dev sets""" shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1) """Step 4: save the labels into labels.json since predict.py needs it""" with open('./labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test))) """Step 5: build a graph and cnn object""" graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=params['embedding_dim'], filter_sizes=list(map(int, params['filter_sizes'].split(","))), num_filters=params['num_filters']) global_step = tf.Variable(0, name="global_step", trainable=False) epsilon=params['epsilon'] num_batches_per_epoch = int((len(x_train)-1)/params['batch_size']) + 1 learning_rate = tf.train.exponential_decay(params['learning_rate'], global_step,num_batches_per_epoch, 0.95, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate,epsilon) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) gradients, variables = zip(*optimizer.compute_gradients(cnn.loss)) gradients, _ = tf.clip_by_global_norm(gradients, 7.0) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "model_" + timestamp)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) # One training step: train the model with one batch def train_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.is_training: True, cnn.dropout_keep_prob: params['dropout_keep_prob']} _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict) return acc,loss # One evaluation step: evaluate the model with one batch def dev_step(x_batch, y_batch): feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.is_training: False, cnn.dropout_keep_prob: 1.0} step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict) return num_correct # Save the word_to_id map since predict.py needs it vocab_processor.save(os.path.join(out_dir, "vocab.pickle")) sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs']) best_accuracy, best_at_step = 0, 0 """Step 6: train the cnn model with x_train and y_train (batch by batch)""" for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_acc, train_loss = train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)""" if current_step % params['evaluate_every'] == 0: logging.critical('step: {} accuracy: {} cnn_loss: {} '.format(current_step, train_acc, train_loss)) dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) num_dev_correct = dev_step(x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct dev_accuracy = float(total_dev_correct) / len(y_dev) logging.critical('Accuracy on dev set: {}'.format(dev_accuracy)) """Step 6.2: save the model if it is the best based on accuracy on dev set""" if dev_accuracy >= best_accuracy: best_accuracy, best_at_step = dev_accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model at {} at step {}'.format(path, best_at_step)) logging.critical('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step)) """Step 7: predict x_test (batch by batch)""" test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1) total_test_correct = 0 start=time.time() for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) num_test_correct = dev_step(x_test_batch, y_test_batch) total_test_correct += num_test_correct #path = saver.save(sess, checkpoint_prefix) logging.critical("\nExecution time for testing = {0:.6f}".format(time.time() - start)) test_accuracy = float(total_test_correct) / len(y_test) logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path)) logging.critical('The training is complete')
import tensorflow as tf import numpy as np import os import time import datetime import data_helper from text_cnn import TextCNN from config import FLAGS os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 指定一个GPU print('\n----------------Parameters--------------') # 在网络训练之前,先打印出来看看 for attr, value in (FLAGS.__flags.items()): print('{}={}'.format(attr.upper(), value)) # Load data and cut x_train_data, y = data_helper.load_data_and_labels(FLAGS.train_data_file, FLAGS.train_label_file) # Padding sentence padded_sentences_train, max_padding_length = data_helper.padding_sentence( sentences=x_train_data, padding_sentence_length=FLAGS.padding_sentence_length, padding_move=FLAGS.padding_move) print(padded_sentences_train[:10]) x, vocabulary_len = data_helper.embedding_sentences( embedding_file=FLAGS.embedding_file, padded_sentences=padded_sentences_train, embedding_dimension=FLAGS.embedding_dimension) print(x[:2]) # Shuffle data randomly np.random.seed(100) shuffle_indices = np.random.permutation(np.arange(len(y)))
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") tf.flags.DEFINE_boolean("use_cached_embeddings", True, "Cache embeddings locally on disk for repeated runs") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") print("Loading Data...") q1, q2, y, x1_length, x2_length = data_helper.load_data_and_labels( FLAGS.training_data_file) max_length = max(max([len(x.split(" ")) for x in q1]), max([len(x.split(" ")) for x in q2])) vocab_processor = learn.preprocessing.VocabularyProcessor(max_length) print("max question length:", max_length) #converting to embedding matrix x_text = q1 + q2 vocab_ids = np.array(list(vocab_processor.fit_transform(x_text))) x1 = vocab_ids[:len(q1)] x2 = vocab_ids[len(q1):] print("Loading Word embeddings") vocab_dict = vocab_processor.vocabulary_._mapping
r"C:\Users\satyasaideepthi\PycharmProjects\DL_LAB2\data\rt-polarity.neg", "Data source for the negative data.") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helper.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
tf.flags.DEFINE_string("embedding_file","./glove.6B/glove.6B.100d.txt","pretrained embediing file") tf.flags.DEFINE_integer("batch_size",512,"size of batch") tf.flags.DEFINE_string("checkpoint_dir","","checkpoint file") tf.flags.DEFINE_boolean("allow_soft_placement",True,"Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement",False,"Log placement of ops on devices") FLAGS=tf.flags.FLAGS FLAGS._parse_flags() for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") print("Loading Data...") q1,q2,y_label,q1_length,q2_length=data_helper.load_data_and_labels(FLAGS.testing_file) x_text=q1+q2 vocab_path=os.path.join(FLAGS.checkpoint_dir,"..","vocab") vocab_processor=learn.preprocessing.VocabularyProcessor.restore(vocab_path) vocab_ids=np.array(list(vocab_processor.tranform(x_text))) x1_test=vocab_ids[:len(q1)] x2_test=vocab_ids[len(q1):] y_test=np.argmax(y_label,axis=1) checkpoint_file=tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph=tf.Graph() with graph.as_default(): session_conf=tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement,