tf.flags.DEFINE_string("training_file_neg", "twitter-datasets/train_neg.txt", "Path and name for the training file (neg examples)")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helper.load_data_and_labels(FLAGS.training_file_pos, FLAGS.training_file_neg)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))


# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# It's better to use cross-validation
Beispiel #2
0
def train_cnn(dataset_name):
    """Step 0: load sentences, labels, and training parameters"""
    dataset = '../dataset/' + dataset_name + '_csv/train.csv'
    testset = '../dataset/' + dataset_name + '_csv/test.csv'
    parameter_file = "./parameters.json"
    params = json.loads(open(parameter_file).read())
    learning_rate = params['learning_rate']
    filter_sizes = list(int(x) for x in params['filter_sizes'].split(','))
    if params['enable_max_len'] == 1:
        enable_max = True
    else:
        enable_max = False
    if params['watch_rnn_output'] == 1:
        watch_rnn_output = True
    else:
        watch_rnn_output = False
    if params['is_simple'] == 1:
        is_simple = True
    else:
        is_simple = False
    x_raw, y_raw, target_raw, df, labels = data_helper.load_data_and_labels(
        dataset, dataset_name, params['max_length'],
        params['max_summary_length'], enable_max, True)
    x_test_raw, y_test_raw, target_test_raw, df_test, labels_test = data_helper.load_data_and_labels(
        testset, dataset_name, params['max_length'],
        params['max_summary_length'], enable_max, False)
    word_counts = {}
    count_words(word_counts, x_raw)
    logging.info("Size of Vocabulary: {}".format(len(word_counts)))
    """Step 1: pad each sentence to the same length and map each word to an id"""
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    min_document_length = min([len(x.split(' ')) for x in x_raw])
    logging.info(
        'The maximum length of all sentences: {}'.format(max_document_length))
    logging.info(
        'The minimum length of all sentences: {}'.format(min_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length, min_frequency=params['min_frequency'])
    vocab_processor.fit_transform(x_raw)
    vocab_to_int = vocab_processor.vocabulary_._mapping

    # Special tokens that will be added to our vocab
    codes = ["UNK", "PAD", "EOS", "GO"]

    # Add codes to vocab
    for code in codes:
        vocab_to_int[code] = len(vocab_to_int)

    # Dictionary to convert integers to words
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word
    usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100

    logging.info("Total number of words: {}".format(len(word_counts)))
    logging.info("Number of words we will use: {}".format(len(vocab_to_int)))
    logging.info("Percent of words we will use: {0:.2f}%".format(usage_ratio))

    # Apply convert_to_ints to clean_summaries and clean_texts
    word_count = 0
    unk_count = 0
    int_summaries, word_count, unk_count = convert_to_ints(
        target_raw, vocab_to_int, word_count, unk_count)
    int_texts, word_count, unk_count = convert_to_ints(x_raw,
                                                       vocab_to_int,
                                                       word_count,
                                                       unk_count,
                                                       eos=True)
    int_test_summaries, word_count, unk_count = convert_to_ints(
        target_test_raw, vocab_to_int, word_count, unk_count)
    int_test_texts, word_count, unk_count = convert_to_ints(x_test_raw,
                                                            vocab_to_int,
                                                            word_count,
                                                            unk_count,
                                                            eos=True)
    unk_percent = round(unk_count / word_count, 4) * 100

    logging.info("Total number of words in texts: {}".format(word_count))
    logging.info("Total number of UNKs in  texts: {}".format(unk_count))
    logging.info("Percent of words that are UNK: {0:.2f}%".format(unk_percent))
    """Step 1: pad each sentence to the same length and map each word to an id"""

    x_int = pad_sentence_batch(vocab_to_int, int_texts)
    target_int = pad_sentence_batch(vocab_to_int, int_summaries)
    x_test_int = pad_sentence_batch(vocab_to_int, int_test_texts)
    target_test_int = pad_sentence_batch(vocab_to_int, int_test_summaries)
    x = np.array(x_int)
    y = np.array(y_raw)
    x_test = np.array(x_test_int)
    y_test = np.array(y_test_raw)

    target = np.array(target_int)
    target_test = np.array(target_test_int)
    t = np.array(list(len(x) for x in x_int))
    t_test = np.array(list(len(x) for x in x_test_int))
    s = np.array(list(params['max_summary_length'] for x in x_int))
    s_test = np.array(list(params['max_summary_length'] for x in x_test_int))
    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    target_shuffled = target[shuffle_indices]
    t_shuffled = t[shuffle_indices]
    s_shuffled = s[shuffle_indices]
    x_train, x_dev, y_train, y_dev, target_train, target_dev, t_train, t_dev, s_train, s_dev = train_test_split(
        x_shuffled,
        y_shuffled,
        target_shuffled,
        t_shuffled,
        s_shuffled,
        test_size=0.1)
    """Step 4: save the labels into labels.json since predict.py needs it"""
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))
    logging.info('target_train: {}, target_dev: {}, target_test: {}'.format(
        len(target_train), len(target_dev), len(target_test)))
    logging.info('t_train: {}, t_dev: {}, t_test: {}'.format(
        len(t_train), len(t_dev), len(t_test)))
    logging.info('s_train: {}, s_dev: {}, s_test: {}'.format(
        len(s_train), len(s_dev), len(s_test)))
    """Step 5: build a graph and cnn object"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            cnn = seq2CNN(num_classes=y_train.shape[1],
                          filter_sizes=filter_sizes,
                          max_summary_length=params['max_summary_length'],
                          rnn_size=params['rnn_size'],
                          vocab_to_int=vocab_to_int,
                          num_filters=params['num_filters'],
                          vocab_size=len(vocab_to_int),
                          embedding_size=params['embedding_dim'])
            global_step = tf.Variable(0, name="global_step", trainable=False)
            num_batches_per_epoch = int(
                (len(x_train) - 1) / params['batch_size']) + 1
            epsilon = params['epsilon']

            learning_rate = tf.train.exponential_decay(params['learning_rate'],
                                                       global_step,
                                                       num_batches_per_epoch,
                                                       0.95,
                                                       staircase=True)

            optimizer = tf.train.AdamOptimizer(learning_rate, epsilon)
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            gradients, variables = zip(*optimizer.compute_gradients(cnn.loss))
            gradients, _ = tf.clip_by_global_norm(gradients, 5.0)

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(zip(gradients, variables),
                                                     global_step=global_step)

            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, dataset_name + "_" + timestamp))

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            #for tensorboard
            train_writer = tf.summary.FileWriter(
                '/home/tgisaturday/Workspace/Taehoon/VGG_text_cnn/seq2CNN' +
                '/graphs/train/' + dataset_name + '_' + timestamp, sess.graph)
            test_writer = tf.summary.FileWriter(
                '/home/tgisaturday/Workspace/Taehoon/VGG_text_cnn/seq2CNN' +
                '/graphs/test/' + dataset_name + '_' + timestamp)
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch, target_batch, t_batch, s_batch,
                           seq_lambda):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.targets: target_batch,
                    cnn.text_length: t_batch,
                    cnn.summary_length: s_batch,
                    cnn.batch_size: len(x_batch),
                    cnn.dropout_keep_prob: params['dropout_keep_prob'],
                    cnn.seq_lambda: seq_lambda,
                    cnn.is_training: True
                }
                summary, _, logits, step, loss, seq_loss, cnn_loss, acc = sess.run(
                    [
                        cnn.merged, train_op, cnn.training_logits, global_step,
                        cnn.loss, cnn.seq_loss, cnn.cnn_loss, cnn.accuracy
                    ], feed_dict)
                current_step = tf.train.global_step(sess, global_step)
                train_writer.add_summary(summary, current_step)
                return loss, seq_loss, cnn_loss, acc, logits

            # One evaluation step: evaluate the model with one batch
            def dev_step(x_batch, y_batch, target_batch, t_batch, s_batch,
                         seq_lambda):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.targets: target_batch,
                    cnn.text_length: t_batch,
                    cnn.summary_length: s_batch,
                    cnn.batch_size: len(x_batch),
                    cnn.dropout_keep_prob: 1.0,
                    cnn.seq_lambda: seq_lambda,
                    cnn.is_training: False
                }
                summary, step, loss, seq_loss, acc, num_correct, examples = sess.run(
                    [
                        cnn.merged, global_step, cnn.loss, cnn.seq_loss,
                        cnn.accuracy, cnn.num_correct, cnn.inference_logits
                    ], feed_dict)
                if watch_rnn_output == True:
                    pad = vocab_to_int['PAD']
                    result = " ".join(
                        [int_to_vocab[j] for j in examples[0] if j != pad])
                    logging.info('{}'.format(result))
                current_step = tf.train.global_step(sess, global_step)
                test_writer.add_summary(summary, current_step)
                return num_correct

            # Save the word_to_id map since predict.py needs it
            vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))

            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(
                list(zip(x_train, y_train, target_train, t_train, s_train)),
                params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                x_train_batch, y_train_batch, target_train_batch, t_train_batch, s_train_batch = zip(
                    *train_batch)
                current_step = tf.train.global_step(sess, global_step)
                seq_lambda = exponential_lambda_decay(params['seq_lambda'],
                                                      current_step,
                                                      num_batches_per_epoch,
                                                      0.95,
                                                      staircase=True)
                #seq_lambda = params['seq_lambda']
                train_loss, train_seq_loss, train_cnn_loss, train_acc, examples = train_step(
                    x_train_batch, y_train_batch, target_train_batch,
                    t_train_batch, s_train_batch, seq_lambda)
                """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    logging.critical(
                        'step: {} accuracy: {:0.6f} learning_rate: {:0.6f} seq_lambda: {:0.6f} loss: {:0.6f} seq_loss: {:0.6f} cnn_loss: {:0.6f}'
                        .format(current_step, train_acc, learning_rate.eval(),
                                seq_lambda, train_loss, train_seq_loss,
                                train_cnn_loss))
                    pad = vocab_to_int['PAD']
                    result = " ".join(
                        [int_to_vocab[j] for j in examples[0] if j != pad])
                    logging.info('{}'.format(result))
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev, target_dev, t_dev, s_dev)),
                        params['batch_size'], 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch, target_dev_batch, t_dev_batch, s_dev_batch = zip(
                            *dev_batch)
                        num_dev_correct = dev_step(x_dev_batch, y_dev_batch,
                                                   target_dev_batch,
                                                   t_dev_batch, s_dev_batch,
                                                   seq_lambda)
                        total_dev_correct += num_dev_correct
                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    logging.critical(
                        'Accuracy on dev set: {}'.format(dev_accuracy))
                    """Step 6.2: save the model if it is the best based on accuracy on dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model at {} at step {}'.format(
                            path, best_at_step))
                        logging.critical(
                            'Best accuracy is {} at step {}'.format(
                                best_accuracy, best_at_step))
            """Step 7: predict x_test (batch by batch)"""
            test_batches = data_helper.batch_iter(
                list(zip(x_test, y_test, target_test, t_test, s_test)),
                params['batch_size'], 1)
            total_test_correct = 0
            watch_rnn_output = True
            start = time.time()
            for test_batch in test_batches:
                x_test_batch, y_test_batch, target_test_batch, t_test_batch, s_test_batch = zip(
                    *test_batch)
                num_test_correct = dev_step(x_test_batch, y_test_batch,
                                            target_test_batch, t_test_batch,
                                            s_test_batch, seq_lambda)
                total_test_correct += num_test_correct
            path = saver.save(sess, checkpoint_prefix)
            test_accuracy = float(total_test_correct) / len(y_test)
            logging.critical(
                "\nExecution time for testing = {0:.6f}".format(time.time() -
                                                                start))
            logging.critical(
                'Accuracy on test set is {} based on the best model {}'.format(
                    test_accuracy, path))
            logging.critical('The training is complete')
Beispiel #3
0
tf.flags.DEFINE_integer('ckpt_interval', 1000, 'save model after given number of training loop')

# Data Parameters
tf.flags.DEFINE_string('train_pos_file', 'twitter-datasets/train_pos.txt',"the path of positive training data")
tf.flags.DEFINE_string('train_neg_file', 'twitter-datasets/train_neg.txt',"the path of negative training data")
tf.flags.DEFINE_string('embedding_path', 'twitter-datasets/glove.6B.50d.txt',"the path for embeddings")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

for attr,value in FLAGS.__flags.items():
    print("{}={}".format(attr,value))

# Data Preparation
x_text, y = load_data_and_labels(FLAGS.train_pos_file,FLAGS.train_neg_file)

# build dict
max_length = max([len(text.strip().split(' ')) for text in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
print("data prepared and dict built")

# split train and valid set
x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size=0.01,random_state=10)

with tf.Session() as sess:
    model = TextCNN(sequence_length=max_length,
                    num_class=2,vocab_size=len(vocab_processor.vocabulary_),
                    emb_dim=FLAGS.emb_dim,
                    filter_size_list=list(map(int, FLAGS.filter_size_list.split(','))),
Beispiel #4
0
from __future__ import print_function
import numpy as np
from data_helper import load_data_and_labels
from model import TextCNN

VALIDATION_SPLIT = 0.1
CORPUS_DIR = './data'
BATCH_SIZE = 32
EPOCHS = 10
EMBEDDING_SIZE = 256
NUM_FILTERS = 128
FILTER_SIZES = [3, 4, 5]

# Load data and labels
data, labels, num_words = load_data_and_labels(CORPUS_DIR)
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Training model.')

text_cnn = TextCNN(num_class=y_train.shape[1],
                   num_words=num_words,
                   sequence_length=data.shape[1],
Beispiel #5
0
def train_cnn():
	path = ''
	"""Step 0: load sentences, labels, and training parameters"""
	train_file = sys.argv[1]
	x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file)

	parameter_file = sys.argv[2]
	params = json.loads(open(parameter_file).read())

	"""Step 1: pad each sentence to the same length and map each word to an id"""
	max_document_length = max([len(x.split(' ')) for x in x_raw])
	logging.info('The maximum length of all sentences: {}'.format(max_document_length))
	vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
	x = np.array(list(vocab_processor.fit_transform(x_raw)))
	y = np.array(y_raw)

	"""Step 2: split the original dataset into train and test sets"""
	x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

	"""Step 3: shuffle the train set and split the train set into train and dev sets"""
	shuffle_indices = np.random.permutation(np.arange(len(y_)))
	x_shuffled = x_[shuffle_indices]
	y_shuffled = y_[shuffle_indices]
	x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1)

	"""Step 4: save the labels into labels.json since predict.py needs it"""
	with open('./labels.json', 'w') as outfile:
		json.dump(labels, outfile, indent=4)

	logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
	logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

	"""Step 5: build a graph and cnn object"""
	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)
		with sess.as_default():
			cnn = TextCNN(
				sequence_length=x_train.shape[1],
				num_classes=y_train.shape[1],
				vocab_size=len(vocab_processor.vocabulary_),
				embedding_size=params['embedding_dim'],
				filter_sizes=list(map(int, params['filter_sizes'].split(","))),
				num_filters=params['num_filters'],
				l2_reg_lambda=params['l2_reg_lambda'])

			global_step = tf.Variable(0, name="global_step", trainable=False)
			optimizer = tf.train.AdamOptimizer(1e-3)
			grads_and_vars = optimizer.compute_gradients(cnn.loss)
			train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

			timestamp = str(int(time.time()))
			out_dir = os.path.abspath(os.path.join(os.path.curdir, "trained_model_" + timestamp))

			checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
			checkpoint_prefix = os.path.join(checkpoint_dir, "model")
			if not os.path.exists(checkpoint_dir):
				os.makedirs(checkpoint_dir)
			saver = tf.train.Saver()

			# One training step: train the model with one batch
			def train_step(x_batch, y_batch):
				feed_dict = {
					cnn.input_x: x_batch,
					cnn.input_y: y_batch,
					cnn.dropout_keep_prob: params['dropout_keep_prob']}
				_, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

			# One evaluation step: evaluate the model with one batch
			def dev_step(x_batch, y_batch):
				feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0}
				step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict)
				return num_correct

			# Save the word_to_id map since predict.py needs it
			vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
			sess.run(tf.global_variables_initializer())

			# Training starts here
			train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
			best_accuracy, best_at_step = 0, 0

			"""Step 6: train the cnn model with x_train and y_train (batch by batch)"""
			for train_batch in train_batches:
				x_train_batch, y_train_batch = zip(*train_batch)
				train_step(x_train_batch, y_train_batch)
				current_step = tf.train.global_step(sess, global_step)

				"""Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
				if current_step % params['evaluate_every'] == 0:
					dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)
					total_dev_correct = 0
					for dev_batch in dev_batches:
						x_dev_batch, y_dev_batch = zip(*dev_batch)
						num_dev_correct = dev_step(x_dev_batch, y_dev_batch)
						total_dev_correct += num_dev_correct

					dev_accuracy = float(total_dev_correct) / len(y_dev)
					logging.critical('Accuracy on dev set: {}'.format(dev_accuracy))

					"""Step 6.2: save the model if it is the best based on accuracy on dev set"""
					if dev_accuracy >= best_accuracy:
						best_accuracy, best_at_step = dev_accuracy, current_step
						path = saver.save(sess, checkpoint_prefix, global_step=current_step)
						logging.critical('Saved model at {} at step {}'.format(path, best_at_step))
						logging.critical('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step))

			"""Step 7: predict x_test (batch by batch)"""
			test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1)
			total_test_correct = 0
			for test_batch in test_batches:
				x_test_batch, y_test_batch = zip(*test_batch)
				num_test_correct = dev_step(x_test_batch, y_test_batch)
				total_test_correct += num_test_correct

			test_accuracy = float(total_test_correct) / len(y_test)
			logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path))
			logging.critical('The training is complete')
def data_preprocess():
    # Data preprocess
    # =======================================================
    # Load data
    print("Loading data...")
    if not os.path.exists(os.path.join(out_dir, "data_x.npy")):
        x, y = data_helper.load_data_and_labels(FLAGS.data_file)
        # Get embedding vector
        x = x[:1000]
        y = y[:1000]
        sentences, max_document_length = data_helper.padding_sentences(
            x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length)
        print(len(sentences[0]))
        if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")):
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    embedding_size=FLAGS.embedding_dim,
                    file_to_save=os.path.join(out_dir,
                                              'trained_word2vec.model')))
        else:
            print('w2v model found...')
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    embedding_size=FLAGS.embedding_dim,
                    file_to_save=os.path.join(out_dir,
                                              'trained_word2vec.model'),
                    file_to_load=os.path.join(out_dir,
                                              'trained_word2vec.model')))
        y = np.array(y)
        # np.save(os.path.join(out_dir,"data_x.npy"),x)
        # np.save(os.path.join(out_dir,"data_y.npy"),y)
        del sentences
    else:
        print('data found...')
        x = np.load(os.path.join(out_dir, "data_x.npy"))
        y = np.load(os.path.join(out_dir, "data_y.npy"))
    print("x.shape = {}".format(x.shape))
    print("y.shape = {}".format(y.shape))

    # Save params
    if not os.path.exists(os.path.join(out_dir, "training_params.pickle")):
        training_params_file = os.path.join(out_dir, 'training_params.pickle')
        params = {
            'num_labels': FLAGS.num_labels,
            'max_document_length': max_document_length
        }
        data_helper.saveDict(params, training_params_file)

    # Shuffle data randomly
    # np.random.seed(10)
    # shuffle_indices = np.random.permutation(np.arange(len(y)))
    # x_shuffled = x[shuffle_indices]
    # y_shuffled = y[shuffle_indices]
    # del x,y

    # x_train, x_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=42)  # split into training and testing set 80/20 ratio
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2,
        random_state=42)  # split into training and testing set 80/20 ratio
    del x, y
    return x_train, x_test, y_train, y_test
Beispiel #7
0
# Parametros do tensorflow
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParametros:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.eval_train:
    x_raw, y_test = data_helper.load_data_and_labels(FLAGS.positive_data_file,
                                                     FLAGS.negative_data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# carregando vocabulario do treino
vocab_path = os.path.join(FLAGS.vocab_dir, "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
# transformando frases fixas em formato de array de palavras
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nAvaliando...\n")

# Avaliacao
# ==================================================
Beispiel #8
0
def train():
    # prepare data
    positive_file = os.path.join(os.path.dirname(__file__),
                                 'data/rt-polaritydata/rt-polarity.pos')
    negative_file = os.path.join(os.path.dirname(__file__),
                                 'data/rt-polaritydata/rt-polarity.neg')
    data_x, data_y, vocab_size = load_data_and_labels(positive_file,
                                                      negative_file)
    # generate train_data and validate_data
    validate_index = -1 * int(FLAGS.val_percent * len(data_y))
    x_train, x_val = data_x[:validate_index], data_x[validate_index:]
    y_train, y_val = data_y[:validate_index], data_y[validate_index:]
    with tf.Graph().as_default():
        with tf.Session() as sess:
            model = Model(learning_rate=FLAGS.learning_rate,
                          sequence_length=x_train.shape[1],
                          num_classes=FLAGS.num_classes,
                          vocab_size=vocab_size,
                          embedding_size=FLAGS.embedding_size,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(','))),
                          num_filters=FLAGS.num_filters,
                          num_checkpoints=FLAGS.num_checkpoints,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # initialize
            init_op = tf.group(tf.global_variables_initializer(),
                               tf.local_variables_initializer())
            sess.run(init_op)

            def train_op(x_batch, y_batch):
                loss, accuracy, global_step, summaries, _ = sess.run(
                    [
                        model.loss, model.accuracy, model.global_step,
                        model.train_summary, model.train_op
                    ],
                    feed_dict={
                        model.input_x: x_batch,
                        model.output_y: y_batch,
                        model.dropout: FLAGS.dropout_keep_prob
                    })

                print("step: {:d}, loss {:g}, acc {:g}".format(
                    global_step, loss, accuracy))
                # model.train_summary_writer.add_summary(summaries, global_step)
                return global_step

            def val_op(val_x, val_y):
                loss, accuracy, summaries = sess.run(
                    [model.loss, model.accuracy, model.val_summary],
                    feed_dict={
                        model.input_x: val_x,
                        model.output_y: val_y,
                        model.dropout: 1.0
                    })
                print("loss {:g}, acc {:g}".format(loss, accuracy))
                # model.train_summary_writer.add_summary(val_summary, step)

            # train and validate
            # generate batches
            batches = batch_iter(list(zip(x_train, y_train)),
                                 batch_size=FLAGS.batch_size,
                                 num_epochs=FLAGS.num_epochs)

            for batch in batches:
                x_batch, y_batch = zip(*batch)
                x_batch = np.array(x_batch, dtype=np.int32)
                y_batch = np.array(y_batch, dtype=np.int32)
                current_step = train_op(x_batch, y_batch)
                if current_step % FLAGS.evaluate_every == 0:
                    print('Evaluate\n')
                    val_op(val_x=x_val, val_y=y_val)
def train_cnn():
	"""Step 0: load sentences, labels, and training parameters"""
	train_file = sys.argv[1]
	x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file)

	parameter_file = sys.argv[2]
	params = json.loads(open(parameter_file).read())

	"""Step 1: pad each sentence to the same length and map each word to an id"""
	max_document_length = max([len(x.split(' ')) for x in x_raw])
	logging.info('The maximum length of all sentences: {}'.format(max_document_length))
	vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
	x = np.array(list(vocab_processor.fit_transform(x_raw)))
	y = np.array(y_raw)

	"""Step 2: split the original dataset into train and test sets"""
	x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

	"""Step 3: shuffle the train set and split the train set into train and dev sets"""
	shuffle_indices = np.random.permutation(np.arange(len(y_)))
	x_shuffled = x_[shuffle_indices]
	y_shuffled = y_[shuffle_indices]
	x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1)

	"""Step 4: save the labels into labels.json since predict.py needs it"""
	with open('./labels.json', 'w') as outfile:
		json.dump(labels, outfile, indent=4)

	logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
	logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

	"""Step 5: build a graph and cnn object"""
	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)
		with sess.as_default():
			cnn = TextCNN(
				sequence_length=x_train.shape[1],
				num_classes=y_train.shape[1],
				vocab_size=len(vocab_processor.vocabulary_),
				embedding_size=params['embedding_dim'],
				filter_sizes=list(map(int, params['filter_sizes'].split(","))),
				num_filters=params['num_filters'],
				l2_reg_lambda=params['l2_reg_lambda'])

			global_step = tf.Variable(0, name="global_step", trainable=False)
			optimizer = tf.train.AdamOptimizer(1e-3)
			grads_and_vars = optimizer.compute_gradients(cnn.loss)
			train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

			timestamp = str(int(time.time()))
			out_dir = os.path.abspath(os.path.join(os.path.curdir, "trained_model_" + timestamp))

			checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
			checkpoint_prefix = os.path.join(checkpoint_dir, "model")
			if not os.path.exists(checkpoint_dir):
				os.makedirs(checkpoint_dir)
			saver = tf.train.Saver()

			# One training step: train the model with one batch
			def train_step(x_batch, y_batch):
				feed_dict = {
					cnn.input_x: x_batch,
					cnn.input_y: y_batch,
					cnn.dropout_keep_prob: params['dropout_keep_prob']}
				_, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

			# One evaluation step: evaluate the model with one batch
			def dev_step(x_batch, y_batch):
				feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0}
				step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict)
				return num_correct

			# Save the word_to_id map since predict.py needs it
			vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
			sess.run(tf.global_variables_initializer())

			# Training starts here
			train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
			best_accuracy, best_at_step = 0, 0

			"""Step 6: train the cnn model with x_train and y_train (batch by batch)"""
			for train_batch in train_batches:
				x_train_batch, y_train_batch = zip(*train_batch)
				train_step(x_train_batch, y_train_batch)
				current_step = tf.train.global_step(sess, global_step)

				"""Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
				if current_step % params['evaluate_every'] == 0:
					dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)
					total_dev_correct = 0
					for dev_batch in dev_batches:
						x_dev_batch, y_dev_batch = zip(*dev_batch)
						num_dev_correct = dev_step(x_dev_batch, y_dev_batch)
						total_dev_correct += num_dev_correct

					dev_accuracy = float(total_dev_correct) / len(y_dev)
					logging.critical('Accuracy on dev set: {}'.format(dev_accuracy))

					"""Step 6.2: save the model if it is the best based on accuracy on dev set"""
					if dev_accuracy >= best_accuracy:
						best_accuracy, best_at_step = dev_accuracy, current_step
						path = saver.save(sess, checkpoint_prefix, global_step=current_step)
						logging.critical('Saved model at {} at step {}'.format(path, best_at_step))
						logging.critical('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step))

			"""Step 7: predict x_test (batch by batch)"""
			test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1)
			total_test_correct = 0
			for test_batch in test_batches:
				x_test_batch, y_test_batch = zip(*test_batch)
				num_test_correct = dev_step(x_test_batch, y_test_batch)
				total_test_correct += num_test_correct

			test_accuracy = float(total_test_correct) / len(y_test)
			logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path))
			logging.critical('The training is complete')
Beispiel #10
0
def compute_accurcy(x_data, y_data):
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(input_y, 1))
    accurcy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    return sess.run(accurcy, feed_dict={input_x: x_data, input_y: y_data})


learning_rate = 0.1
train_steps = 100
pos_file = "pos.txt"
neg_file = "neg.txt"
dev_sample_percentage = .1
display_steps = 1

print("loading data...")
x_text, y = data_helper.load_data_and_labels(pos_file, neg_file)

max_document_length = max([len(line.split(" ")) for line in x_text])
print("max_document_length = ", max_document_length)
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
#print(x)
#Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
def train_cnn(dataset_name):
    """Step 0: load sentences, labels, and training parameters"""
    dataset = './dataset/' + dataset_name + '_csv/train.csv'
    parameter_file = "./parameters.json"
    params = json.loads(open(parameter_file).read())
    learning_rate = params['learning_rate']
    if params['enable_max_len'] == 1:
        enable_max = True
    else:
        enable_max = False

    if params['summary_using_keywords'] == 1:
        enable_keywords = True
    else:
        enable_keywords = False

    if params['layer_norm'] == 1:
        layer_norm = True
    else:
        layer_norm = False

    if params['watch_rnn_output'] == 1:
        watch_rnn_output = True
    else:
        watch_rnn_output = False

    if params['use_he_uniform'] == 1:
        use_he_uniform = True
    else:
        use_he_uniform = False

    if params['optional_shortcut'] == 1:
        optional_shortcut = True
    else:
        optional_shortcut = False

    x_raw, y_raw, target_raw, df, labels = data_helper.load_data_and_labels(
        dataset, params['max_length'], params['max_summary_length'],
        enable_max, enable_keywords)
    word_counts = {}

    count_words(word_counts, x_raw)

    logging.info("Size of Vocabulary: {}".format(len(word_counts)))

    # Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better
    # (https://github.com/commonsense/conceptnet-numberbatch)
    embeddings_index = {}
    with open('./dataset/embeddings/numberbatch-en.txt',
              encoding='utf-8') as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    max_document_length = max([len(x.split(' ')) for x in x_raw])

    # Find the number of words that are missing from CN, and are used more than our threshold.
    missing_words = 0
    threshold = params['min_frequency']

    for word, count in word_counts.items():
        if count > threshold:
            if word not in embeddings_index:
                missing_words += 1

    missing_ratio = round(missing_words / len(word_counts), 4) * 100

    logging.info("Number of words missing from CN: {}".format(missing_words))
    logging.info(
        "Percent of words that are missing from vocabulary: {0:.2f}%".format(
            missing_ratio))

    #dictionary to convert words to integers
    """Step 1: pad each sentence to the same length and map each word to an id"""
    value = 0
    vocab_to_int = {}
    for word, count in word_counts.items():
        if count >= threshold:
            vocab_to_int[word] = value
            value += 1
    # Special tokens that will be added to our vocab
    codes = ["UNK", "PAD", "EOS", "GO"]

    # Add codes to vocab
    for code in codes:
        vocab_to_int[code] = len(vocab_to_int)

    # Dictionary to convert integers to words
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word
    usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100

    logging.info("Total number of words: {}".format(len(word_counts)))
    logging.info("Number of words we will use: {}".format(len(vocab_to_int)))
    logging.info("Percent of words we will use: {0:.2f}%".format(usage_ratio))

    # Need to use 300 for embedding dimensions to match CN's vectors.
    embedding_dim = 300
    nb_words = len(vocab_to_int)
    logging.info("Size of vocab_to_int: {}".format(len(vocab_to_int)))
    # Create matrix with default values of zero
    word_embedding_matrix = np.zeros((nb_words, embedding_dim),
                                     dtype=np.float32)
    for word, i in vocab_to_int.items():
        if word in embeddings_index:
            word_embedding_matrix[i] = embeddings_index[word]
        else:
            # If word not in CN, create a random embedding for it
            new_embedding = np.array(
                np.random.uniform(-1.0, 1.0, embedding_dim))
            embeddings_index[word] = new_embedding
            word_embedding_matrix[i] = new_embedding

    # Check if value matches len(vocab_to_int)
    logging.info("Size of word embedding matrix: {}".format(
        len(word_embedding_matrix)))

    # Apply convert_to_ints to clean_summaries and clean_texts
    word_count = 0
    unk_count = 0
    logging.info("text_example: {}".format(x_raw[0]))
    logging.info("helper_example: {}".format(target_raw[0]))

    int_summaries, word_count, unk_count = convert_to_ints(
        target_raw, vocab_to_int, word_count, unk_count)
    int_texts, word_count, unk_count = convert_to_ints(x_raw,
                                                       vocab_to_int,
                                                       word_count,
                                                       unk_count,
                                                       eos=True)
    unk_percent = round(unk_count / word_count, 4) * 100

    logging.info("Total number of words in texts: {}".format(word_count))
    logging.info("Total number of UNKs in  texts: {}".format(unk_count))
    logging.info("Percent of words that are UNK: {0:.2f}%".format(unk_percent))
    """Step 1: pad each sentence to the same length and map each word to an id"""

    x_int = pad_sentence_batch(vocab_to_int, int_texts)
    target_int = pad_sentence_batch(vocab_to_int, int_summaries)
    x = np.array(x_int)
    y = np.array(y_raw)
    target = np.array(target_int)
    t = np.array(list(len(x) for x in x_int))
    max_summary_length = max([len(sentence) for sentence in target_int])
    s = np.array(list(max_summary_length for x in x_int))
    """Step 2: split the original dataset into train and test sets"""
    x_, x_test, y_, y_test, target_, target_test, t_, t_test, s_, s_test = train_test_split(
        x, y, target, t, s, test_size=0.1, random_state=42)
    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y_)))
    x_shuffled = x_[shuffle_indices]
    y_shuffled = y_[shuffle_indices]
    target_shuffled = target_[shuffle_indices]
    t_shuffled = t_[shuffle_indices]
    s_shuffled = s_[shuffle_indices]
    x_train, x_dev, y_train, y_dev, target_train, target_dev, t_train, t_dev, s_train, s_dev = train_test_split(
        x_shuffled,
        y_shuffled,
        target_shuffled,
        t_shuffled,
        s_shuffled,
        test_size=0.1)
    """Step 4: save the labels into labels.json since predict.py needs it"""
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))
    logging.info('target_train: {}, target_dev: {}, target_test: {}'.format(
        len(target_train), len(target_dev), len(target_test)))
    logging.info('t_train: {}, t_dev: {}, t_test: {}'.format(
        len(t_train), len(t_dev), len(t_test)))
    logging.info('s_train: {}, s_dev: {}, s_test: {}'.format(
        len(s_train), len(s_dev), len(s_test)))
    """Step 5: build a graph and cnn object"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = seq2CNN(embeddings=word_embedding_matrix,
                          num_classes=y_train.shape[1],
                          max_summary_length=max_summary_length,
                          rnn_size=params['rnn_size'],
                          rnn_num_layers=params['rnn_num_layers'],
                          vocab_to_int=vocab_to_int,
                          num_filters=params['num_filters'],
                          vocab_size=len(vocab_to_int),
                          embedding_size=300,
                          layer_norm=layer_norm,
                          depth=params['VDCNN_depth'],
                          downsampling_type=params['downsampling_type'],
                          use_he_uniform=use_he_uniform,
                          optional_shortcut=optional_shortcut)

            global_step = tf.Variable(0, name="global_step", trainable=False)
            num_batches_per_epoch = int(
                (len(x_train) - 1) / params['batch_size']) + 1
            epsilon = params['epsilon']
            learning_rate = tf.train.exponential_decay(params['learning_rate'],
                                                       global_step,
                                                       params['num_epochs'] *
                                                       num_batches_per_epoch,
                                                       0.95,
                                                       staircase=True)
            optimizer = tf.train.AdamOptimizer(learning_rate, epsilon)
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            cnn_gradients, cnn_variables = zip(
                *optimizer.compute_gradients(cnn.loss))
            seq_gradients, seq_variables = zip(
                *optimizer.compute_gradients(cnn.seq_loss))
            cnn_gradients, _ = tf.clip_by_global_norm(cnn_gradients, 7.0)
            seq_gradients, _ = tf.clip_by_global_norm(seq_gradients, 7.0)
            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(zip(
                    cnn_gradients, cnn_variables),
                                                     global_step=global_step)
                seq_train_op = optimizer.apply_gradients(
                    zip(seq_gradients, seq_variables), global_step=global_step)
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "result_" + timestamp))

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch, target_batch, t_batch, s_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.targets: target_batch,
                    cnn.text_length: t_batch,
                    cnn.summary_length: s_batch,
                    cnn.batch_size: len(x_batch),
                    cnn.dropout_keep_prob: params['dropout_keep_prob'],
                    cnn.is_training: True
                }
                _, logits, step, loss, seq_loss, acc = sess.run([
                    train_op, cnn.training_logits, global_step, cnn.loss,
                    cnn.seq_loss, cnn.accuracy
                ], feed_dict)
                return loss, seq_loss, acc

            def seq_train_step(x_batch, y_batch, target_batch, t_batch,
                               s_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.targets: target_batch,
                    cnn.text_length: t_batch,
                    cnn.summary_length: s_batch,
                    cnn.batch_size: len(x_batch),
                    cnn.dropout_keep_prob: params['dropout_keep_prob'],
                    cnn.is_training: True
                }
                _, logits, step, loss, seq_loss, acc = sess.run([
                    seq_train_op, cnn.training_logits, global_step, cnn.loss,
                    cnn.seq_loss, cnn.accuracy
                ], feed_dict)
                return loss, seq_loss, acc

            # One evaluation step: evaluate the model with one batch
            def dev_step(x_batch, y_batch, target_batch, t_batch, s_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.targets: target_batch,
                    cnn.text_length: t_batch,
                    cnn.summary_length: s_batch,
                    cnn.batch_size: len(x_batch),
                    cnn.dropout_keep_prob: 1.0,
                    cnn.is_training: False
                }
                step, loss, seq_loss, acc, num_correct, examples = sess.run([
                    global_step, cnn.loss, cnn.seq_loss, cnn.accuracy,
                    cnn.num_correct, cnn.training_logits
                ], feed_dict)
                if watch_rnn_output == True:
                    pad = vocab_to_int['PAD']
                    result = " ".join(
                        [int_to_vocab[j] for j in examples[0] if j != pad])
                    logging.info('{}'.format(result))

                return num_correct

            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(
                list(zip(x_train, y_train, target_train, t_train, s_train)),
                params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                x_train_batch, y_train_batch, target_train_batch, t_train_batch, s_train_batch = zip(
                    *train_batch)
                train_loss, train_seq_loss, train_acc = train_step(
                    x_train_batch, y_train_batch, target_train_batch,
                    t_train_batch, s_train_batch)
                current_step = tf.train.global_step(sess, global_step)
                train_loss, train_seq_loss, train_acc = seq_train_step(
                    x_train_batch, y_train_batch, target_train_batch,
                    t_train_batch, s_train_batch)
                if current_step % params['evaluate_every'] == 0:
                    logging.critical(
                        'step: {} accuracy: {} cnn_loss: {} seq_loss: {}'.
                        format(current_step, train_acc, train_loss,
                               train_seq_loss))
                """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev, target_dev, t_dev, s_dev)),
                        params['batch_size'], 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch, target_dev_batch, t_dev_batch, s_dev_batch = zip(
                            *dev_batch)
                        num_dev_correct = dev_step(x_dev_batch, y_dev_batch,
                                                   target_dev_batch,
                                                   t_dev_batch, s_dev_batch)
                        total_dev_correct += num_dev_correct

                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    logging.critical(
                        'Accuracy on dev set: {}'.format(dev_accuracy))
                    """Step 6.2: save the model if it is the best based on accuracy on dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model at {} at step {}'.format(
                            path, best_at_step))
                        logging.critical(
                            'Best accuracy is {} at step {}'.format(
                                best_accuracy, best_at_step))
            """Step 7: predict x_test (batch by batch)"""
            test_batches = data_helper.batch_iter(
                list(zip(x_test, y_test, target_test, t_test, s_test)),
                params['batch_size'], 1)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch, target_test_batch, t_test_batch, s_test_batch = zip(
                    *test_batch)
                num_test_correct = dev_step(x_test_batch, y_test_batch,
                                            target_test_batch, t_test_batch,
                                            s_test_batch)
                total_test_correct += num_test_correct
            path = saver.save(sess, checkpoint_prefix)
            test_accuracy = float(total_test_correct) / len(y_test)
            logging.critical(
                'Accuracy on test set is {} based on the best model {}'.format(
                    test_accuracy, path))
            logging.critical('The training is complete')
Beispiel #12
0
tf.flags.DEFINE_string("training_file_neg", "twitter-datasets/train_neg.txt", "Path and name for the training file (neg examples)")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helper.load_data_and_labels(FLAGS.training_file_pos, FLAGS.training_file_neg)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))


# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: It's better to use cross-validation
def train_cnn():
    """Step 0: load sentences, labels, and training parameters"""
    train_file = '../data/iseardataset.csv'
    x_raw, y_raw, df, labels, embedding_mat = data_helper.load_data_and_labels(
        train_file)

    parameter_file = '../training_config.json'
    params = json.loads(open(parameter_file).read())
    """Step 1: pad each sentence to the same length and map each word to an id"""
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    logging.info(
        'The maximum length of all sentences: {}'.format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)

    # print x.shape
    """Step 2: split the original dataset into train and test sets"""
    x_, x_test, y_, y_test = train_test_split(x,
                                              y,
                                              test_size=0.2,
                                              random_state=42)
    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y_)))
    x_shuffled = x_[shuffle_indices]
    y_shuffled = y_[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled,
                                                      y_shuffled,
                                                      test_size=0.2)
    """Step 4: save the labels into labels.json since predict.py needs it"""
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))
    """Step 5: build a graph and cnn object"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=9000,
                          embedding_size=params['embedding_dim'],
                          filter_sizes=list(
                              map(int, params['filter_sizes'].split(","))),
                          num_filters=params['num_filters'],
                          embedding_mat=embedding_mat,
                          l2_reg_lambda=params['l2_reg_lambda'])

            # Optimizing our loss function using Adam's optimizer
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "trained_model_" + timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summary for predictions
            # predictions_summary = tf.summary.scalar("predictions", cnn.predictions)

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: params['dropout_keep_prob']
                }
                _, step, summaries, loss, acc = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, acc))
                train_summary_writer.add_summary(summaries, step)

            # One evaluation step: evaluate the model with one batch
            def dev_step(x_batch, y_batch, writer=None):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, acc, num_correct, predictions = \
                    sess.run([global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.num_correct, cnn.predictions],
                             feed_dict)
                if writer:
                    writer.add_summary(summaries, step)
                return num_correct, predictions

            # Save the word_to_id map since predict.py needs it
            vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
            sess.run(tf.global_variables_initializer())

            print "Loading Embeddings !"

            embedding_dimension = 200
            embedding_dir = '../embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
            # embedding_dir = '../GoogleNews-vectors-negative300.bin'

            initW = data_helper.load_embedding_vectors_glove(
                vocab_processor.vocabulary_, embedding_dir,
                embedding_dimension)
            # initW = data_helper.load_embedding_vectors_word2vec(vocab_processor.vocabulary_, embedding_dir, embedding_dimension)
            sess.run(cnn.W.assign(initW))

            print "Loaded Embeddings !"

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                if len(train_batch) == 0:
                    continue
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)
                """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        if len(dev_batch) == 0:
                            continue
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        num_dev_correct, y_pred_tre = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct

                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    logging.critical(
                        'Accuracy on dev set: {}'.format(dev_accuracy))
                    """Step 6.2: save the model if it is the best based on accuracy of the dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))

            classes = [
                "joy", "fear", "anger", "sadness", "disgust", "shame", "guilt"
            ]
            """Step 7: predict x_test (batch by batch)"""
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'], 1)
            total_test_correct = 0
            for test_batch in test_batches:
                if len(test_batch) == 0:
                    continue
                print "Non Zero Length"
                x_test_batch, y_test_batch = zip(*test_batch)
                num_test_correct, y_pred = dev_step(x_test_batch, y_test_batch)
                total_test_correct += num_test_correct

            test_accuracy = (float(total_test_correct) / len(y_test)) * 100

            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'], 1)

            total_train_correct = 0
            for train_batch in train_batches:
                if len(train_batch) == 0:
                    continue
                print "Non Zero Length"
                x_train_batch, y_train_batch = zip(*train_batch)
                num_test_correct, y_ = dev_step(x_train_batch, y_train_batch)
                total_train_correct += num_test_correct

            train_accuracy = (float(total_train_correct) / len(y_train)) * 100

        print 'Accuracy on test set is {} based on the best model'.format(
            test_accuracy)
        print 'Accuracy on train set is {} based on the best model'.format(
            train_accuracy)
        # logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path))

        print(len(y_test_batch))
        print(y_test_batch[0])
        print(len(y_pred))
        print(y_pred[0])
        # Y_test = np.argmax(y_test_batch, axis=1)
        # y_pred_class = np.argmax(y_pred, axis=1)

        print(classification_report(y_test_batch, y_pred,
                                    target_names=classes))

        # # Create confusion matrix
        # cnf_matrix = confusion_matrix(Y_test, y_pred_class)
        # plt.figure(figsize=(20, 10))
        # data_helper.plot_confusion_matrix(cnf_matrix, labels=classes)

        logging.critical('The training is complete')
Beispiel #14
0
import os
import data_helper
from tensorflow.contrib import learn
import csv

# 改变这里:加载数据。加载自己的数据
positive_data_file = './data/rt-polarity.pos'
negative_data_file = './data/rt-polarity.neg'
if_eval = True
checkpoint_dir = './runs/1548567747'
allow_soft_placement = True
log_device_placement = False
batch_size = 16

if if_eval:
    x_raw, y_test = data_helper.load_data_and_labels(positive_data_file,
                                                     negative_data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ['a masterpiece four years in the making', 'everying is off']
    y_test = [1, 0]

# map data into vocabulary
vocab_path = os.path.join(checkpoint_dir, 'vocab')
print(vocab_path)
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print('\n Evaluating...\n')

# Evaluation
# ====================================
Beispiel #15
0
batch_size = 16
num_epochs = 10
evaluate_every = 100
checkpoint_every = 100
num_checkpoints = 5
allow_soft_placement = True
log_device_placement = False
filter_sizes = '3,4,5'
num_filters = 128

# Data Preparation
# ==============================================

# Load data加载数据,返回数据集和标签
print('Loading data...')
x_text, y = data_helper.load_data_and_labels(positive_data_file,
                                             negative_data_file)

# Build vocabulary 生成但是字典
# 得到最大邮件长度(单词个数),不足的用0补充
max_document_length = max([len(x.split(' ')) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(
    vocab_processor.fit_transform(x_text)))  # todo 生成word_to_id-Metrix ,不够的补零

# 数据打乱数据集
np.random.seed(32)

shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffle = x[shuffle_indices]
y_shuffle = y[shuffle_indices]
Beispiel #16
0
def train_cnn():
    """Step 0: load sentences, labels, and training parameters"""
    train_file = "C:\\Users\\s1761548\\Downloads\\NPS\\nps(New)\\New NPS\\nps_sentiment_training.zip"
    x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file)

    parameter_file = "C:\\Code_Sketch\\NPS\\S3134076\\PycharmProjects\\nps\\parameters_sentiment.json"
    params = json.loads(open(parameter_file).read())
    """Step 1: pad each sentence to the same length and map each word to an id"""
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    logging.info(
        'The maximum length of all sentences: {}'.format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)
    """Step 2: split the original dataset into train and test sets"""
    x_, x_test, y_, y_test = train_test_split(x,
                                              y,
                                              test_size=0.1,
                                              random_state=42)
    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y_)))
    x_shuffled = x_[shuffle_indices]
    y_shuffled = y_[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled,
                                                      y_shuffled,
                                                      test_size=0.1)
    """Step 4: save the labels into labels.json since predict_fraud.py needs it"""
    with open('./labels_sentiment.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    for num_epoch in [25]:
        params['num_epochs'] = num_epoch
        for batch_size in [30]:
            params['batch_size'] = batch_size
            for l2 in [0.0]:
                params['l2_reg_lambda'] = l2
                """Step 5: build a graph and cnn object"""
                graph = tf.Graph()
                with graph.as_default():
                    session_conf = tf.ConfigProto(allow_soft_placement=True,
                                                  log_device_placement=False)
                    sess = tf.Session(config=session_conf)
                    with sess.as_default():
                        cnn = TextCNN(
                            sequence_length=x_train.shape[1],
                            num_classes=y_train.shape[1],
                            vocab_size=len(vocab_processor.vocabulary_),
                            embedding_size=params['embedding_dim'],
                            filter_sizes=list(
                                map(int, params['filter_sizes'].split(","))),
                            num_filters=params['num_filters'],
                            l2_reg_lambda=params['l2_reg_lambda'])

                        global_step = tf.Variable(0,
                                                  name="global_step",
                                                  trainable=False)
                        optimizer = tf.train.AdamOptimizer(1e-3)
                        grads_and_vars = optimizer.compute_gradients(cnn.loss)
                        train_op = optimizer.apply_gradients(
                            grads_and_vars, global_step=global_step)

                        # Keep track of gradient values and sparsity (optional)
                        grad_summaries = []
                        for g, v in grads_and_vars:
                            if g is not None:
                                grad_hist_summary = tf.summary.histogram(
                                    "{}/grad/hist".format(v.name), g)
                                sparsity_summary = tf.summary.scalar(
                                    "{}/grad/sparsity".format(v.name),
                                    tf.nn.zero_fraction(g))
                                grad_summaries.append(grad_hist_summary)
                                grad_summaries.append(sparsity_summary)
                        grad_summaries_merged = tf.summary.merge(
                            grad_summaries)

                        timestamp = str(int(time.time()))
                        out_dir = os.path.abspath(
                            os.path.join(os.path.curdir,
                                         "trained_model_" + timestamp))

                        # Summaries for loss and accuracy
                        loss_summary = tf.summary.scalar("loss", cnn.loss)
                        acc_summary = tf.summary.scalar(
                            "accuracy", cnn.accuracy)

                        # Train Summaries
                        train_summary_op = tf.summary.merge(
                            [loss_summary, acc_summary, grad_summaries_merged])
                        train_summary_dir = os.path.join(
                            out_dir, "summaries", "train")
                        train_summary_writer = tf.summary.FileWriter(
                            train_summary_dir, sess.graph_def)

                        # Dev summaries
                        dev_summary_op = tf.summary.merge(
                            [loss_summary, acc_summary])
                        dev_summary_dir = os.path.join(out_dir, "summaries",
                                                       "dev")
                        dev_summary_writer = tf.summary.FileWriter(
                            dev_summary_dir, sess.graph_def)

                        checkpoint_dir = os.path.abspath(
                            os.path.join(out_dir, "checkpoints"))
                        checkpoint_prefix = os.path.join(
                            checkpoint_dir, "model")
                        if not os.path.exists(checkpoint_dir):
                            os.makedirs(checkpoint_dir)
                        saver = tf.train.Saver(tf.all_variables())

                        # One training step: train the model with one batch
                        def train_step(x_batch, y_batch):
                            feed_dict = {
                                cnn.input_x: x_batch,
                                cnn.input_y: y_batch,
                                cnn.dropout_keep_prob:
                                params['dropout_keep_prob']
                            }
                            _, step, summaries, loss, acc = sess.run([
                                train_op, global_step, train_summary_op,
                                cnn.loss, cnn.accuracy
                            ], feed_dict)
                            train_summary_writer.add_summary(summaries, step)

                        # One evaluation step: evaluate the model with one batch
                        def dev_step(x_batch, y_batch, writer=None):
                            feed_dict = {
                                cnn.input_x: x_batch,
                                cnn.input_y: y_batch,
                                cnn.dropout_keep_prob: 1.0
                            }
                            step, summaries, loss, acc, num_correct = sess.run(
                                [
                                    global_step, dev_summary_op, cnn.loss,
                                    cnn.accuracy, cnn.num_correct
                                ], feed_dict)
                            if writer:
                                writer.add_summary(summaries, step)
                            return num_correct

                        # Save the word_to_id map since predict_fraud.py needs it
                        vocab_processor.save(
                            os.path.join(out_dir, "vocab_sentiment.pickle"))
                        sess.run(tf.initialize_all_variables())

                        # Training starts here
                        train_batches = data_helper.batch_iter(
                            list(zip(x_train, y_train)), params['batch_size'],
                            params['num_epochs'])
                        best_accuracy, best_at_step = 0, 0
                        """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
                        for train_batch in train_batches:
                            x_train_batch, y_train_batch = zip(*train_batch)
                            train_step(x_train_batch, y_train_batch)
                            current_step = tf.train.global_step(
                                sess, global_step)
                            """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                            print("hello1")
                            print(current_step)
                            print(params['evaluate_every'])
                            if current_step % params['evaluate_every'] == 0:
                                dev_batches = data_helper.batch_iter(
                                    list(zip(x_dev, y_dev)),
                                    params['batch_size'], 1)
                                total_dev_correct = 0
                                for dev_batch in dev_batches:
                                    x_dev_batch, y_dev_batch = zip(*dev_batch)
                                    num_dev_correct = dev_step(
                                        x_dev_batch,
                                        y_dev_batch,
                                        writer=dev_summary_writer)
                                    total_dev_correct += num_dev_correct

                                dev_accuracy = float(total_dev_correct) / len(
                                    y_dev)
                                print("hello2")
                                logging.critical(
                                    'Accuracy on dev set: {}'.format(
                                        dev_accuracy))
                                """Step 6.2: save the model if it is the best based on accuracy of the dev set"""
                                if dev_accuracy >= best_accuracy:
                                    best_accuracy, best_at_step = dev_accuracy, current_step
                                    path = saver.save(sess,
                                                      checkpoint_prefix,
                                                      global_step=current_step)
                                    logging.critical(
                                        'Saved model {} at step {}'.format(
                                            path, best_at_step))
                                    logging.critical(
                                        'Best accuracy {} at step {}'.format(
                                            best_accuracy, best_at_step))
                        """Step 7: predict x_test (batch by batch)"""
                        test_batches = data_helper.batch_iter(
                            list(zip(x_test, y_test)), params['batch_size'], 1)
                        total_test_correct = 0
                        for test_batch in test_batches:
                            x_test_batch, y_test_batch = zip(*test_batch)
                            num_test_correct = dev_step(
                                x_test_batch, y_test_batch)
                            total_test_correct += num_test_correct

                        test_accuracy = float(total_test_correct) / len(y_test)
                        logging.critical(
                            'Accuracy on test set is {} based on the best model {}'
                            .format(test_accuracy, path))
                        logging.critical('The training is complete')
Beispiel #17
0
from text_cnn import TextCNN
from config import FLAGS
import tensorflow as tf
import data_helper

x_test_data, y_test = data_helper.load_data_and_labels(FLAGS.test_data_file,
                                                       FLAGS.test_label_file)

padded_sentences_test, max_padding_length = data_helper.padding_sentence(
    sentences=x_test_data,
    padding_sentence_length=FLAGS.padding_sentence_length,
    padding_move=FLAGS.padding_move)

x_test, vocabulary_len = data_helper.embedding_sentences(
    embedding_file=FLAGS.embedding_file,
    padded_sentences=padded_sentences_test,
    embedding_dimension=FLAGS.embedding_dimension)

print("x_test.shape = {}".format(x_test.shape))
print("y_test.shape = {}".format(y_test.shape))

cnn = TextCNN(sequence_length=FLAGS.padding_sentence_length,
              num_classes=FLAGS.num_classes,
              embedding_dimension=FLAGS.embedding_dimension,
              filter_sizes=list(map(int, FLAGS.filter_size.split(','))),
              num_filters=FLAGS.num_filters,
              l2_reg_lambda=FLAGS.L2_reg_lambda)

with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_save_path))
def train_cnn():
    """Step 0: 加载数据和参数"""
    train_file = sys.argv[1]
    x_raw, y_raw, _, labels = data_helper.load_data_and_labels(train_file)

    parameter_file = sys.argv[2]
    params = json.loads(open(parameter_file).read())
    """Step 1: 完成单词到ID的映射,一行为一个sequence"""
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    logging.info(
        'The maximum length of all sentences: {}'.format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)
    """Step 2: _x和测试集"""
    x_, x_test, y_, y_test = train_test_split(x,
                                              y,
                                              test_size=0.1,
                                              random_state=42)
    """Step 3: 将_x分为训练集和验证集"""
    shuffle_indices = np.random.permutation(np.arange(len(y_)))
    x_shuffled = x_[shuffle_indices]
    y_shuffled = y_[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled,
                                                      y_shuffled,
                                                      test_size=0.1)
    """Step 4: save the labels into labels.json since predict.py needs it"""
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    #-------------------------------------------------------------------------------------------------------------------
    """Step 5: build a graph and cnn object"""
    graph = tf.Graph()
    with graph.as_default():
        # tf.ConfigProto一般用在创建session的时候。用来对session进行参数配置
        #log_device_placement=True : 是否打印设备分配日志
        #allow_soft_placement=True : 如果你指定的设备不存在,允许TF自动分配设备
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=params['embedding_dim'],
                          filter_sizes=list(
                              map(int, params['filter_sizes'].split(","))),
                          num_filters=params['num_filters'],
                          l2_reg_lambda=params['l2_reg_lambda'])

            #学习率
            global_step = tf.Variable(0, name="global_step", trainable=False)
            learning_rate = tf.train.exponential_decay(1e-3,
                                                       global_step,
                                                       1000,
                                                       0.99,
                                                       staircase=True)

            #优化器,选用最优的adam,速度快,效果稳定 可以直接这样用:train_step = tf.train.AdagradOptimizer(learning_rate).minimize(loss, global_step=global_step)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            #保存模型和断点的路径
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "trained_model_" + timestamp))

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)

            saver = tf.train.Saver(tf.global_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch, train_summary_op):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: params['dropout_keep_prob']
                }
                _, step, train_summary = sess.run(
                    [train_op, global_step, train_summary_op], feed_dict)
                return train_summary

            # One evaluation step: evaluate the model with one batch
            def dev_step(x_batch, y_batch, dev_summary_op):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, dev_summary, num_correct = sess.run(
                    [global_step, dev_summary_op, cnn.num_correct], feed_dict)
                return dev_summary, num_correct

            # 可视化
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "logs", "train")
            dev_summary_dir = os.path.join(out_dir, "logs", "dev")

            sess.run(tf.global_variables_initializer())
            # 保存单词表用于预测Save the word_to_id map since predict.py needs it
            vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
            # 可视化
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            """Step 6: 训练train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_summary = train_step(x_train_batch, y_train_batch,
                                           train_summary_op)

                current_step = tf.train.global_step(sess, global_step)

                if current_step % 100 == 0:
                    train_summary_writer.add_summary(train_summary,
                                                     current_step)
                """Step 6.1: 用验证集评价模型evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        dev_summary, num_dev_correct = dev_step(
                            x_dev_batch, y_dev_batch, dev_summary_op)
                        total_dev_correct += num_dev_correct
                        dev_summary_writer.add_summary(dev_summary,
                                                       current_step)

                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    logging.critical(
                        'Accuracy on dev set: {}'.format(dev_accuracy))
                    """Step 6.2:保存模型save the model if it is the best based on accuracy of the dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            """Step 7: 预测predict x_test (batch by batch)"""
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'], 1)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                _, num_test_correct = dev_step(x_test_batch, y_test_batch,
                                               dev_summary_op)
                total_test_correct += num_test_correct

            test_accuracy = float(total_test_correct) / len(y_test)
            logging.critical(
                'Accuracy on test set is {} based on the best model {}'.format(
                    test_accuracy, path))
            logging.critical('complete!')
Beispiel #19
0
# validate training params file
training_params_file = os.path.join(FLAGS.checkpoint_dir, "..",
                                    "training_params.pickle")
if not os.path.exists(training_params_file):
    print(
        "Training params file \'{}\' is missing!".format(training_params_file))
print("Using training params file : {}".format(training_params_file))

# Load params
params = data_helper.loadDict(training_params_file)
num_labels = int(params['num_labels'])
max_document_length = int(params['max_document_length'])
# Load data
if FLAGS.eval_train and FLAGS.single_url is None:
    x_raw, y_test = data_helper.load_data_and_labels(FLAGS.input_text_file)
elif FLAGS.single_url is not None:
    x_raw = [FLAGS.single_url]
    y_test = None
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Get Embedding vector x_test
sentences, max_document_length = data_helper.padding_sentences(
    x_raw, '<PADDING>', padding_sentence_length=max_document_length)
x_test = np.array(
    word2vec_helpers.embedding_sentences(
        sentences, file_to_load=trained_word2vec_model_file))
print("x_test.shape = {}".format(x_test.shape))
Beispiel #20
0
def train_cnn():
    FLAGS = tf.flags.FLAGS
    with open("config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile)
    if FLAGS.enable_word_embeddings and cfg['word_embeddings'][
            'default'] is not None:
        embedding_name = cfg['word_embeddings']['default']
        embedding_dimension = cfg['word_embeddings'][embedding_name][
            'dimension']
    else:
        embedding_dimension = 300
    filename = "./sun_firefox.csv.zip"
    x_raw, y_raw, df, labels = data_helper.load_data_and_labels(filename)
    #print(x_raw[0])

    parameter_file = sys.argv[2]
    params = json.loads(open(parameter_file).read())
    """Step 1: pad each sentence to the same length and map each word to an id"""
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    #print(max_document_length)

    logging.info(
        'The maximum length of all sentences: {}'.format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)
    #x=np.array(x_raw)
    #y = np.array(y_raw)
    """Step 2: split the original dataset into train and test sets"""
    #x_, x_test, y_, y_test = train_test_split(x_raw, y_raw, test_size=0.1)
    #print(x.shape)
    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled,
                                                      y_shuffled,
                                                      test_size=0.1,
                                                      random_state=1)
    #print(x_train.shape)
    """Step 4: save the labels into labels.json since predict.py needs it"""
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)
    logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev)))
    logging.info('y_train: {}, y_dev: {}'.format(len(y_train), len(y_dev)))
    """Step 5: build a graph and cnn object"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=params['embedding_dim'],
                          filter_sizes=list(
                              map(int, params['filter_sizes'].split(","))),
                          num_filters=params['num_filters'],
                          l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(cnn.learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir,
                             "netbeans_trained_model_" + timestamp))
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.all_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch, learning_rate):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: params['dropout_keep_prob'],
                    cnn.learning_rate: learning_rate
                }

                _, step, loss, acc, k_2_accuracy, k_3_accuracy, k_4_accuracy, k_5_accuracy, k_6_accuracy, k_7_accuracy, k_8_accuracy, k_9_accuracy, k_10_accuracy = sess.run(
                    [
                        train_op, global_step, cnn.loss, cnn.accuracy,
                        cnn.k_2_accuracy, cnn.k_3_accuracy, cnn.k_4_accuracy,
                        cnn.k_5_accuracy, cnn.k_6_accuracy, cnn.k_7_accuracy,
                        cnn.k_8_accuracy, cnn.k_9_accuracy, cnn.k_10_accuracy
                    ], feed_dict)
                print(
                    "Train Step: step {}, loss {:g}, acc {:g},Top-2-Accuracy{:g},Top-3-Accuracy{:g},Top-4-Accuracy{:g}, Top-5-Accuracy{:g}, Top-6-Accuracy{:g}, Top-7-Accuracy{:g}, Top-8-Accuracy{:g}, Top-9-Accuracy{:g}, Top-10-Accuracy{:g}"
                    .format(step, loss, acc, k_2_accuracy, k_3_accuracy,
                            k_4_accuracy, k_5_accuracy, k_6_accuracy,
                            k_7_accuracy, k_8_accuracy, k_9_accuracy,
                            k_10_accuracy))

# One evaluation step: evaluate the model with one batch

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, loss, acc, k_2_accuracy, k_3_accuracy, k_4_accuracy, k_5_accuracy, k_6_accuracy, k_7_accuracy, k_8_accuracy, k_9_accuracy, k_10_accuracy, num_correct, scores, k_2_num_correct, k_3_num_correct, k_4_num_correct, k_5_num_correct, k_6_num_correct, k_7_num_correct, k_8_num_correct, k_9_num_correct, k_10_num_correct = sess.run(
                    [
                        global_step, cnn.loss, cnn.accuracy, cnn.k_2_accuracy,
                        cnn.k_3_accuracy, cnn.k_4_accuracy, cnn.k_5_accuracy,
                        cnn.k_6_accuracy, cnn.k_7_accuracy, cnn.k_8_accuracy,
                        cnn.k_9_accuracy, cnn.k_10_accuracy, cnn.num_correct,
                        cnn.scores, cnn.k_2_num_correct, cnn.k_3_num_correct,
                        cnn.k_4_num_correct, cnn.k_5_num_correct,
                        cnn.k_6_num_correct, cnn.k_7_num_correct,
                        cnn.k_8_num_correct, cnn.k_9_num_correct,
                        cnn.k_10_num_correct
                    ], feed_dict)
                #top_k_predications=tf.nn.top_k(scores,5)
                #print(num_correct)
                #print(k_num_correct)
                print(
                    "Dev Step: step {}, loss {:g}, acc {:g},Top-2-Accuracy{:g},Top-3-Accuracy{:g},Top-4-Accuracy{:g}, Top-5-Accuracy{:g}, Top-6-Accuracy{:g}, Top-7-Accuracy{:g}, Top-8-Accuracy{:g}, Top-9-Accuracy{:g}, Top-10-Accuracy{:g}"
                    .format(step, loss, acc, k_2_accuracy, k_3_accuracy,
                            k_4_accuracy, k_5_accuracy, k_6_accuracy,
                            k_7_accuracy, k_8_accuracy, k_9_accuracy,
                            k_10_accuracy))
                return num_correct, k_2_num_correct, k_3_num_correct, k_4_num_correct, k_5_num_correct, k_6_num_correct, k_7_num_correct, k_8_num_correct, k_9_num_correct, k_10_num_correct

# Save the word_to_id map since predict.py needs it

            vocab_processor.save(os.path.join(out_dir, "vocab"))
            sess.run(tf.global_variables_initializer())
            # GLoVE Embedding

            #if FLAGS.enable_word_embeddings and cfg['word_embeddings']['default'] is not None:
            vocabulary = vocab_processor.vocabulary_
            #    initW = None
            #    if embedding_name == 'word2vec':
            #        print("Load word2vec file {}".format(cfg['word_embeddings']['word2vec']['path']))
            #        initW = data_helper.load_embedding_vectors_word2vec(vocabulary,cfg['word_embeddings']['word2vec']['path'],cfg['word_embeddings']['word2vec']['binary'])
            #        print("word2vec file has been loaded")
            #    elif embedding_name == 'glove':
            #        print("Load glove file {}".format(cfg['word_embeddings']['glove']['path']))
            #        initW = data_helper.load_embedding_vectors_glove(vocabulary,cfg['word_embeddings']['glove']['path'],embedding_dimension)
            #        print("glove file has been loaded\n")
            #    elif embedding_name == 'elmo':
            #        print("Loading Elmo Model")
            #url = "https://tfhub.dev/google/elmo/2"
            #embed = hub.Module(url, trainable=True)
            #initW = embed(tf.reshape(tf.cast(x_train, tf.string), [-1]), signature="default", as_dict=True)['default']
            #initW = embed(tf.squeeze(tf.cast(vocabulary, tf.string)), signature="default", as_dict=True)['default']
            #print (initW)

            #sess.run(cnn.W.assign(initW))

            # It uses dynamic learning rate with a high value at the beginning to speed up the training
            max_learning_rate = 0.005
            min_learning_rate = 0.0001
            decay_speed = FLAGS.decay_coefficient * len(
                y_train) / params['batch_size']
            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            counter = 0
            #start_time=gmtime();
            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:

                #learning_rate = 0.001
                learning_rate = min_learning_rate + (
                    max_learning_rate - min_learning_rate) * math.exp(
                        -counter / decay_speed)
                counter += 1
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch, learning_rate)
                current_step = tf.train.global_step(sess, global_step)
                """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_dev_correct = 0
                    k_2_total_dev_correct = 0
                    k_3_total_dev_correct = 0
                    k_4_total_dev_correct = 0
                    k_5_total_dev_correct = 0
                    k_6_total_dev_correct = 0
                    k_7_total_dev_correct = 0
                    k_8_total_dev_correct = 0
                    k_9_total_dev_correct = 0
                    k_10_total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        num_dev_correct, k_2_num_dev_correct, k_3_num_dev_correct, k_4_num_dev_correct, k_5_num_dev_correct, k_6_num_dev_correct, k_7_num_dev_correct, k_8_num_dev_correct, k_9_num_dev_correct, k_10_num_dev_correct = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                        k_2_total_dev_correct += k_2_num_dev_correct
                        k_3_total_dev_correct += k_3_num_dev_correct
                        k_4_total_dev_correct += k_4_num_dev_correct
                        k_5_total_dev_correct += k_5_num_dev_correct
                        k_6_total_dev_correct += k_6_num_dev_correct
                        k_7_total_dev_correct += k_7_num_dev_correct
                        k_8_total_dev_correct += k_8_num_dev_correct
                        k_9_total_dev_correct += k_9_num_dev_correct
                        k_10_total_dev_correct += k_10_num_dev_correct

                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    k_2_dev_accuracy = float(k_2_total_dev_correct) / len(
                        y_dev)
                    k_3_dev_accuracy = float(k_3_total_dev_correct) / len(
                        y_dev)
                    k_4_dev_accuracy = float(k_4_total_dev_correct) / len(
                        y_dev)
                    k_5_dev_accuracy = float(k_5_total_dev_correct) / len(
                        y_dev)
                    k_6_dev_accuracy = float(k_6_total_dev_correct) / len(
                        y_dev)
                    k_7_dev_accuracy = float(k_7_total_dev_correct) / len(
                        y_dev)
                    k_8_dev_accuracy = float(k_8_total_dev_correct) / len(
                        y_dev)
                    k_9_dev_accuracy = float(k_9_total_dev_correct) / len(
                        y_dev)
                    k_10_dev_accuracy = float(k_10_total_dev_correct) / len(
                        y_dev)
                    print("\n\n")
                    logging.critical(
                        'Accuracy on dev set: {}'.format(dev_accuracy))
                    logging.critical('Top-2 Accuracy on dev set: {}'.format(
                        k_2_dev_accuracy))
                    logging.critical('Top-3 Accuracy on dev set: {}'.format(
                        k_3_dev_accuracy))
                    logging.critical('Top-4 Accuracy on dev set: {}'.format(
                        k_4_dev_accuracy))
                    logging.critical('Top-5 Accuracy on dev set: {}'.format(
                        k_5_dev_accuracy))
                    logging.critical('Top-6 Accuracy on dev set: {}'.format(
                        k_6_dev_accuracy))
                    logging.critical('Top-7 Accuracy on dev set: {}'.format(
                        k_7_dev_accuracy))
                    logging.critical('Top-8 Accuracy on dev set: {}'.format(
                        k_8_dev_accuracy))
                    logging.critical('Top-9 Accuracy on dev set: {}'.format(
                        k_9_dev_accuracy))
                    logging.critical('Top-10 Accuracy on dev set: {}'.format(
                        k_10_dev_accuracy))
                    print("\n\n")
                    """Step 6.2: save the model if it is the best based on accuracy of the dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))

# """Step 7: predict x_test (batch by batch)"""
# end_time=gmtime();
# print("\n\n")
# print("Start Time:",start_time)
# print("End Time:",end_time)
# test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1)
# total_test_correct = 0
# k_2_total_test_correct = 0
# k_3_total_test_correct = 0
# k_4_total_test_correct = 0
# k_5_total_test_correct = 0
# k_6_total_test_correct = 0
# k_7_total_test_correct = 0
# k_8_total_test_correct = 0
# k_9_total_test_correct = 0
# k_10_total_test_correct = 0
# for test_batch in test_batches:
# 	x_test_batch, y_test_batch = zip(*test_batch)
# 	num_test_correct,k_2_num_test_correct,k_3_num_test_correct,k_4_num_test_correct,k_5_num_test_correct,k_6_num_test_correct,k_7_num_test_correct,k_8_num_test_correct,k_9_num_test_correct,k_10_num_test_correct = dev_step(x_test_batch, y_test_batch)
# 	total_test_correct += num_test_correct
# 	k_2_total_test_correct += k_2_num_test_correct
# 	k_3_total_test_correct += k_3_num_test_correct
# 	k_4_total_test_correct += k_4_num_test_correct
# 	k_5_total_test_correct += k_5_num_test_correct
# 	k_6_total_test_correct += k_6_num_test_correct
# 	k_7_total_test_correct += k_7_num_test_correct
# 	k_8_total_test_correct += k_8_num_test_correct
# 	k_9_total_test_correct += k_9_num_test_correct
# 	k_10_total_test_correct += k_10_num_test_correct

# test_accuracy = float(total_test_correct) / len(y_test)
# k_2_test_accuracy = float(k_2_total_test_correct) / len(y_test)
# k_3_test_accuracy = float(k_3_total_test_correct) / len(y_test)
# k_4_test_accuracy = float(k_4_total_test_correct) / len(y_test)
# k_5_test_accuracy = float(k_5_total_test_correct) / len(y_test)
# k_6_test_accuracy = float(k_6_total_test_correct) / len(y_test)
# k_7_test_accuracy = float(k_7_total_test_correct) / len(y_test)
# k_8_test_accuracy = float(k_8_total_test_correct) / len(y_test)
# k_9_test_accuracy = float(k_9_total_test_correct) / len(y_test)
# k_10_test_accuracy = float(k_10_total_test_correct) / len(y_test)
# print("\n\n")
# logging.critical('Accuracy on test set is {} '.format(test_accuracy))
# logging.critical('Top-2 Accuracy on test set is {}'.format(k_2_test_accuracy))
# logging.critical('Top-3 Accuracy on test set is {}'.format(k_3_test_accuracy))
# logging.critical('Top-4 Accuracy on test set is {}'.format(k_4_test_accuracy))
# logging.critical('Top-5 Accuracy on test set is {}'.format(k_5_test_accuracy))
# logging.critical('Top-6 Accuracy on test set is {}'.format(k_6_test_accuracy))
# logging.critical('Top-7 Accuracy on test set is {}'.format(k_7_test_accuracy))
# logging.critical('Top-8 Accuracy on test set is {}'.format(k_8_test_accuracy))
# logging.critical('Top-9 Accuracy on test set is {}'.format(k_9_test_accuracy))
# logging.critical('Top-10 Accuracy on test set is {}'.format(k_10_test_accuracy))

            print("\n\n")
            logging.critical('The training is complete')
            end_time = gmtime()
            print("\n\n")
            print("Start Time:", start_time)
            print("End Time:", end_time)
Beispiel #21
0
def train_cnn():
    """Step 0: load sentences, labels, and training parameters"""

    create_test = False

    # load train, cat and and othe path configurations from parameter file
    train_file = params['train_file']
    cat_file = params['cat_file']
    test_set_dir = params['test_set_dir']
    desc_col = params["desc_col"]
    #dev_set_dir = params['dev_set_dir']

    x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file,
                                                                cat_file,
                                                                desc_col,
                                                                ispickle=False)
    """Step 1: pad each sentence to the same length and map each word to an id"""

    # MAX DOCUMENT LENGTH
    #max_document_length = max([len(x.split(' ')) for x in x_raw])
    max_document_length = params['max_document_length']
    logger.debug('The maximum length set for all transactions: {}'.format(
        max_document_length))

    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length, tokenizer_fn=data_helper.tokenizer)
    #x_raw = x_raw.apply(lambda x: str(x))
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)

    if create_test:
        """Step 2: split the original dataset into train and test sets"""
        loggger.info("preparing test set")
        x_, x_test, y_, y_test = train_test_split(x,
                                                  y,
                                                  test_size=0.1,
                                                  stratify=y,
                                                  random_state=42)
        logger.info("saving test set")
        x_test.tocsv(os.path.join(test_set_dir, 'x_test.csv'), index=None)
        y_test.tocsv(os.path.join(test_set_dir, 'y_test.csv'), index=None)

        logger.debug("x_test: {}, y_test: {}".format(len(x_test), len(y_test)))

    else:
        x_ = x
        y_ = y

    logger.info("preparing dev set")
    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y_)))
    x_shuffled = x_[shuffle_indices]
    y_shuffled = y_[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(
        x_shuffled,
        y_shuffled,
        stratify=y_shuffled,
        test_size=params['val_set_ratio'],
        random_state=42)

    #x_dev.tocsv(os.path.join(dev_set_dir,'x_test.csv'),index=None)
    #x_dev.tocsv(os.path.join(dev_set_dir,'y_test.csv'),index=None)
    """Step 4: save the labels into labels.json since predict.py needs it"""
    logger.info("saving labels into json file")
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logger.debug('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev)))
    logger.debug('y_train: {}, y_dev: {}'.format(len(y_train), len(y_dev)))
    """Step 5: build a graph and cnn object"""
    logger.info("building tensorflow graph")

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=params['embedding_dim'],
                          filter_sizes=list(
                              map(int, params['filter_sizes'].split(","))),
                          num_filters=params['num_filters'],
                          l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(params['learning_rate'])
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            timestamp = time.strftime("%m%d-%H%M")
            output_dir = params['output_dir']
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", output_dir, timestamp))

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: params['dropout_keep_prob']
                }
                _, step, summaries, loss, acc = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                logger.debug("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, acc))
                train_summary_writer.add_summary(summaries, step)

            # One evaluation step: evaluate the model with one batch
            def dev_step(x_batch, y_batch, writer=None):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, acc, num_correct = sess.run([
                    global_step, dev_summary_op, cnn.loss, cnn.accuracy,
                    cnn.num_correct
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                logger.info("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, acc))
                #if writer:
                #    writer.add_summary(summaries, step)
                dev_summary_writer.add_summary(summaries, step)

                return num_correct

            # Save the word_to_id map since predict.py needs it
            vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)
                """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    #dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    #total_dev_correct = 0
                    #for dev_batch in dev_batches:
                    #    x_dev_batch, y_dev_batch = zip(*dev_batch)
                    #    num_dev_correct = dev_step(x_dev_batch, y_dev_batch)
                    #    total_dev_correct += num_dev_correct

                    total_dev_correct = dev_step(x_dev, y_dev)

                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    logger.info('Accuracy on dev set: {}'.format(dev_accuracy))
                    """Step 6.2: save the model if it is the best based on accuracy of the dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logger.info('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logger.info('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))

            if create_test:
                """Step 7: predict x_test (batch by batch)"""
                test_batches = data_helper.batch_iter(
                    list(zip(x_test, y_test)), params['batch_size'], 1)
                total_test_correct = 0
                #for test_batch in test_batches:
                #    x_test_batch, y_test_batch = zip(*test_batch)
                #    num_test_correct = dev_step(x_test_batch, y_test_batch)
                #    total_test_correct += num_test_correct

                total_test_correct = dev_step(x_test, y_test)

                test_accuracy = float(total_test_correct) / len(y_test)
                print('Accuracy on test set is {} based on the best model {}'.
                      format(test_accuracy, path))
                print('The training is complete')
Beispiel #22
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS(sys.argv)
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helper.load_data_and_labels(FLAGS.regret_short_story,
                                             FLAGS.drugs_consumption_data_file)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
Beispiel #23
0
def train_cnn():
    global my_min  # by odg

    # 오리지널 파일로부터 입력과 출력값을 배열로 만들어내기. 또한 CNN의 설정값 또한 파일로서 읽기
    """Step 0: load sentences, labels, and training parameters"""
    # 파라미터로 받은 파일을 로딩해서 문장배열(x_raw)과 각 문장들의 분류값배열(y_raw)을 얻어낸다. x는 뉴럴넷의 input이로, y는 output이다.
    train_file = sys.argv[1]
    # CNN네트워크의 각종 세부 설정값(hyper parameter)들을 로딩한다. 이 안에는 num_epochs, batch_size, num_filters등의 값들이 들어있다.
    x_raw, y_raw, df, labels = data_helper.load_data_and_labels(
        train_file)  # @
    # x_raw는  데이터셋, y_raw는 label의 One-hot vector, df는 라벨 포함 데이터셋, labels는 라벨 들어가있음.

    parameter_file = sys.argv[2]
    params = json.loads(open(parameter_file).read())

    model_dir = sys.argv[3]  # 모델 폴더 이름
    max_document_length = 0
    minimum_frequency = 5  # 단어장에 넣을 단어의 최소한의 빈도수(해당 빈도수 이상 있어야 단어장에 등록)
    list_max_final_scores = []  # final_scores 들 중 가장 큰 값만 저장한 리스트 by odg

    if (model_dir == "new"):
        timestamp = str(int(time.time()))
        model_name = "./trained_model_" + timestamp
        # 학습내용을 기록할 디렉토리 정의
        out_dir = os.path.abspath(os.path.join(os.path.curdir,
                                               model_name))  # !새롭게 생기는 폴더.
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        vectorize_list = list(mytoken.tokenizer(x_raw))

        for i in vectorize_list:
            if max_document_length < len(i):
                max_document_length = len(i)

        word2Vec = Word2Vec(vectorize_list,
                            size=params['embedding_dim'] -
                            params['num_of_class'],
                            window=3,
                            min_count=minimum_frequency,
                            workers=4)
        word2Vec.save(model_name + "/word2Vec.vec")  # @
        fastText = FastText(vectorize_list,
                            size=params['embedding_dim'] -
                            params['num_of_class'],
                            window=3,
                            min_count=minimum_frequency,
                            workers=4)
        fastText.save(model_name + "/fastText.vec")  # @

        vocab_dict, _ = data_helper.build_vocab(max_document_length,
                                                word2Vec.wv.index2word,
                                                params['num_of_class'], True)
        # 학습내용중 Tensorflow 내부의 변수상태가 저장됨 (예:AdamOptimizer)

    else:
        out_dir = os.path.abspath(os.path.join(model_dir))
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        vocab_dict, max_document_length = data_helper.build_vocab(
            0, None, None, False)
        model_name = model_dir
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")

    # 전체 문장을 동일한 크기로 맞추고, 단어마다 ID를 부여해서, ID로 이루어진 문장을 만들기
    """Step 1: pad each sentence to the same length and map each word to an id"""
    # 문장배열의 크기를 pad값을 사용해서 같은 크기로 맞추어 주고, 문장안의 단어들을 ID로 매핑시켜주는 작업을 통해 학습 문장을 숫자 매트릭스형태로 만들어 학습이 가능한 상태로 만든다.
    logging.info('가장 긴 길이의 문장: {}'.format(max_document_length))  # 21

    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length=max_document_length,
        vocabulary=vocab_dict,
        tokenizer_fn=mytoken.tokenizer)  # 데이터셋의 단어들에 대해 인덱스를 붙여주는... #!

    x = np.array(list(vocab_processor.transform(x_raw)))  # !
    vocab_dictionary = vocab_processor.vocabulary_._mapping  # !
    y = np.array(y_raw)  # y는 라벨에 대한 One-hot vector

    # 데이터셋을 학습용과 테스트용으로 나누기
    """Step 2: split the original dataset into train and test sets"""
    x_, x_test, y_, y_test = train_test_split(x,
                                              y,
                                              test_size=0.1,
                                              random_state=42)
    # 학습 문장과 결과값을 학습과 테스트 두개의 그룹으로 나눈다.(10%만 검증용으로 사용한다)
    # 데이터셋을 임의로 배치하고, 학습 데이터를 학습용과 검증용으로 다시 분류하기.
    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    # 학습용 문장 배열(x_)의 순서를 그때마다 다르게 하기 위해 random방식으로 배열의 순서를 바꾸는 과정이다.
    # 학습데이터를 다시 두개의 그룹으로 나누는 것은 학습과 검증을 나눔으로서 overfitting을 줄이고 학습의 효과를 확인하기 쉽게 하기 위해서이다. 전체 데이터셋 구성은 다음과 같다. https://blog.naver.com/2feelus/221005831312 에서 확인.
    shuffle_indices = np.random.permutation(np.arange(len(y_)))  # 인덱스를 섞어줌
    x_shuffled = x_[
        shuffle_indices]  # 데이터셋에서 랜덤으로 셔플된 문장 인덱스 가져옴 ex) [5 69 0 ... 0]
    y_shuffled = y_[shuffle_indices]  # 해당 데이터셋에 대한 라벨 One-hot vector를 가져옴
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled,
                                                      y_shuffled,
                                                      test_size=0.1)

    # 카테고리 라벨을 파일로 저장하여, 예측시에 활용할수 있도록 하기
    # 전체 카테고리 라벨들이 label.json 파일의 내용으로 저장. 실제 예측시에 이파일에 저장된 카테고리 순서에 따라 예측값을 얻어낸다.

    with open('./labels.json', 'w', encoding='utf-8-sig') as outfile:  # by odg
        json.dump(labels, outfile, indent=4, ensure_ascii=False)  # by odg

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # 텐서 플로우 그래프생성이후 CNN 객체를 생성하기
    """Step 5: build a graph and cnn object"""
    # 텐서플로우에서 머신러닝의 데이터 흐름을 표현하는 그래프를 새로 생성한다. 그래프는 여러가지 머신러닝용 계산 명령 객체들을 포함하고 있다.
    graph = tf.Graph()
    # 파이선의 Context Manager 개념을 사용하여 기존의 기본 그래프 객체를 위에서 선언한 graph 객체로 대체하여 내부 블럭에 적용한다.
    # 멀티프로세스로 돌아가는 환경에서 이러한 방식을 사용하여 쓰레드에서 각각의 그래프 객체를 사용하도록 한다.
    with graph.as_default():
        # 세션을 새로 생성한다. 세션의 설정옵션으로 GPU를 특정하지 않기(allow_soft_placement=True),
        # 연산이 어느디바이스로 설정되었는지 보여주여주지 않기(log_device_placement=False)
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.InteractiveSession(config=session_conf)  # !!
        # 세션또한 Context manager를 사용하여 세션의 열고 닫는 처리를 자동으로 해준다.
        with sess.as_default():
            # CNN객체를 생성한다. 파라미터 = 문장의 최대길이(sequence_length):912, 분류 카테고리수(num_classes):11,
            # 사전에 등록된 단어수(vocab_size):52943, 워드임베딩 사이즈(embedding_size):50,
            # CNN필터(커널)의 크기는 3x3,4x4,5x5 , 필터의 갯수는 총 32 개,
            # 오버피팅 방지를 위한 가중치 영향력 감소 수치(l2_reg_lambda):0.0
            cnn = TextCNN(
                sequence_length=x_train.shape[1],  # 들어온 문장의 최대 길이
                num_classes=y_train.shape[1],  # 라벨의 개수 (= One-hot vector의 길이)
                vocab_size=len(vocab_processor.vocabulary_),  # 단어의 수
                embedding_size=params['embedding_dim'],
                filter_sizes=list(map(int, params['filter_sizes'].split(","))),
                num_filters=params['num_filters'],
                l2_reg_lambda=params['l2_reg_lambda'],
                vec_dir=model_name  # @
            )
            global_step = tf.Variable(0, name="global_step",
                                      trainable=False)  # !!원본 바꾼것
            # Cost function으로 Adam Optimizer사용
            optimizer = tf.train.AdamOptimizer(1e-3)
            # cnn의 loss(오차) 값을 파리미터로 받아 점진하강.
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            # 학습에 사용할 함수 정의(session.run에서 사용됨).
            tf.summary.scalar("cnn_loss", cnn.loss)  # @@@
            tf.summary.scalar("cnn_accuracy", cnn.accuracy)  # @@@

            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # saver를 사용해 학습 내용을 저장
            saver = tf.train.Saver()

            # One training step: train the model with one batch
            # train_step 은 모델을 학습하는 하나의 묶음(batch)이다. 만약 batch size가 50이라면 50번의 Traning과 그에 따른 50번의 Test가 실행되게 된다.
            def train_step(x_batch, y_batch):
                # 입력/예측 출력값을 넣어줌으로서 학습/평가를 할수 있도록 한다.
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: params['dropout_keep_prob']
                }  # Overfitting을 줄이기 위해,  Dropout(신경망 노드 탈락시키기) 확률을 지정.
                # 위에서 설정한 값들을 사용해 학습을 시작한다.
                _, step, loss, acc = sess.run(
                    [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

            # One evaluation step: evaluate the model with one batch
            # dev_step 은 학습 결과 묶음(batch)를 평가(Evaluation)하는 메소드이다.
            def dev_step(x_batch, y_batch):
                # 평가시에는 dropout은 사용하지 않는다.(dropout_keep_prob:1.0 => off)
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                # 평가시에는 학습용 train_op 파라미터는 넣지 않는다.
                step, loss, acc, num_correct, summary, scores, final_scores = sess.run(
                    [
                        global_step, cnn.loss, cnn.accuracy, cnn.num_correct,
                        merged, cnn.scores, cnn.final_scores
                    ], feed_dict)  # @@@

                for j in final_scores:  # 각 final_scores에서 최대값 얻어오기
                    max_final_scores = max(j)
                    list_max_final_scores.append(
                        max_final_scores)  # 각 최대값을 리스트에 추가

                min_final_scores = min(list_max_final_scores)  # 리스트에서 가장 작은 값
                writer.add_summary(summary, step)  # @@@

                return num_correct, min_final_scores

            # 사용된 단어들을 ID에 매핑시켜 차후 예측시에 사용한다.(학습시에는 사용하지 않음)
            vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
            # 텐서플로우에서 사용하는 변수들을 초기화

            ckpt = tf.train.get_checkpoint_state(
                model_dir +
                "/checkpoints")  # checkpoint 얻는다.(모델의 Variable값을 얻어옴) #!!
            if ckpt and tf.train.checkpoint_exists(
                    ckpt.model_checkpoint_path):  # 모델 checkpoint가 존재하면 #!!
                print("다음 파일에서 모델을 읽는 중 입니다..",
                      ckpt.model_checkpoint_path)  # !!
                saver.restore(sess, ckpt.model_checkpoint_path
                              )  # checkpoint파일에서 모델의 변수값을 얻어온다. #!!
            else:  # 모델 checkpoint가 존재하지 않는다면 #!!
                print("새로운 모델을 생성하는 중 입니다.")  # !!
                sess.run(tf.global_variables_initializer())  # !!

            # Training starts here
            # 학습의 총 배치 갯수를 세팅한다. batch_iter 함수는 generator형식으로 작성되어있어서, 아래처럼 초기화를 해놓으면, for문안에서 배치단위로 값을 돌려주게 되어있다.
            # 한번에 학습단위묶음은 37개(batch_size=37). 학습데이터는 전체 학습에 한번 씩만 사용할것이다. (num_epochs=1)
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            # 최고의 정확성을 저장하기 위한 변수
            best_accuracy, best_at_step = 0, 0

            merged = tf.summary.merge_all()  # @@@
            writer = tf.summary.FileWriter("./logs", graph=graph)  # @@@
            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                # zip을 사용하여 배치렬로 x(입력)과 y(기대출력)값을 각각 뽑아낸다.
                x_train_batch, y_train_batch = zip(
                    *train_batch
                )  # *는 unpack 하는거. https://stackoverflow.com/questions/2921847/what-does-the-star-operator-mean 참고.
                # 배치단위로 학습 진행
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)
                # 현재 학습 회차가 evaluate 할 순서이면 evaluate를 한x_dev다. 기본은 200번 마다.
                """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    # 개발용 데이터를 배치단위로 가져온다.
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        # 학습된 모델에 개발용 배치 데이터를 넣어서 예측 성공 갯수를 누적한다.
                        num_dev_correct, _ = dev_step(x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    # 모델의 정확성을 화면에 출력한다.
                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    logging.critical(
                        'Accuracy on dev set: {}'.format(dev_accuracy))

                    # 가장 예측 확률이 좋게 나온 모델을 저장한다. 기준은 dev_accuracy가 가장 좋게 나온 step의 모델이다.
                    """Step 6.2: save the model if it is the best based on accuracy on dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)  # !!
                        tf.Print(path, [path], "This is saver : ")
                        logging.critical('Saved model at {} at step {}'.format(
                            path, best_at_step))
                        logging.critical(
                            'Best accuracy is {} at step {}'.format(
                                best_accuracy, best_at_step))

            # 학습데이터와 Test데이터는 9:1로 나누었다.
            # Test데이터는 학습에 사용되지 않은 데이터로서, 학습된 모델이 객관성을 가지는지 확인하기 위한 용도이다
            """Step 7: predict x_test (batch by batch)"""
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'], 1)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                num_test_correct, min_final_scores = dev_step(
                    x_test_batch, y_test_batch)
                my_min = min_final_scores  # by odg
                total_test_correct += num_test_correct

            f = open(checkpoint_dir + "_min.txt", "w")
            f.write(str(my_min))
            f.close()

            test_accuracy = float(total_test_correct) / len(y_test)

            logging.critical('테스트셋 Accuracy {}, best 모델 {}'.format(
                test_accuracy,
                path))  # 자꾸 여기서 오류남. 이건 그냥 log 띄우는거라 없어도 될거같은데..
            logging.critical('트레이닝 완료')
Beispiel #24
0
def train_cnn(dataset_name):
    """Step 0: load sentences, labels, and training parameters"""
    dataset = '../dataset/'+dataset_name+'_csv/train.csv'
    testset = '../dataset/'+dataset_name+'_csv/test.csv'
    parameter_file = "./parameters.json"
    params = json.loads(open(parameter_file).read())
    x_raw, y_raw, df, labels = data_helper.load_data_and_labels(dataset,dataset_name,True)
    x_test, y_test, df, labels = data_helper.load_data_and_labels(testset,dataset_name,False)

    """Step 1: pad each sentence to the same length and map each word to an id"""
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    logging.info('The maximum length of all sentences: {}'.format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)
    x_test = np.array(list(vocab_processor.fit_transform(x_test)))
    y_test = np.array(y_test)

    """Step 3: shuffle the train set and split the train set into train and dev sets"""
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1)

    """Step 4: save the labels into labels.json since predict.py needs it"""
    with open('./labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

    """Step 5: build a graph and cnn object"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=x_train.shape[1],
                num_classes=y_train.shape[1],
                vocab_size=len(vocab_processor.vocabulary_),
                embedding_size=params['embedding_dim'],
                filter_sizes=list(map(int, params['filter_sizes'].split(","))),
                num_filters=params['num_filters'])

            global_step = tf.Variable(0, name="global_step", trainable=False)
            epsilon=params['epsilon']
            num_batches_per_epoch = int((len(x_train)-1)/params['batch_size']) + 1
            learning_rate = tf.train.exponential_decay(params['learning_rate'], global_step,num_batches_per_epoch, 0.95, staircase=True)

            optimizer = tf.train.AdamOptimizer(learning_rate,epsilon)
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            gradients, variables = zip(*optimizer.compute_gradients(cnn.loss))
            gradients, _ = tf.clip_by_global_norm(gradients, 7.0)

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)

            timestamp = str(int(time.time()))
            
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "model_" + timestamp))

            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())

            # One training step: train the model with one batch
            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.is_training: True,
                    cnn.dropout_keep_prob: params['dropout_keep_prob']}
                _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)
                return acc,loss

            # One evaluation step: evaluate the model with one batch
            def dev_step(x_batch, y_batch):
                feed_dict = {cnn.input_x: x_batch, 
                             cnn.input_y: y_batch,
                             cnn.is_training: False,
                             cnn.dropout_keep_prob: 1.0}
                step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct],
                                                        feed_dict)
                return num_correct

            # Save the word_to_id map since predict.py needs it
            vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0


            """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_acc,  train_loss = train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)
                """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
                if current_step % params['evaluate_every'] == 0:
                    logging.critical('step: {} accuracy: {} cnn_loss: {} '.format(current_step, train_acc, train_loss))
                    dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        num_dev_correct = dev_step(x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct

                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    logging.critical('Accuracy on dev set: {}'.format(dev_accuracy))

                    """Step 6.2: save the model if it is the best based on accuracy on dev set"""
                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        logging.critical('Saved model at {} at step {}'.format(path, best_at_step))
                        logging.critical('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step))

            """Step 7: predict x_test (batch by batch)"""
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1)
            total_test_correct = 0
            start=time.time()
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                num_test_correct = dev_step(x_test_batch, y_test_batch)
                total_test_correct += num_test_correct
            #path = saver.save(sess, checkpoint_prefix)
            logging.critical("\nExecution time for testing = {0:.6f}".format(time.time() - start))   
            test_accuracy = float(total_test_correct) / len(y_test)
            logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path))
            logging.critical('The training is complete')
Beispiel #25
0
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helper
from text_cnn import TextCNN
from config import FLAGS

os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # 指定一个GPU
print('\n----------------Parameters--------------')  # 在网络训练之前,先打印出来看看
for attr, value in (FLAGS.__flags.items()):
    print('{}={}'.format(attr.upper(), value))

# Load data and cut
x_train_data, y = data_helper.load_data_and_labels(FLAGS.train_data_file,
                                                   FLAGS.train_label_file)

# Padding sentence
padded_sentences_train, max_padding_length = data_helper.padding_sentence(
    sentences=x_train_data,
    padding_sentence_length=FLAGS.padding_sentence_length,
    padding_move=FLAGS.padding_move)
print(padded_sentences_train[:10])
x, vocabulary_len = data_helper.embedding_sentences(
    embedding_file=FLAGS.embedding_file,
    padded_sentences=padded_sentences_train,
    embedding_dimension=FLAGS.embedding_dimension)
print(x[:2])
# Shuffle data randomly
np.random.seed(100)
shuffle_indices = np.random.permutation(np.arange(len(y)))
Beispiel #26
0
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")
tf.flags.DEFINE_boolean("use_cached_embeddings", True,
                        "Cache embeddings locally on disk for repeated runs")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

print("Loading Data...")
q1, q2, y, x1_length, x2_length = data_helper.load_data_and_labels(
    FLAGS.training_data_file)

max_length = max(max([len(x.split(" ")) for x in q1]),
                 max([len(x.split(" ")) for x in q2]))
vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
print("max question length:", max_length)

#converting to embedding matrix

x_text = q1 + q2
vocab_ids = np.array(list(vocab_processor.fit_transform(x_text)))
x1 = vocab_ids[:len(q1)]
x2 = vocab_ids[len(q1):]

print("Loading Word embeddings")
vocab_dict = vocab_processor.vocabulary_._mapping
Beispiel #27
0
    r"C:\Users\satyasaideepthi\PycharmProjects\DL_LAB2\data\rt-polarity.neg",
    "Data source for the negative data.")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = data_helper.load_data_and_labels(FLAGS.positive_data_file,
                                             FLAGS.negative_data_file)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
Beispiel #28
0
tf.flags.DEFINE_string("embedding_file","./glove.6B/glove.6B.100d.txt","pretrained embediing file")

tf.flags.DEFINE_integer("batch_size",512,"size of batch")
tf.flags.DEFINE_string("checkpoint_dir","","checkpoint file")

tf.flags.DEFINE_boolean("allow_soft_placement",True,"Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement",False,"Log placement of ops on devices")

FLAGS=tf.flags.FLAGS
FLAGS._parse_flags()
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

print("Loading Data...")
q1,q2,y_label,q1_length,q2_length=data_helper.load_data_and_labels(FLAGS.testing_file)

x_text=q1+q2
vocab_path=os.path.join(FLAGS.checkpoint_dir,"..","vocab")
vocab_processor=learn.preprocessing.VocabularyProcessor.restore(vocab_path)
vocab_ids=np.array(list(vocab_processor.tranform(x_text)))

x1_test=vocab_ids[:len(q1)]
x2_test=vocab_ids[len(q1):]
y_test=np.argmax(y_label,axis=1)

checkpoint_file=tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph=tf.Graph()
with graph.as_default():
    session_conf=tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,