Beispiel #1
0
def test_model():
    vocab, embeddings = data_helper.load_embeddings(config.get('data', 'embedding_file'))
    model = RNNModel(embeddings, num_classes=5)
    model.load(config.get('data', 'model_dir'))
    test_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'test.txt'))
    numeric_test_samples = data_helper.convert_to_numeric_samples(test_data, vocab, num_classes=5)
    model.eval(numeric_test_samples)
Beispiel #2
0
def train_model():
    vocab, embeddings = data_helper.load_embeddings(config.get('data', 'embedding_file'))
    train_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'train.txt'))
    numeric_train_samples = data_helper.convert_to_numeric_samples(train_data, vocab, num_classes=5)
    model = RNNModel(embeddings, num_classes=5, model_config=config['model'])
    dev_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'dev.txt'))
    numeric_dev_samples = data_helper.convert_to_numeric_samples(dev_data, vocab, num_classes=5)
    eval_func = lambda: model.eval(numeric_dev_samples)
    model.train(numeric_train_samples, eval_func)
    model.save(config.get('data', 'model_dir'))
Beispiel #3
0
def load_and_train_abstract():
    label2idx = json.load(open('../data/journal2idx.json', 'r'))
    label_num = len(label2idx) + 1
    embedding_path = '../data/glove.42B.300d.50K.w2v.txt'
    embeddings, vocab, embedding_size = load_embeddings(embedding_path, 100000)
    X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label(
        '../data/dataset_abstract_stat_50.npy', embeddings, vocab)
    model = models.baseline_abstract_cnn_model(embeddings, len(vocab),
                                               embedding_size, label_num)
    train_and_val('../models/abstract_cnn_baseline', model, X_train, y_train,
                  X_val, y_val)
Beispiel #4
0
def load_and_train_ref_abs(has_rank=False):
    label2idx = json.load(open('../data/journal2idx.json', 'r'))
    label_num = len(label2idx) + 1
    embedding_path = '../data/glove.42B.300d.50K.w2v.txt'
    embeddings, vocab, embedding_size = load_embeddings(embedding_path, 100000)
    X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label(
        '../data/dataset_abstract_stat_50.npy', embeddings, vocab)

    label2idx = json.load(open('../data/journal2idx.json', 'r'))
    label_num = len(label2idx) + 1
    journal2idx_all = json.load(open('../data/journal2idx_all.json', 'r'))

    X_train_ref, _, _, X_val_ref, _, y_val_r_ref, X_test_ref, _, _ = load_ref_chain_to_label(
        '../data/dataset_ref_chain_stat_50.npy')

    model = models.reference_abstract_model(
        embeddings,
        word_vocab_size=len(vocab),
        word_embedding_dim=embedding_size,
        ref_vocab_size=len(journal2idx_all),
        ref_embedding_dim=embedding_size,
        label_num=label_num,
        has_rank=has_rank)

    if has_rank:
        y_train = [np.array(y_train), np.array(y_train_r)]
        y_val = [np.array(y_val), np.array(y_val_r)]
        train_and_val(
            '../models/ref_abs_cnn_with_rank', model,
            [np.array(X_train), np.array(X_train_ref)], y_train,
            [np.array(X_val), np.array(X_val_ref)], y_val)
    else:
        train_and_val(
            '../models/ref_abs_cnn_baseline', model,
            [np.array(X_train), np.array(X_train_ref)], y_train,
            [np.array(X_val), np.array(X_val_ref)], y_val)
Beispiel #5
0
def train_cnn_rnn(input_file,training_config):
	epochs=10
#	input_file = sys.argv[1]
	x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(input_file)

#	training_config = sys.argv[2]
	params = json.loads(open(training_config).read())

	# Assign a 300 dimension vector to each word
	word_embeddings = data_helper.load_embeddings(vocabulary)
	embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)]
	embedding_mat = np.array(embedding_mat, dtype = np.float32)

	# Split the original dataset into train set and test set
	x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1, random_state=16)

	# Split the train set into train set and dev set
	x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=16)

	logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
	logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

	# Create a directory, everything related to the training will be saved in this directory
	timestamp = str(int(time.time()))
	trained_dir = './trained_results_' + timestamp + '/'
	if os.path.exists(trained_dir):
		shutil.rmtree(trained_dir)
	os.makedirs(trained_dir)

	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)
		with sess.as_default():
			cnn_rnn = TextCNNRNN(
				embedding_mat=embedding_mat,
				sequence_length=x_train.shape[1],
				num_classes = y_train.shape[1],
				non_static=params['non_static'],
				hidden_unit=params['hidden_unit'],
				max_pool_size=params['max_pool_size'],
				filter_sizes=map(int, params['filter_sizes'].split(",")),
				num_filters = params['num_filters'],
				embedding_size = params['embedding_dim'],
				l2_reg_lambda = params['l2_reg_lambda'])

			global_step = tf.Variable(0, name='global_step', trainable=False)
			optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
			grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
			train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

			# Checkpoint files will be saved in this directory during training
			checkpoint_dir = './checkpoints_' + timestamp + '/'
			if os.path.exists(checkpoint_dir):
				shutil.rmtree(checkpoint_dir)
			os.makedirs(checkpoint_dir)
			checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

			def real_len(batches):
				return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]

			def train_step(x_batch, y_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.input_y: y_batch,
					cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'],
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				_, step, loss, accuracy = sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict)

			def dev_step(x_batch, y_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.input_y: y_batch,
					cnn_rnn.dropout_keep_prob: 1.0,
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				step, loss, accuracy, num_correct, predictions = sess.run(
					[global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict)
				return accuracy, loss, num_correct, predictions

			saver = tf.train.Saver(tf.all_variables())
			sess.run(tf.initialize_all_variables())

			# Training starts here
			train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
			best_accuracy, best_at_step = 0, 0

			# Train the model with x_train and y_train
			for epoch in range(epochs):
				for train_batch in train_batches:
					x_train_batch, y_train_batch = zip(*train_batch)
					train_step(x_train_batch, y_train_batch)
					current_step = tf.train.global_step(sess, global_step)

					# Evaluate the model with x_dev and y_dev
					if current_step % params['evaluate_every'] == 0:
						dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)

						total_dev_correct = 0
						for dev_batch in dev_batches:
							x_dev_batch, y_dev_batch = zip(*dev_batch)
							acc, loss, num_dev_correct, predictions = dev_step(x_dev_batch, y_dev_batch)
							total_dev_correct += num_dev_correct
						accuracy = float(total_dev_correct) / len(y_dev)
						logging.info('Accuracy on dev set: {}'.format(accuracy))

						if accuracy >= best_accuracy:
							best_accuracy, best_at_step = accuracy, current_step
							path = saver.save(sess, checkpoint_prefix, global_step=current_step)
							logging.critical('Saved model {} at step {}'.format(path, best_at_step))
							logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))
			logging.critical('Training is complete, testing the best model on x_test and y_test')

			# Save the model files to trained_dir. predict.py needs trained model files. 
			saver.save(sess, trained_dir + "best_model.ckpt")

			# Evaluate x_test and y_test
			saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
			acc, loss, num_test_correct, predictions = dev_step(x_test, y_test)
			from sklearn.metrics import recall_score
			from sklearn.metrics import f1_score
			from sklearn.metrics import accuracy_score
     
			y_test=[np.argmax(y_t) for y_t in y_test]      
			print(sorted(list(set(y_test))))       
				   
			recall_l=recall_score(y_test,predictions,average=None)
			f1_score=f1_score(y_test,predictions,average=None)
			acc_score=accuracy_score(y_test,predictions)
			total_test_correct = int(num_test_correct)                
			logging.critical('Recall on test set: '+str(recall_l))
			logging.critical('Acc on test set: '+str(acc_score))
			logging.critical('F1 on test set: '+str(f1_score))
			logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test)))
			print(len(labels))
			print(len(recall_l))
			print(len(f1_score))
			labels_=[labels[n] for n in sorted(list(set(y_test)))]
				
			logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test)))
			df_=pd.DataFrame();df_["labels"]=labels_;df_["recall"]=recall_l;df_["f1"]=f1_score;df_.to_csv("matrics.csv",index=False)
			# Save trained parameters and files since predict.py needs them
			#print (vocabulary)

	with open(trained_dir + 'words_index.json', 'w') as outfile:
		#jsObj = json.dumps(vocabulary)  
		  
		#outfile.write(jsObj)  
		#outfile.close()  
		json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
	with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
		pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
	with open(trained_dir + 'labels.json', 'w') as outfile:
		json.dump(labels, outfile, indent=4, ensure_ascii=False)

	params['sequence_length'] = x_train.shape[1]
	with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
		json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
Beispiel #6
0
def main(args):

    argc = len(args)
    data_dir = args[1]  #train/reviews
    train_text = args[2]  # 'paper' # paper, review, all
    model_name = args[3]  #rnn cnn dan
    label = int(args[4])

    (x_train, y_train, x_dev, y_dev, x_test, y_test),\
        vocab, vocab_inv, label_scale, aspects = \
        prepare_data(
            data_dir,
            max_vocab_size = 35000,
            max_len_paper = 1000,
            max_len_review = 200)

    # choose only given aspect as label among different aspects
    if label >= 0:
        aspects = [aspects[label]]
        print('Labels:', aspects)

    # extract only data of interest
    x_train,y_train,evaluate_mean_train,evaluate_major_train,mean_aspects_train,major_aspects_train = \
        choose_label(x_train, y_train, size = label_scale, label=label)
    x_dev,y_dev,evaluate_mean_dev,evaluate_major_dev,_,_ = \
        choose_label(x_dev, y_dev, size = label_scale, label=label)
    x_test,y_test,evaluate_mean_test,evaluate_major_test,_,_ = \
        choose_label(x_test, y_test, size = label_scale, label=label)

    # get mean/major from train on test
    evaluate_mean = []
    evaluate_major = []
    for aid, y_aspect in enumerate(y_test.T):
        mean_aspect = mean_aspects_train[aid]
        major_aspect = major_aspects_train[aid]
        evaluate_mean_aspect = evaluate(y_aspect,
                                        [mean_aspect] * len(y_aspect))
        evaluate_major_aspect = evaluate(y_aspect,
                                         [major_aspect] * len(y_aspect))
        evaluate_mean.append(evaluate_mean_aspect)
        evaluate_major.append(evaluate_major_aspect)
    print('Majority (Test)')
    for mean, major, a in zip(evaluate_mean, evaluate_major, aspects):
        print('\t%15s\t%.4f\t%.4f' % (a, mean, major))
    print('\t%15s\t%.4f\t%.4f' %
          ('TOTAL', np.average(evaluate_mean), np.average(evaluate_major)))

    # choose train text
    if train_text == 'paper':
        x_train = x_train[0]
        x_dev = x_dev[0]
        x_test = x_test[0]
    elif train_text == 'review':
        x_train = x_train[1]
        x_dev = x_dev[1]
        x_test = x_test[1]
    elif train_text == 'all':
        x_train = np.concatenate(x_train, axis=1)
        x_dev = np.concatenate(x_dev, axis=1)
        x_test = np.concatenate(x_test, axis=1)
    else:
        print('Wrong')
        sys.exit(1)
    max_len = x_train.shape[1]

    print('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev),
                                                      len(x_test)))
    print('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev),
                                                      len(y_test)))

    timestamp = str(int(time.time()))
    trained_dir = './trained_results/' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    model, config = models(model_name)
    config.seq_length = max_len
    config.vocab_size = len(vocab)
    config.num_classes = len(aspects)

    #load embedding or None
    embedding_mat = load_embeddings(
        vocab, load="/data/word2vec/glove.840B.300d.w2v.bin")  #None

    # loading a model
    model = model(config, embedding=embedding_mat)

    def feed_data(x_batch, y_batch, keep_prob):
        feed_dict = {
            model.input_x: x_batch,
            model.input_y: y_batch,
            model.keep_prob: keep_prob
        }
        return feed_dict

    session_conf = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    sess = tf.Session(config=session_conf)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    if embedding_mat is not None:
        sess.run([model.embedding_init],
                 feed_dict={model.embedding_placeholder: embedding_mat})

    # Checkpoint files will be saved in this directory during training
    checkpoint_dir = './ckpts/' + timestamp + '/'
    if os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    os.makedirs(checkpoint_dir)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

    ##############################
    # Training starts here
    ##############################
    best_loss = np.inf
    best_at_step = 0
    for epoch in range(config.num_epochs):

        train_batches = batch_iter(list(zip(x_train, y_train)),
                                   config.batch_size)
        train_losses = []
        for train_batch in train_batches:
            x_train_batch, y_train_batch = list(zip(*train_batch))
            feed_dict = feed_data(x_train_batch, y_train_batch,
                                  config.dropout_keep_prob)

            current_step, train_loss, _ = sess.run(
                [model.global_step, model.loss, model._train_op], feed_dict)
            train_losses.append(train_loss)

            if current_step % config.print_per_batch == 0:
                print('[%d/%d] %.4f' %
                      (epoch, current_step, np.average(train_losses)))

            # evaluateuate the model with x_dev and y_dev
            if current_step % config.save_per_batch == 0:
                dev_batches = batch_iter(list(zip(x_dev, y_dev)),
                                         config.batch_size)
                dev_losses = []
                aspect_all_ys = {i: [] for i in range(len(aspects))}
                aspect_all_ys_ = {i: [] for i in range(len(aspects))}
                for dev_batch in dev_batches:
                    x_dev_batch, y_dev_batch = list(zip(*dev_batch))
                    feed_dict = feed_data(x_dev_batch, y_dev_batch, 1.0)
                    dev_loss, dev_logit = sess.run([model.loss, model.logits],
                                                   feed_dict)
                    dev_losses.append(dev_loss)
                    #import pdb; pdb.set_trace()
                    dev_y = np.array([d for d in dev_batch[:, 1]])
                    for aid, (y, y_) in enumerate(zip(dev_y.T, dev_logit.T)):
                        aspect_all_ys[aid].extend(list(y))
                        aspect_all_ys_[aid].extend(list(y_))
                dev_aspect = []
                for aid in range(len(aspects)):
                    ys = aspect_all_ys[aid]
                    ys_ = aspect_all_ys_[aid]
                    dev_aspect.append(evaluate(ys, ys_))
                #for a,r in zip(aspects, dev_aspect):
                #  print '\t%20s\t%.4f'%(a,r)
                #print '\t%20s\t%.4f'%('TOTAL',np.average(dev_aspect))
                print('[%d] dev loss: %.6f, acc: %.6f' %
                      (current_step, np.average(dev_losses),
                       np.average(dev_aspect)))

                # test
                test_batches = batch_iter(list(zip(x_test, y_test)),
                                          config.batch_size,
                                          shuffle=False)
                aspect_all_ys = {}  #[[]] * len(aspects)
                aspect_all_ys_ = {}  #[[]] * len(aspects)
                for i in range(len(aspects)):
                    aspect_all_ys[i] = []
                    aspect_all_ys_[i] = []
                for test_batch in test_batches:
                    x_test_batch, y_test_batch = list(zip(*test_batch))
                    feed_dict = feed_data(x_test_batch, y_test_batch, 1.0)
                    test_loss, test_logit = sess.run(
                        [model.loss, model.logits], feed_dict)
                    test_y = np.array([d for d in test_batch[:, 1]])
                    for aid, (y, y_) in enumerate(zip(test_y.T, test_logit.T)):
                        aspect_all_ys[aid].extend(list(y))
                        aspect_all_ys_[aid].extend(list(y_))
                test_aspect = []
                for aid in range(len(aspects)):
                    ys = aspect_all_ys[aid]
                    ys_ = aspect_all_ys_[aid]
                    test_aspect.append(evaluate(ys, ys_))
                print('[%d] test loss: %.4f' %
                      (current_step, np.average(test_aspect)))

                if np.average(dev_losses) <= best_loss:
                    best_loss, best_at_step = np.average(
                        dev_losses), current_step
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print('Best loss %.2f at step %d' %
                          (best_loss, best_at_step))
        #print 'Epoch done'
    print('Training is complete, testing the best model on x_test and y_test')

    print('Best epoch', best_at_step)
    saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))

    test_batches = batch_iter(list(zip(x_test, y_test)),
                              config.batch_size,
                              shuffle=False)
    aspect_all_ys = {}  #[[]] * len(aspects)
    aspect_all_ys_ = {}  #[[]] * len(aspects)
    for i in range(len(aspects)):
        aspect_all_ys[i] = []
        aspect_all_ys_[i] = []
    for test_batch in test_batches:
        x_test_batch, y_test_batch = list(zip(*test_batch))
        feed_dict = feed_data(x_test_batch, y_test_batch, 1.0)
        test_loss, test_logit = sess.run([model.loss, model.logits], feed_dict)
        test_y = np.array([d for d in test_batch[:, 1]])
        for aid, (y, y_) in enumerate(zip(test_y.T, test_logit.T)):
            aspect_all_ys[aid].extend(list(y))
            aspect_all_ys_[aid].extend(list(y_))
    evaluate_aspect = []
    for aid in range(len(aspects)):
        ys = aspect_all_ys[aid]
        ys_ = aspect_all_ys_[aid]
        evaluate_aspect.append(evaluate(ys, ys_))
    for a, r in zip(aspects, evaluate_aspect):
        print('\t%20s\t%.4f' % (a, r))
    print('\t%20s\t%.4f' % ('TOTAL', np.average(evaluate_aspect)))
Beispiel #7
0
def train_cnn_rnn():
    input_file = sys.argv[1]
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(
        input_file)

    training_config = sys.argv[2]
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info("x_train: {}, x_dev: {}, x_test: {}".format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info("y_train: {}, y_dev: {}, y_test: {}".format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = "./trained_results_" + timestamp + "/"
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(
                embedding_mat=embedding_mat,
                sequence_length=x_train.shape[1],
                num_classes=y_train.shape[1],
                non_static=params["non_static"],
                hidden_unit=params["hidden_unit"],
                max_pool_size=params["max_pool_size"],
                filter_sizes=map(int, params["filter_sizes"].split(",")),
                num_filters=params["num_filters"],
                embedding_size=params["embedding_dim"],
                l2_reg_lambda=params["l2_reg_lambda"],
            )

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = "./checkpoints_" + timestamp + "/"
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params["max_pool_size"])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params["dropout_keep_prob"],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params["embedding_dim"], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params["embedding_dim"], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run(
                    [
                        global_step,
                        cnn_rnn.loss,
                        cnn_rnn.accuracy,
                        cnn_rnn.num_correct,
                        cnn_rnn.predictions,
                    ],
                    feed_dict,
                )
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params["batch_size"],
                                                   params["num_epochs"])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params["evaluate_every"] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params["batch_size"], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info("Accuracy on dev set: {}".format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical("Saved model {} at step {}".format(
                            path, best_at_step))
                        logging.critical("Best accuracy {} at step {}".format(
                            best_accuracy, best_at_step))
            logging.critical(
                "Training is complete, testing the best model on x_test and y_test"
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + "-" + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params["batch_size"],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical("Accuracy on test set: {}".format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + "words_index.json", "w") as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + "embeddings.pickle", "wb") as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + "labels.json", "w") as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params["sequence_length"] = x_train.shape[1]
    with open(trained_dir + "trained_parameters.json", "w") as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Beispiel #8
0
def train_cnn_rnn():
    x_, y_, x_test, y_test, vocabulary, vocabulary_inv, labels = data_helper.load_data(
    )
    #x_, y_, vocabulary, vocabulary_inv, labels = data_helper.load_data_book()
    training_config = 'training_config.json'
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = []
    for i in range(len(vocabulary_inv)):
        embedding_mat.append(word_embeddings[vocabulary_inv[i]])
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set

    # Split the train set into train set and dev set
    # IMDB style
    # x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1)

    # Book data style
    #x_, x_test, y_, y_test = train_test_split(x_, y_, test_size=0.1)
    x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1)

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():

            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=[
                                     int(x)
                                     for x in params['filter_sizes'].split(",")
                                 ],
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            #optimizer = tf.train.MomentumOptimizer(0.1, 0.9)
            optimizer = tf.train.AdamOptimizer()
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn_rnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn_rnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                summaries, _, step, loss, accuracy = sess.run([
                    train_summary_op, train_op, global_step, cnn_rnn.loss,
                    cnn_rnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)
                # print(accuracy)
                return accuracy

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                summaries, step, loss, accuracy, num_correct, predictions = sess.run(
                    [
                        dev_summary_op, global_step, cnn_rnn.loss,
                        cnn_rnn.accuracy, cnn_rnn.num_correct,
                        cnn_rnn.predictions
                    ], feed_dict)
                dev_summary_writer.add_summary(summaries, step)
                print("step {}, loss {:g}, acc {:g}".format(
                    step, loss, accuracy))
                return accuracy, predictions

            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_dev_accuracy, best_at_step = 0, 0
            best_test_accuracy = 0
            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_acc = train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:

                    print("Training Accuracy:", train_acc, end=' ')
                    print("Evaluation:", end=' ')
                    dev_acc, _ = dev_step(x_dev, y_dev)
                    print("Test:", end=' ')
                    test_acc_tmp, pred__ = dev_step(x_test, y_test)
                    # with open('results/prediction' + str(current_step), 'bw') as f:
                    #     pickle.dump(pred__, f)
                    if dev_acc > best_dev_accuracy:
                        best_dev_accuracy = dev_acc
                        best_test_accuracy = test_acc_tmp
                    print('best dev accuracy is', best_dev_accuracy,
                          'the test is', best_test_accuracy)
            print(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Evaluate x_test and y_test

            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    # os.rename(path, trained_dir + 'best_model.ckpt')
    # os.rename(path + '.meta', trained_dir + 'best_model.meta')
    shutil.rmtree(checkpoint_dir)
    logging.critical('{} has been removed'.format(checkpoint_dir))

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
def train_cnn():
    input_file = sys.argv[1]
    if os.path.exists('./data/x.p') and \
            os.path.exists('./data/y.p') and \
            os.path.exists('./data/vocabulary.p') and \
            os.path.exists('./data/vocabulary_inv.p') and \
            os.path.exists('./data/labels.p'):
        x_ = pickle.load(open("./data/x.p", "rb"))
        y_ = pickle.load(open("./data/y.p", "rb"))
        vocabulary = pickle.load(open("./data/vocabulary.p", "rb"))
        vocabulary_inv = pickle.load(open("./data/vocabulary_inv.p", "rb"))
        labels = pickle.load(open("./data/labels.p", "rb"))
    else:
        x_, y_, vocabulary, vocabulary_inv, _, labels = data_helper.load_data(
            input_file)

    training_config = sys.argv[2]
    params = json.loads(open(training_config).read())

    # Assign a n dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary,
                                                  dim=params['embedding_dim'])
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(embedding_mat=embedding_mat,
                          non_static=params['non_static'],
                          sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(vocabulary),
                          embedding_size=params['embedding_dim'],
                          filter_sizes=list(
                              map(int, params['filter_sizes'].split(","))),
                          num_filters=params['num_filters'],
                          l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            # optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            optimizer = tf.train.AdamOptimizer(learning_rate=1e-3,
                                               beta1=0.9,
                                               beta2=0.999,
                                               epsilon=1e-08,
                                               use_locking=False,
                                               name='Adam')
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: params['dropout_keep_prob'],
                }
                _, step, summaries, loss_, accuracy_ = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss_, accuracy_))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0,
                }
                step, summaries, loss_, accuracy_, predictions_ = sess.run([
                    global_step, dev_summary_op, cnn.loss, cnn.accuracy,
                    cnn.predictions
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("evaluation on test set:")
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss_, accuracy_))
                if writer:
                    writer.add_summary(summaries, step)
                return accuracy_, loss_, predictions_

            saver = tf.train.Saver(tf.global_variables())
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])

            # Train the model with x_train and y_train
            i = 0
            for train_batch in train_batches:
                logging.info('Training on batch: {}'.format(i))
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, predictions = dev_step(
                            x_dev_batch,
                            y_dev_batch,
                            writer=dev_summary_writer)

                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    logging.critical('Saved model {} at step {}'.format(
                        path, i))
                i += 1
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(i))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    os.rename(path, trained_dir + 'best_model.ckpt')
    os.rename(path + '.meta', trained_dir + 'best_model.meta')
    shutil.rmtree(checkpoint_dir)
    logging.critical('{} has been removed'.format(checkpoint_dir))

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Beispiel #10
0

if __name__ == "__main__":
    args = set_args()

    vocab_file = '../data/vocab.txt'
    train_file = '../data/LCQMC/LCQMC.train.data'
    valid_file = '../data/LCQMC/LCQMC.valid.data'
    embeddings_file = '../data/token_vec_300.bin'

    print('加载训练集ing...')
    train_data = LCQMC_Dataset(train_file, vocab_file, args.max_char_len)
    train_loader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=args.train_batch_size)

    print('加载验证集ing...')
    dev_data = LCQMC_Dataset(valid_file, vocab_file, args.max_char_len)
    dev_loader = DataLoader(dev_data,
                            shuffle=True,
                            batch_size=args.dev_batch_size)

    print('加载词向量ing...')
    embeddings = load_embeddings(embeddings_file)

    model = Model(embeddings)
    if torch.cuda.is_available():
        model.cuda()

    train()
Beispiel #11
0
def train_cnn_rnn():
    input_file = "logstashTemp.dat"
    output_file = "wcData85_1.csv"

    # 	with open(input_file,"r",encoding="utf8") as datFile:
    # 		jsonDict=json.loads(datFile.readline())
    # 	with open(input_file,"r",encoding="utf8") as datFile:
    # 		jsonDf=pd.DataFrame([],columns=list(jsonDict.keys()))
    # 		rowNO=0
    # 		for row in datFile.readlines():
    # 			try:
    # 				jsonDf.loc[rowNO]=list(json.loads(row).values())
    # 			except json.decoder.JSONDecodeError as ex:
    # 				print(ex.tostring)
    # 			rowNO+=1
    # 		jsonDf.to_csv(output_file)

    print("loading data...")
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data3(
        output_file, ["crit", "err"], 10000)
    # 	print("y_:",y_)
    training_config = "training_config.json"
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    print(trained_dir)
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            #global_step will control the changes of grads_and_vars with
            #	the change of itself which caused by optimizer.apply_gradients()
            optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3,
                                                  decay=0.9)
            #initiate the optimizer whose learning_rate is firstly 1e-3
            # but it will be decreased along with the change of decay in the folume below:
            # decayed_learning_rate = learning_rate*decay_rate^(global_step/decay_steps)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            #compute gradients of loss
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            #apply the gradients to variables and change them

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)
                print(step, "trainAccuracy", accuracy)
                with open("trainLogCsv.txt", "a+",
                          encoding="utf8") as trainLogFile:
                    trainLogFile.write("=========" + str(step) + "=========\n")
                    trainLogFile.write("acc:" + str(accuracy) + "\n")
                    trainLogFile.write("loss:" + str(loss) + "\n")

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            filter_writer = tf.summary.FileWriter('/path/to/logs', sess.graph)
            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \
                     params['batch_size'], \
                     params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                if len(train_batch) > 0:
                    x_train_batch, y_train_batch = zip(*train_batch)
                    train_step(x_train_batch, y_train_batch)
                    current_step = tf.train.global_step(sess, global_step)

                    # Evaluate the model with x_dev and y_dev
                    if current_step % params['evaluate_every'] == 0:
                        dev_batches = data_helper.batch_iter(
                            list(zip(x_dev, y_dev)), params['batch_size'], 1)

                        total_dev_correct = 0
                        y_dev = []
                        y_pre = []
                        for dev_batch in dev_batches:
                            if len(dev_batch) > 0:
                                x_dev_batch, y_dev_batch = zip(*dev_batch)
                                acc, loss, num_dev_correct, predictions = dev_step(
                                    x_dev_batch, y_dev_batch)
                                y_pre += predictions.tolist()
                                y_dev += list(y_dev_batch)
                                total_dev_correct += num_dev_correct
                        y_devs = [
                            y_devItem.tolist().index(max(y_devItem.tolist()))
                            for y_devItem in y_dev
                        ]
                        # 						print("y_pre:",y_pre)
                        # 						print("y_devs:",y_devs)
                        devRecall, devPrecision = getRP(y_pre, y_devs)
                        logging.info(
                            'Recall and precision of dev set: {},{}'.format(
                                devRecall, devPrecision))
                        accuracy = float(total_dev_correct) / len(y_dev)
                        logging.info(
                            'Accuracy on dev set: {}'.format(accuracy))

                        lossItem = loss
                        accuracyItem = accuracy

                        with open("devCsv.csv", "a+",
                                  encoding="utf8") as csvFile:
                            myWriter = csv.writer(csvFile)
                            myWriter.writerow([
                                lossItem, accuracyItem, devRecall, devPrecision
                            ])

                        if accuracy >= best_accuracy:
                            best_accuracy, best_at_step = accuracy, current_step
                            path = saver.save(sess,
                                              checkpoint_prefix,
                                              global_step=current_step)
                            logging.critical(
                                'Saved model {} at step {}'.format(
                                    path, best_at_step))
                            logging.critical(
                                'Best accuracy {} at step {}'.format(
                                    best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                if len(test_batch) > 0:
                    x_test_batch, y_test_batch = zip(*test_batch)
                    acc, loss, num_test_correct, predictions = dev_step(
                        x_test_batch, y_test_batch)
                    total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Beispiel #12
0
def train_cnn_rnn():
    input_file = sys.argv[1]
    if os.path.exists('./data/x.p') and \
            os.path.exists('./data/y.p') and \
            os.path.exists('./data/vocabulary.p') and \
            os.path.exists('./data/vocabulary_inv.p') and \
            os.path.exists('./data/labels.p'):
        x_ = pickle.load(open("./data/x.p", "rb"))
        y_ = pickle.load(open("./data/y.p", "rb"))
        vocabulary = pickle.load(open("./data/vocabulary.p", "rb"))
        vocabulary_inv = pickle.load(open("./data/vocabulary_inv.p", "rb"))
        labels = pickle.load(open("./data/labels.p", "rb"))
    else:
        x_, y_, vocabulary, vocabulary_inv, _, labels = data_helper.load_data(
            input_file)

    training_config = sys.argv[2]
    params = json.loads(open(training_config).read())

    # Assign a n dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary,
                                                  dim=params['embedding_dim'])
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            # optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            optimizer = tf.train.AdamOptimizer(learning_rate=0.0005,
                                               beta1=0.9,
                                               beta2=0.999,
                                               epsilon=1e-08,
                                               use_locking=False,
                                               name='Adam')
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver(tf.global_variables())
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            i = 0
            for train_batch in train_batches:
                logging.info('Training on batch: {}'.format(i))
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
                i += 1
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    os.rename(path, trained_dir + 'best_model.ckpt')
    os.rename(path + '.meta', trained_dir + 'best_model.meta')
    shutil.rmtree(checkpoint_dir)
    logging.critical('{} has been removed'.format(checkpoint_dir))

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Beispiel #13
0
def train_cnn_rnn():
    # input_file=sys.argv[1]
    input_file = './data/simple3.csv'
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(
        input_file)
    #print(x_.shape)#(27404,489)
    #print(y_.shape)#(27404,10)

    #training_config=sys.argv[2]
    training_config = './training_config.json'
    params = json.loads(open(training_config).read())
    #print(params)
    """
    {'num_epochs': 1, 'num_filters': 32, 'max_pool_size': 4, 'l2_reg_lambda': 0.0, 'filter_sizes': '3,4,5', 'dropout_keep_prob': 0.5, 
    'non_static': False, 'evaluate_every': 200, 'hidden_unit': 300, 'batch_size': 128, 'embedding_dim': 300}
    """
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)
    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    #timestamp = str(int(time.time()))
    #创建问夹准备把参数词典等中间必要东西村建
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + 'test' + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()

    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            #设置优化器OP和训练OP
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # 训练的时候保存模型
            # checkpoint_dir = 'checkpoints_' + timestamp + '/'
            # if os.path.exists(checkpoint_dir):
            #     shutil.rmtree(checkpoint_dir)
            # os.makedirs(checkpoint_dir)
            # checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                #batches ?
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            #训练
            def train_step(x_batch, y_batch):
                #x_batch ?
                #y_batch ?
                # print(x_batch[1])
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                #print("real_len:", len(real_len(x_batch)))
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            #测试
            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            #saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            #训练准备
            #根据batch_size计算每个train_batch的大小
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                #print("y_train_batch:", y_train_batch[0])
                train_step(x_train_batch, y_train_batch)
                #print("train_step", )
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        # path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        # logging.critical('Saved model {} at step {}'.format(path, best_at_step))
                        # logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            # saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            #saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))
            print('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

        # Save trained parameters and files since predict.py needs them
        with open(trained_dir + 'words_index.json', 'w') as outfile:
            json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
        with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
            pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
        with open(trained_dir + 'labels.json', 'w') as outfile:
            json.dump(labels, outfile, indent=4, ensure_ascii=False)

        params['sequence_length'] = x_train.shape[1]
        with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
            json.dump(params,
                      outfile,
                      indent=4,
                      sort_keys=True,
                      ensure_ascii=False)
Beispiel #14
0
max_length = max(max([len(x.split(" ")) for x in q1]),
                 max([len(x.split(" ")) for x in q2]))
vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
print("max question length:", max_length)

#converting to embedding matrix

x_text = q1 + q2
vocab_ids = np.array(list(vocab_processor.fit_transform(x_text)))
x1 = vocab_ids[:len(q1)]
x2 = vocab_ids[len(q1):]

print("Loading Word embeddings")
vocab_dict = vocab_processor.vocabulary_._mapping
pretrained_embeddings = data_helper.load_embeddings(
    FLAGS.embeddings_file, vocab_dict, FLAGS.embedding_dim,
    FLAGS.use_cached_embeddings)

print("Shuffling Data:")
np.random.seed(10)
shuffled_index = np.random.permutation(np.arange(len(y)))
x1_shuffled = x1[shuffled_index]
x2_shuffled = x2[shuffled_index]
y_shuffled = y[shuffled_index]
q1_lenghts_shuffled = x1_length[shuffled_index]
q2_lenghts_shuffled = x2_length[shuffled_index]

print("Splitting Training/Validation data")
validation_index = -1 * int(FLAGS.val_sample_percentage * float(len(y)))
x1_training, x1_validation = x1_shuffled[:validation_index], x1_shuffled[
    validation_index:]
                    type=int,
                    help="length of sentence",
                    default=256)
parser.add_argument("--train_data",
                    type=str,
                    help="trianing data",
                    default="data_use.txt")
parser.add_argument("--config_file",
                    type=str,
                    help="training config",
                    default="training_config.json")

args = parser.parse_args()

vocabulary = Vocabulary(args.train_data)
word_embedding = data_helper.load_embeddings(vocabulary.String2i)
embedding_mat = [
    word_embedding[word] for index, word in enumerate(vocabulary.i2String)
]
embedding_mat = np.array(embedding_mat, np.float32)
'''
print embedding_mat.shape
print embedding_mat[20]
'''
with open(args.config_file) as f:
    params = json.load(f)

#print params
'''
{u'hidden_unit': 300, u'l2_reg_lambda': 0.0, u'dropout_keep_prob': 0.5, u'num_filters': 128, u'max_pool_size': 4, u'embedding_dim': 300, u'batch_size': 256, u'filter_sizes': u'3,4,5', u'evaluate_every': 100, u'non_static': False, u'num_epochs': 1}
Beispiel #16
0
def train_cnn_rnn(input_file, training_config):
    # read data and params
    x_, y_, vocabulary, vocabulary_inv, df, labels=data_helper.load_data(input_file)
    params=json.loads(open(training_config).read())
    
    # create a directory, everything related to the training will be saved in this directory
    timestamp=str(int(time.time()))
    output_dir=os.path.join('data_path_save','cnn_rnn_'+timestamp)
    trained_dir=os.path.join(output_dir,'trained_results')
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)
    
    # assign a 300 dimension vector to each word
    word_embeddings=data_helper.load_embeddings(vocabulary)
    embedding_mat=[word_embeddings[word] for index,word in enumerate(vocabulary_inv)]
    embedding_mat=np.array(embedding_mat, dtype=np.float32)
    
    # split the original dataset into trainset and devset
    x_train, x_dev, y_train, y_dev=train_test_split(x_, y_, test_size=0.1)
    # split the trainset into trainset and devset
    logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev)))
    
    graph=tf.Graph()
    with graph.as_default():
        session_conf=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess=tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn=TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], 
                               non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'],
                               filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'],
                               embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda'])
            global_step=tf.Variable(0, name='global_step', trainable=False)
            optimizer=tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars=optimizer.compute_gradients(cnn_rnn.loss)
            train_op=optimizer.apply_gradients(grads_and_vars, global_step=global_step)
            checkpoint_dir=os.path.join(output_dir,'checkpoints')
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix=os.path.join(checkpoint_dir, 'model')
            
            def real_len(batches):
                return [np.ceil(np.argmin(batch+[0])*1.0/params['max_pool_size']) for batch in batches]
            
            def train_step(x_batch, y_batch):
                feed_dict={
                        cnn_rnn.input_x: x_batch, 
                        cnn_rnn.input_y: y_batch,
                        cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'],
                        cnn_rnn.batch_size: len(x_batch),
                        cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn_rnn.real_len: real_len(x_batch)
                        }
                _, step, loss, accuracy=sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict=feed_dict)
                
            def dev_step(x_batch, y_batch):
                feed_dict={
                        cnn_rnn.input_x: x_batch, 
                        cnn_rnn.input_y: y_batch,
                        cnn_rnn.dropout_keep_prob: 1.0,
                        cnn_rnn.batch_size: len(x_batch),
                        cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn_rnn.real_len: real_len(x_batch)
                        }
                step, loss, accuracy, num_correct, predictions=sess.run([global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict=feed_dict)
                return accuracy, loss, num_correct, predictions
            
            saver=tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            
            # training starts here
            train_batches=data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step=0, 0
            for train_batch in train_batches:
                x_train_batch, y_train_batch=zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step=tf.train.global_step(sess, global_step)
                
                if current_step%params['evaluate_every']==0:
                    dev_batches=data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_dev_correct=0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch=zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions=dev_step(x_dev_batch, y_dev_batch)
                        total_dev_correct+=num_dev_correct
                    accuracy=float(total_dev_correct)/len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))
                    
                    if accuracy>=best_accuracy:
                        best_accuracy, best_at_step=accuracy, current_step
                        path=saver.save(sess, checkpoint_prefix, global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))
            logging.critical('Training is complete, testing the best model on x_test and y_test')
            
    # save trained params and files
    with open(trained_dir+'/words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir+'/embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir+'/labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)
    params['sequence_length']=x_train.shape[1]
    with open(trained_dir+'/trained_parameters.json', 'w') as outfile:
        json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
Beispiel #17
0
def train_cnn_rnn():
    input_file = "logstashTemp.dat"
    output_file = "logstash.csv"

    dataList = []
    with open(input_file, 'r', encoding='utf8') as logFile:
        for row in logFile:
            dataList.append(json.loads(row))
    keyList = list(dataList[0].keys())
    csvList = [[keyItem for keyItem in keyList]]
    for row in dataList:
        if "severity" in list(row.keys()):
            tempRow = [
                row[keyItem] for keyItem in keyList
                if keyItem in list(row.keys())
            ]
            csvList.append(tempRow)
    with open(output_file, "w+", encoding="utf8") as csvFile:
        for row in csvList:
            myWriter = csv.writer(csvFile)
            myWriter.writerow(row)
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(
        output_file, 20000)
    training_config = "training_config.json"
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    print(trained_dir)
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \
                                                params['batch_size'], \
                                                params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        print("׼ȷÂÊ£º", accuracy)
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Beispiel #18
0
    # load_and_train_ref_neighbor(has_rank=True)
    load_and_train_ref_abs(has_rank=True)

    # embedding_path = '../data/glove.42B.300d.50K.w2v.txt'
    # embeddings, vocab, embedding_size=load_embeddings(embedding_path, 100000)
    # X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label('../data/dataset_abstract_stat_50.npy', embeddings, vocab)
    # produce_test_prediciton('../models/abstract_cnn_baseline', X_test, '../outputs/abstract_cnn_baseline')

    # X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_ref_chain_to_label('../data/dataset_ref_chain_stat_50.npy')
    # produce_test_prediciton('../models/ref_chain_cnn_baseline', X_test, '../outputs/ref_chain_cnn_baseline')
    # produce_test_prediciton('../models/ref_chain_cnn_with_rank', X_test, '../outputs/ref_chain_cnn_with_rank')

    # X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_ref_nb_to_label('../data/dataset_ref_nb_stat_50.npy')
    # produce_test_prediciton('../models/ref_neighbor_cnn_baseline', X_test, '../outputs/ref_neighbor_cnn_baseline')
    # produce_test_prediciton('../models/ref_neighbor_cnn_with_rank', X_test, '../outputs/ref_neighbor_cnn_with_rank')

    embedding_path = '../data/glove.42B.300d.50K.w2v.txt'
    embeddings, vocab, embedding_size = load_embeddings(embedding_path, 100000)
    X_train, y_train, y_train_r, X_val, y_val, y_val_r, X_test, y_test, y_test_r = load_abstract_to_label(
        '../data/dataset_abstract_stat_50.npy', embeddings, vocab)

    label2idx = json.load(open('../data/journal2idx.json', 'r'))
    label_num = len(label2idx) + 1
    journal2idx_all = json.load(open('../data/journal2idx_all.json', 'r'))

    X_train_ref, _, _, X_val_ref, _, _, X_test_ref, _, _ = load_ref_chain_to_label(
        '../data/dataset_ref_chain_stat_50.npy')
    produce_test_prediciton(
        '../models/ref_abs_cnn_with_rank',
        [np.array(X_test), np.array(X_test_ref)],
        '../outputs/ref_abs_with_rank')
Beispiel #19
0
def train_cnn():
    input_dir = sys.argv[1]

    x_train, x_dev, x_test, pos1_train, pos2_train, pos1_dev, pos2_dev, pos1_test, pos2_test, y_train, y_dev, y_test, vocabulary, vocabulary_inv, labels = data_helper.load_data_split_sents(
        input_dir)

    training_config = sys.argv[2]
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    #sentence_length = 200
    pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        200)
    pos_vocab_processor.fit(pos1_train + pos2_train + pos1_dev + pos2_dev +
                            pos1_test + pos2_test)

    pos1_train_vec = np.array(list(pos_vocab_processor.transform(pos1_train)))
    pos2_train_vec = np.array(list(pos_vocab_processor.transform(pos2_train)))

    pos1_dev_vec = np.array(list(pos_vocab_processor.transform(pos1_dev)))
    pos2_dev_vec = np.array(list(pos_vocab_processor.transform(pos2_dev)))

    pos1_test_vec = np.array(list(pos_vocab_processor.transform(pos1_test)))
    pos2_test_vec = np.array(list(pos_vocab_processor.transform(pos2_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextLSTMCNN(
                embedding_mat=embedding_mat,
                sequence_length=x_train.shape[1],
                num_classes=y_train.shape[1],
                non_static=params['non_static'],
                hidden_unit=params['hidden_unit'],
                max_pool_size=params['max_pool_size'],
                filter_sizes=map(int, params['filter_sizes'].split(",")),
                num_filters=params['num_filters'],
                embedding_size=params['embedding_dim'],
                pos_vocab_size=len(pos_vocab_processor.vocabulary_),
                pos_embedding_size=params['position_embedding_dim'],
                l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x1_batch, pos1_batch, pos2_batch, y_batch):
                feed_dict = {
                    cnn.input_x1:
                    x1_batch,
                    cnn.input_pos1:
                    pos1_batch,
                    cnn.input_pos2:
                    pos2_batch,
                    cnn.input_y:
                    y_batch,
                    cnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn.batch_size:
                    len(x1_batch),
                    cnn.pad:
                    np.zeros([len(x1_batch), 1, params['embedding_dim'], 1]),
                    #cnn.pad_pos: np.zeros([len(x1_batch), 1, params['embedding_dim']+2*params['position_embedding_dim'], 1]),
                    cnn.real_len:
                    real_len(x1_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

            def dev_step(x1_batch, pos1_batch, pos2_batch, y_batch):
                feed_dict = {
                    cnn.input_x1:
                    x1_batch,
                    cnn.input_pos1:
                    pos1_batch,
                    cnn.input_pos2:
                    pos2_batch,
                    cnn.input_y:
                    y_batch,
                    cnn.dropout_keep_prob:
                    1.0,
                    cnn.batch_size:
                    len(x1_batch),
                    cnn.pad:
                    np.zeros([len(x1_batch), 1, params['embedding_dim'], 1]),
                    #cnn.pad_pos: np.zeros([len(x1_batch), 1, params['embedding_dim']+2*params['position_embedding_dim'], 1]),
                    cnn.real_len:
                    real_len(x1_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn.loss, cnn.accuracy, cnn.num_correct,
                    cnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(
                list(zip(x_train, pos1_train_vec, pos2_train_vec, y_train)),
                params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_left_train and y_train
            print "len(train_batches): ", train_batches
            prev_test_set_accuracy = 0.0
            for train_batch in train_batches:
                #print train_batch
                if train_batch.shape[0] > 0:

                    x_train_batch, pos1_train_batch, pos2_train_batch, y_train_batch = zip(
                        *train_batch)

                    train_step(x_train_batch, pos1_train_batch,
                               pos2_train_batch, y_train_batch)
                    current_step = tf.train.global_step(sess, global_step)

                    # Evaluate the model with x_left_dev and y_dev
                    if current_step % params['evaluate_every'] == 0:
                        dev_batches = data_helper.batch_iter(
                            list(zip(x_dev, pos1_dev_vec, pos2_dev_vec,
                                     y_dev)), params['batch_size'], 1)

                        total_dev_correct = 0
                        count_y_dev = 0
                        for dev_batch in dev_batches:
                            if dev_batch.shape[0] > 0:
                                x_dev_batch, pos1_dev_batch, pos2_dev_batch, y_dev_batch = zip(
                                    *dev_batch)
                                acc, loss, num_dev_correct, predictions = dev_step(
                                    x_dev_batch, pos1_dev_batch,
                                    pos2_dev_batch, y_dev_batch)
                                total_dev_correct += num_dev_correct
                                count_y_dev = count_y_dev + len(dev_batch)
                        accuracy = float(total_dev_correct) / count_y_dev
                        logging.info(
                            'Accuracy on dev set: {}'.format(accuracy))

                        test_batches = data_helper.batch_iter(
                            list(
                                zip(x_test, pos1_test_vec, pos2_test_vec,
                                    y_test)),
                            params['batch_size'],
                            1,
                            shuffle=False)
                        total_test_correct = 0
                        count_y_test = 0
                        for test_batch in test_batches:
                            if test_batch.shape[0] > 0:
                                x_test_batch, pos1_test_batch, pos2_test_batch, y_test_batch = zip(
                                    *test_batch)
                                acc, loss, num_test_correct, predictions = dev_step(
                                    x_test_batch, pos1_test_batch,
                                    pos2_test_batch, y_test_batch)
                                total_test_correct += int(num_test_correct)
                                count_y_test = count_y_test + len(test_batch)

                        test_set_acc = float(total_test_correct) / count_y_test
                        logging.critical('Accuracy on test set: {}'.format(
                            float(total_test_correct) / count_y_test))

                        if test_set_acc > prev_test_set_accuracy:
                            prev_test_set_accuracy = test_set_acc
                            best_accuracy, best_at_step = accuracy, current_step
                            path = saver.save(sess,
                                              checkpoint_prefix,
                                              global_step=current_step)
                            logging.critical(
                                'Saved model {} at step {}'.format(
                                    path, best_at_step))
                            logging.critical(
                                'Best accuracy {} at step {}'.format(
                                    best_accuracy, best_at_step))
                            logging.critical('Accuracy on test set: {}'.format(
                                float(total_test_correct) / count_y_test))

            logging.critical(
                'Training is complete, testing the best model on x_left_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_left_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(
                zip(x_test, pos1_test_vec, pos2_test_vec, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            count_y_test = 0
            for test_batch in test_batches:
                if test_batch.shape[0] > 0:
                    x_test_batch, pos1_test_batch, pos2_test_batch, y_test_batch = zip(
                        *test_batch)
                    acc, loss, num_test_correct, predictions = dev_step(
                        x_test_batch, pos1_test_batch, pos2_test_batch,
                        y_test_batch)
                    total_test_correct += int(num_test_correct)
                    count_y_test = count_y_test + len(test_batch)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / count_y_test))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Beispiel #20
0
def train_cnn_rnn():
    print('------------------------------++ ', 'begin trainin')
    input_file = sys.argv[1]
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(
        input_file)

    training_config = sys.argv[2]
    params = json.loads(open(training_config, encoding='utf-8').read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.4)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.4)
    print('------------------------------++ ', 'end loading dataset')
    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    #timestamp = '1524692100'
    builder = tf.saved_model.builder.SavedModelBuilder('./SavedModelB')
    trained_dir = './SavedModelB/'
    print('------------------------------++ ', trained_dir, 'created !')
    #os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        print('------------------------------++ ', 'begin building graph')
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            print('------------------------------++ ',
                  'begin initializing our cnn object')
            a = max(vocabulary, key=lambda i: vocabulary[i])
            nv = np.chararray((vocabulary[a] + 1), itemsize=40, unicode=True)
            nv[:] = ''
            for a in vocabulary.keys():
                nv[vocabulary[a]] = a
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=33,
                                 num_classes=y_train.shape[1],
                                 dictionnaire=nv,
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")))

            print('------------------------------++ ',
                  'End initializing our cnn object')

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            print('------------------------------++ ', 'vars was intialized')
            # Checkpoint files will be saved in this directory during training

            checkpoint_dir = './SavedModelB/'
            #os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):

                print(len(x_batch), len(x_batch[0]))
                feed_dict = {
                    cnn_rnn.input_x: x_batch,
                    cnn_rnn.input_y: y_batch
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x: x_batch,
                    cnn_rnn.input_y: y_batch
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            print('------------------------------++ ',
                  'sess was initilaized with initializing all the variables')
            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            print('------------------------------++ ', 'Step traing begin :')
            i = 0
            rrr = 0
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                print(
                    '--------------------------------------------------------++ ',
                    'nv step BEGININS i= ', i)
                i = i + 1
                train_step(x_train_batch, y_train_batch)

                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    j = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        print(
                            '--------------------------------------------------------++ ',
                            'nv step ININ  j= ', j)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        j = j + 1
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
                        if rrr == 10:
                            break
                        else:
                            rrr = rrr + 1  #######################################################################################################
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # defining the signature of the graph
            to_input_x = tf.saved_model.utils.build_tensor_info(
                cnn_rnn.string_to_manipulate)
            to_predictions = tf.saved_model.utils.build_tensor_info(
                cnn_rnn.predictions)
            prediction_signature = (
                tf.saved_model.signature_def_utils.build_signature_def(
                    inputs={'to_input_x': to_input_x},
                    outputs={'to_predictions': to_predictions},
                    method_name=tf.saved_model.signature_constants.
                    PREDICT_METHOD_NAME))

            legacy_init_op = tf.group(tf.tables_initializer(),
                                      name='legacy_init_op')

            builder.add_meta_graph_and_variables(
                sess, [tf.saved_model.tag_constants.SERVING],
                signature_def_map={
                    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                    prediction_signature
                },
                assets_collection=None,
                legacy_init_op=legacy_init_op)

            builder.save()
            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w',
              encoding='utf8') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w', encoding='utf8') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w',
              encoding='utf8') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)