コード例 #1
0
def predict():
	#trained_dir = sys.argv[1]
	trained_dir = './trained_results_1575177514/' 
	
	#if not trained_dir.endswith('/'):
	#	trained_dir += '/'
	#test_file = sys.argv[2]
	
	if request.method == 'POST':
		file = request.files['ReceivedFile']
		logging.critical('Received Filename from App: {}'.format(file))
		if file and allowed_file(file.filename):
			filename = secure_filename(file.filename)
			file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
			test_file = './uploads/'+filename
		
		
			df = pd.read_csv(test_file)
			text = df[['Descript']]
				   
			params, words_index, labels, embedding_mat = load_trained_params(trained_dir)
			flag = 0
			x_, y_, df = load_test_data(test_file, labels,flag)
			x_ = data_helper.pad_sentences(x_, forced_sequence_length=params['sequence_length'])
			x_ = map_word_to_index(x_, words_index)
		else:
			#text = request.json["text"]
			text =request.form["query"]
			params, words_index, labels, embedding_mat = load_trained_params(trained_dir)
			flag = 1
			x_, y_, df = load_test_data(text, labels,flag)
			x_ = data_helper.pad_sentences(x_, forced_sequence_length=params['sequence_length'])
			x_ = map_word_to_index(x_, words_index)

	x_test, y_test = np.asarray(x_), None
	if y_ is not None:
		y_test = np.asarray(y_)

	timestamp = trained_dir.split('/')[-2].split('_')[-1]
	predicted_dir = './predicted_results_' + timestamp + '/'
	if os.path.exists(predicted_dir):
		shutil.rmtree(predicted_dir)
	os.makedirs(predicted_dir)

	with tf.Graph().as_default():
		session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.compat.v1.Session(config=session_conf)
		with sess.as_default():
			cnn_rnn = TextCNNRNN(
				embedding_mat = embedding_mat,
				non_static = params['non_static'],
				hidden_unit = params['hidden_unit'],
				sequence_length = len(x_test[0]),
				max_pool_size = params['max_pool_size'],
				filter_sizes = map(int, params['filter_sizes'].split(",")),
				num_filters = params['num_filters'],
				num_classes = len(labels),
				embedding_size = params['embedding_dim'],
				l2_reg_lambda = params['l2_reg_lambda'])

			def real_len(batches):
				return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]

			def predict_step(x_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.dropout_keep_prob: 1.0,
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				predictions = sess.run([cnn_rnn.predictions], feed_dict)
				return predictions

			checkpoint_file = trained_dir + 'best_model.ckpt'
			saver = tf.compat.v1.train.Saver(tf.compat.v1.all_variables())
			saver = tf.compat.v1.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)
			logging.critical('{} has been loaded'.format(checkpoint_file))

			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)

			predictions, predict_labels = [], []
			for x_batch in batches:
				batch_predictions = predict_step(x_batch)[0]
				for batch_prediction in batch_predictions:
					predictions.append(batch_prediction)
					predict_labels.append(labels[batch_prediction])
					logging.critical('Prediction is complete Class belongs to: {}'.format(predict_labels[0]))

			#if os.path.exists(test_file):
			#	os.remove(test_file)
				
			return render_template('results.html',prediction = predict_labels[0],name =text)
コード例 #2
0
def train():
    config = Config()
    data_loader = Loader()
    x_raw, y_raw, word_to_id, labels = data_loader.load_data(
        'data/companies.jsons')
    word_embeddings = data_loader.load_embeddings(config.embedding_size)
    id_to_word = data_loader.invert_vocab(word_to_id)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(word_to_id)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    x, x_test, y, y_test = train_test_split(x_raw, y_raw, test_size=0.1)
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    timestamp = str(int(time.time()))
    trained_dir = 'save/trained_results' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=config.non_static,
                                 hidden_unit=config.hidden_size,
                                 max_pool_size=config.max_pool_size,
                                 filter_sizes=map(
                                     int, config.filter_sizes.split(",")),
                                 num_filters=config.num_filters,
                                 embedding_size=config.embedding_size,
                                 l2_reg_lambda=config.l2_reg_lambda)

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            checkpoint_dir = 'save/checkpoints' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / config.max_pool_size)
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    config.dropout_keep_prob,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, config.embedding_size, 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy, num_correct = sess.run([
                    train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct
                ], feed_dict)
                return num_correct

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, config.embedding_size, 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver(tf.all_variables())
            sess.run(tf.initialize_all_variables())

            train_batches = batch_iter(list(zip(x_train, y_train)),
                                       config.batch_size, config.epochs)
            best_accuracy, best_at_stp = 0, 0
            total_train_correct = 0
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                num_correct = train_step(x_train_batch, y_train_batch)
                total_train_correct += num_correct
                current_step = tf.train.global_step(sess, global_step)

                accuracy_train = float(num_correct) / len(y_train_batch)
                print("train accuracy ", accuracy_train)
                # Evaluate the model with x_dev and y_dev
                if current_step % config.evaluate_every == 0:
                    dev_batches = batch_iter(list(zip(x_dev, y_dev)),
                                             config.batch_size, 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    print(accuracy)
                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        print('Best accuracy :  ', best_accuracy)

            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = batch_iter(list(zip(x_test, y_test)),
                                      1,
                                      1,
                                      shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            print('Accuracy on test set ',
                  float(total_test_correct) / len(y_test))

    print('a')
コード例 #3
0
def test_data():
    trained_dir = sys.argv[1]
    if not trained_dir.endswith('/'):
        trained_dir += '/'
    test_file = sys.argv[2]

    params, words_index, labels, embedding_mat = load_trained_params(
        trained_dir)
    x_, y_, df = load_test_data(test_file, labels)
    x_ = data_helper.pad_sentences(
        x_, forced_sequence_length=params['sequence_length'])
    x_ = map_word_to_index(x_, words_index)

    x_test, y_test = np.asarray(x_), None
    if y_ is not None:
        y_test = np.asarray(y_)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # logging.info(params)
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 sequence_length=len(x_test[0]),
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 num_classes=len(labels),
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                loss, num_correct, predictions = sess.run(
                    [cnn_rnn.loss, cnn_rnn.num_correct, cnn_rnn.predictions],
                    feed_dict)
                return loss, num_correct, predictions

            checkpoint_file = trained_dir + 'model-4'
            saver = tf.train.Saver(tf.global_variables())
            logging.info(checkpoint_file)
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))

            total_test_correct = 0
            loss, num_test_correct, predictions = dev_step(x_test, y_test)
            logging.info(num_test_correct)
            logging.info(predictions)
            total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))
コード例 #4
0
param_length = 40
trained_dir = "trained_results/" 

params, words_index, labels, embedding_mat = load_trained_params(trained_dir)

with tf.Graph().as_default():
	session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
	sess = tf.Session(config=session_conf)
	with sess.as_default():
		cnn_rnn = TextCNNRNN(
			embedding_mat = embedding_mat,
			non_static = params['non_static'],
			hidden_unit = params['hidden_unit'],
			sequence_length = param_length,
			max_pool_size = params['max_pool_size'],
			filter_sizes = map(int, params['filter_sizes'].split(",")),
			num_filters = params['num_filters'],
			num_classes = len(labels),
			embedding_size = params['embedding_dim'],
			l2_reg_lambda = params['l2_reg_lambda'])

		checkpoint_file = trained_dir + 'model-3100'
		saver = tf.train.Saver(tf.all_variables())
		saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
		saver.restore(sess, checkpoint_file)
		logging.critical('{} has been loaded'.format(checkpoint_file))

		def real_len(batches):
			return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]
コード例 #5
0
def predict_unseen_data():
	test_x = []
	#test_input = os.environ.get('TEST_X', None)
	test_input = "What time is the class"

	if test_input is None:
		logging.critical(' TEST_X is not found ')
		sys.exit()
	test_x.append(test_input.split(' '))
	trained_dir = "trained_results_1512435063"
	#os.environ.get('TRAINED_RESULTS', None)



	if trained_dir is None:
		logging.critical(' TRAINED_RESULTS is not found ')
		sys.exit()

	if not trained_dir.endswith('/'):
		trained_dir += '/'

	x_ = data_helper.pad_sentences(test_x, forced_sequence_length=params['sequence_length'])
	x_ = map_word_to_index(x_, words_index)

	x_test, y_test = np.asarray(x_), None

	timestamp = trained_dir.split('/')[-2].split('_')[-1]

	with tf.Graph().as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)
		with sess.as_default():
			cnn_rnn = TextCNNRNN(
				embedding_mat = embedding_mat,
				non_static = params['non_static'],
				hidden_unit = params['hidden_unit'],
				sequence_length = len(x_test[0]),
				max_pool_size = params['max_pool_size'],
				filter_sizes = map(int, params['filter_sizes'].split(",")),
				num_filters = params['num_filters'],
				num_classes = len(labels),
				embedding_size = params['embedding_dim'],
				l2_reg_lambda = params['l2_reg_lambda'])

			def real_len(batches):
				return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]

			def predict_step(x_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.dropout_keep_prob: 1.0,
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				scores,predictions = sess.run([cnn_rnn.scores,cnn_rnn.predictions], feed_dict)
				return scores,predictions

			checkpoint_file = trained_dir + 'best_model.ckpt'
			saver = tf.train.Saver(tf.all_variables())
			saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)
			logging.critical('{} has been loaded'.format(checkpoint_file))

			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)
			response=""
			predictions, predict_labels = [], []
			for x_batch in batches:
				scores,batch_predictions = predict_step(x_batch)
				print scores
				score=normalize(scores[0])
				print score
				print score.max()
				mscore=score.max()
				range_perc = 0.01

				max_range = mscore + (mscore * range_perc)
				min_range = mscore - (mscore * range_perc)

				for s in score:
					if(s > min_range and s < max_range)


				max_score = score.max()
				if(max_score>0.1):
					print scores
					for batch_prediction in batch_predictions:
						predictions.append(batch_prediction)
						predict_labels.append(labels[batch_prediction])
					response= predict_labels[0]
				else:
					response="Fall back!"
			sys.stdout.write(response)
			print response

			os.environ['PRED_LABEL'] = response
コード例 #6
0
ファイル: main.py プロジェクト: wyu-du/DeepLearning-NLP
def demo_cnn_rnn(demo_model):
    # load training parameters
    params, words_index, labels, embedding_mat=load_trained_params('data_path_save/cnn_rnn_'+demo_model+'/trained_results/')    
    
    with tf.Graph().as_default():
        session_conf=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess=tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn=TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=params['sequence_length'],
                               max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], 
                               num_classes=len(labels),embedding_size=params['embedding_dim'],l2_reg_lambda=params['l2_reg_lambda'])
            
            def real_len(batches):
                return [np.ceil(np.argmin(batch+[0])*1.0/params['max_pool_size']) for batch in batches]
            
            def predict_step(x_batch):
                feed_dict={
                        cnn_rnn.input_x: x_batch,
                        cnn_rnn.dropout_keep_prob: 1.0,
                        cnn_rnn.batch_size: len(x_batch),
                        cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn_rnn.real_len: real_len(x_batch)
                        }
                predictions=sess.run([cnn_rnn.predictions], feed_dict=feed_dict)
                return predictions
            
            checkpoint_file=tf.train.latest_checkpoint('data_path_save/cnn_rnn_'+demo_model+'/checkpoints/')
            saver=tf.train.Saver(tf.all_variables())
            saver=tf.train.import_meta_graph('{}.meta'.format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))
            
            while(1):
                print('Please input your sentence:')
                input_sentence = input()
                if input_sentence == '' or input_sentence.isspace():
                    print('See you next time!')
                    break
                else:
                    x_=data_helper.clean_str(input_sentence).split(' ')
                    # Prediction: cut off the sentence if it is longer than the sequence length
                    sequence_length=params['sequence_length']
                    num_padding=sequence_length-len(x_)
                    padded_sentence=[]
                    if num_padding<0:
                        logging.info('This sentence has to be cut off because it is longer than trained sequence length')
                        padded_sentence=x_[0: sequence_length]
                    else:
                        padded_sentence=x_+['<PAD/>']*num_padding
                    # Get word index
                    temp=[]
                    for word in padded_sentence:
                        if word in words_index:
                            temp.append(words_index[word])
                        else:
                            temp.append(0)
                    temp=np.asarray(temp)
                    x_test=np.expand_dims(temp, axis=0)
                    
                    prediction=predict_step(x_test)[0][0]
                    predicted_label=labels[prediction]
                    print('\n疾病类别: '+predicted_label+'\n')
コード例 #7
0
def train_cnn_rnn():
    input_file = "logstashTemp.dat"
    output_file = "logstash.csv"

    dataList = []
    with open(input_file, 'r', encoding='utf8') as logFile:
        for row in logFile:
            dataList.append(json.loads(row))
    keyList = list(dataList[0].keys())
    csvList = [[keyItem for keyItem in keyList]]
    for row in dataList:
        if "severity" in list(row.keys()):
            tempRow = [
                row[keyItem] for keyItem in keyList
                if keyItem in list(row.keys())
            ]
            csvList.append(tempRow)
    with open(output_file, "w+", encoding="utf8") as csvFile:
        for row in csvList:
            myWriter = csv.writer(csvFile)
            myWriter.writerow(row)
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(
        output_file, 20000)
    training_config = "training_config.json"
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    print(trained_dir)
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \
                                                params['batch_size'], \
                                                params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        print("׼ȷÂÊ£º", accuracy)
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
コード例 #8
0
def train_cnn_rnn():
    x_, y_, x_test, y_test, vocabulary, vocabulary_inv, labels = data_helper.load_data(
    )
    #x_, y_, vocabulary, vocabulary_inv, labels = data_helper.load_data_book()
    training_config = 'training_config.json'
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = []
    for i in range(len(vocabulary_inv)):
        embedding_mat.append(word_embeddings[vocabulary_inv[i]])
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set

    # Split the train set into train set and dev set
    # IMDB style
    # x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1)

    # Book data style
    #x_, x_test, y_, y_test = train_test_split(x_, y_, test_size=0.1)
    x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1)

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():

            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=[
                                     int(x)
                                     for x in params['filter_sizes'].split(",")
                                 ],
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            #optimizer = tf.train.MomentumOptimizer(0.1, 0.9)
            optimizer = tf.train.AdamOptimizer()
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn_rnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn_rnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                summaries, _, step, loss, accuracy = sess.run([
                    train_summary_op, train_op, global_step, cnn_rnn.loss,
                    cnn_rnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)
                # print(accuracy)
                return accuracy

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                summaries, step, loss, accuracy, num_correct, predictions = sess.run(
                    [
                        dev_summary_op, global_step, cnn_rnn.loss,
                        cnn_rnn.accuracy, cnn_rnn.num_correct,
                        cnn_rnn.predictions
                    ], feed_dict)
                dev_summary_writer.add_summary(summaries, step)
                print("step {}, loss {:g}, acc {:g}".format(
                    step, loss, accuracy))
                return accuracy, predictions

            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_dev_accuracy, best_at_step = 0, 0
            best_test_accuracy = 0
            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_acc = train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:

                    print("Training Accuracy:", train_acc, end=' ')
                    print("Evaluation:", end=' ')
                    dev_acc, _ = dev_step(x_dev, y_dev)
                    print("Test:", end=' ')
                    test_acc_tmp, pred__ = dev_step(x_test, y_test)
                    # with open('results/prediction' + str(current_step), 'bw') as f:
                    #     pickle.dump(pred__, f)
                    if dev_acc > best_dev_accuracy:
                        best_dev_accuracy = dev_acc
                        best_test_accuracy = test_acc_tmp
                    print('best dev accuracy is', best_dev_accuracy,
                          'the test is', best_test_accuracy)
            print(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Evaluate x_test and y_test

            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    # os.rename(path, trained_dir + 'best_model.ckpt')
    # os.rename(path + '.meta', trained_dir + 'best_model.meta')
    shutil.rmtree(checkpoint_dir)
    logging.critical('{} has been removed'.format(checkpoint_dir))

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
コード例 #9
0
def train_cnn_rnn():
    input_file = sys.argv[1]
    x_, y_ = data_helper.load_data(input_file)

    config = sys.argv[2]
    params = json.loads(open(config).read())
    params['embedding_dim'] = params['tick_size'] * params['feature_size']

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './result/trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './result/checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver(tf.all_variables())
            sess.run(tf.initialize_all_variables())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    os.rename(path + '.index', trained_dir + 'best_model.ckpt')
    os.rename(path + '.meta', trained_dir + 'best_model.meta')
    shutil.rmtree(checkpoint_dir)
    logging.critical('{} has been removed'.format(checkpoint_dir))

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
コード例 #10
0
def train_cnn_rnn():
    input_file = sys.argv[1]
    if os.path.exists('./data/x.p') and \
            os.path.exists('./data/y.p') and \
            os.path.exists('./data/vocabulary.p') and \
            os.path.exists('./data/vocabulary_inv.p') and \
            os.path.exists('./data/labels.p'):
        x_ = pickle.load(open("./data/x.p", "rb"))
        y_ = pickle.load(open("./data/y.p", "rb"))
        vocabulary = pickle.load(open("./data/vocabulary.p", "rb"))
        vocabulary_inv = pickle.load(open("./data/vocabulary_inv.p", "rb"))
        labels = pickle.load(open("./data/labels.p", "rb"))
    else:
        x_, y_, vocabulary, vocabulary_inv, _, labels = data_helper.load_data(
            input_file)

    training_config = sys.argv[2]
    params = json.loads(open(training_config).read())

    # Assign a n dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary,
                                                  dim=params['embedding_dim'])
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            # optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            optimizer = tf.train.AdamOptimizer(learning_rate=0.0005,
                                               beta1=0.9,
                                               beta2=0.999,
                                               epsilon=1e-08,
                                               use_locking=False,
                                               name='Adam')
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver(tf.global_variables())
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            i = 0
            for train_batch in train_batches:
                logging.info('Training on batch: {}'.format(i))
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
                i += 1
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    os.rename(path, trained_dir + 'best_model.ckpt')
    os.rename(path + '.meta', trained_dir + 'best_model.meta')
    shutil.rmtree(checkpoint_dir)
    logging.critical('{} has been removed'.format(checkpoint_dir))

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
コード例 #11
0
ファイル: train2.py プロジェクト: zfnow/LogstashAI
def train_cnn_rnn():
    input_file = "logstashTemp.dat"
    output_file = "wcData85_1.csv"

    # 	with open(input_file,"r",encoding="utf8") as datFile:
    # 		jsonDict=json.loads(datFile.readline())
    # 	with open(input_file,"r",encoding="utf8") as datFile:
    # 		jsonDf=pd.DataFrame([],columns=list(jsonDict.keys()))
    # 		rowNO=0
    # 		for row in datFile.readlines():
    # 			try:
    # 				jsonDf.loc[rowNO]=list(json.loads(row).values())
    # 			except json.decoder.JSONDecodeError as ex:
    # 				print(ex.tostring)
    # 			rowNO+=1
    # 		jsonDf.to_csv(output_file)

    print("loading data...")
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data3(
        output_file, ["crit", "err"], 10000)
    # 	print("y_:",y_)
    training_config = "training_config.json"
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    print(trained_dir)
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            #global_step will control the changes of grads_and_vars with
            #	the change of itself which caused by optimizer.apply_gradients()
            optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3,
                                                  decay=0.9)
            #initiate the optimizer whose learning_rate is firstly 1e-3
            # but it will be decreased along with the change of decay in the folume below:
            # decayed_learning_rate = learning_rate*decay_rate^(global_step/decay_steps)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            #compute gradients of loss
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            #apply the gradients to variables and change them

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)
                print(step, "trainAccuracy", accuracy)
                with open("trainLogCsv.txt", "a+",
                          encoding="utf8") as trainLogFile:
                    trainLogFile.write("=========" + str(step) + "=========\n")
                    trainLogFile.write("acc:" + str(accuracy) + "\n")
                    trainLogFile.write("loss:" + str(loss) + "\n")

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            filter_writer = tf.summary.FileWriter('/path/to/logs', sess.graph)
            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \
                     params['batch_size'], \
                     params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                if len(train_batch) > 0:
                    x_train_batch, y_train_batch = zip(*train_batch)
                    train_step(x_train_batch, y_train_batch)
                    current_step = tf.train.global_step(sess, global_step)

                    # Evaluate the model with x_dev and y_dev
                    if current_step % params['evaluate_every'] == 0:
                        dev_batches = data_helper.batch_iter(
                            list(zip(x_dev, y_dev)), params['batch_size'], 1)

                        total_dev_correct = 0
                        y_dev = []
                        y_pre = []
                        for dev_batch in dev_batches:
                            if len(dev_batch) > 0:
                                x_dev_batch, y_dev_batch = zip(*dev_batch)
                                acc, loss, num_dev_correct, predictions = dev_step(
                                    x_dev_batch, y_dev_batch)
                                y_pre += predictions.tolist()
                                y_dev += list(y_dev_batch)
                                total_dev_correct += num_dev_correct
                        y_devs = [
                            y_devItem.tolist().index(max(y_devItem.tolist()))
                            for y_devItem in y_dev
                        ]
                        # 						print("y_pre:",y_pre)
                        # 						print("y_devs:",y_devs)
                        devRecall, devPrecision = getRP(y_pre, y_devs)
                        logging.info(
                            'Recall and precision of dev set: {},{}'.format(
                                devRecall, devPrecision))
                        accuracy = float(total_dev_correct) / len(y_dev)
                        logging.info(
                            'Accuracy on dev set: {}'.format(accuracy))

                        lossItem = loss
                        accuracyItem = accuracy

                        with open("devCsv.csv", "a+",
                                  encoding="utf8") as csvFile:
                            myWriter = csv.writer(csvFile)
                            myWriter.writerow([
                                lossItem, accuracyItem, devRecall, devPrecision
                            ])

                        if accuracy >= best_accuracy:
                            best_accuracy, best_at_step = accuracy, current_step
                            path = saver.save(sess,
                                              checkpoint_prefix,
                                              global_step=current_step)
                            logging.critical(
                                'Saved model {} at step {}'.format(
                                    path, best_at_step))
                            logging.critical(
                                'Best accuracy {} at step {}'.format(
                                    best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                if len(test_batch) > 0:
                    x_test_batch, y_test_batch = zip(*test_batch)
                    acc, loss, num_test_correct, predictions = dev_step(
                        x_test_batch, y_test_batch)
                    total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
コード例 #12
0
def train_cnn_rnn():
    # input_file=sys.argv[1]
    input_file = './data/simple3.csv'
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(
        input_file)
    #print(x_.shape)#(27404,489)
    #print(y_.shape)#(27404,10)

    #training_config=sys.argv[2]
    training_config = './training_config.json'
    params = json.loads(open(training_config).read())
    #print(params)
    """
    {'num_epochs': 1, 'num_filters': 32, 'max_pool_size': 4, 'l2_reg_lambda': 0.0, 'filter_sizes': '3,4,5', 'dropout_keep_prob': 0.5, 
    'non_static': False, 'evaluate_every': 200, 'hidden_unit': 300, 'batch_size': 128, 'embedding_dim': 300}
    """
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)
    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    #timestamp = str(int(time.time()))
    #创建问夹准备把参数词典等中间必要东西村建
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + 'test' + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()

    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            #设置优化器OP和训练OP
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # 训练的时候保存模型
            # checkpoint_dir = 'checkpoints_' + timestamp + '/'
            # if os.path.exists(checkpoint_dir):
            #     shutil.rmtree(checkpoint_dir)
            # os.makedirs(checkpoint_dir)
            # checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                #batches ?
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            #训练
            def train_step(x_batch, y_batch):
                #x_batch ?
                #y_batch ?
                # print(x_batch[1])
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                #print("real_len:", len(real_len(x_batch)))
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            #测试
            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            #saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            #训练准备
            #根据batch_size计算每个train_batch的大小
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params['batch_size'],
                                                   params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                #print("y_train_batch:", y_train_batch[0])
                train_step(x_train_batch, y_train_batch)
                #print("train_step", )
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        # path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        # logging.critical('Saved model {} at step {}'.format(path, best_at_step))
                        # logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            # saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            #saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params['batch_size'],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))
            print('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

        # Save trained parameters and files since predict.py needs them
        with open(trained_dir + 'words_index.json', 'w') as outfile:
            json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
        with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
            pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
        with open(trained_dir + 'labels.json', 'w') as outfile:
            json.dump(labels, outfile, indent=4, ensure_ascii=False)

        params['sequence_length'] = x_train.shape[1]
        with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
            json.dump(params,
                      outfile,
                      indent=4,
                      sort_keys=True,
                      ensure_ascii=False)
コード例 #13
0
def train_rnn_cnn():
    training_config = "./training_config.json"
    params = json.loads(open(training_config).read())

    config = "./data"
    trainfile = os.path.join(config, "travelogueData")
    stopwordfile = os.path.join(config, "stopwords.txt")
    w2vmodel_file = os.path.join(config, "googleVectorMR.bin")
    x_train, y_train, x_dev, y_dev, embedding_mat = data_help2.load_data2(
        w2vmodel_file, params['dev_size'])

    logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev)))
    logging.info('y_train: {}, y_dev: {}'.format(len(y_train), len(y_dev)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './runs/trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 num_layers=params['num_layers'],
                                 max_pool_size=params['max_pool_size'],
                                 hidden_unit=params['hidden_unit'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = './runs/checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver(tf.all_variables())
            sess.run(tf.initialize_all_variables())

            # Training starts here
            train_batches = data_help2.batch_iter(list(zip(x_train, y_train)),
                                                  params['batch_size'],
                                                  params['num_epochs'])
            best_accuracy, best_at_stp = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_help2.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    predictList = list()
                    real_label = list()
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                        predictList.extend(list(predictions))
                        real_label.extend(list(y_dev_batch))

                    # TP = len([ i for i in range(len(predictList)) if predictList[i] == 1 and real_label[i][1] == 1])
                    # TN = len([ i for i in range(len(predictList)) if predictList[i] == 0 and real_label[i][1] == 1])
                    # FP = len([ i for i in range(len(predictList)) if predictList[i] == 1 and real_label[i][1] == 0])

                    # right = float(TP) / (TP + FP)
                    # recall = float(TP) / (TP + TN)
                    # F_value = 2 * right * recall / (right + recall)

                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )
コード例 #14
0
ファイル: predict.py プロジェクト: dcap76/powerAI
def predict_unseen_data():
    trained_dir = sys.argv[1]
    if not trained_dir.endswith('/'):
        trained_dir += '/'
    filepath = './data/predict'

    params, words_index, labels, embedding_mat = load_trained_params(
        trained_dir)
    x_, filename_ = load_test_data(filepath)
    x_ = parser.pad_sentences(x_,
                              forced_sequence_length=params['sequence_length'])
    x_ = map_word_to_index(x_, words_index)
    #一行一个索引数组
    x_test, filename_test = np.asarray(x_), np.asarray(filename_)
    #x_test, y_test = np.asarray(x_), None
    #if y_ is not None:
    #	y_test = np.asarray(y_)

    timestamp = trained_dir.split('/')[-2].split('_')[-1]
    predicted_dir = './predicted_results_' + timestamp + '/'
    if os.path.exists(predicted_dir):
        shutil.rmtree(predicted_dir)
    os.makedirs(predicted_dir)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 sequence_length=len(x_test[0]),
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 num_classes=len(labels),
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def predict_step(x_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                score, predictions = sess.run(
                    [cnn_rnn.scores, cnn_rnn.predictions], feed_dict)
                return score, predictions

            def proba(x):
                e_x = np.exp(x - np.max(x))
                return e_x / e_x.sum()

            checkpoint_file = trained_dir + 'best_model.ckpt'
            saver = tf.train.Saver(tf.all_variables())
            saver = tf.train.import_meta_graph("{}.meta".format(
                checkpoint_file[:-5]))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))


            predict_labels_index, predict_labels, predict_filename,probabs = [], [],[],[]

            #batch不为1的写法
            batches = parser.batch_iter(list(x_test),
                                        params['batch_size'],
                                        1,
                                        shuffle=False)
            for x_batch in batches:
                tmp = predict_step(x_batch)
                for indexOfLable in tmp[1]:
                    predict_labels_index.append(indexOfLable)
                    predict_labels.append(labels[indexOfLable])
                for score in tmp[0]:
                    probabs.append(proba(score).max())
            batches2 = parser.batch_iter(list(filename_test),
                                         params['batch_size'],
                                         1,
                                         shuffle=False)
            for tmp in batches2:
                for filename in tmp:
                    predict_filename.append(filename)

            #batch为1的简化写法
            #for indexOfLable in predict_step(x_test)[1]:
            #	predict_labels_index.append(indexOfLable)
            #	predict_labels.append(labels[indexOfLable])
            #for score in predict_step(x_test)[0]:
            #	probabs.append(proba(score).max())
            #for filename in filename_test:
            #	predict_filename.append(filename)

            infoList = []
            for i in range(len(predict_labels_index)):
                info = {}
                info["prob"] = float(probabs[i])
                info["sampleId"] = predict_filename[i]
                info["label"] = predict_labels[i]
                infoList.append(info)

            allJson = {}
            allJson["type"] = "Fiance Product Classifcation"
            allJson["result"] = infoList

            with codecs.open(predicted_dir + 'predictions_all.json',
                             'w',
                             encoding="utf-8") as outfile:
                json.dump(allJson, outfile, indent=4, ensure_ascii=False)

            #df['PREDICTED'] = predict_labels
            #df.to_json(path_or_buf=predicted_dir + 'predictions_all.json', orient='records', lines=True)

            #if y_test is not None:
            #	y_test = np.array(np.argmax(y_test, axis=1))
            #	accuracy = sum(np.array(predictions) == y_test) / float(len(y_test))
            #	logging.critical('The prediction accuracy is: {}'.format(accuracy))

            logging.critical(
                'Prediction is complete, all files have been saved: {}'.format(
                    predicted_dir))
コード例 #15
0
ファイル: train.py プロジェクト: znlinux/StockForecast
def train_cnn_rnn(input_file,training_config):
	epochs=10
#	input_file = sys.argv[1]
	x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(input_file)

#	training_config = sys.argv[2]
	params = json.loads(open(training_config).read())

	# Assign a 300 dimension vector to each word
	word_embeddings = data_helper.load_embeddings(vocabulary)
	embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)]
	embedding_mat = np.array(embedding_mat, dtype = np.float32)

	# Split the original dataset into train set and test set
	x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1, random_state=16)

	# Split the train set into train set and dev set
	x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=16)

	logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
	logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

	# Create a directory, everything related to the training will be saved in this directory
	timestamp = str(int(time.time()))
	trained_dir = './trained_results_' + timestamp + '/'
	if os.path.exists(trained_dir):
		shutil.rmtree(trained_dir)
	os.makedirs(trained_dir)

	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)
		with sess.as_default():
			cnn_rnn = TextCNNRNN(
				embedding_mat=embedding_mat,
				sequence_length=x_train.shape[1],
				num_classes = y_train.shape[1],
				non_static=params['non_static'],
				hidden_unit=params['hidden_unit'],
				max_pool_size=params['max_pool_size'],
				filter_sizes=map(int, params['filter_sizes'].split(",")),
				num_filters = params['num_filters'],
				embedding_size = params['embedding_dim'],
				l2_reg_lambda = params['l2_reg_lambda'])

			global_step = tf.Variable(0, name='global_step', trainable=False)
			optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
			grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
			train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

			# Checkpoint files will be saved in this directory during training
			checkpoint_dir = './checkpoints_' + timestamp + '/'
			if os.path.exists(checkpoint_dir):
				shutil.rmtree(checkpoint_dir)
			os.makedirs(checkpoint_dir)
			checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

			def real_len(batches):
				return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]

			def train_step(x_batch, y_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.input_y: y_batch,
					cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'],
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				_, step, loss, accuracy = sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict)

			def dev_step(x_batch, y_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.input_y: y_batch,
					cnn_rnn.dropout_keep_prob: 1.0,
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				step, loss, accuracy, num_correct, predictions = sess.run(
					[global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict)
				return accuracy, loss, num_correct, predictions

			saver = tf.train.Saver(tf.all_variables())
			sess.run(tf.initialize_all_variables())

			# Training starts here
			train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
			best_accuracy, best_at_step = 0, 0

			# Train the model with x_train and y_train
			for epoch in range(epochs):
				for train_batch in train_batches:
					x_train_batch, y_train_batch = zip(*train_batch)
					train_step(x_train_batch, y_train_batch)
					current_step = tf.train.global_step(sess, global_step)

					# Evaluate the model with x_dev and y_dev
					if current_step % params['evaluate_every'] == 0:
						dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)

						total_dev_correct = 0
						for dev_batch in dev_batches:
							x_dev_batch, y_dev_batch = zip(*dev_batch)
							acc, loss, num_dev_correct, predictions = dev_step(x_dev_batch, y_dev_batch)
							total_dev_correct += num_dev_correct
						accuracy = float(total_dev_correct) / len(y_dev)
						logging.info('Accuracy on dev set: {}'.format(accuracy))

						if accuracy >= best_accuracy:
							best_accuracy, best_at_step = accuracy, current_step
							path = saver.save(sess, checkpoint_prefix, global_step=current_step)
							logging.critical('Saved model {} at step {}'.format(path, best_at_step))
							logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))
			logging.critical('Training is complete, testing the best model on x_test and y_test')

			# Save the model files to trained_dir. predict.py needs trained model files. 
			saver.save(sess, trained_dir + "best_model.ckpt")

			# Evaluate x_test and y_test
			saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
			acc, loss, num_test_correct, predictions = dev_step(x_test, y_test)
			from sklearn.metrics import recall_score
			from sklearn.metrics import f1_score
			from sklearn.metrics import accuracy_score
     
			y_test=[np.argmax(y_t) for y_t in y_test]      
			print(sorted(list(set(y_test))))       
				   
			recall_l=recall_score(y_test,predictions,average=None)
			f1_score=f1_score(y_test,predictions,average=None)
			acc_score=accuracy_score(y_test,predictions)
			total_test_correct = int(num_test_correct)                
			logging.critical('Recall on test set: '+str(recall_l))
			logging.critical('Acc on test set: '+str(acc_score))
			logging.critical('F1 on test set: '+str(f1_score))
			logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test)))
			print(len(labels))
			print(len(recall_l))
			print(len(f1_score))
			labels_=[labels[n] for n in sorted(list(set(y_test)))]
				
			logging.critical('Accuracy on test set: {}'.format(float(total_test_correct) / len(y_test)))
			df_=pd.DataFrame();df_["labels"]=labels_;df_["recall"]=recall_l;df_["f1"]=f1_score;df_.to_csv("matrics.csv",index=False)
			# Save trained parameters and files since predict.py needs them
			#print (vocabulary)

	with open(trained_dir + 'words_index.json', 'w') as outfile:
		#jsObj = json.dumps(vocabulary)  
		  
		#outfile.write(jsObj)  
		#outfile.close()  
		json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
	with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
		pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
	with open(trained_dir + 'labels.json', 'w') as outfile:
		json.dump(labels, outfile, indent=4, ensure_ascii=False)

	params['sequence_length'] = x_train.shape[1]
	with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
		json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
コード例 #16
0
def predict_raw_data(argv):
    ###################################################################
    #       Make sure to enter the trained_dir you want to load       #
    # 				                                				  #
    #       AT predicted_dir, there is predict_labels.txt   		  #
    #		which store the prediction result of test file			  #
    # 				                                				  #
    ###################################################################

    in_file = ''
    out_file = ''

    try:
        opts, args = getopt.getopt(
            argv, "h:t:i:o:",
            ["trained_dir=", "in_filepath=", "out_filepath="])
    except getopt.GetoptError:
        print(
            "python main.py -i <in_filepath> -o <out_filepath> -t <trained_dir>"
        )
        sys.exit(2)

    trained_dir = './trained_results/'

    for opt, arg in opts:
        if opt == '-h':
            print("python main.py -i <in_filepath> -o <out_filepath>")
            sys.exit()
        elif opt in ("-i", "--in_filepath"):
            in_file = arg
        elif opt in ("-o", "--out_filepath"):
            out_file = arg
        elif opt in ("-t", "--trained_dir"):
            trained_dir = arg

    params, words_index, labels, embedding_mat = load_trained_params(
        trained_dir)
    original_x, x_, y_ = load_test_data(in_file)

    x_ = data_helper.pad_sentences(
        x_, forced_sequence_length=params['sequence_length'])
    x_ = map_word_to_index(x_, words_index)

    x_test, y_test = np.asarray(x_), None
    if y_ is not None:
        y_test = np.asarray(y_)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 sequence_length=len(x_test[0]),
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 num_classes=len(labels),
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def predict_step(x_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                predictions, scores = sess.run(
                    [cnn_rnn.predictions, cnn_rnn.scores], feed_dict)
                return predictions, scores

            checkpoint_file = trained_dir + 'best_model.ckpt'
            saver = tf.train.Saver(tf.global_variables())
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)

            predictions, predict_labels, predict_probs = [], [], []
            for x_test_batch in batches:
                batch_predictions = predict_step(x_test_batch)[0]
                batch_prop_preds = predict_step(x_test_batch)[1]

                for batch_prediction, batch_prop_pred in zip(
                        batch_predictions, batch_prop_preds):
                    predictions.append(batch_prediction)
                    predict_labels.append(labels[batch_prediction])
                    predict_probs.append(batch_prop_pred[batch_prediction])

            with open(out_file, "w", encoding='utf-8') as f:
                for original_x_, predict_label, predict_prob in zip(
                        original_x, predict_labels, predict_probs):
                    print_prob = round(predict_prob * 100, 2)
                    f.write(
                        str(original_x_) + '\t' + str(predict_label) + '\t' +
                        str(print_prob) + '\n')

            if y_test is not None:
                y_test = np.array(np.argmax(y_test, axis=1))
                accuracy = sum(np.array(predictions) == y_test) / float(
                    len(y_test))
                logging.critical(
                    'The prediction accuracy is: {}'.format(accuracy))

            logging.critical('Prediction is complete')
コード例 #17
0
ファイル: main.py プロジェクト: wyu-du/DeepLearning-NLP
def train_cnn_rnn(input_file, training_config):
    # read data and params
    x_, y_, vocabulary, vocabulary_inv, df, labels=data_helper.load_data(input_file)
    params=json.loads(open(training_config).read())
    
    # create a directory, everything related to the training will be saved in this directory
    timestamp=str(int(time.time()))
    output_dir=os.path.join('data_path_save','cnn_rnn_'+timestamp)
    trained_dir=os.path.join(output_dir,'trained_results')
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)
    
    # assign a 300 dimension vector to each word
    word_embeddings=data_helper.load_embeddings(vocabulary)
    embedding_mat=[word_embeddings[word] for index,word in enumerate(vocabulary_inv)]
    embedding_mat=np.array(embedding_mat, dtype=np.float32)
    
    # split the original dataset into trainset and devset
    x_train, x_dev, y_train, y_dev=train_test_split(x_, y_, test_size=0.1)
    # split the trainset into trainset and devset
    logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev)))
    
    graph=tf.Graph()
    with graph.as_default():
        session_conf=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess=tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn=TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], 
                               non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'],
                               filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'],
                               embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda'])
            global_step=tf.Variable(0, name='global_step', trainable=False)
            optimizer=tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars=optimizer.compute_gradients(cnn_rnn.loss)
            train_op=optimizer.apply_gradients(grads_and_vars, global_step=global_step)
            checkpoint_dir=os.path.join(output_dir,'checkpoints')
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix=os.path.join(checkpoint_dir, 'model')
            
            def real_len(batches):
                return [np.ceil(np.argmin(batch+[0])*1.0/params['max_pool_size']) for batch in batches]
            
            def train_step(x_batch, y_batch):
                feed_dict={
                        cnn_rnn.input_x: x_batch, 
                        cnn_rnn.input_y: y_batch,
                        cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'],
                        cnn_rnn.batch_size: len(x_batch),
                        cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn_rnn.real_len: real_len(x_batch)
                        }
                _, step, loss, accuracy=sess.run([train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict=feed_dict)
                
            def dev_step(x_batch, y_batch):
                feed_dict={
                        cnn_rnn.input_x: x_batch, 
                        cnn_rnn.input_y: y_batch,
                        cnn_rnn.dropout_keep_prob: 1.0,
                        cnn_rnn.batch_size: len(x_batch),
                        cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn_rnn.real_len: real_len(x_batch)
                        }
                step, loss, accuracy, num_correct, predictions=sess.run([global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions], feed_dict=feed_dict)
                return accuracy, loss, num_correct, predictions
            
            saver=tf.train.Saver()
            sess.run(tf.global_variables_initializer())
            
            # training starts here
            train_batches=data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step=0, 0
            for train_batch in train_batches:
                x_train_batch, y_train_batch=zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step=tf.train.global_step(sess, global_step)
                
                if current_step%params['evaluate_every']==0:
                    dev_batches=data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_dev_correct=0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch=zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions=dev_step(x_dev_batch, y_dev_batch)
                        total_dev_correct+=num_dev_correct
                    accuracy=float(total_dev_correct)/len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))
                    
                    if accuracy>=best_accuracy:
                        best_accuracy, best_at_step=accuracy, current_step
                        path=saver.save(sess, checkpoint_prefix, global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))
            logging.critical('Training is complete, testing the best model on x_test and y_test')
            
    # save trained params and files
    with open(trained_dir+'/words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir+'/embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir+'/labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)
    params['sequence_length']=x_train.shape[1]
    with open(trained_dir+'/trained_parameters.json', 'w') as outfile:
        json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
コード例 #18
0
def train_cnn_rnn():
    input_file = sys.argv[1]
    x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data(
        input_file)

    training_config = sys.argv[2]
    params = json.loads(open(training_config).read())

    # Assign a 300 dimension vector to each word
    word_embeddings = data_helper.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    logging.info("x_train: {}, x_dev: {}, x_test: {}".format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info("y_train: {}, y_dev: {}, y_test: {}".format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = "./trained_results_" + timestamp + "/"
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(
                embedding_mat=embedding_mat,
                sequence_length=x_train.shape[1],
                num_classes=y_train.shape[1],
                non_static=params["non_static"],
                hidden_unit=params["hidden_unit"],
                max_pool_size=params["max_pool_size"],
                filter_sizes=map(int, params["filter_sizes"].split(",")),
                num_filters=params["num_filters"],
                embedding_size=params["embedding_dim"],
                l2_reg_lambda=params["l2_reg_lambda"],
            )

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = "./checkpoints_" + timestamp + "/"
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params["max_pool_size"])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params["dropout_keep_prob"],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params["embedding_dim"], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params["embedding_dim"], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run(
                    [
                        global_step,
                        cnn_rnn.loss,
                        cnn_rnn.accuracy,
                        cnn_rnn.num_correct,
                        cnn_rnn.predictions,
                    ],
                    feed_dict,
                )
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                                   params["batch_size"],
                                                   params["num_epochs"])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params["evaluate_every"] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev)), params["batch_size"], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info("Accuracy on dev set: {}".format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical("Saved model {} at step {}".format(
                            path, best_at_step))
                        logging.critical("Best accuracy {} at step {}".format(
                            best_accuracy, best_at_step))
            logging.critical(
                "Training is complete, testing the best model on x_test and y_test"
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + "-" + str(best_at_step))
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  params["batch_size"],
                                                  1,
                                                  shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical("Accuracy on test set: {}".format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + "words_index.json", "w") as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + "embeddings.pickle", "wb") as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + "labels.json", "w") as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params["sequence_length"] = x_train.shape[1]
    with open(trained_dir + "trained_parameters.json", "w") as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
コード例 #19
0
def train_cnn_rnn():  # TRAIN
    print("Entering function train_cnn_rnn")
    x_, y_, vocabulary, vocabulary_inv, df, labels = load_data(TRAIN_FILE_PATH)
    # Assign a 300 dimension vector to each word
    # word_embeddings = load_embeddings(vocabulary)
    # embedding_mat = [word_embeddings[word] for index, word in enumerate(vocabulary_inv)]
    # embedding_mat = np.array(embedding_mat, dtype=np.float32)
    # print(len(embedding_mat))

    gl_word_to_emb_mat_ind, emb_mat = load_emb(EMB_FILE_PATH)
    embedding_mat = emb_mat

    # Split the original dataset into train set and test set
    x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1)

    # Split the train set into train set and dev set
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    print('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev),
                                                      len(x_test)))
    print('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev),
                                                      len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    if USE_TMP_FOLDER:
        timestamp = "temp"
    trained_dir = PRO_FLD + 'trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            # optimizer = tf.train.RMSPropOptimizer(0.001, decay=0.9)
            optimizer = tf.train.AdamOptimizer(0.001,
                                               beta1=0.9,
                                               beta2=0.999,
                                               epsilon=1e-08)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            checkpoint_dir = PRO_FLD + 'checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, _, l_loss, l_acc = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)
                return l_loss, l_acc

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                loss_l, accuracy_l, num_correct, predictions_l = sess.run([
                    cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct,
                    cnn_rnn.predictions
                ], feed_dict)
                return accuracy_l, loss_l, num_correct, predictions_l

            def print_stats(stat_dict_total, stat_dict_correct):
                longest_key = 0
                for key in stat_dict_total:
                    if len(key) > longest_key:
                        longest_key = len(key)
                for key in stat_dict_total:
                    my_msg = "     Class {:{}s}: ({}/{}) -> accuracy: {:.4f}%"
                    temp = 0
                    if key in stat_dict_correct:
                        temp = stat_dict_correct[key]
                    my_acc_l = (float(temp) /
                                float(stat_dict_total[key])) * 100
                    print(
                        my_msg.format(key, longest_key, temp,
                                      stat_dict_total[key], my_acc_l))
                return

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = batch_iter(list(zip(x_train, y_train)),
                                       params['batch_size'],
                                       params['num_epochs'])
            best_accuracy, best_at_step, current_step = 0, 0, 0
            trn_loss_over_steps, trn_acc_over_steps, dev_loss_over_steps, dev_acc_over_steps = [], [], [], []
            trn_loss_tmp, trn_acc_tmp, dev_loss_tmp, dev_acc_tmp, trn_iters, dev_iters = 0, 0, 0, 0, 0, 0
            number_of_steps_in_total = int(
                (len(x_train) / params['batch_size'] + 1) *
                params['num_epochs'])  # steps
            print("***There will be {} steps total".format(
                number_of_steps_in_total))
            stat_dict_all_total, stat_dict_all_correct = defaultdict(
                int), defaultdict(int)
            # Train the model with x_train and y_train
            temp_start_time = time.time()  # measure epoch time
            for train_batch in train_batches:
                stat_dict_step_total, stat_dict_step_correct = defaultdict(
                    int), defaultdict(int)
                x_train_batch, y_train_batch = zip(*train_batch)
                step_loss, step_acc = train_step(x_train_batch, y_train_batch)
                trn_loss_tmp += step_loss
                trn_acc_tmp += step_acc
                trn_iters += 1
                current_step = tf.train.global_step(sess, global_step)
                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = batch_iter(list(zip(x_dev, y_dev)),
                                             params['batch_size'], 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        dev_loss_tmp += loss
                        dev_acc_tmp += acc
                        dev_iters += 1
                        ind = 0
                        for p in predictions:
                            real_class_value = int(np.argmax(y_dev_batch[ind]))
                            real_class_label = labels[real_class_value]
                            stat_dict_step_total[real_class_label] += 1
                            if p == real_class_value:
                                stat_dict_step_correct[real_class_label] += 1
                            ind += 1
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)

                    trn_1_eval_loss = float(trn_loss_tmp) / float(trn_iters)
                    trn_1_eval_acc = float(trn_acc_tmp) / float(trn_iters)
                    trn_loss_over_steps.append(trn_1_eval_loss)
                    trn_acc_over_steps.append(trn_1_eval_acc)
                    trn_loss_tmp, trn_acc_tmp, trn_iters = 0, 0, 0

                    dev_1_eval_loss = float(dev_loss_tmp) / float(dev_iters)
                    dev_1_eval_acc = float(dev_acc_tmp) / float(dev_iters)
                    dev_loss_over_steps.append(dev_1_eval_loss)
                    dev_acc_over_steps.append(dev_1_eval_acc)
                    dev_loss_tmp, dev_acc_tmp, dev_iters = 0, 0, 0

                    # Stats prints
                    mes = "STEP {} - ({}/{}) -> accuracy: {:.4f}%"
                    print(
                        mes.format(current_step, int(total_dev_correct),
                                   len(y_dev), accuracy * 100))
                    if current_step % PRINT_CLASSES_STATS_EACH_X_STEPS == 0:
                        print_stats(stat_dict_step_total,
                                    stat_dict_step_correct)
                        temp_end_time = time.time() - temp_start_time
                        temp_start_time = time.time()  # measure epoch time
                        hours, rem = divmod(temp_end_time, 3600)
                        minutes, seconds = divmod(rem, 60)
                        print(
                            "temp run time(from last eval): {:0>2}:{:0>2}:{:0>2}"
                            .format(int(hours), int(minutes), int(seconds)))

                    if accuracy > best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        if SHOULD_SAVE:
                            path = saver.save(sess,
                                              checkpoint_prefix,
                                              global_step=current_step)
                            logging.info(
                                '    Saved model {} at step {}'.format(
                                    path, best_at_step))
                        msg = '    Best accuracy {:.4f}% at step {}/{} ({}/{})'
                        logging.info(
                            msg.format(best_accuracy * 100, best_at_step,
                                       number_of_steps_in_total,
                                       int(total_dev_correct), len(y_dev)))
                stat_dict_all_total = dict(
                    Counter(stat_dict_all_total) +
                    Counter(stat_dict_step_total))
                stat_dict_all_correct = dict(
                    Counter(stat_dict_all_correct) +
                    Counter(stat_dict_step_correct))
            train_msg = '***Training is complete. Best accuracy {:.4f}% at step {}/{}'
            print(
                train_msg.format(best_accuracy * 100, best_at_step,
                                 current_step))
            # Stats prints
            print_stats(stat_dict_all_total, stat_dict_all_correct)
            # Save the model files to trained_dir. predict.py needs trained model files.
            if SHOULD_SAVE:
                saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            if RUN_TEST_AFTER_TRAIN:
                print('***Testing...')
                saver.restore(sess,
                              checkpoint_prefix + '-' + str(best_at_step))
                test_batches = batch_iter(list(zip(x_test, y_test)),
                                          params['batch_size'],
                                          1,
                                          shuffle=False)
                total_test_correct = 0
                test_stat_dict_total, test_dict_correct = defaultdict(
                    int), defaultdict(int)
                tst_loss_total, tst_iters = 0, 0
                for test_batch in test_batches:
                    x_test_batch, y_test_batch = zip(*test_batch)
                    acc, loss, num_test_correct, predictions = dev_step(
                        x_test_batch, y_test_batch)
                    tst_loss_total += loss
                    tst_iters += 1
                    ind = 0
                    for p in predictions:
                        real_class_value = int(np.argmax(y_test_batch[ind]))
                        real_class_label = labels[real_class_value]
                        test_stat_dict_total[real_class_label] += 1
                        if p == real_class_value:
                            test_dict_correct[real_class_label] += 1
                        ind += 1
                    total_test_correct += int(num_test_correct)
                my_acc = (float(total_test_correct) / float(len(y_test))) * 100
                acc_msg = 'Accuracy on test set - ({}/{}) -> accuracy: {:.4f}%'

                tst_loss_total = float(tst_loss_total) / float(tst_iters)
                tst_loss_total, tst_acc_total = [
                    tst_loss_total
                ] * len(dev_acc_over_steps), [
                    (float(total_test_correct) / float(len(y_test)))
                ] * len(dev_acc_over_steps)
                print(acc_msg.format(total_test_correct, len(y_test), my_acc))

                print_graph(
                    'loss/epochs(train in red, validation in green, test(constant) in blue)',
                    'epochs', 'loss', trn_loss_over_steps, dev_loss_over_steps,
                    tst_loss_total)
                print_graph(
                    'acc/epochs(train in red, validation in green, test(constant) in blue)',
                    'epochs', 'acc', trn_acc_over_steps, dev_acc_over_steps,
                    tst_acc_total)

                # Stats prints
                print_stats(test_stat_dict_total, test_dict_correct)
                if PRINT_WORD_PARAGRAPH:
                    mdiff = 'data file={}. us and spain 45-150 tokens. BasicLSTMCell'.format(
                        CSV_FULL_PATH)
                    last_out = 7
                    print('Difference from out{}: {}'.format(last_out, mdiff))
                    m1 = 'Training best acc {:.4f}% at step {}/{}'
                    print(
                        m1.format(best_accuracy * 100, best_at_step,
                                  current_step))
                    m2 = 'Test results: Accuracy on test set - ({}/{}) -> accuracy: {:.4f}%'
                    print(m2.format(total_test_correct, len(y_test), my_acc))
                    print_stats(test_stat_dict_total, test_dict_correct)

    # # Save trained parameters and files since predict.py needs them
    # with open(trained_dir + 'words_index.json', 'w') as outfile:
    #     json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    # with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
    #     pickle.dump(embedding_mat, outfile)
    # with open(trained_dir + 'labels.json', 'w') as outfile:
    #     json.dump(labels, outfile, indent=4, ensure_ascii=False)
    #
    # params['sequence_length'] = x_train.shape[1]
    # with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
    #     json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
    print("Leaving function train_cnn_rnn")
    return
コード例 #20
0
ファイル: predict.py プロジェクト: zyq11223/nlp_labeling
def predict_cnn_rnn(x_test, y_test, out_dir='trained_results'):
    ###################################################################
    # 		ARG : x_test/y_test (test data/label matrix)			  #
    # 			                                      				  #
    # 		RETURN : predict_labels, accuracy         				  #
    # 			    (predict label vector)             				  #
    # 			                                      				  #
    #       AT predicted_dir, there is predict_labels.txt   		  #
    #		which stores the prediction result of test file	    	  #
    # 				                                				  #
    ###################################################################

    if out_dir == '':
        trained_dir = 'trained_results'
    else:
        trained_dir = out_dir

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            params = json.loads(
                open(trained_dir + 'trained_parameters.json',
                     encoding='utf-8').read())
            words_index = json.loads(
                open(trained_dir + 'words_index.json',
                     encoding='utf-8').read())
            labels = json.loads(
                open(trained_dir + 'labels.json', encoding='utf-8').read())
            with open(trained_dir + 'embeddings.pickle', 'rb') as input_file:
                fetched_embedding = pickle.load(input_file)
            embedding_mat = np.array(fetched_embedding, dtype=np.float32)

            cnn_rnn2 = TextCNNRNN(embedding_mat=embedding_mat,
                                  non_static=params['non_static'],
                                  hidden_unit=params['hidden_unit'],
                                  sequence_length=len(x_test[0]),
                                  max_pool_size=params['max_pool_size'],
                                  filter_sizes=map(
                                      int, params['filter_sizes'].split(",")),
                                  num_filters=params['num_filters'],
                                  num_classes=len(labels),
                                  embedding_size=params['embedding_dim'],
                                  l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def predict_step(x_batch):
                feed_dict = {
                    cnn_rnn2.input_x:
                    x_batch,
                    cnn_rnn2.dropout_keep_prob:
                    1.0,
                    cnn_rnn2.batch_size:
                    len(x_batch),
                    cnn_rnn2.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn2.real_len:
                    real_len(x_batch),
                }
                predictions = sess.run([cnn_rnn2.predictions], feed_dict)
                return predictions

            checkpoint_file = trained_dir + 'best_model.ckpt'
            saver = tf.train.Saver(tf.global_variables())
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)

            predictions, predict_labels = [], []
            for x_test_batch in batches:
                batch_predictions = predict_step(x_test_batch)[0]
                for batch_prediction in batch_predictions:
                    predictions.append(batch_prediction)
                    predict_labels.append(labels[batch_prediction])

            if y_test is not None:
                y_test = np.array(np.argmax(y_test, axis=1))
                accuracy = sum(np.array(predictions) == y_test) / float(
                    len(y_test))
                logging.critical(
                    'The prediction accuracy is: {}'.format(accuracy))

            logging.critical('Prediction is complete')

    return predict_labels, accuracy
コード例 #21
0
def train(x_train, y_train, vocab_processor, x_dev, y_dev, x_real_len_train,
          x_real_len_dev, sorted_label):
    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            if FLAGS.model_type == "cnnrnn":
                obj = TextCNNRNN(sequence_length=FLAGS.max_document_length,
                                 num_classes=y_train.shape[1],
                                 vocab_size=len(vocab_processor.vocabulary_),
                                 hidden_unit=FLAGS.hidden_unit,
                                 embedding_size=FLAGS.embedding_dim,
                                 filter_sizes=list(
                                     map(int, FLAGS.filter_sizes.split(","))),
                                 num_filters=FLAGS.num_filters,
                                 l2_reg_lambda=FLAGS.l2_reg_lambda)
            elif FLAGS.model_type == "rnncnn":
                obj = TextRNNCNN(sequence_length=FLAGS.max_document_length,
                                 num_classes=y_train.shape[1],
                                 vocab_size=len(vocab_processor.vocabulary_),
                                 hidden_unit=FLAGS.hidden_unit,
                                 embedding_size=FLAGS.embedding_dim,
                                 filter_sizes=list(
                                     map(int, FLAGS.filter_sizes.split(","))),
                                 num_filters=FLAGS.num_filters,
                                 l2_reg_lambda=FLAGS.l2_reg_lambda)
            elif FLAGS.model_type == "rnnandcnn":
                obj = TextRNNandCNN(
                    sequence_length=FLAGS.max_document_length,
                    num_classes=y_train.shape[1],
                    vocab_size=len(vocab_processor.vocabulary_),
                    hidden_unit=FLAGS.hidden_unit,
                    embedding_size=FLAGS.embedding_dim,
                    filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                    num_filters=FLAGS.num_filters,
                    l2_reg_lambda=FLAGS.l2_reg_lambda)
            elif FLAGS.model_type == "rnn":
                obj = TextRNN(sequence_length=FLAGS.max_document_length,
                              num_classes=y_train.shape[1],
                              vocab_size=len(vocab_processor.vocabulary_),
                              hidden_unit=FLAGS.hidden_unit,
                              embedding_size=FLAGS.embedding_dim,
                              l2_reg_lambda=FLAGS.l2_reg_lambda)
            elif FLAGS.model_type == "dan":
                obj = TextDAN(sequence_length=FLAGS.max_document_length,
                              num_classes=y_train.shape[1],
                              vocab_size=len(vocab_processor.vocabulary_),
                              embedding_size=FLAGS.embedding_dim,
                              filter_sizes=list(
                                  map(int, FLAGS.filter_sizes.split(","))),
                              num_filters=FLAGS.num_filters,
                              l2_reg_lambda=FLAGS.l2_reg_lambda)
            elif FLAGS.model_type == "attn_cnn":
                obj = TextAttnCNN(sequence_length=FLAGS.max_document_length,
                                  num_classes=y_train.shape[1],
                                  vocab_size=len(vocab_processor.vocabulary_),
                                  embedding_size=FLAGS.embedding_dim,
                                  num_heads=FLAGS.num_heads,
                                  filter_sizes=list(
                                      map(int, FLAGS.filter_sizes.split(","))),
                                  num_filters=FLAGS.num_filters,
                                  l2_reg_lambda=FLAGS.l2_reg_lambda)
            elif FLAGS.model_type == "dpcnn":
                obj = TextDPCNN(sequence_length=FLAGS.max_document_length,
                                num_classes=y_train.shape[1],
                                vocab_size=len(vocab_processor.vocabulary_),
                                embedding_size=FLAGS.embedding_dim,
                                filter_sizes=list(
                                    map(int, FLAGS.filter_sizes.split(","))),
                                num_filters=FLAGS.num_filters,
                                num_blocks=FLAGS.num_blocks,
                                l2_reg_lambda=FLAGS.l2_reg_lambda)
            else:
                obj = TextCNN(sequence_length=FLAGS.max_document_length,
                              num_classes=y_train.shape[1],
                              vocab_size=len(vocab_processor.vocabulary_),
                              embedding_size=FLAGS.embedding_dim,
                              filter_sizes=list(
                                  map(int, FLAGS.filter_sizes.split(","))),
                              num_filters=FLAGS.num_filters,
                              l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                grads_and_vars = optimizer.compute_gradients(obj.loss)
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", FLAGS.model_version))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", obj.loss)
            acc_summary = tf.summary.scalar("accuracy", obj.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Save train params since eval.py needs them
            trained_dir = os.path.abspath(
                os.path.join(out_dir, "trained_results"))
            if not os.path.exists(trained_dir):
                os.makedirs(trained_dir)
            with open(trained_dir + '/sorted_label.json', 'w') as outfile:
                json.dump(sorted_label, outfile, indent=4, ensure_ascii=False)
            with open(trained_dir + '/train_params.json', 'w') as outfile:
                json.dump({"max_document_length": FLAGS.max_document_length},
                          outfile,
                          indent=4,
                          ensure_ascii=False)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch, x_real_len_batch):
                """
                A single training step
                """
                if FLAGS.model_type == "cnn" or FLAGS.model_type == "dan" or FLAGS.model_type == "attn_cnn" or FLAGS.model_type == "dpcnn":
                    feed_dict = {
                        obj.input_x: x_batch,
                        obj.input_y: y_batch,
                        obj.dropout_keep_prob: FLAGS.dropout_keep_prob,
                        obj.is_training: True
                    }
                else:
                    feed_dict = {
                        obj.input_x: x_batch,
                        obj.input_y: y_batch,
                        obj.dropout_keep_prob: FLAGS.dropout_keep_prob,
                        obj.real_len: x_real_len_batch
                    }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, obj.loss,
                    obj.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def overfit(dev_loss, eva_num=3):
                n = len(dev_loss)
                if n < eva_num:
                    return False
                for i in xrange(n - eva_num + 1, n):
                    if dev_loss[i] > dev_loss[i - 1]:
                        return False
                return True

            def dev_step(x_batch, y_batch, x_real_len_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                dev_batches = data_helpers.batch_iter(list(
                    zip(x_batch, y_batch, x_real_len_batch)),
                                                      FLAGS.batch_size,
                                                      1,
                                                      shuffle=False)
                all_pred = []
                correct_total_num = 0
                for batch in dev_batches:
                    x_dev_batch, y_dev_batch, x_real_len_dev_batch = zip(
                        *batch)
                    if FLAGS.model_type == "cnn" or FLAGS.model_type == "dan" or FLAGS.model_type == "attn_cnn" or FLAGS.model_type == "dpcnn":
                        feed_dict = {
                            obj.input_x: x_dev_batch,
                            obj.input_y: y_dev_batch,
                            obj.dropout_keep_prob: 1.0,
                            obj.is_training: False
                        }
                    else:
                        feed_dict = {
                            obj.input_x: x_dev_batch,
                            obj.input_y: y_dev_batch,
                            obj.dropout_keep_prob: 1.0,
                            obj.real_len: x_real_len_dev_batch
                        }

                    step, summaries, pred, correct_pred_num = sess.run([
                        global_step, dev_summary_op, obj.predictions,
                        obj.correct_pred_num
                    ], feed_dict)
                    all_pred = np.concatenate([all_pred, pred])
                    correct_total_num += correct_pred_num
                    if writer:
                        writer.add_summary(summaries, step)
                dev_acc = 1.0 * correct_total_num / len(y_batch)
                print("right_sample {}, dev_sample {}, dev_acc {:g}".format(
                    correct_total_num, len(y_batch), dev_acc))
                return dev_acc

            # Generate batches
            batches = data_helpers.batch_iter(
                list(zip(x_train, y_train, x_real_len_train)),
                FLAGS.batch_size, FLAGS.num_epochs)
            # Training loop. For each batch...
            dev_acc = []
            for batch in batches:
                x_batch, y_batch, x_real_len_batch = zip(*batch)
                train_step(x_batch, y_batch, x_real_len_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:", current_step)
                    cur_acc = dev_step(x_dev,
                                       y_dev,
                                       x_real_len_dev,
                                       writer=dev_summary_writer)
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
                    dev_acc.append(cur_acc)
                    if overfit(dev_acc):
                        print("current accuracy drop and stop train..\n")
                        sys.exit(0)
                    print("")
コード例 #22
0
def train_cnn_rnn(embedding_mat,
                  embedding_pre,
                  x_train,
                  x_dev,
                  y_train,
                  y_dev,
                  pre_y_train,
                  pre_y_dev,
                  labels,
                  vocabulary,
                  out_dir='./trained_results/'):

    if out_dir == '':
        trained_dir = './trained_results/'
    else:
        trained_dir = out_dir
    params = {
        "batch_size": 128,
        "dropout_keep_prob": 0.5,
        "embedding_dim": 64,
        "evaluate_every": 500,
        "filter_sizes": "3,4,5",
        "hidden_unit": 64,
        "l2_reg_lambda": 0.0,
        "max_pool_size": 4,
        "non_static": True,
        "num_epochs": 100,
        "num_filters": 32,
        "attention_size": 66
    }

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():

            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 embedding_pre=embedding_pre,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)

            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch, pre_y_batch):

                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.input_pre_y:
                    pre_y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy, embedding_mat = sess.run([
                    train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.Word
                ], feed_dict)

                return embedding_mat

            def dev_step(x_batch, y_batch, pre_y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.input_pre_y:
                    pre_y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = data_helper.batch_iter(
                list(zip(x_train, y_train, pre_y_train)), params['batch_size'],
                params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch, pre_y_train_batch = zip(
                    *train_batch)
                embedding_mat = train_step(x_train_batch, y_train_batch,
                                           pre_y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = data_helper.batch_iter(
                        list(zip(x_dev, y_dev, pre_y_dev)),
                        params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch, pre_y_dev_batch = zip(
                            *dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch, pre_y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to out_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

    with open(trained_dir + 'words_index.json', 'w',
              encoding='utf-8') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)

    with open(trained_dir + 'embedding_mat.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'embedding_pre.pickle', 'wb') as outfile:
        pickle.dump(embedding_pre, outfile, pickle.HIGHEST_PROTOCOL)

    with open(trained_dir + 'labels.json', 'w', encoding='utf-8') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w',
              encoding='utf-8') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
コード例 #23
0
def predict_unseen_data():
    trained_dir = sys.argv[1]
    if not trained_dir.endswith('/'):
        trained_dir += '/'
    test_file = sys.argv[2]

    params, words_index, labels, embedding_mat = load_trained_params(
        trained_dir)
    x_, y_, df = load_test_data(test_file, labels)
    x_ = data_helper.pad_sentences(
        x_, forced_sequence_length=params['sequence_length'])
    x_ = map_word_to_index(x_, words_index)

    x_test, y_test = np.asarray(x_), None
    if y_ is not None:
        y_test = np.asarray(y_)

    timestamp = trained_dir.split('/')[-2].split('_')[-1]
    predicted_dir = './predicted_results_' + timestamp + '/'
    if os.path.exists(predicted_dir):
        shutil.rmtree(predicted_dir)
    os.makedirs(predicted_dir)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 sequence_length=len(x_test[0]),
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 num_classes=len(labels),
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def predict_step(x_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                predictions = sess.run([cnn_rnn.predictions], feed_dict)
                return predictions

            checkpoint_file = trained_dir + 'best_model.ckpt'
            saver = tf.train.Saver(tf.all_variables())
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)

            predictions, predict_labels = [], []
            for x_batch in batches:
                batch_predictions = predict_step(x_batch)[0]
                for batch_prediction in batch_predictions:
                    predictions.append(batch_prediction)
                    predict_labels.append(labels[batch_prediction])

            df['PREDICTED'] = predict_labels
            columns = sorted(df.columns, reverse=True)
            df.to_csv(predicted_dir + 'predictions_all.csv',
                      index=False,
                      columns=columns,
                      sep='|')

            if y_test is not None:
                y_test = np.array(np.argmax(y_test, axis=1))
                accuracy = sum(np.array(predictions) == y_test) / float(
                    len(y_test))
                logging.critical(
                    'The prediction accuracy is: {}'.format(accuracy))

            logging.critical(
                'Prediction is complete, all files have been saved: {}'.format(
                    predicted_dir))
コード例 #24
0
def train_model():
    datafile = 'toxic_comments.csv'

    # x_:数据集, y_: 标签, vocabulary: 单词及标号, vocabulary_inv:    , df: pandas数据, labels标签
    x_, y_, vocabulary, vocabulary_inv, df, labels = Data_preprocess.load_data(
        datafile)
    params = {
        "batch_size": 16,
        "dropout_keep_prob": 0.5,
        "embedding_dim": len(vocabulary),
        "evaluate_every": 200,
        "filter_sizes": "3,4,5",
        "hidden_unit": 300,
        "l2_reg_lambda": 0.0,
        "max_pool_size": 4,
        "non_static": False,
        "num_epochs": 128,
        "num_filters": 32
    }  ## "num_epochs": 1-->128
    # Assign a 149998 dimension vector to each word.
    word_embeddings = Data_preprocess.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # Split the original dataset into train set and test set
    # 将数据分为训练集和测试机
    x, x_test, y, y_test = train_test_split(
        x_, y_, test_size=0.1)  # 这里的y_只是某一类的y,这里是第一类toxic

    # Split the train set into train set and dev set
    # 将训练数据又分为训练集和验证集
    x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1)

    print('x_train', x_train)
    print('y_train', y_train)

    print('y_train.shape', y_train.shape[1])
    #i = input()

    logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(
        len(x_train), len(x_dev), len(x_test)))
    logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(
        len(y_train), len(y_dev), len(y_test)))

    # Create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    trained_dir = './trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(
                embedding_mat=embedding_mat,
                sequence_length=x_train.shape[1],
                num_classes=y_train.shape[1],  ##############
                non_static=params['non_static'],
                hidden_unit=params['hidden_unit'],
                max_pool_size=params['max_pool_size'],
                filter_sizes=map(int, params['filter_sizes'].split(",")),
                num_filters=params['num_filters'],
                embedding_size=params['embedding_dim'],
                l2_reg_lambda=params['l2_reg_lambda'])
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)  ## 优化器
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint files will be saved in this directory during training
            # 存储训练模型
            checkpoint_dir = './checkpoints_' + timestamp + '/'
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                step, loss, accuracy, num_correct, predictions = sess.run([
                    global_step, cnn_rnn.loss, cnn_rnn.accuracy,
                    cnn_rnn.num_correct, cnn_rnn.predictions
                ], feed_dict)
                return accuracy, loss, num_correct, predictions

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # Training starts here 开始训练
            train_batches = Data_preprocess.batch_iter(
                list(zip(x_train, y_train)), params['batch_size'],
                params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            # Train the model with x_train and y_train
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                # Evaluate the model with x_dev and y_dev
                if current_step % params['evaluate_every'] == 0:
                    dev_batches = Data_preprocess.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)

                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        acc, loss, num_dev_correct, predictions = dev_step(
                            x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct
                    accuracy = float(total_dev_correct) / len(y_dev)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

            # Save the model files to trained_dir. predict.py needs trained model files.
            saver.save(sess, trained_dir + "best_model.ckpt")

            # Evaluate x_test and y_test
            saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step))
            test_batches = Data_preprocess.batch_iter(list(zip(x_test,
                                                               y_test)),
                                                      params['batch_size'],
                                                      1,
                                                      shuffle=False)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                acc, loss, num_test_correct, predictions = dev_step(
                    x_test_batch, y_test_batch)
                total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

    # Save trained parameters and files since predict.py needs them
    with open(trained_dir + 'words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + 'embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + 'labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4, ensure_ascii=False)

    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)