def predict(): with open(map_path, "rb") as f: word_to_id, cat_to_id, seq_length, num_classes = pickle.load(f) id_to_cat = {v: k for k, v in cat_to_id.items()} config = TRNNConfig() config.num_classes = num_classes config.vocab_size = len(word_to_id) model = TextRNN(config) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 while True: line = input("请输入测试句子:") data_id = [[ word_to_id[x] for x in list(native_content(line)) if x in word_to_id ]] x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length) y_pred_cls = session.run(model.y_pred_cls, feed_dict={ model.input_x: x_pad, model.keep_prob: 1.0 }) print('sentence : {}, prdict intent : {}'.format( line, id_to_cat[y_pred_cls[0]])) a = 1
def __init__(self): self.config = TRNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.model = TextRNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): self.config = TRNNConfig() self.categories, self.cat_to_id = read_category() self.words = np.load('./datas/dict_token.npy') self.word_to_id = np.load('./datas/token_to_id.npy').tolist() self.config.vocab_size = len(self.words) self.model = TextRNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): self.config = TRNNConfig() self.categories = categories self.cat_to_id = cat_to_id self.words = words self.word_to_id = word_to_id self.config.vocab_size = len(self.words) self.model = TextRNN(self.config) self.session = sess self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=rnn_save_path) # 读取保存的模型
def init(): """初始化模型""" rnn = TextRNN(embedding_dim=FLAGS.embedding_dim, seq_length=FLAGS.seq_length, num_classes=FLAGS.num_classes, vocab_size=FLAGS.vocab_size, num_layers=FLAGS.num_layers, hidden_dim=FLAGS.hidden_dim, rnn=FLAGS.rnn, dropout_keep_prob=FLAGS.dropout_keep_prob, learning_rate=FLAGS.learning_rate, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs, print_per_batch=FLAGS.print_per_batch, save_per_batch=FLAGS.save_per_batch) return rnn
def __init__(self): with open(map_path, "rb") as f: self.word_to_id, self.cat_to_id, self.seq_length, self.num_classes = pickle.load( f) self.id_to_cat = {v: k for k, v in self.cat_to_id.items()} if model_type == 'cnn': self.config = TCNNConfig() self.config.num_classes = self.num_classes self.config.vocab_size = len(self.word_to_id) self.model = TextCNN(self.config) else: self.config = TRNNConfig() self.config.num_classes = self.num_classes self.config.vocab_size = len(self.word_to_id) self.model = TextRNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def load_model(self): sess = tf.Session() print('Configuring CNN model...') config = TRNNConfig() cnn_model = TextRNN(config) saver = tf.train.Saver() params_file = tf.train.latest_checkpoint(self.model_dir) saver.restore(sess, params_file) categories, cat_to_id = read_category() vocab_dir = 'cnews/cnews.vocab.txt' words, word_to_id = read_vocab(vocab_dir) self.words = words self.word_to_id = word_to_id self.categories = categories self.cat_to_id = cat_to_id self.cnn_model = cnn_model self.sess = sess print(self.cnn_model) print(self.sess)
# 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") print('Configuring RNN model...') config = TRNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextRNN(config) if sys.argv[1] == 'train': train() else: test()
# default=path + '/valid2.csv') parser.add_argument('--test_data', type=str, default='data/test.csv') # parser.add_argument('--tensorboard_dir', type=str, # default=path + '/tensorboard') parser.add_argument('--save_path', type=str, default=path + '/model.ckpt') parser.add_argument('--word_file', type=str, default=path + '/words.csv') # parser.add_argument('--label_file', type=str, # default=path + '/labels.csv') parser.add_argument('--result', type=str, default=path + '/result.csv') FLAGS, unparser = parser.parse_known_args() # contents, labels, _ = read_data(FLAGS.train_data, sep=' ') # train_contents, valid_contents, train_labels, valid_labels = train_test_split(contents, labels, test_size=0.1, # random_state=0) # valid_contents, valid_labels, _ = read_data(FLAGS.valid_data) # words, word2id, labels, label2id = word_to_id(train_contents, train_labels, FLAGS.vocab_size) # save_words(word2id, FLAGS.word_file) # save_labels(label2id, FLAGS.label_file) pred_contents, review_id, texts = read_data(FLAGS.test_data, sep=' ') model = TextRNN(FLAGS.embedding_size, FLAGS.hidden_layers, FLAGS.hidden_units, FLAGS.number_classes, FLAGS.learning_rate, FLAGS.sequence_length, FLAGS.vocab_size) # train() # test() predict()
print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") conn = pymysql.connect(host='localhost', user='******', passwd='sasa', db='text_clf', charset='utf8') cur = conn.cursor() print('Configuring RNN model...') config = TRNNConfig() #if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 # build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_vec = read_category() #words, word_to_id = read_vocab(vocab_dir) #config.vocab_size = len(words) model = TextRNN(config, get_embedding()) print('training') if sys.argv[1] == 'train': train() else: test()
def train(): print('Configuring RNN model...') config = TRNNConfig() config.dropout_keep_prob = 1.0 start_time = time.time() # config.batch_size = 10 total_batch = 0 # 总批次 best_mse_val = 99999999 # 最佳验证集准确率 best_loss_val = 99999999 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 5000 # 如果超过1000轮未提升,提前结束训练 count = 0 tensorboard_dir = config.tensorboard_dir # 配置GPU内存分配方式 tfconfig = tf.ConfigProto(log_device_placement=True) tfconfig.gpu_options.allow_growth = True tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6 with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess: train_dir_list = os.listdir(config.train_dir_tf) train_dir_list = [ os.path.join(config.train_dir_tf, i) for i in train_dir_list ] queueTrain = tf.train.string_input_producer( train_dir_list, num_epochs=config.num_epochs) title_len, title, label, frame_weight = read_example(queueTrain) title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch( [title_len, title, label, frame_weight], batch_size=config.batch_size, capacity=100000, num_threads=1) with tf.variable_scope("model", initializer=tf.random_uniform_initializer( -1 * 1, 1)): model = TextRNN(config=config, input_x_len=title_len_batch, input_x=title_batch, input_y=label_batch, frame_weight=frame_weight_batch) tf.summary.scalar("loss", model.loss) tf.summary.scalar("mse", model.mse) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) fetches = [model.loss, model.mse] feed_dict = {} # init init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # 配置 Saver saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V1) if not config.retraining: saver.restore(sess=sess, save_path=config.modelPath) # 读取保存的模型 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) try: while not coord.should_stop(): # Run training steps or whatever # titles, labels = sess.run([title_batch, label_batch]) if total_batch % config.save_per_batch == 0: # 每多少轮次将训练结果写入tensorboard scalar s = sess.run(merged_summary, feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 loss_val, mse_val = sess.run(fetches, feed_dict) if mse_val < best_mse_val or loss_val < best_loss_val: # 保存最好结果 best_mse_val = mse_val best_loss_val = loss_val last_improved = total_batch improved_str = '*' # saver.save(sess=sess, save_path=config.save_path) if total_batch % config.save_per_batch == 0: saver.save(sess, config.save_path + '_%03d' % (total_batch / config.save_per_batch), write_meta_graph=False) else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Val Loss: {1:>6.5}, Mse: {2:>6.5}, Time: {3} {4}' print( msg.format(total_batch, loss_val, mse_val, time_dif, improved_str)) # print(embedding_inputs) sess.run(model.optim, feed_dict) total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") coord.should_stop() break # 跳出循环 except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') coord.request_stop() coord.join(threads)
def test(): print('Configuring RNN model...') config = TRNNConfig() config.dropout_keep_prob = 1.0 config.num_epochs = 1 start_time = time.time() config.batch_size = 10 count = 0 # 配置GPU内存分配方式 tfconfig = tf.ConfigProto(log_device_placement=True) tfconfig.gpu_options.allow_growth = True tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6 fw = file(config.test_dir_output, "w") with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess: test_dir_list = os.listdir(config.test_dir_tf) test_dir_list = [ os.path.join(config.test_dir_tf, i) for i in test_dir_list ] queueTest = tf.train.string_input_producer( test_dir_list, num_epochs=config.num_epochs) text, title_len, title, label, frame_weight = read_example_test( queueTest) text_batch, title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch( [text, title_len, title, label, frame_weight], batch_size=config.batch_size, capacity=50000, num_threads=1) with tf.variable_scope("model", initializer=tf.random_uniform_initializer( -1 * 1, 1)): model = TextRNN(config=config, input_x_len=title_len_batch, input_x=title_batch, input_y=label_batch, frame_weight=frame_weight_batch) fetches = [text_batch, model.input_x_len, model.y_pred, model.input_y] feed_dict = {} # init init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # 配置 Saver saver = tf.train.Saver() saver.restore(sess=sess, save_path=config.modelPath) # 读取保存的模型 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) try: while not coord.should_stop(): texts, x_len, y_pred, y_test = sess.run(fetches, feed_dict=feed_dict) texts = "".join(texts.values).split("\n") for i in range(len(texts) - 1): score = [str(int(j * 100)) for j in y_test[i]][:x_len[i][0]] y_test_i = " ".join(score) score = [str(int(j * 100)) for j in y_pred[i]][:x_len[i][0]] y_pred_i = " ".join(score) fw.write(texts[i] + "\ttarget:\t" + y_test_i + "\tpredict\t" + y_pred_i + "\n") count = count + 1 if count % 10000 == 0: print(count) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') coord.request_stop() coord.join(threads) fw.close()
-0.0, 0.0, (rnn.config.vocab_size, embedding_dim)) count = 0 for i in range(0, rnn.config.vocab_size): if (rnn.words[i] in word_vector_map ): #word_vector_map.has_key(cnn.words[i]) count = count sub_embeddings[i] = word_vector_map.get(rnn.words[i]) else: count = count + 1 missing_words_file.write(rnn.words[i] + '\n') print('no embedding: ' + str(1.0 * count / len(rnn.words))) print(str(len(sub_embeddings)) + '\t' + str(len(sub_embeddings[0]))) missing_words_file.close() rnn.model = TextRNN(rnn.config) rnn.train() predict_y = rnn.test() #predicting results print(predict_y) print(len(predict_y)) print(len(test_data_Y)) tf.reset_default_graph() correct_count = 0 for i in range(len(test_data_Y)): if rnn.id_to_cat[predict_y[i]] == test_data_Y[i]: correct_count += 1 doc_node = doc.createElement("doc") doc_node.setAttribute("id", test_docs[i].split(',')[0])
# 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") print('Configuring RNN model...') print('load data. . .') X = pickle.load(open(train_data, 'rb')) df, word_vecs, word_cab_num, sentence_max_len, class_num = X[0], X[1], X[2], X[3], X[4] config = TRNNConfig(sentence_max_len, class_num, word_cab_num) word_ids, W_list = process_data.getWordsVect(config, word_vecs) model = TextRNN(config, W_list, False) #默认不训练词向量 if sys.argv[1] == 'train': train() else: test()
def train(): # Training procedure # ====================================================== # 设定最小显存使用量 config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: config = RNNConfig() rnn = TextRNN(config) rnn.prepare_data() rnn.setRNN() print('Setting Tensorboard and Saver...') # 设置Saver和checkpoint来保存模型 # =================================================== checkpoint_dir = os.path.join(os.path.abspath("checkpoints"), "textrnn") checkpoint_prefix = os.path.join(checkpoint_dir, rnn.train_mode) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) # ===================================================== # 配置Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 # ==================================================================== train_tensorboard_dir = 'tensorboard/textrnn/train/' + config.train_mode valid_tensorboard_dir = 'tensorboard/textrnn/valid/' + config.train_mode if not os.path.exists(train_tensorboard_dir): os.makedirs(train_tensorboard_dir) if not os.path.exists(valid_tensorboard_dir): os.makedirs(valid_tensorboard_dir) # 训练结果记录 log_file = open(valid_tensorboard_dir+'/log.txt', mode='w') merged_summary = tf.summary.merge([tf.summary.scalar('loss', rnn.loss), tf.summary.scalar('accuracy', rnn.accuracy)]) train_summary_writer = tf.summary.FileWriter(train_tensorboard_dir, sess.graph) # ========================================================================= global_step = tf.Variable(0, trainable=False) # 保证Batch normalization的执行 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): # 保证train_op在update_ops执行之后再执行。 train_op = tf.train.AdamOptimizer(config.learning_rate).minimize(rnn.loss, global_step) # 训练步骤 def train_step(batch_x, batch_y, keep_prob=config.dropout_keep_prob): feed_dict = { rnn.input_x: batch_x, rnn.labels: batch_y, rnn.dropout_keep_prob: keep_prob, rnn.training: True } sess.run(train_op, feed_dict=feed_dict) step, loss, accuracy, summery = sess.run( [global_step, rnn.loss, rnn.accuracy, merged_summary], feed_dict={rnn.input_x: batch_x, rnn.labels: batch_y, rnn.dropout_keep_prob: 1.0, rnn.training: False}) t = datetime.datetime.now().strftime('%m-%d %H:%M') print('%s: epoch: %d, step: %d, loss: %f, accuracy: %f' % (t, epoch,step, loss, accuracy)) # 把结果写入Tensorboard中 train_summary_writer.add_summary(summery, step) # 验证步骤 def valid_step(next_valid_element): # 把valid_loss和valid_accuracy归0 valid_loss = 0.0 valid_accuracy = 0.0 valid_precision = 0.0 valid_recall = 0.0 valid_f1_score = 0.0 i = 0 while True: try: lines = sess.run(next_valid_element) batch_x, batch_y = rnn.convert_input(lines) feed_dict = { rnn.input_x: batch_x, rnn.labels: batch_y, rnn.dropout_keep_prob: 1.0, rnn.training: False } loss, accuracy, prediction, y_true = sess.run( [rnn.loss, rnn.accuracy, rnn.prediction, rnn.labels], feed_dict) precision = sk.metrics.precision_score(y_true=y_true, y_pred=prediction, average='weighted') recall = sk.metrics.recall_score(y_true=y_true, y_pred=prediction, average='weighted') f1_score = sk.metrics.f1_score(y_true=y_true, y_pred=prediction, average='weighted') valid_loss += loss valid_accuracy += accuracy valid_precision += precision valid_recall += recall valid_f1_score += f1_score i += 1 except tf.errors.OutOfRangeError: # 遍历完验证集,然后对loss和accuracy求平均值 valid_loss /= i valid_accuracy /= i valid_precision /= i valid_recall /= i valid_f1_score /= i t = datetime.datetime.now().strftime('%m-%d %H:%M') log = '%s: epoch %d, validation loss: %0.6f, accuracy: %0.6f' % ( t, epoch, valid_loss, valid_accuracy) log = log + '\n' + ('precision: %0.6f, recall: %0.6f, f1_score: %0.6f' % ( valid_precision, valid_recall, valid_f1_score)) print(log) log_file.write(log + '\n') time.sleep(3) # 把结果写入Tensorboard中 # valid_summary_writer.add_summary(valid_summary, step) return print('Start training TextRNN, training mode='+rnn.train_mode) sess.run(tf.global_variables_initializer()) # Training loop for epoch in range(config.epoch_num): train_init_op, valid_init_op, next_train_element, next_valid_element = rnn.shuffle_datset() sess.run(train_init_op) while True: try: lines = sess.run(next_train_element) batch_x, batch_y = rnn.convert_input(lines) train_step(batch_x, batch_y, config.dropout_keep_prob) except tf.errors.OutOfRangeError: # 初始化验证集迭代器 sess.run(valid_init_op) valid_step(next_valid_element) break train_summary_writer.close() log_file.close() # 训练完成后保存参数 path = saver.save(sess, checkpoint_prefix, global_step=global_step) print("Saved model checkpoint to {}\n".format(path))