def predict(): with open(map_path, "rb") as f: word_to_id, cat_to_id, seq_length, num_classes = pickle.load(f) id_to_cat = {v: k for k, v in cat_to_id.items()} config = TRNNConfig() config.num_classes = num_classes config.vocab_size = len(word_to_id) model = TextRNN(config) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 while True: line = input("请输入测试句子:") data_id = [[ word_to_id[x] for x in list(native_content(line)) if x in word_to_id ]] x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length) y_pred_cls = session.run(model.y_pred_cls, feed_dict={ model.input_x: x_pad, model.keep_prob: 1.0 }) print('sentence : {}, prdict intent : {}'.format( line, id_to_cat[y_pred_cls[0]])) a = 1
def __init__(self): self.config = TRNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.model = TextRNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): self.config = TRNNConfig() self.categories, self.cat_to_id = read_category() self.words = np.load('./datas/dict_token.npy') self.word_to_id = np.load('./datas/token_to_id.npy').tolist() self.config.vocab_size = len(self.words) self.model = TextRNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): self.config = TRNNConfig() self.categories = categories self.cat_to_id = cat_to_id self.words = words self.word_to_id = word_to_id self.config.vocab_size = len(self.words) self.model = TextRNN(self.config) self.session = sess self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=rnn_save_path) # 读取保存的模型
def read_example(filename_queue): """Read one example from filename_queue""" config = TRNNConfig() reader = tf.TFRecordReader() key, value = reader.read(filename_queue) features = tf.parse_single_example(value, features={"title_len": tf.FixedLenFeature([1], tf.int64), "title": tf.FixedLenFeature([config.seq_length], tf.int64), "label": tf.FixedLenFeature([config.seq_length], tf.float32), "frame_weight": tf.FixedLenFeature([config.seq_length], tf.float32), }) title_len = tf.cast(features["title_len"], tf.int32) title = tf.cast(features["title"], tf.int32) label = tf.cast(features["label"], tf.float32) frame_weight = tf.cast(features["frame_weight"], tf.float32) return title_len, title, label, frame_weight
def __init__(self): with open(map_path, "rb") as f: self.word_to_id, self.cat_to_id, self.seq_length, self.num_classes = pickle.load( f) self.id_to_cat = {v: k for k, v in self.cat_to_id.items()} if model_type == 'cnn': self.config = TCNNConfig() self.config.num_classes = self.num_classes self.config.vocab_size = len(self.word_to_id) self.model = TextCNN(self.config) else: self.config = TRNNConfig() self.config.num_classes = self.num_classes self.config.vocab_size = len(self.word_to_id) self.model = TextRNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def load_model(self): sess = tf.Session() print('Configuring CNN model...') config = TRNNConfig() cnn_model = TextRNN(config) saver = tf.train.Saver() params_file = tf.train.latest_checkpoint(self.model_dir) saver.restore(sess, params_file) categories, cat_to_id = read_category() vocab_dir = 'cnews/cnews.vocab.txt' words, word_to_id = read_vocab(vocab_dir) self.words = words self.word_to_id = word_to_id self.categories = categories self.cat_to_id = cat_to_id self.cnn_model = cnn_model self.sess = sess print(self.cnn_model) print(self.sess)
print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") conn = pymysql.connect(host='localhost', user='******', passwd='sasa', db='text_clf', charset='utf8') cur = conn.cursor() print('Configuring RNN model...') config = TRNNConfig() #if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 # build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_vec = read_category() #words, word_to_id = read_vocab(vocab_dir) #config.vocab_size = len(words) model = TextRNN(config, get_embedding()) print('training') if sys.argv[1] == 'train': train() else: test()
# 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") print('Configuring RNN model...') config = TRNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextRNN(config) if sys.argv[1] == 'train': train() else: test()
test_dir = os.path.join(base_dir, 'cnewstest.txt') val_dir = os.path.join(base_dir, 'cnewsval.txt') vocab_dir = os.path.join(base_dir, 'cnewsvocab.txt') vector_word_dir= os.path.join(base_dir, 'vector_word.txt')#vector_word trained by word2vec vector_word_npz=os.path.join(base_dir, 'vector_word.npz')# save vector_word to numpy file #最佳验证结果保存路径 save_dir = r'HOME\mydata\lstm\checkpoints' save_path = os.path.join(save_dir, 'best_validation') #获取词典 '''build_vocab(train_dir,vocab_dir) _,word_to_id=read_vocab(vocab_dir) categories,cat_to_id=read_category() config=TRNNConfig() model=TextRNN(config)''' config=TRNNConfig() build_vocab(train_dir,vocab_dir) words,word_to_id=read_vocab(vocab_dir) categories,cat_to_id=read_category() config.vocab_size = len(words) if not os.path.exists(vector_word_npz): export_word2vec_vectors(word_to_id, vector_word_dir, vector_word_npz) config.pre_trianing = get_training_word2vec_vectors(vector_word_npz) model=TextRNN(config) init=tf.global_variables_initializer() def get_time_dif(start_time): """获取已使用时间""" end_time = time.time() time_dif = end_time - start_time return timedelta(seconds=int(round(time_dif)))
print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 3 or sys.argv[1] not in [ 'train', 'test' ] or sys.argv[2] not in ['char', 'word']: raise ValueError( """usage: python run_cnn.py [train / test] [char / word]""") print('Configuring RNN model...') if sys.argv[2] == 'char': config = TRNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextRNN(config) if sys.argv[1] == 'train': train() else: test() else: print("train on word embedding...") config = TRNNConfig() config.seq_length = 400
# 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': print('Configuring RNN model...',sys.argv) if len(sys.argv) == 6 and sys.argv[1] in ['train', 'test']: config = TRNNConfig() t_name = sys.argv[3] t_th = sys.argv[2] data_dir = sys.argv[4] base_dir = 'data/' + data_dir + '/' + t_name classes = sys.argv[5].split('-') train_dir = os.path.join(base_dir, 'train.csv') test_dir = os.path.join(base_dir, 'test.csv') val_dir = os.path.join(base_dir, 'dev.csv') vocab_dir = os.path.join('data/data_orginal/'+t_name, 'vocab.csv') if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 print(' vocab_dir not exists: ',vocab_dir) build_vocab('data/data_orginal/'+t_name+'/whole.csv', vocab_dir, config.vocab_size)
print(len(test_docs)) train_file.close() test_file = open(rnn.test_dir, 'w') for test_doc in test_docs: temp = test_doc.split(',') test_data_X.append(corpus[int(temp[0]) - 1]) test_data_Y.append(temp[1]) string = corpus[int(temp[0]) - 1].replace('\n', '').replace('\t', '') test_file.write(temp[1] + '\t' + string + '\n') print('Configuring RNN model...') test_file.close() rnn.config = TRNNConfig() #if not os.path.exists(rnn.vocab_dir): #if no vocab, build it build_vocab_words(rnn.train_dir, rnn.vocab_dir, rnn.config.vocab_size) rnn.categories, rnn.cat_to_id, rnn.id_to_cat = read_category() rnn.words, rnn.word_to_id = read_vocab(rnn.vocab_dir) rnn.config.vocab_size = len(rnn.words) #select a subset of word vectors rnn.missing_dir = os.path.join(rnn.base_dir, key + '.' + sub_key + '.missing.txt') missing_words_file = open(rnn.missing_dir, 'w') sub_embeddings = np.random.uniform( -0.0, 0.0, (rnn.config.vocab_size, embedding_dim)) count = 0 for i in range(0, rnn.config.vocab_size): if (rnn.words[i] in word_vector_map
for f in F1: print('\t'.join(['%0.1f'%f[0],str(f[2]),str(f[3]),str(f[4]),str(f[5])])) return auc,F1 if __name__ == '__main__': tf.reset_default_graph() base_dir = sys.argv[1] save_dir = sys.argv[2] ckpt_dir = sys.argv[3] train_dir = os.path.join(base_dir, 'train.txt') test_dir = os.path.join(base_dir, 'test.txt') val_dir = os.path.join(base_dir, 'val.txt') vocab_dir = os.path.join(base_dir, 'vocab.txt') predict_dir = os.path.join(base_dir, 'predict.txt') save_path = os.path.join(save_dir, 'best_validation') # 最佳验证结果保存路径 if len(sys.argv)>4: option = sys.argv[4] else: option = 'train' print('Configuring RNN model...') config = TRNNConfig() tokenizer = Tokenizer(vocab_dir) config.vocab_size = len(tokenizer.vocab) model = TextRNN(config) print('参数总量:%d'%np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])) if option == 'train': iter = batch_iter(train_dir, tokenizer, epochs=config.num_epochs) iter_test = batch_iter_test(val_dir, tokenizer) train() else: test()
# 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: # raise ValueError("""usage: python run_rnn.py [train / test]""") print('Configuring RNN model...') config = TRNNConfig() # 构建英文和中文的词表、字表 if not os.path.exists(vocab_cn): # 中文词汇表2370个,重建 # 中文是分好词的,我需要把他处理成一个一个的字 build_vocab(train_cnx, vocab_cn, config.vocab_size) if not os.path.exists(vocab_en): # 英文词汇表5000个,重建 build_vocab(train_eny, vocab_en, config.vocab_size) # 开始映射words to id words_cn, word_to_id_cn = read_vocab(vocab_cn) words_en, word_to_id_en = read_vocab(vocab_en) config.vocab_size_cn = len(words_cn) config.vocab_size_en = len(words_en)
def test(): print('Configuring RNN model...') config = TRNNConfig() config.dropout_keep_prob = 1.0 config.num_epochs = 1 start_time = time.time() config.batch_size = 10 count = 0 # 配置GPU内存分配方式 tfconfig = tf.ConfigProto(log_device_placement=True) tfconfig.gpu_options.allow_growth = True tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6 fw = file(config.test_dir_output, "w") with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess: test_dir_list = os.listdir(config.test_dir_tf) test_dir_list = [ os.path.join(config.test_dir_tf, i) for i in test_dir_list ] queueTest = tf.train.string_input_producer( test_dir_list, num_epochs=config.num_epochs) text, title_len, title, label, frame_weight = read_example_test( queueTest) text_batch, title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch( [text, title_len, title, label, frame_weight], batch_size=config.batch_size, capacity=50000, num_threads=1) with tf.variable_scope("model", initializer=tf.random_uniform_initializer( -1 * 1, 1)): model = TextRNN(config=config, input_x_len=title_len_batch, input_x=title_batch, input_y=label_batch, frame_weight=frame_weight_batch) fetches = [text_batch, model.input_x_len, model.y_pred, model.input_y] feed_dict = {} # init init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # 配置 Saver saver = tf.train.Saver() saver.restore(sess=sess, save_path=config.modelPath) # 读取保存的模型 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) try: while not coord.should_stop(): texts, x_len, y_pred, y_test = sess.run(fetches, feed_dict=feed_dict) texts = "".join(texts.values).split("\n") for i in range(len(texts) - 1): score = [str(int(j * 100)) for j in y_test[i]][:x_len[i][0]] y_test_i = " ".join(score) score = [str(int(j * 100)) for j in y_pred[i]][:x_len[i][0]] y_pred_i = " ".join(score) fw.write(texts[i] + "\ttarget:\t" + y_test_i + "\tpredict\t" + y_pred_i + "\n") count = count + 1 if count % 10000 == 0: print(count) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') coord.request_stop() coord.join(threads) fw.close()
def train(): print('Configuring RNN model...') config = TRNNConfig() config.dropout_keep_prob = 1.0 start_time = time.time() # config.batch_size = 10 total_batch = 0 # 总批次 best_mse_val = 99999999 # 最佳验证集准确率 best_loss_val = 99999999 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 5000 # 如果超过1000轮未提升,提前结束训练 count = 0 tensorboard_dir = config.tensorboard_dir # 配置GPU内存分配方式 tfconfig = tf.ConfigProto(log_device_placement=True) tfconfig.gpu_options.allow_growth = True tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.6 with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess: train_dir_list = os.listdir(config.train_dir_tf) train_dir_list = [ os.path.join(config.train_dir_tf, i) for i in train_dir_list ] queueTrain = tf.train.string_input_producer( train_dir_list, num_epochs=config.num_epochs) title_len, title, label, frame_weight = read_example(queueTrain) title_len_batch, title_batch, label_batch, frame_weight_batch = tf.train.batch( [title_len, title, label, frame_weight], batch_size=config.batch_size, capacity=100000, num_threads=1) with tf.variable_scope("model", initializer=tf.random_uniform_initializer( -1 * 1, 1)): model = TextRNN(config=config, input_x_len=title_len_batch, input_x=title_batch, input_y=label_batch, frame_weight=frame_weight_batch) tf.summary.scalar("loss", model.loss) tf.summary.scalar("mse", model.mse) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) fetches = [model.loss, model.mse] feed_dict = {} # init init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # 配置 Saver saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V1) if not config.retraining: saver.restore(sess=sess, save_path=config.modelPath) # 读取保存的模型 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) try: while not coord.should_stop(): # Run training steps or whatever # titles, labels = sess.run([title_batch, label_batch]) if total_batch % config.save_per_batch == 0: # 每多少轮次将训练结果写入tensorboard scalar s = sess.run(merged_summary, feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 loss_val, mse_val = sess.run(fetches, feed_dict) if mse_val < best_mse_val or loss_val < best_loss_val: # 保存最好结果 best_mse_val = mse_val best_loss_val = loss_val last_improved = total_batch improved_str = '*' # saver.save(sess=sess, save_path=config.save_path) if total_batch % config.save_per_batch == 0: saver.save(sess, config.save_path + '_%03d' % (total_batch / config.save_per_batch), write_meta_graph=False) else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Val Loss: {1:>6.5}, Mse: {2:>6.5}, Time: {3} {4}' print( msg.format(total_batch, loss_val, mse_val, time_dif, improved_str)) # print(embedding_inputs) sess.run(model.optim, feed_dict) total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") coord.should_stop() break # 跳出循环 except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') coord.request_stop() coord.join(threads)
# 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") print('Configuring RNN model...') print('load data. . .') X = pickle.load(open(train_data, 'rb')) df, word_vecs, word_cab_num, sentence_max_len, class_num = X[0], X[1], X[2], X[3], X[4] config = TRNNConfig(sentence_max_len, class_num, word_cab_num) word_ids, W_list = process_data.getWordsVect(config, word_vecs) model = TextRNN(config, W_list, False) #默认不训练词向量 if sys.argv[1] == 'train': train() else: test()