def __init__(self): self.config = TCNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): self.config = TCNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): print('Configuring CNN model...') if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) self.categories, cat_to_id = read_category() words, self.word_to_id = read_vocab(vocab_dir) self.table = pd.read_excel('predict_check_data.xls') category_set = list(set(self.table['name'].tolist())) self.config = TCNNConfig(len(list(category_set))) self.config.vocab_size = len(words) self.model = TextCNN(self.config) self.categories = list(set(self.table['name'].tolist())) self.categories.sort(key=self.table['name'].tolist().index)
def __init__(self): self.config = TextCNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) # self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) # saver = tf.train.Saver() # saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型 self.meta_graph_def = sm.loader.load(self.session, tags=[sm.tag_constants.SERVING], export_dir=pd_path) signature = self.meta_graph_def.signature_def x_tensor_name = signature[signature_key].inputs['input_x'].name kp_tensor_name = signature[signature_key].inputs['keep_prob'].name y_tensor_name = signature[signature_key].outputs['output'].name self.x = self.session.graph.get_tensor_by_name(x_tensor_name) self.kp = self.session.graph.get_tensor_by_name(kp_tensor_name) self.y = self.session.graph.get_tensor_by_name(y_tensor_name)
print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: # raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) dataNums = [16, 32, 64, 128, 256] for i in dataNums: if i == 0: continue g1 = tf.Graph() sess1 = tf.Session(graph=g1) with sess1.as_default(): with g1.as_default(): model = TextCNN(config, batch_size=i) train() test() plt.plot(xx, yy1) plt.title('train loss') plt.show()
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: # 同上 break if __name__ == '__main__': print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word2id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextCNN(config) train()
rnn.vocab_dir = os.path.join(rnn.base_dir, key + '.' + sub_key + '.vocab.txt') test_docs = test_sub_dic[sub_key] test_data_Y = [] print(len(test_docs)) for test_doc in test_docs: temp = test_doc.split(',') test_data_Y.append(temp[1]) print('Configuring CNN model...') rnn.config = TRNNConfig() #if not os.path.exists(cnn.vocab_dir): #if no vocab, build it build_vocab_words(rnn.all_dir, rnn.vocab_dir, rnn.config.vocab_size) rnn.words, rnn.word_to_id = read_vocab(rnn.vocab_dir) rnn.config.vocab_size = len(rnn.words) #select a subset of word vectors rnn.missing_dir = os.path.join(rnn.base_dir, key + '.' + sub_key + '.missing.txt') missing_words_file = open(rnn.missing_dir, 'w') sub_embeddings = np.random.uniform( -0.0, 0.0, (rnn.config.vocab_size, embedding_dim)) count = 0 for i in range(0, rnn.config.vocab_size): if (rnn.words[i] in word_vector_map ): # word_vector_map.has_key(cnn.words[i]) count = count sub_embeddings[i] = word_vector_map.get(rnn.words[i]) else:
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(train_op, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: # 同上 break if __name__ == '__main__': model = TextCNN() config = model.config file_config, _ = Config().parse.parse_known_args() print('Configuring CNN model...') if not os.path.exists(file_config.vocab_path): # 如果不存在词汇表,重建 build_vocab(file_config.train_path, file_config.vocab_path, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(file_config.vocab_path) config.vocab_size = len(words) train(model, file_config)
# 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") print('Configuring RNN model...') config = TRNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextRNN(config) if sys.argv[1] == 'train': train() else: test()
metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_rnn.py [train / test]""") print('Configuring RNN model...') config = TRNNConfig() if not os.path.exists(vocab_path): # 如果不存在词汇表,重建 build_vocab(train_path, vocab_path, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_path) config.vocab_size = len(words) model = TextRNN(config) if sys.argv[1] == 'train': train() else: test()
test_data_X.append(corpus[int(temp[0]) - 1]) test_data_Y.append(temp[1]) string = corpus[int(temp[0]) - 1] #string = expand_abbr(string) str_to_write = string test_file.write(temp[1] + '\t' + str_to_write + '\n') all_file.write(temp[1] + '\t' + str_to_write + '\n') print('Configuring CNN model...') test_file.close() all_file.close() cnn.config = TCNNConfig() #if not os.path.exists(cnn.vocab_dir): #if no vocab, build it build_vocab_words(cnn.all_dir, cnn.vocab_dir, cnn.config.vocab_size) cnn.words, cnn.word_to_id = read_vocab(cnn.vocab_dir) cnn.config.vocab_size = len(cnn.words) #select a subset of word vectors cnn.missing_dir = os.path.join(cnn.base_dir, key + '.' + sub_key + '.missing.txt') missing_words_file = open(cnn.missing_dir, 'w') sub_embeddings = np.random.uniform( -0.0, 0.0, (cnn.config.vocab_size, embedding_dim)) count = 0 for i in range(0, cnn.config.vocab_size): if (cnn.words[i] in word_vector_map ): #word_vector_map.has_key(cnn.words[i]) count = count sub_embeddings[i] = word_vector_map.get(cnn.words[i]) else:
# 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() # 读取配置文件 if not os.path.exists( vocab_dir): # 如果不存在词汇表,用train_dir中频率最高的vocab_size-1个词构建词汇表 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() # 读分类list 和 分类-id 字典 words, word_to_id = read_vocab(vocab_dir) # 读词汇表list 和 words-id 字典 config.vocab_size = len(words) # 词汇表大小重新设定 model = TextCNN(config) if sys.argv[1] == 'train': train() elif sys.argv[1] == 'test': test() else: raise ValueError("""usage: python run_cnn.py [train / test]""")
def train(model,data): if print("Configuring TensorBoard and Saver...") # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 tensorboard_dir = 'tensorboard/textcnn' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) # 配置 Saver saver = tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("Loading training and validation data...") # 载入训练集与验证集 start_time = time.time() x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length) x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # 创建session session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and evaluating...') start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False for epoch in range(config.num_epochs): print('Epoch:', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) if total_batch % config.save_per_batch == 0: # 每多少轮次将训练结果写入tensorboard scalar s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, x_val, y_val) # todo if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: # 同上 break def test(): print("Loading test data...") start_time = time.time() x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 print('Testing...') loss_test, acc_test = evaluate(session, x_test, y_test) msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) batch_size = 128 data_len = len(x_test) num_batch = int((data_len - 1) / batch_size) + 1 y_test_cls = np.argmax(y_test, 1) y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存预测结果 for i in range(num_batch): # 逐批次处理 start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) feed_dict = { model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0 } y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) # 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': #if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: # raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextCNN(config) #if sys.argv[1] == 'train': # train() #else: # test() train()
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': # 输入参数 train 和 test 表示训练与测试 # 需要在命令行运行 python run_cnn.py <train>|<test> if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() # 获得TCNNConfig设置,TCNNConfig表示CNN配置参数 if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 单词表长度5000,是train里面出现最频繁的5000个单词 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() # read_category()获取目录,cat_to_id 标签:序号的字典 words, word_to_id = read_vocab(vocab_dir) # 将词汇表的各个单词编号 config.vocab_size = len(words) # 更新词汇表长度 model = TextCNN(config) # 构建CNN模型,很重要 if sys.argv[1] == 'train': train() else: test()
cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) return y_pred_cls if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id, id_to_cat = read_category() words, word_to_id = read_vocab(vocab_dir) entities, entity_to_id = read_vocab(entity_vocab_dir) config.vocab_size = len(words) config.entity_vocab_size = len(entities) word_embeddings = np.random.uniform(-0.0, 0.0, (config.vocab_size, 100)) entity_embeddings = np.random.uniform(-0.0, 0.0, (config.entity_vocab_size, 200)) model = TextCNN(config, word_embeddings, entity_embeddings) if sys.argv[1] == 'train': train() else: test()
test_data_X.append(corpus[int(temp[0]) - 1]) test_data_Y.append(temp[1]) string = corpus[int(temp[0]) - 1] str_to_write = string test_file.write(temp[1] + '\t' + str_to_write + '\n') all_file.write(temp[1] + '\t' + str_to_write + '\n') test_file.close() all_file.close() print('Configuring CNN model...') cnn.config = TCNNConfig() build_vocab_words(cnn.all_dir, cnn.vocab_dir, cnn.config.vocab_size) cnn.words, cnn.word_to_id = read_vocab(cnn.vocab_dir) cnn.config.vocab_size = len(cnn.words) #select a subset of word vectors cnn.missing_dir = os.path.join(cnn.base_dir, key + '.' + sub_key + '.missing.txt') missing_words_file = open(cnn.missing_dir, 'w') word_embeddings = np.random.uniform( -0.0, 0.0, (cnn.config.vocab_size, word_embedding_dim)) count = 0 for i in range(0, cnn.config.vocab_size): if (cnn.words[i] in word_vector_map ): #word_vector_map.has_key(cnn.words[i]) word_embeddings[i] = word_vector_map.get(cnn.words[i]) else: count = count + 1