def predict(): with open(map_path, "rb") as f: word_to_id, cat_to_id, seq_length, num_classes = pickle.load(f) id_to_cat = {v: k for k, v in cat_to_id.items()} config = TCNNConfig() config.num_classes = num_classes config.vocab_size = len(word_to_id) model = TextCNN(config) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 while True: line = str(input("请输入测试句子:")) data_id = [[ word_to_id[x] for x in list(native_content(line)) if x in word_to_id ]] x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length) y_pred_cls = session.run(model.y_pred_cls, feed_dict={ model.input_x: x_pad, model.keep_prob: 1.0 }) print('sentence : {}, prdict intent : {}'.format( line, id_to_cat[y_pred_cls[0]])) a = 1
def __init__(self, LENGTH): self.config = TCNNConfig() self.config.seq_length = LENGTH self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): self.config = TCNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): self.config = TCNNConfig() self.categories, self.cat_to_id = read_category() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.config.pre_training = np.load(pre_training) self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) if not load_checkpoint(save_dir, self.session): exit()
def load_variable_pb(): session = tf.Session(graph=tf.Graph()) model_file_path = "pb/model" meta_graph = tf.saved_model.loader.load( session, [tf.saved_model.tag_constants.SERVING], model_file_path) model_graph_signature = list(meta_graph.signature_def.items())[0][1] output_feed = [] output_op_names = [] output_tensor_dict = {} output_op_names.append('y_pred_cls') output_op_names.append('y_pred_prob') for output_item in model_graph_signature.outputs.items(): output_op_name = output_item[0] output_tensor_name = output_item[1].name output_tensor_dict[output_op_name] = output_tensor_name for name in output_op_names: output_feed.append(output_tensor_dict[name]) print(output_tensor_dict[name]) print("load model finish!") config = TCNNConfig() categories, cat_to_id = read_category() word_to_id = read_vocab(vocab_dir) while True: string = input("请输入测试句子: ").strip() input_x = [[word_to_id.get(x, word_to_id['<PAD>']) for x in string]] input_x = tf.keras.preprocessing.sequence.pad_sequences( sequences=input_x, maxlen=config.seq_length) inputs = {} inputs['input_x'] = input_x inputs['keep_prob'] = 1.0 feed_dict = {} for input_item in model_graph_signature.inputs.items(): input_op_name = input_item[0] input_tensor_name = input_item[1].name feed_dict[input_tensor_name] = inputs[input_op_name] outputs = session.run(output_feed, feed_dict=feed_dict) print(categories[outputs[0][0]]) print(outputs[1][0])
def __init__(self): self.config = TCNNConfig() self.categories = categories self.cat_to_id = cat_to_id self.words = words self.word_to_id = word_to_id self.config.vocab_size = len(self.words) self.model = TextCNN(self.config) self.session = sess self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=cnn_save_path) # 读取保存的模型
def __init__(self): self.config = TCNNConfig() self.words, self.word_to_id = read_vocab(vocab_dir) self.config.vocab_size = len(self.words) self.config.pre_training = pd.read_csv(word_vector_dir, header=None, index_col=None).values self.model = TextCNN(self.config) session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) self.session = tf.Session(config=session_conf) self.session.run(tf.global_variables_initializer()) # self.session.run(tf.initialize_local_variables()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self): print('Configuring CNN model...') if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) self.categories, cat_to_id = read_category() words, self.word_to_id = read_vocab(vocab_dir) self.table = pd.read_excel('predict_check_data.xls') category_set = list(set(self.table['name'].tolist())) self.config = TCNNConfig(len(list(category_set))) self.config.vocab_size = len(words) self.model = TextCNN(self.config) self.categories = list(set(self.table['name'].tolist())) self.categories.sort(key=self.table['name'].tolist().index)
def __init__(self): self.map_path = './model/ids.map' self.save_path = './model/best_validation' with open(self.map_path, "rb") as f: self.word_to_id, self.cat_to_id, self.seq_length, self.num_classes = pickle.load( f) self.id_to_cat = {v: k for k, v in self.cat_to_id.items()} self.config = TCNNConfig() self.config.num_classes = self.num_classes self.config.vocab_size = len(self.word_to_id) self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=self.save_path) # 读取保存的模型
def model_convert(model_path,pb_path): config = TCNNConfig() model = TextCNN(config) save_path = model_path with tf.Session() as session: session.run(tf.global_variables_initializer()) saver_1 = tf.train.Saver() saver_1.restore(sess=session, save_path=save_path) # 读取保存的模型 print([n.name for n in session.graph.as_graph_def().node]) frozen_graph_def= tf.graph_util.convert_variables_to_constants( session, session.graph_def, output_node_names=["keep_prob","input_x","score/predict"]) with open(pb_path, 'wb') as f: f.write(frozen_graph_def.SerializeToString())
def read_example(filename_queue): """Read one example from filename_queue""" reader = tf.TFRecordReader() config = TCNNConfig() key, value = reader.read(filename_queue) features = tf.parse_single_example( value, features={ "text": tf.VarLenFeature(tf.string), "title": tf.FixedLenFeature([config.seq_length], tf.int64), "label": tf.FixedLenFeature([config.num_classes], tf.int64) }) text = features["text"] title = tf.cast(features["title"], tf.int32) label = tf.cast(features["label"], tf.int32) return text, title, label
def __init__(self): embedding_model_file = os.path.join( 'data', 'word_embedding', 'embeddings.bin') embedding_model = word2vec.load(embedding_model_file) # type: word2vec.WordVectors self.segor = Train() self.config = TCNNConfig() self.categories, self.cat_to_id = read_category() words = list(embedding_model.vocab) self.word_to_id = embedding_model.vocab_hash self.config.vocab_size = len(words) self.model = TextCNN(self.config, embedding_model) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def __init__(self, stopwords_path, vocab_dir, categories_dir, save_path): self.thu = thulac.thulac(seg_only=True) self.stopwords = [ line.strip() for line in open(stopwords_path).readlines() ] categories, cat_to_id = read_category(categories_dir) self.id_to_cat = {v: k for k, v in cat_to_id.items()} words, self.word_to_id = read_vocab(vocab_dir) g = tf.Graph() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True self.sess = tf.Session(graph=g, config=tf_config) with self.sess.as_default(): with g.as_default(): self.config = TCNNConfig() self.config.num_classes = len(cat_to_id) self.config.vocab_size = len(words) self.model = TextCNN(self.config) saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) saver.restore(self.sess, save_path=save_path)
def load_model(self): sess = tf.Session() print('Configuring CNN model...') config = TCNNConfig() cnn_model = TextCNN(config) saver = tf.train.Saver() params_file = tf.train.latest_checkpoint(self.model_dir) saver.restore(sess, params_file) categories, cat_to_id = read_category() vocab_dir = 'cnews/cnews.vocab.txt' words, word_to_id = read_vocab(vocab_dir) self.words = words self.word_to_id = word_to_id self.categories = categories self.cat_to_id = cat_to_id self.cnn_model = cnn_model self.sess = sess print(self.cnn_model) print(self.sess)
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': # 输入参数 train 和 test 表示训练与测试 # 需要在命令行运行 python run_cnn.py <train>|<test> if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() # 获得TCNNConfig设置,TCNNConfig表示CNN配置参数 if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 单词表长度5000,是train里面出现最频繁的5000个单词 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() # read_category()获取目录,cat_to_id 标签:序号的字典 words, word_to_id = read_vocab(vocab_dir) # 将词汇表的各个单词编号 config.vocab_size = len(words) # 更新词汇表长度 model = TextCNN(config) # 构建CNN模型,很重要 if sys.argv[1] == 'train': train() else: test()
print("Confusion Matrix...") print(y_test_cls) print(y_pred_cls) cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: # raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) dataNums = [16, 32, 64, 128, 256] for i in dataNums: if i == 0: continue g1 = tf.Graph() sess1 = tf.Session(graph=g1) with sess1.as_default(): with g1.as_default(): model = TextCNN(config, batch_size=i) train()
# 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': config = TCNNConfig() # 1.加载配置参数; 初始化右边的类之后得到左边的对象config if not os.path.exists(vocab_dir): # 如果cnews.vocab.txt不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() # 制作分类目录 words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextCNN(config) option = 'test' if option == 'train': train() else: test()
def train(model,data): if print("Configuring TensorBoard and Saver...") # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 tensorboard_dir = 'tensorboard/textcnn' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) # 配置 Saver saver = tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("Loading training and validation data...") # 载入训练集与验证集 start_time = time.time() x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length) x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # 创建session session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and evaluating...') start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False for epoch in range(config.num_epochs): print('Epoch:', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) if total_batch % config.save_per_batch == 0: # 每多少轮次将训练结果写入tensorboard scalar s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, x_val, y_val) # todo if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: # 同上 break def test(): print("Loading test data...") start_time = time.time() x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 print('Testing...') loss_test, acc_test = evaluate(session, x_test, y_test) msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) batch_size = 128 data_len = len(x_test) num_batch = int((data_len - 1) / batch_size) + 1 y_test_cls = np.argmax(y_test, 1) y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存预测结果 for i in range(num_batch): # 逐批次处理 start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) feed_dict = { model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0 } y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) # 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': #if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: # raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextCNN(config) #if sys.argv[1] == 'train': # train() #else: # test() train()
f.close() return 0 if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test', 'predict']: raise ValueError( """usage: python run_cnn.py [train / test / predict]""") print('Configuring CNN model...') table = pd.read_excel('predict_check_data.xls') SubCategoryName_list = table['name'].tolist() category_set = list(set(SubCategoryName_list)) category_set.sort(key=SubCategoryName_list.index) config = TCNNConfig(len(category_set)) if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) #print(config.vocab_size) model = TextCNN(config) if sys.argv[1] == 'train': train() elif sys.argv[1] == 'predict': predict() else: test()
# 评估 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() # 加载配置文件 if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab_to_words(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() # 获取分类类别 words, word_to_id = read_vocab(vocab_dir) # 读取词汇表 config.vocab_size = len(words) # 修改 词汇表大小 model = TextCNN(config) if sys.argv[1] == 'train': train() else: test()
# 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': config = TCNNConfig() # 获取配置参数 if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextCNN(config) option = 'train' if option == 'train': train() else: test()
test_file = open(cnn.test_dir, 'w') for test_doc in test_docs: temp = test_doc.split(',') test_data_X.append(corpus[int(temp[0]) - 1]) test_data_Y.append(temp[1]) string = corpus[int(temp[0]) - 1] #string = expand_abbr(string) str_to_write = string test_file.write(temp[1] + '\t' + str_to_write + '\n') all_file.write(temp[1] + '\t' + str_to_write + '\n') print('Configuring CNN model...') test_file.close() all_file.close() cnn.config = TCNNConfig() #if not os.path.exists(cnn.vocab_dir): #if no vocab, build it build_vocab_words(cnn.all_dir, cnn.vocab_dir, cnn.config.vocab_size) cnn.words, cnn.word_to_id = read_vocab(cnn.vocab_dir) cnn.config.vocab_size = len(cnn.words) #select a subset of word vectors cnn.missing_dir = os.path.join(cnn.base_dir, key + '.' + sub_key + '.missing.txt') missing_words_file = open(cnn.missing_dir, 'w') sub_embeddings = np.random.uniform( -0.0, 0.0, (cnn.config.vocab_size, embedding_dim)) count = 0 for i in range(0, cnn.config.vocab_size): if (cnn.words[i] in word_vector_map ): #word_vector_map.has_key(cnn.words[i])
# 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() # 读取配置文件 if not os.path.exists( vocab_dir): # 如果不存在词汇表,用train_dir中频率最高的vocab_size-1个词构建词汇表 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() # 读分类list 和 分类-id 字典 words, word_to_id = read_vocab(vocab_dir) # 读词汇表list 和 words-id 字典 config.vocab_size = len(words) # 词汇表大小重新设定 model = TextCNN(config) if sys.argv[1] == 'train': train() elif sys.argv[1] == 'test': test() else: raise ValueError("""usage: python run_cnn.py [train / test]""")