def train(): x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, 600) #获取训练数据每个字的id和对应标签的oe-hot形式 x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, 600) #使用LSTM或者CNN model = TextRNN() model.train() # model = TextCNN() #选择损失函数 Loss = nn.MultiLabelSoftMarginLoss() # Loss = nn.BCELoss() # Loss = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) best_val_acc = 0 for epoch in range(100): i = 0 print('epoch:{}'.format(epoch)) batch_train = batch_iter(x_train, y_train, 64) for x_batch, y_batch in batch_train: i += 1 # print(i) x = np.array(x_batch) y = np.array(y_batch) x = torch.LongTensor(x) y = torch.Tensor(y) # y = torch.LongTensor(y) # x = Variable(x) # y = Variable(y) out = model(x) loss = Loss(out, y) optimizer.zero_grad() loss.backward() optimizer.step() # 对模型进行验证 if i % 90 == 0: los, accracy = evaluate(model, Loss, x_val, y_val) # 此处不需要优化器参数 print('loss:{},accracy:{}'.format(los, accracy)) if accracy > best_val_acc: torch.save(model.state_dict(), 'model_params.pkl') best_val_acc = accracy
def test(): print("Loading test data...") start_time = time.time() x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 在保存和恢复模型时都需要首先运行这一行:tf.train.Saver(),而不是只有保存时需要。 print('Testing...') loss_test, acc_test = evaluate(session, x_test, y_test) # 返回了10000个总测试样本的平均交叉熵损失和平均准率。 msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) batch_size = 128 data_len = len(x_test) num_batch = int((data_len - 1) / batch_size) + 1 y_test_cls = np.argmax(y_test, 1) y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) for i in range(num_batch): start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) feed_dict = { model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0 } y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) # 测试的时候不需要dropout神经元。 print("Precision, Recall and F1-Score...") print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 可以得到准确率 、召回率和F1_score # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif)
def test(): print("Loading test data...") start_time = time.time() x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, args.SEQ_LENGTH) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) # 读取保存的模型 print('Testing...') loss_test, acc_test = evaluate(session, x_test, y_test) msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) batch_size = 128 data_len = len(x_test) num_batch = int((data_len - 1) / batch_size) + 1 y_test_cls = np.argmax(y_test, 1) y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存预测结果 for i in range(num_batch): # 逐批次处理 start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) feed_dict = { model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0 } y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) # 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif)
def test(): print("loading test data ...") start_time=time.time() x_test,y_test=process_file(test_dir,word_to_id,cat_to_id,config.seq_length) session=tf.Session() session.run(tf.global_variables_initializer()) saver=tf.train.Saver() saver.restore(sess=session,save_path=save_path) print("testing.. ") loss_test,acc_test=evaluate(session,x_test,y_test) msg="Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}" print(msg.format(loss_test,acc_test)) batch_size=128 data_len=len(x_test) num_batch=int((data_len-1)/batch_size)+1 y_test_cls=np.argmax(y_test,1)#1指的是第一列数据 y_pred_cls=np.zeros(shape=len(x_test),dtype=np.int32) for i in range(num_batch): start_id=i*batch_size end_id=min((i+1)*batch_size,data_len) feed_dict={ model.input_x:x_test[start_id:end_id], model.keep_prob:1.0 } y_pred_cls[start_id:end_id]=session.run(model.y_pred_cls,feed_dict=feed_dict) #评估 print('precision, recall and f1') print(metrics.classification_report(y_test_cls,y_pred_cls,target_names=categories)) #混淆矩阵 print("confusion matrix") cm=metrics.confusion_matrix(y_test_cls,y_pred_cls) print(cm) time_dif=get_time_dif(start_time) print("time usage ",time_dif)
def train(): print("Configuring TensorBoard and Saver...") # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 tensorboard_dir = 'tensorboard/textrnn' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) # 配置 Saver saver = tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("Loading training and validation data...") # 载入训练集与验证集 start_time = time.time() x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length) x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # 创建session session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and evaluating...') start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False for epoch in range(config.num_epochs): print('Epoch:', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) if total_batch % config.save_per_batch == 0: # 每多少轮次将训练结果写入tensorboard scalar s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, x_val, y_val) # todo if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: # 同上 break
def train(): print("Configuring TensorBoard and Saver...") tensorboard_dir = 'tensorboard/textcnn' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) # 用到 tf.summary 中的方法保存日志数据,用于tensorboard可视化操作。 # 用 tf.summary.scalar 保存标量,一般用来保存loss,accuary,学习率等数据 merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) # 使用 tf.summaries.merge_all() 对所有的汇总操作进行合并 # 将数据写入本地磁盘: tf.summary.FileWriter saver = tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("Loading training and validation data...") start_time = time.time() x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length) x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and evaluating...') start_time = time.time() total_batch = 0 best_acc_val = 0.0 last_improved = 0 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练,防止过拟合。 flag = False for epoch in range(config.num_epochs): print('Epoch:', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) if total_batch % config.save_per_batch == 0: s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, x_val, y_val) # todo if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break
def train(): print("configuring tensorboard and saver") tensorboard_dir="tensorboard/textcnn" if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss",model.loss) tf.summary.scalar("accuracy",model.acc) merged_summary=tf.summary.merge_all() writer=tf.summary.FileWriter(tensorboard_dir) saver=tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("loading training and validation data..") start_time=time.time() x_train,y_train=process_file(train_dir,word_to_id,cat_to_id,config.seq_length) x_val,y_val=process_file(val_dir,word_to_id,cat_to_id,config.seq_length) time_dif=get_time_dif(start_time) print("time usage: ",time_dif) #创建session session=tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print("training and evaluate...") start_time=time.time() total_batch=0 best_acc_val=0.0 last_improved=0 require_improvement=1000 flag=False for epoch in range(config.num_epochs): print("epoch: ",epoch+1) batch_train=batch_iter(x_train,y_train,config.batch_size) for x_batch,y_batch in batch_train: feed_dict=feed_data(x_batch,y_batch,config.dropout_keep_prob) if(total_batch%config.save_per_batch==0): s=session.run(merged_summary,feed_dict=feed_dict) writer.add_summary(s,total_batch) if(total_batch%config.print_per_batch==0): feed_dict[model.keep_prob]=1.0 loss_train,acc_train=session.run([model.loss,model.acc],feed_dict=feed_dict) loss_val,acc_val=evaluate( session,x_val,y_val ) # 保存最好的结果 if acc_val>best_acc_val: best_acc_val=acc_val last_improved=0 saver.save(sess=session,save_path=save_path) improved_str="*" else: improved_str="" time_dif=get_time_dif(start_time) #msg="iter: {0:>6}, train loss: {1:6.2},train acc:{2:7.2},"+"val loss: {3:>6.2},val acc: {4:>7.2%},time:{5},{6}" msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim,feed_dict=feed_dict) total_batch+=1 if total_batch-last_improved>require_improvement: print("no optimization for a long time, auto-stopping...") flag=True break if flag: break
num_classes = 10 #类别数 # Preparing data is usually the most time-consuming part of machine learning. base_dir = r'C:\Users\weke\Desktop\语料搜集\cnews' train_dir = os.path.join(base_dir, 'cnews.train.txt') test_dir = os.path.join(base_dir, 'cnews.test.txt') val_dir = os.path.join(base_dir, 'cnews.val.txt') vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt') if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) vocab_size = len(words) X_train, y_train = process_file(train_dir, word_to_id, cat_to_id, seq_length) X_test, y_test = process_file(val_dir, word_to_id, cat_to_id, seq_length) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') ## Padding input data # Models in Keras (and elsewhere) usually take as input batches of sentences of the same length. # Since sentences usually have different sizes, we "pad" sentences (we add a dummy "padding" token at the end of the # sentences. The input thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence # in the batch. ''' maxlen = 80 # cut texts after this number of words (among top vocab_size most common words) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) '''
test_file = 'cnews.test.txt' val_file = 'cnews.val.txt' # 获取文本的类别及其对应id的字典 categories, cat_to_id = read_category() #print(categories) # 获取训练文本中所有出现过的字及其所对应的id words, word_to_id = read_vocab('cnews.vocab.txt') #print(words) #print(word_to_id) #print(word_to_id) #获取字数 vocab_size = len(words) # 数据加载及分批 # 获取训练数据每个字的id和对应标签的one-hot形式 x_train, y_train = process_file('cnews.train1.txt', word_to_id, cat_to_id, 600) #print('x_train=', x_train) x_val, y_val = process_file('cnews.val.txt', word_to_id, cat_to_id, 600) #设置GPU cuda = torch.device('cuda') x_train, y_train = torch.LongTensor(x_train), torch.Tensor(y_train) x_val, y_val = torch.LongTensor(x_val), torch.Tensor(y_val) train_dataset = Data.TensorDataset(x_train, y_train) train_loader = Data.DataLoader(dataset=train_dataset, batch_size=1280, shuffle=True) val_dataset = Data.TensorDataset(x_val, y_val) val_loader = Data.DataLoader(dataset=val_dataset, batch_size=1280)
}) text = features["text"] title = tf.cast(features["title"], tf.int32) label = tf.cast(features["label"], tf.int32) return text, title, label if __name__ == "__main__": # 数据格式转换 config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length) text_test, x_test, y_test = process_file2(test_dir, word_to_id, cat_to_id, config.seq_length) convert_to_TFRecords([x_train, y_train], base_dir + "/train") # convert_to_TFRecords_withText([text_test, x_test, y_test], base_dir+"/test") # 读取测试 # read_TFRecords_test("sexText/train10.tfrecords") # queue = tf.train.string_input_producer(["sexText/test.tfrecords"], num_epochs=10) # text, title, label = read_example(queue) # # text_batch, title_batch, label_batch = tf.train.batch([text, title, label], batch_size=1, capacity=5000, # num_threads=1) # count = 0 # with tf.Session() as sess:
""" words, word_to_id = read_vocab(vocab_dir) categories, cat_to_id = read_category() # vocab_size = len(words) return words, word_to_id, categories, cat_to_id if __name__ == "__main__": test_dir = 'file/cnews/cnews.test.txt' vocab_dir = 'file/cnews/vocab.txt' train_dir = 'file/cnews/cnews.train.txt' words, word_to_id, categories, cat_to_id = load_data(vocab_dir) x_pad, y_pad = process_file(train_dir, word_to_id, cat_to_id, max_length=5000) x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, max_length=5000) X = x_pad.T Y = y_pad.T x_test = x_test.T y_test = y_test.T learning_rate = 1e-3 num_interantion = 100 print("---------------start------------------------")
train_epochs = 1000 batch_size = 256 lr = 0.001 ######################################### # 获取文本的类别及其对应id的字典 categories, cat_to_id = read_category() print(categories) # 获取训练文本中所有出现过的字及其所对应的id words, word_to_id = read_vocab('cnews.vocab.txt') vocab_size = len(words) # 数据加载及分批 # 获取训练数据每个字的id和对应标签的one-hot形式 x_train, y_train = process_file('cnews.train.txt', word_to_id, cat_to_id, 600) x_test, y_test = process_file('cnews.val.txt', word_to_id, cat_to_id, 600) cuda = torch.device('cuda') x_train, y_train = torch.LongTensor(x_train), torch.LongTensor(y_train) x_test, y_test = torch.LongTensor(x_test), torch.LongTensor(y_test) train_dataset = Data.TensorDataset(x_train, y_train) train_loader = Data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) test_dataset = Data.TensorDataset(x_test, y_test) test_loader = Data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True) #################################################################################