def train(config): # tag_length should not include start/end tags model = BiLSTM_CRF(config) optimizer = optim.Adam(model.parameters(), config.lr) # f1 score of validation dataset valid_f1 = -1000 stop = False start_t = time.time() for epoch in range(config.n_epoch): # or bert_test.zero_grad() since all bert_test parameters are in optimizer if stop: break optimizer.zero_grad() _, batch_inputs, batch_outputs, masks, length = random_batch(embeddings, x_train, y_train, config.batch_size) loss = model.neg_log_likelihood(batch_inputs, batch_outputs, masks, length) loss.backward() optimizer.step() if (epoch + 1) % config.eval_freq == 0: print('Epoch: {:04d}, loss: {:.4f}, seconds: {:.4f}'.format(epoch, loss, time.time() - start_t)) entities, new_valid_f1, prec, recall = get_f1(model, config, test=False) print('[Validation]f1 score from {:.6f} to {:.6f}'.format(valid_f1, new_valid_f1)) print('[Validation]precision: {}, recall: {}\n'.format(prec, recall)) if epoch > 100000 and (abs(new_valid_f1 - valid_f1) < 0.001 or new_valid_f1 < valid_f1): stop = True if new_valid_f1 > valid_f1: valid_f1 = new_valid_f1 torch.save(model.state_dict(), config.model_save_path)
def singel_predict(model_path, content, char_to_id_json_path, batch_size, embedding_dim, hidden_dim, num_layers, sentence_length, offset, target_type_list, tag2id): char_to_id = json.load( open(char_to_id_json_path, mode="r", encoding="utf-8")) # 将字符串转为码表id列表 char_ids = content_to_id(content, char_to_id) # 处理成 batch_size * sentence_length 的 tensor 数据 # 定义模型输入列表 model_inputs_list, model_input_map_list = build_model_input_list( content, char_ids, batch_size, sentence_length, offset) # 加载模型 model = BiLSTM_CRF(vocab_size=len(char_to_id), tag_to_ix=tag2id, embedding_dim=embedding_dim, hidden_dim=hidden_dim, batch_size=batch_size, num_layers=num_layers, sequence_length=sentence_length) # 加载模型字典 model.load_state_dict(torch.load(model_path)) tag_id_dict = { v: k for k, v in tag_to_id.items() if k[2:] in target_type_list } # 定义返回实体列表 entities = [] with torch.no_grad(): for step, model_inputs in enumerate(model_inputs_list): prediction_value = model(model_inputs) # 获取每一行预测结果 for line_no, line_value in enumerate(prediction_value): # 定义将要识别的实体 entity = None # 获取当前行每个字的预测结果 for char_idx, tag_id in enumerate(line_value): # 若预测结果 tag_id 属于目标字典数据 key 中 if tag_id in tag_id_dict: # 取符合匹配字典id的第一个字符,即B, I tag_index = tag_id_dict[tag_id][0] # 计算当前字符确切的下标位置 current_char = model_input_map_list[step][line_no][ char_idx] # 若当前字标签起始为 B, 则设置为实体开始 if tag_index == "B": entity = current_char # 若当前字标签起始为 I, 则进行字符串追加 elif tag_index == "I" and entity: entity += current_char # 当实体不为空且当前标签类型为 O 时,加入实体列表 if tag_id == tag_to_id["O"] and entity: # 满足当前字符为O,上一个字符为目标提取实体结尾时,将其加入实体列表 entities.append(entity) # 重置实体 entity = None return entities
def get_f1(model, config, test=True): if test: model = BiLSTM_CRF(config) model.load_state_dict(torch.load(config.model_save_path)) model.eval() x, y = x_test, y_test else: x, y = x_valid, y_valid n_batch = math.ceil(len(x) / config.batch_size) entity_pred, entity_true = [], [] for i in range(n_batch): start = i * config.batch_size end = (i + 1) * config.batch_size if i != (n_batch - 1) else len(x) batch_ids, batch_inputs, batch_outputs, masks, length = random_batch(embeddings, x[start:end], y[start:end], end - start, False) scores, sequences = model(batch_inputs, masks, length) entity_pred += retrieve_entity(batch_ids, sequences, masks, id2tag, id2word) entity_true += retrieve_entity(batch_ids, batch_outputs.numpy(), masks, id2tag, id2word) union = [i for i in entity_pred if i in entity_true] precision = float(len(union)) / len(entity_pred) recall = float(len(union)) / len(entity_true) f1_score = 2 * precision * recall / (precision + recall) if len(union) != 0 else 0.0 return entity_pred, f1_score, precision, recall
def main(_): word2idx = pickle_load(FLAGS.word2idx_path) word2vec = pickle_load(FLAGS.word2vec_path) embedding_size = list(word2vec.values())[0].shape[0] word_embeddings = np.zeros([len(word2idx) + 1, embedding_size]) # index 0 for UNK token for word in word2idx.keys(): word_embeddings[word2idx[word]] = word2vec[word] FLAGS.n_words = word_embeddings.shape[0] FLAGS.embedding_size = word_embeddings.shape[1] train_data = read_data(FLAGS.train_data_path, word2idx, FLAGS.max_sequence_len) valid_data = read_data(FLAGS.valid_data_path, word2idx, FLAGS.max_sequence_len) graph = tf.Graph() with tf.Session(graph=graph) as sess: model = BiLSTM_CRF(FLAGS, sess, word_embeddings) model.build_model() model.train(train_data, valid_data)
if __name__ == '__main__': char_embedding_mat = np.load('data/char_embedding_matrix.npy') X_test = np.load('data/X_test.npy') y_test = np.load('data/y_test.npy') true_path = 'res/true.txt' predict_path = 'res/predict.txt' ner_model = BiLSTM_CRF(n_input=500, n_vocab=char_embedding_mat.shape[0], n_embed=200, embedding_mat=char_embedding_mat, keep_prob=0.5, n_lstm=150, keep_prob_lstm=0.5, n_entity=13, optimizer='adam', batch_size=512, epochs=100) model_file = 'checkpoints/bilstm_crf_weights_best.hdf5' ner_model.model.load_weights(model_file) y_pred = ner_model.model.predict(X_test[:, :]) char2vec, n_char, n_embed, char2index = p.get_char2object() index2char = {i: w for w, i in char2index.items()}
X = np.load('data/train.npy') y = np.load('data/y.npy') X_train = X[:600] y_train = y[:600] # ner_model = BiLSTM_CRF(n_input=200, n_vocab=char_embedding_mat.shape[0], # n_embed=100, embedding_mat=char_embedding_mat, # keep_prob=0.5, n_lstm=100, keep_prob_lstm=0.8, # n_entity=7, optimizer='adam', batch_size=64, epochs=500) ner_model = BiLSTM_CRF(n_input=300, n_vocab=char_embedding_mat.shape[0], n_embed=100, embedding_mat=char_embedding_mat, keep_prob=0.5, n_lstm=256, keep_prob_lstm=0.6, n_entity=3, optimizer='adam', batch_size=16, epochs=500) cp_folder, cp_file = 'checkpoints', 'bilstm_crf_weights_best_attention_experiment2.hdf5' log_filepath = 'logs/bilstm_crf_summaries' cb = [ ModelCheckpoint(os.path.join(cp_folder, cp_file), monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True,
x_sent[0, :x_sent_1.shape[0]] = x_sent_1 x_sent[1, :x_sent_2.shape[0]] = x_sent_2 # create a batch of 2 samples with their proper padding x_tags = torch.full((2, max_sent_size), Const.PAD_TAG_ID, dtype=torch.long) x_tags[0, :x_tags_1.shape[0]] = x_tags_1 x_tags[1, :x_tags_2.shape[0]] = x_tags_2 # mask tensor with shape (batch_size, max_sent_size) mask = (x_tags != Const.PAD_TAG_ID).float() # get a reversed dict mapping int to str ix_to_tag = {ix: tag for tag, ix in tag_to_ix.items()} # see bilstm_crf.py model = BiLSTM_CRF(len(word_to_ix), len(tag_to_ix)) optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) # Check predictions before training print('Predictions before training:') with torch.no_grad(): scores, seqs = model(x_sent, mask=mask) for score, seq in zip(scores, seqs): str_seq = " ".join(ids_to_tags(seq, ix_to_tag)) print('%.2f: %s' % (score.item(), str_seq)) # Make sure prepare_sequence from earlier in the LSTM section is loaded for epoch in range( 300): # normally you would NOT do 300 epochs, it is toy data # Step 1. Remember that Pytorch accumulates gradients.
def main(config): trainDataPath = config['data']['trainDataPath'] validDataPath = config['data']['validDataPath'] testDataPath = config['data']['testDataPath'] modelName = config['modelName'] batchSize = config['model']['batchSize'] epochNum = config['model']['epochNum'] earlyStop = config['model']['earlyStop'] learningRate = config['model']['learningRate'] modelSavePath = config['model']['modelSavePath'] #GPU/CPU DEVICE = config['DEVICE'] trianDataset = NERDataset(trainDataPath, config) validDataset = NERDataset(validDataPath, config) testDataset = NERDataset(testDataPath, config) trainIter = data.DataLoader(dataset=trianDataset, batch_size=batchSize, shuffle=True, num_workers=4, collate_fn=pad) validIter = data.DataLoader(dataset=validDataset, batch_size=batchSize, shuffle=False, num_workers=4, collate_fn=pad) testIter = data.DataLoader(dataset=testDataset, batch_size=batchSize, shuffle=False, num_workers=4, collate_fn=pad) if modelName == 'bilstm': net = BiLSTM(config) train = bilstmTrain eval = bilstmEval if torch.cuda.device_count() > 1: net = nn.DataParallel(net) if modelName == 'bilstm_crf': net = BiLSTM_CRF(config) train = bilstmCRFTrain eval = bilstmCRFEval if modelName == 'transformer_crf': net = Transformer_CRF(config) train = transformerCRFTrain eval = transformerCRFEval if modelName == 'cnn': net = CNN(config) train = cnnTrain eval = cnnEval net = net.to(DEVICE) lossFunction = nn.NLLLoss() optimizer = optim.Adam(net.parameters(), lr=learningRate, betas=(0.9, 0.999), eps=1e-08) earlyNumber, beforeLoss, maxScore = 0, sys.maxsize, -1 #开始训练 for epoch in range(epochNum): print('第%d次迭代: ' % epoch) totalLoss = train(net, trainIter, optimizer=optimizer, criterion=lossFunction, DEVICE=DEVICE) print('训练损失为: %f' % totalLoss) totalLoss, f1Score = eval(net, validIter, criterion=lossFunction, DEVICE=DEVICE) if f1Score > maxScore: maxScore = f1Score torch.save(net.state_dict(), modelSavePath) print('验证损失为:%f f1Score:%f / %f' % (totalLoss, f1Score, maxScore)) if f1Score < maxScore: earlyNumber += 1 print('earyStop: %d/%d' % (earlyNumber, earlyStop)) else: earlyNumber = 0 if earlyNumber >= earlyStop: break print('\n') #加载最优模型 net.load_state_dict(torch.load(modelSavePath)) totalLoss, f1Score = eval(net, testIter, criterion=lossFunction, DEVICE=DEVICE) print('测试损失为: %f, f1Score: %f' % (totalLoss, f1Score))
def main(config): trainDataPath = config['data']['trainDataPath'] validDataPath = config['data']['validDataPath'] testDataPath = config['data']['testDataPath'] batchSize = config['model']['batchSize'] #GPU/CPU DEVICE = config['DEVICE'] trianDataset = NERDataset(trainDataPath, config) validDataset = NERDataset(validDataPath, config) testDataset = NERTestDataset(testDataPath, config) trainIter = data.DataLoader(dataset=trianDataset, batch_size=batchSize, shuffle=True, num_workers=6, collate_fn=pad) validIter = data.DataLoader(dataset=validDataset, batch_size=batchSize, shuffle=False, num_workers=6, collate_fn=pad) testIter = data.DataLoader(dataset=testDataset, batch_size=batchSize, shuffle=False, num_workers=6, collate_fn=testPad) if config['modelName'] == 'bilstm': net = BiLSTM(config) config['modelSavePath'] = config['data']['BiLSTMSavePath'] modelSavePath = config['modelSavePath'] config['submitDataPath'] = config['data']['BiLSTMSubmitDataPath'] train = bilstm_train test = bilstm_test if config['modelName'] == 'bilstm_crf': net = BiLSTM_CRF(config) config['modelSavePath'] = config['data']['BiLSTMCRFSavePath'] modelSavePath = config['modelSavePath'] config['submitDataPath'] = config['data']['BiLSTMCRFSubmitDataPath'] train = bilstm_crf_train test = bilstm_crf_test if config['modelName'] == 'transformer_cnn': net = Transformer_CNN(config) config['modelSavePath'] = config['data']['TransformerCNNSavePath'] config['submitDataPath'] = config['data'][ 'TransformerCNNSubmitDataPath'] modelSavePath = config['modelSavePath'] train = transformer_cnn_train test = transformer_cnn_test if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net = net.to(DEVICE) if os.path.exists(modelSavePath): net.load_state_dict(torch.load(modelSavePath)) #if config['train']: #train(net, trainIter, validIter, config) #if config['test']: test(net, testIter, config)
def train(data_loader, data_size, batch_size, embedding_dim, hidden_dim, sentence_length, num_layers, epochs, learning_rate, tag2id, model_saved_path, train_log_path, validate_log_path, train_history_image_path): ''' data_loader: 数据集的加载器, 之前已经通过load_dataset完成了构造 data_size: 训练集和测试集的样本数量 batch_size: 批次的样本个数 embedding_dim: 词嵌入的维度 hidden_dim: 隐藏层的维度 sentence_length: 文本限制的长度 num_layers: 神经网络堆叠的LSTM层数 epochs: 训练迭代的轮次 learning_rate: 学习率 tag2id: 标签到id的映射字典 model_saved_path: 模型保存的路径 train_log_path: 训练日志保存的路径 validate_log_path: 测试集日志保存的路径 train_history_image_path: 训练数据的相关图片保存路径 ''' # 将中文字符和id的对应码表加载进内存 char2id = json.load( open("./data/char_to_id.json", mode="r", encoding="utf-8")) # 初始化BiLSTM_CRF模型 model = BiLSTM_CRF(vocab_size=len(char2id), tag_to_ix=tag2id, embedding_dim=embedding_dim, hidden_dim=hidden_dim, batch_size=batch_size, num_layers=num_layers, sequence_length=sentence_length) # 定义优化器, 使用SGD作为优化器(pytorch中Embedding支持的GPU加速为SGD, SparseAdam) # 参数说明如下: # lr: 优化器学习率 # momentum: 优化下降的动量因子, 加速梯度下降过程 # optimizer = optim.SGD(params=model.parameters(), lr=learning_rate, momentum=0.85, weight_decay=1e-4) optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4) # 设定优化器学习率更新策略 # 参数说明如下: # optimizer: 优化器 # step_size: 更新频率, 每过多少个epoch更新一次优化器学习率 # gamma: 学习率衰减幅度, # 按照什么比例调整(衰减)学习率(相对于上一轮epoch), 默认0.1 # 例如: # 初始学习率 lr = 0.5, step_size = 20, gamma = 0.1 # lr = 0.5 if epoch < 20 # lr = 0.05 if 20 <= epoch < 40 # lr = 0.005 if 40 <= epoch < 60 # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.8) # 初始化存放训练中损失, 准确率, 召回率, F1等数值指标 train_loss_list = [] train_acc_list = [] train_recall_list = [] train_f1_list = [] train_log_file = open(train_log_path, mode="w", encoding="utf-8") # 初始化存放测试中损失, 准确率, 召回率, F1等数值指标 validate_loss_list = [] validate_acc_list = [] validate_recall_list = [] validate_f1_list = [] validate_log_file = open(validate_log_path, mode="w", encoding="utf-8") # 利用tag2id生成id到tag的映射字典 id2tag = {v: k for k, v in tag2id.items()} # 利用char2id生成id到字符的映射字典 id2char = {v: k for k, v in char2id.items()} # 按照参数epochs的设定来循环epochs次 for epoch in range(epochs): # 在进度条打印前, 先输出当前所执行批次 tqdm.write("Epoch {}/{}".format(epoch + 1, epochs)) # 定义要记录的正确总实体数, 识别实体数以及真实实体数 total_acc_entities_length, \ total_predict_entities_length, \ total_gold_entities_length = 0, 0, 0 # 定义每batch步数, 批次loss总值, 准确度, f1值 step, total_loss, correct, f1 = 1, 0.0, 0, 0 # 开启当前epochs的训练部分 for inputs, labels in tqdm(data_loader["train"]): # 将数据以Variable进行封装 inputs, labels = Variable(inputs), Variable(labels) # 在训练模型期间, 要在每个样本计算梯度前将优化器归零, 不然梯度会被累加 optimizer.zero_grad() # 此处调用的是BiLSTM_CRF类中的neg_log_likelihood()函数 loss = model.neg_log_likelihood(inputs, labels) # 获取当前步的loss, 由tensor转为数字 step_loss = loss.data # 累计每步损失值 total_loss += step_loss # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数 best_path_list = model(inputs) # 模型评估指标值获取包括:当前批次准确率, 召回率, F1值以及对应的实体个数 step_acc, step_recall, f1_score, acc_entities_length, \ predict_entities_length, gold_entities_length = evaluate(inputs.tolist(), labels.tolist(), best_path_list, id2char, id2tag) # 训练日志内容 ''' log_text = "Epoch: %s | Step: %s " \ "| loss: %.5f " \ "| acc: %.5f " \ "| recall: %.5f " \ "| f1 score: %.5f" % \ (epoch, step, step_loss, step_acc, step_recall,f1_score) ''' # 分别累计正确总实体数、识别实体数以及真实实体数 total_acc_entities_length += acc_entities_length total_predict_entities_length += predict_entities_length total_gold_entities_length += gold_entities_length # 对损失函数进行反向传播 loss.backward() # 通过optimizer.step()计算损失, 梯度和更新参数 optimizer.step() # 记录训练日志 # train_log_file.write(log_text + "\n") step += 1 # 获取当前epochs平均损失值(每一轮迭代的损失总值除以总数据量) epoch_loss = total_loss / data_size["train"] # 计算当前epochs准确率 if total_predict_entities_length > 0: total_acc = total_acc_entities_length / total_predict_entities_length # 计算当前epochs召回率 if total_gold_entities_length > 0: total_recall = total_acc_entities_length / total_gold_entities_length # 计算当前epochs的F1值 total_f1 = 0 if total_acc + total_recall != 0: total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall) log_text = "Epoch: %s " \ "| mean loss: %.5f " \ "| total acc: %.5f " \ "| total recall: %.5f " \ "| total f1 scroe: %.5f" % (epoch, epoch_loss, total_acc, total_recall, total_f1) # 当前epochs训练后更新学习率, 必须在优化器更新之后 # scheduler.step() # 记录当前epochs训练loss值(用于图表展示), 准确率, 召回率, f1值 train_loss_list.append(epoch_loss) train_acc_list.append(total_acc) train_recall_list.append(total_recall) train_f1_list.append(total_f1) train_log_file.write(log_text + "\n") # 定义要记录的正确总实体数, 识别实体数以及真实实体数 total_acc_entities_length, \ total_predict_entities_length, \ total_gold_entities_length = 0, 0, 0 # 定义每batch步数, 批次loss总值, 准确度, f1值 step, total_loss, correct, f1 = 1, 0.0, 0, 0 # 开启当前epochs的验证部分 with torch.no_grad(): for inputs, labels in tqdm(data_loader["validation"]): # 将数据以 Variable 进行封装 inputs, labels = Variable(inputs), Variable(labels) # 此处调用的是 BiLSTM_CRF 类中的 neg_log_likelihood 函数 # 返回最终的 CRF 的对数似然结果 try: loss = model.neg_log_likelihood(inputs, labels) except: continue # 获取当前步的 loss 值,由 tensor 转为数字 step_loss = loss.data # 累计每步损失值 total_loss += step_loss # 获取解码最佳路径列表, 此时调用的是BiLSTM_CRF类中的forward()函数 best_path_list = model(inputs) # 模型评估指标值获取: 当前批次准确率, 召回率, F1值以及对应的实体个数 step_acc, step_recall, f1_score, acc_entities_length, \ predict_entities_length, gold_entities_length = evaluate(inputs.tolist(), labels.tolist(), best_path_list, id2char, id2tag) # 训练日志内容 ''' log_text = "Epoch: %s | Step: %s " \ "| loss: %.5f " \ "| acc: %.5f " \ "| recall: %.5f " \ "| f1 score: %.5f" % \ (epoch, step, step_loss, step_acc, step_recall,f1_score) ''' # 分别累计正确总实体数、识别实体数以及真实实体数 total_acc_entities_length += acc_entities_length total_predict_entities_length += predict_entities_length total_gold_entities_length += gold_entities_length # 记录验证集损失日志 # validate_log_file.write(log_text + "\n") step += 1 # 获取当前批次平均损失值(每一批次损失总值除以数据量) epoch_loss = total_loss / data_size["validation"] # 计算总批次准确率 if total_predict_entities_length > 0: total_acc = total_acc_entities_length / total_predict_entities_length # 计算总批次召回率 if total_gold_entities_length > 0: total_recall = total_acc_entities_length / total_gold_entities_length # 计算总批次F1值 total_f1 = 0 if total_acc + total_recall != 0.0: total_f1 = 2 * total_acc * total_recall / (total_acc + total_recall) log_text = "Epoch: %s " \ "| mean loss: %.5f " \ "| total acc: %.5f " \ "| total recall: %.5f " \ "| total f1 scroe: %.5f" % (epoch, epoch_loss, total_acc, total_recall, total_f1) # 记录当前批次验证loss值(用于图表展示)准确率, 召回率, f1值 validate_loss_list.append(epoch_loss) validate_acc_list.append(total_acc) validate_recall_list.append(total_recall) validate_f1_list.append(total_f1) validate_log_file.write(log_text + "\n") # 保存模型 torch.save(model.state_dict(), model_saved_path) # 将loss下降历史数据转为图片存储 save_train_history_image(train_loss_list, validate_loss_list, train_history_image_path, "Loss") # 将准确率提升历史数据转为图片存储 save_train_history_image(train_acc_list, validate_acc_list, train_history_image_path, "Acc") # 将召回率提升历史数据转为图片存储 save_train_history_image(train_recall_list, validate_recall_list, train_history_image_path, "Recall") # 将F1上升历史数据转为图片存储 save_train_history_image(train_f1_list, validate_f1_list, train_history_image_path, "F1") print("train Finished".center(100, "-"))
F1 = 2 * P * R / (P + R + 1e-18) loss = model.neg_log_likelihood(sentence_in, targets) loss_ += (loss.item() / len(sentence)) count += 1 average_loss = loss_ / (count + 1e-18) print('\rtesting:%d/%d\t loss:%0.5f \t' % (count, len(valdata), average_loss), end='', flush=True) return F1, average_loss #%% # 定义模型和优化器 model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) # model.load_state_dict(torch.load('../model_dict/params.pkl')) model.cuda() # 调用cuda #%% optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) # optimizer = optim.Adam(model.parameters(),lr = 1e-4, amsgrad=True, weight_decay=1e-5) #%% # 开始训练 for epoch in range( 0, 10): # again, normally you would NOT do 300 epochs, it is toy data inter = 0 out_entity_num = 0 target_entity_num = 0 loss_ = 0 for iter, (sentence, tags) in enumerate(train):
pbar(step, {'loss': loss.item(), "acc": acc.acc}) step += 1 def test(model, test_data): # Check predictions after training acc = Acc() with torch.no_grad(): for test_sentence, test_tag in test_data: precheck_sent = prepare_sequence(test_sentence, word_to_ix) predict = model(precheck_sent) predict_tags = np.array(predict[1]) predict_tags = torch.from_numpy(predict_tags) precheck_tags = torch.tensor([tag_to_ix[t] for t in test_tag], dtype=torch.long) acc.update(predict_tags, precheck_tags) print("acc:{%.4f}" % acc.acc) # We got it! if __name__ == '__main__': tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4} training_data = get_train_data() word_to_ix = get_word_to_ix(training_data=training_data) model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) test_data = training_data test(model, test_data) train(model, training_data) test(model, test_data)
def create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings, label_weights, cell="lstm", num_layers=1, dropout_rate=0.1, ): """ Create a model architecture. Args: bert_config: is_training: input_ids: input_mask: segment_ids: label_ids: [batch_size=32, ] num_labels: int, 类别个数 use_one_hot_embeddings: label_weights: [batch_size=32, num_labels=2] """ # output_layer: [batch_size=32, hidden_size=768] # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size] # with tf.Session(): # print( # "{} input_ids={}".format(input_ids.shape, tf.reshape(input_ids, [-1, FLAGS.max_token_length]).eval().shape)) # with tf.Session() as sess: # input_ids = torch.tensor(sess.run(input_ids).eval(), dtype=torch.long) # sess.close() input_ids = tf.reshape(input_ids, [-1, FLAGS.max_token_length]) _, embedding = model(input_ids) print("input_ids={}".format(input_ids.shape)) # hidden_size: 768 print("{} embedding".format(embedding.shape)) hidden_size = embedding.shape[-1].value max_seq_length = embedding.shape[1].value # 算序列真实长度 used = tf.sign(tf.abs(input_ids)) true_lengths = tf.reduce_sum( used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的真实序列长度 print("{} true_lengths".format(true_lengths)) # 添加Bi-LSTM+CRF Layer blstm_crf = BiLSTM_CRF(embedding_inputs=embedding, hidden_units_num=hidden_size, cell_type=cell, num_layers=num_layers, dropout_rate=dropout_rate, initializers=initializers, num_labels=num_labels, sequence_length=max_seq_length, tag_indices=label_ids, sequence_lengths=true_lengths, is_training=is_training) (loss, per_example_loss, logits, probabilities) = blstm_crf.add_blstm_crf_layer(crf_only=True) return (loss, per_example_loss, logits, probabilities)
if index % log_interval == 0: print('Epoch:{0}-{1}/{2}, Batch Loss:{3}'.format( epoch, index, batches, loss.item() / sources.size(0))) loss.backward() total_loss += loss.item() op.step() return total_loss if __name__ == '__main__': epochs = 1 embedding_dim = hidden_dim = 300 dataset = QuestionTag() data_loader = DataLoader(dataset, batch_size=1, shuffle=True) model = BiLSTM_CRF(len(dataset.word2idx), dataset.tag2idx, embedding_dim, hidden_dim).cuda() optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) start = time.time() for epoch in range(1, epochs + 1): loss = train(epoch, data_loader, model, optimizer) print('Epoch:{0}, Loss:{1}, Elapsed-time:{2}'.format( epoch, loss / len(dataset), round(time.time() - start, 2))) print('#' * 100) torch.save(model, './check_point/model.pkl')
def main(): EPOCHS = 500 EARLY_STOP_EPOCHS = 5 SPLIT_WORDS = 'first' # Init Train Dataset posdataset = POSTrainDataset(data_dir, dataset_choice=dataset, unk_chance=0) loader = DataLoader(posdataset) # Criterion to for loss (weighted) weighted_loss = torch.ones(len(posdataset.ttoi)) for key in posdataset.entities: weighted_loss[posdataset.ttoi[key]] = posdataset.entities[key] weighted_loss = weighted_loss.to(device) criterion = nn.CrossEntropyLoss(weight=weighted_loss) if model_architecture == "bertlstm": model = BERT_BiLSTM_CRF(len(posdataset.wtoi), posdataset.ttoi, HIDDEN_DIM, N_LAYERS) # No grad Bert layer for p in model.model.parameters(): p.requires_grad = False elif model_architecture == "bilstm": EMBEDDING_SIZE = 256 model = BiLSTM_CRF(len(posdataset.wtoi), posdataset.ttoi, EMBEDDING_SIZE, HIDDEN_DIM, N_LAYERS, DROPOUT) model.to(device) optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=ADAMEPS) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, SCHEDULER_GAMMA) model_name = f"{model_architecture}_h{HIDDEN_DIM}_n{N_LAYERS}_lr{LEARNING_RATE:f}_d{DROPOUT}_{dataset}" print(f"Running for {model_name}") # Implement Early stop early_stop = 0 # Model training and eval best_loss = sys.maxsize train_losses = [] eval_losses = [] eval_recall = [] eval_precision = [] for epoch in range(1, EPOCHS + 1): start = time.time() scheduler.step() # Toggle Train set posdataset.train = True trainloss = train(loader, model, optimizer, criterion, device, SPLIT_WORDS) train_losses.append(trainloss) # Toggle Validation Set posdataset.train = False loss, accuracy, precision, recall = eval(loader, model, criterion, device, SPLIT_WORDS) eval_losses.append(loss) eval_recall.append(recall) eval_precision.append(precision) time_taken = time.time() - start print( 'Epoch {}, Training Loss: {}, Evaluation Loss: {}, Evaluation Accuracy: {}, Evaluation Precision: {}, Evaluation Recall: {}, time taken: {:.3f}s' .format(epoch, trainloss, loss, accuracy, precision, recall, time_taken), flush=True) # Check if current loss is better than previous if loss < best_loss: best_loss = loss torch.save(model, output_model_dir / '{}.pt'.format(model_name)) early_stop = 0 # If loss has stagnated, early stop else: early_stop += 1 if early_stop >= EARLY_STOP_EPOCHS: print('Early Stopping') break # Plot respective graphs for visualisation plt.figure() plt.title('{} Model Training'.format(model_name)) plt.xlabel('Epoch') plt.ylabel('Loss') plt.plot(train_losses) plt.savefig('{}_Training.png'.format(model_name)) plt.figure() plt.title('{} Model Evaluation'.format(model_name)) plt.xlabel('Epoch') plt.ylabel('Loss') plt.plot(eval_losses) plt.savefig('{}_EvalLoss.png'.format(model_name)) plt.figure() plt.title('{} Model Evaluation'.format(model_name)) plt.xlabel('Epoch') plt.ylabel('Precision') plt.plot(eval_precision) plt.savefig('{}_EvalPrec.png'.format(model_name)) plt.figure() plt.title('{} Model Evaluation'.format(model_name)) plt.xlabel('Epoch') plt.ylabel('Recall') plt.plot(eval_recall) plt.savefig('{}_EvalRecall.png'.format(model_name))
'Drug': 5, 'Frequency': 6, 'Amount': 7, 'Method': 8, 'Treatment': 9, 'Operation': 10, 'Anatomy': 11, 'Level': 12, 'Duration': 13, 'SideEff': 14, 'O': 15, START_TAG: 16, STOP_TAG: 17 } model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) model.load_state_dict( torch.load('../model_dict/hidim512_1_0.6659692762422823_params.pkl')) model.cuda() # 调用cuda base_path = '/home/lingang/chris/knowledge_graph' # base_path = '/media/chris/D/challenge/knowledge_graph_rename' file_txt_path = os.path.join(base_path, 'dataset', 'test_data') result_dict = {} with torch.no_grad(): for p in os.listdir(file_txt_path): p1 = os.path.join(file_txt_path, p) output = []