def train(self, train_word_lists, train_tag_lists, word2id, tag2id, dev_word_lists, dev_tag_lists): #按句子长短进行排序 #训练集,验证集中句子顺序无需还原 train_word_lists, train_tag_lists, _ = sort_by_lengths( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists, _ = sort_by_lengths( dev_word_lists, dev_tag_lists) B = self.batch_size for e in range(1, self.epoches + 1): #迭代轮次 self.step = 0 losses = 0. for ind in range(0, len(train_tag_lists), B): #每次训练B个句子 batch_sents = train_word_lists[ind:ind + B] batch_tag = train_tag_lists[ind:ind + B] losses += self.train_step(batch_sents, batch_tag, word2id, tag2id) if self.step % BiLSTM_CRF_TrainingConfig.print_step == 0: total_step = (len(train_word_lists) // self.batch_size + 1) print( "Epoch {}, step/total_step: {}/{} Average Loss for one batch:{:.4f}" .format(e, self.step, total_step, losses / self.print_step)) losses = 0.
def train(self, word_lists, tag_lists, dev_word_lists, dev_tag_lists, word2id, tag2id): #对数据集按长度排序 word_lists, tag_lists, _ = sort_by_lengths(word_lists, tag_lists) dev_word_lists, dev_tag_lists, _ = sort_by_lengths( dev_word_lists, dev_tag_lists) print("训练数据总量:{}".format(len(word_lists))) batch_size = self.batch_size epoch_iterator = trange(1, self.epoches + 1, desc="Epoch") for epoch in epoch_iterator: self.step = 0 losses = 0. for idx in trange(0, len(word_lists), batch_size, desc="Iteration"): batch_sents = word_lists[idx:idx + batch_size] batch_tags = tag_lists[idx:idx + batch_size] losses += self.train_step(batch_sents, batch_tags, word2id, tag2id) if self.step % TrainingConfig.print_step == 0: total_step = (len(word_lists) // batch_size + 1) print( "Epoch {}, step/total_step: {}/{} {:.2f}% Loss:{:.4f}". format(epoch, self.step, total_step, 100. * self.step / total_step, losses / self.print_step)) losses = 0. # 每轮结束测试在验证集上的性能,保存最好的一个 val_loss = self.validate(dev_word_lists, dev_tag_lists, word2id, tag2id) print("Epoch {}, Val Loss:{:.4f}".format(epoch, val_loss))
def test(self, test_word_lists, test_tag_lists, word2id, tag2id): test_word_lists, test_tag_lists, indices = sort_by_lengths( test_word_lists, test_tag_lists) tensorized_sent, lengths = tensorized(test_word_lists, word2id) tag_lists = [ test_tag_list[:lengths[i]] for i, test_tag_list in enumerate(test_tag_lists) ] self.best_model.eval() pred_tagid_lists = [] with torch.no_grad(): B = self.batch_size for ind in range(0, len(test_word_lists), B): tensorized_batch_sent = tensorized_sent.to(self.device) batch_tagids = self.best_model.test(tensorized_batch_sent, lengths, tag2id) #[B,L] pred_tagid_lists += batch_tagids id2tag = dict((id, tag) for tag, id in tag2id.items()) pred_tag_lists = [] #[B,L] for i, ids in enumerate(pred_tagid_lists): tag_list = [] for j in range(lengths[i]): tag_list.append(id2tag.get(ids[j])) pred_tag_lists.append(tag_list) return pred_tag_lists, tag_lists
def test(self, word_lists, tag_lists, word2id, tag2id): """返回最佳模型在测试集上的预测结果""" # 数据准备 word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists) tensorized_sents, lengths = tensorized(word_lists, word2id) tensorized_sents = tensorized_sents.to(self.device) self.best_model.eval() with torch.no_grad(): batch_tagids = self.best_model.test(tensorized_sents, lengths, tag2id) # 将id转化为标注 pred_tag_lists = [] id2tag = dict((id_, tag) for tag, id_ in tag2id.items()) for i, ids in enumerate(batch_tagids): tag_list = [] if self.crf: for j in range(lengths[i] - 1): # crf解码过程中,end被舍弃 tag_list.append(id2tag[ids[j].item()]) else: for j in range(lengths[i]): tag_list.append(id2tag[ids[j].item()]) pred_tag_lists.append(tag_list) # indices存有根据长度排序后的索引映射的信息 # 比如若indices = [1, 2, 0] 则说明原先索引为1的元素映射到的新的索引是0, # 索引为2的元素映射到新的索引是1... # 下面根据indices将pred_tag_lists和tag_lists转化为原来的顺序 ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1]) indices, _ = list(zip(*ind_maps)) pred_tag_lists = [pred_tag_lists[i] for i in indices] tag_lists = [tag_lists[i] for i in indices] return pred_tag_lists, tag_lists
def test(self, test_word_lists, test_tag_lists, word2id, tag2id): #要还原句子顺序 test_word_lists, test_tag_lists, indices = sort_by_lengths( test_word_lists, test_tag_lists) tensorized_sent, lengths = tensorized(test_word_lists, word2id) tensorized_tag, lengths = tensorized(test_word_lists, tag2id) tensorized_sent = tensorized_sent.to(self.device) self.best_model.eval() with torch.no_grad(): batch_tagids = self.best_model.test(tensorized_sent, lengths, tag2id) #[B,L] id2tag = dict((id, tag) for tag, id in tag2id.items()) pred_tag_lists = [] #[B,L] for i, ids in enumerate(batch_tagids): tag_list = [] #(L,) if self.crf: for j in range(lengths[i] - 1): tag_list.append( id2tag[ids[j].item()]) #item() 取 tensor中的值,容易忘记 else: for j in range(lengths[i]): tag_list.append(id2tag[ids[j].item()]) pred_tag_lists.append(tag_list) #indices= [1,2,0] 表示 原先索引为1的 新的索引是0 [(0,1) (1,2),(2,0)] 排序后 [(2,0),(0,1),(1,2)] ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1]) indices, _ = list(zip(*ind_maps)) pred_tag_lists = [pred_tag_lists[i] for i in indices] tag_lists = [test_tag_lists[i] for i in indices] return pred_tag_lists, tag_lists