コード例 #1
0
ファイル: bilstm_crf_.py プロジェクト: OpenNLPhub/ChineseNER
    def train(self, train_word_lists, train_tag_lists, word2id, tag2id,
              dev_word_lists, dev_tag_lists):
        #按句子长短进行排序
        #训练集,验证集中句子顺序无需还原
        train_word_lists, train_tag_lists, _ = sort_by_lengths(
            train_word_lists, train_tag_lists)
        dev_word_lists, dev_tag_lists, _ = sort_by_lengths(
            dev_word_lists, dev_tag_lists)

        B = self.batch_size

        for e in range(1, self.epoches + 1):
            #迭代轮次
            self.step = 0
            losses = 0.
            for ind in range(0, len(train_tag_lists), B):
                #每次训练B个句子
                batch_sents = train_word_lists[ind:ind + B]
                batch_tag = train_tag_lists[ind:ind + B]

                losses += self.train_step(batch_sents, batch_tag, word2id,
                                          tag2id)

                if self.step % BiLSTM_CRF_TrainingConfig.print_step == 0:
                    total_step = (len(train_word_lists) // self.batch_size + 1)
                    print(
                        "Epoch {}, step/total_step: {}/{} Average Loss for one batch:{:.4f}"
                        .format(e, self.step, total_step,
                                losses / self.print_step))
                    losses = 0.
コード例 #2
0
    def train(self, word_lists, tag_lists, dev_word_lists, dev_tag_lists,
              word2id, tag2id):
        #对数据集按长度排序
        word_lists, tag_lists, _ = sort_by_lengths(word_lists, tag_lists)
        dev_word_lists, dev_tag_lists, _ = sort_by_lengths(
            dev_word_lists, dev_tag_lists)

        print("训练数据总量:{}".format(len(word_lists)))

        batch_size = self.batch_size
        epoch_iterator = trange(1, self.epoches + 1, desc="Epoch")
        for epoch in epoch_iterator:
            self.step = 0
            losses = 0.
            for idx in trange(0, len(word_lists), batch_size,
                              desc="Iteration"):
                batch_sents = word_lists[idx:idx + batch_size]
                batch_tags = tag_lists[idx:idx + batch_size]
                losses += self.train_step(batch_sents, batch_tags, word2id,
                                          tag2id)

                if self.step % TrainingConfig.print_step == 0:
                    total_step = (len(word_lists) // batch_size + 1)
                    print(
                        "Epoch {}, step/total_step: {}/{} {:.2f}% Loss:{:.4f}".
                        format(epoch, self.step, total_step,
                               100. * self.step / total_step,
                               losses / self.print_step))
                    losses = 0.
            # 每轮结束测试在验证集上的性能,保存最好的一个
            val_loss = self.validate(dev_word_lists, dev_tag_lists, word2id,
                                     tag2id)
            print("Epoch {}, Val Loss:{:.4f}".format(epoch, val_loss))
コード例 #3
0
ファイル: bilstm_crf_.py プロジェクト: OpenNLPhub/ChineseNER
    def test(self, test_word_lists, test_tag_lists, word2id, tag2id):
        test_word_lists, test_tag_lists, indices = sort_by_lengths(
            test_word_lists, test_tag_lists)
        tensorized_sent, lengths = tensorized(test_word_lists, word2id)
        tag_lists = [
            test_tag_list[:lengths[i]]
            for i, test_tag_list in enumerate(test_tag_lists)
        ]
        self.best_model.eval()
        pred_tagid_lists = []
        with torch.no_grad():
            B = self.batch_size
            for ind in range(0, len(test_word_lists), B):
                tensorized_batch_sent = tensorized_sent.to(self.device)
                batch_tagids = self.best_model.test(tensorized_batch_sent,
                                                    lengths, tag2id)  #[B,L]
                pred_tagid_lists += batch_tagids

        id2tag = dict((id, tag) for tag, id in tag2id.items())
        pred_tag_lists = []  #[B,L]
        for i, ids in enumerate(pred_tagid_lists):
            tag_list = []
            for j in range(lengths[i]):
                tag_list.append(id2tag.get(ids[j]))
            pred_tag_lists.append(tag_list)

        return pred_tag_lists, tag_lists
コード例 #4
0
    def test(self, word_lists, tag_lists, word2id, tag2id):
        """返回最佳模型在测试集上的预测结果"""
        # 数据准备
        word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists)
        tensorized_sents, lengths = tensorized(word_lists, word2id)
        tensorized_sents = tensorized_sents.to(self.device)

        self.best_model.eval()
        with torch.no_grad():
            batch_tagids = self.best_model.test(tensorized_sents, lengths,
                                                tag2id)

        # 将id转化为标注
        pred_tag_lists = []
        id2tag = dict((id_, tag) for tag, id_ in tag2id.items())
        for i, ids in enumerate(batch_tagids):
            tag_list = []
            if self.crf:
                for j in range(lengths[i] - 1):  # crf解码过程中,end被舍弃
                    tag_list.append(id2tag[ids[j].item()])
            else:
                for j in range(lengths[i]):
                    tag_list.append(id2tag[ids[j].item()])
            pred_tag_lists.append(tag_list)

        # indices存有根据长度排序后的索引映射的信息
        # 比如若indices = [1, 2, 0] 则说明原先索引为1的元素映射到的新的索引是0,
        # 索引为2的元素映射到新的索引是1...
        # 下面根据indices将pred_tag_lists和tag_lists转化为原来的顺序
        ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
        indices, _ = list(zip(*ind_maps))
        pred_tag_lists = [pred_tag_lists[i] for i in indices]
        tag_lists = [tag_lists[i] for i in indices]

        return pred_tag_lists, tag_lists
コード例 #5
0
ファイル: bilstm_crf.py プロジェクト: OpenNLPhub/ChineseNER
    def test(self, test_word_lists, test_tag_lists, word2id, tag2id):

        #要还原句子顺序
        test_word_lists, test_tag_lists, indices = sort_by_lengths(
            test_word_lists, test_tag_lists)

        tensorized_sent, lengths = tensorized(test_word_lists, word2id)
        tensorized_tag, lengths = tensorized(test_word_lists, tag2id)

        tensorized_sent = tensorized_sent.to(self.device)

        self.best_model.eval()
        with torch.no_grad():
            batch_tagids = self.best_model.test(tensorized_sent, lengths,
                                                tag2id)  #[B,L]
        id2tag = dict((id, tag) for tag, id in tag2id.items())
        pred_tag_lists = []  #[B,L]
        for i, ids in enumerate(batch_tagids):
            tag_list = []  #(L,)
            if self.crf:
                for j in range(lengths[i] - 1):
                    tag_list.append(
                        id2tag[ids[j].item()])  #item() 取 tensor中的值,容易忘记
            else:
                for j in range(lengths[i]):
                    tag_list.append(id2tag[ids[j].item()])

            pred_tag_lists.append(tag_list)

        #indices= [1,2,0] 表示 原先索引为1的 新的索引是0 [(0,1) (1,2),(2,0)] 排序后 [(2,0),(0,1),(1,2)]
        ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
        indices, _ = list(zip(*ind_maps))
        pred_tag_lists = [pred_tag_lists[i] for i in indices]
        tag_lists = [test_tag_lists[i] for i in indices]

        return pred_tag_lists, tag_lists