Ejemplo n.º 1
0
    def __init__(self, input_vocab_num, max_seq_len, pad_idx=0):
        """
        :param input_vocab_num: 全部输入序列的词典的单词数
        :param max_seq_len: 输入序列最大长度
        :param pad_idx: pad的填充位置,默认为0
        """
        super(Encoder, self).__init__()
        self.word_embedding = nn.Embedding(input_vocab_num, config.d_model, padding_idx=pad_idx)  # 词向量层 N*D
        self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx)  # 位置向量层 (N+1)*D
        self.encoder_layers = nn.ModuleList([EncoderLayer() for _ in range(config.layers)])  # 堆叠n层encoder_layer

        self.pad_obj = Mask()  # mask对象
        self.tool = Tools()  # 工具对象
Ejemplo n.º 2
0
    def __init__(self, target_vocab_num, max_seq_len, pad_idx=0):
        """

        :param target_vocab_num: 全部目标序列的单词数
        :param max_seq_len: 全部目标序列的最大长度
        :param pad_idx: 屏蔽位,默认为0
        """
        super(Decoder, self).__init__()
        self.word_embedding = nn.Embedding(target_vocab_num, config.d_model)  # 构建词向量层 M*D
        self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx)  # 构建位置向量层 (M+1)*D
        self.decoder_layers = nn.ModuleList([DecoderLayer() for _ in range(config.layers)])  # 堆叠n层DecoderLayer

        self.mask_obj = Mask()  # mask类
        self.tool = Tools()  # 工具类
Ejemplo n.º 3
0
    def __init__(self, transformer, optimizer, criterion):
        self.transformer = transformer  # transformer模型
        self.optimizer = optimizer  # 优化器
        self.criterion = criterion  # 损失函数

        self.tool = Tools()  # 工具类

        self.val_acc_all = [0]  # 收集验证集所有的准确率
        self.save_checkpoint = 0  # 模型最佳保存点的次数

        self.val_all_batch = 0
        self.train_all_batch = 0

        self.train_loss = []
        self.val_loss = []
        self.train_ppl = []
        self.val_ppl = []
        self.train_acc = []
        self.val_acc = []
Ejemplo n.º 4
0
class Decoder(nn.Module):
    """
    Decoder由6层DecoderLayer构成
    """
    def __init__(self, target_vocab_num, max_seq_len, pad_idx=0):
        """

        :param target_vocab_num: 全部目标序列的单词数
        :param max_seq_len: 全部目标序列的最大长度
        :param pad_idx: 屏蔽位,默认为0
        """
        super(Decoder, self).__init__()
        self.word_embedding = nn.Embedding(target_vocab_num,
                                           config.d_model)  # 构建词向量层 M*D
        self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model,
                                               pad_idx)  # 构建位置向量层 (M+1)*D
        self.decoder_layers = nn.ModuleList(
            [DecoderLayer() for _ in range(config.layers)])  # 堆叠n层DecoderLayer

        self.mask_obj = Mask()  # mask类
        self.tool = Tools()  # 工具类

    def forward(self, tgt_seq, src_seq, enc_out):
        """

        :param tgt_seq: 该批目标序列 B*L
        :param src_seq: 该批输入序列 B*L
        :param enc_out: 编码器的输出 B*L*(d*h=d_model)
        :return: 解码器的输出 B*L*(d*h)
        """
        tgt_pos = self.tool.seq_2_pos(tgt_seq)  # 生成目标序列的位置向量 B*L
        no_pad_mask = self.mask_obj.no_padding_mask(
            tgt_seq)  # 生成target序列补齐位的屏蔽位 B*L*1

        pad_mask = self.mask_obj.padding_mask(tgt_seq,
                                              tgt_seq)  # 生成序列的补齐位的屏蔽位 B*L*L
        seq_mask = self.mask_obj.sequence_mask(tgt_seq)  # 生成子序列屏蔽位(上三角形) B*L*L
        pad_seq_mask = (pad_mask + seq_mask).gt(0)  # 在解码器中,结合两种mask B*L*L

        # 在第二层的多头注意力机制中,产生context类的mask B * tgt_L * src_L
        dec_enc_mask = self.mask_obj.padding_mask(src_seq, tgt_seq)

        # Decoder的第一层为词向量word embedding+位置向量 embedding
        dec_in = self.word_embedding(tgt_seq) + self.pos_encoding(
            tgt_pos)  # B*L*(h*d=d_model)

        dec_out = 0
        for decoder_layer in self.decoder_layers:  # 循环计算每一层
            dec_out = decoder_layer(dec_in, enc_out, no_pad_mask, pad_seq_mask,
                                    dec_enc_mask)
            dec_in = dec_out  # 上一层的输出等于下一层的输入

        return dec_out
Ejemplo n.º 5
0
class Encoder(nn.Module):
    """
    Encoder由6层EncodeLayer堆叠构成
    """
    def __init__(self, input_vocab_num, max_seq_len, pad_idx=0):
        """
        :param input_vocab_num: 全部输入序列的词典的单词数
        :param max_seq_len: 输入序列最大长度
        :param pad_idx: pad的填充位置,默认为0
        """
        super(Encoder, self).__init__()
        self.word_embedding = nn.Embedding(input_vocab_num,
                                           config.d_model,
                                           padding_idx=pad_idx)  # 词向量层 N*D
        self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model,
                                               pad_idx)  # 位置向量层 (N+1)*D
        self.encoder_layers = nn.ModuleList([
            EncoderLayer() for _ in range(config.layers)
        ])  # 堆叠n层encoder_layer

        self.pad_obj = Mask()  # mask对象
        self.tool = Tools()  # 工具对象

    def forward(self, src_seq):
        """

        :param src_seq: 输入批序列 B*L
        :return: 6层encoder子层的计算结果 B*L*?
        """
        src_pos = self.tool.seq_2_pos(src_seq)  # 生成输入序列对应的位置序列 B*L
        # Encoder第一层的输入是词向量word embedding + 位置向量positional encoding B*L*D
        enc_in = self.word_embedding(src_seq) + self.pos_encoding(src_pos)

        pad_mask = self.pad_obj.padding_mask(
            src_seq, src_seq)  # pad_mask 由补齐序列产生的屏蔽位 B*L*L
        no_pad_mask = self.pad_obj.no_padding_mask(src_seq)  # 序列补齐位的屏蔽位 B*L*1

        enc_out = 0
        for encoder_layer in self.encoder_layers:  # 循环计算每一层
            enc_out = encoder_layer(enc_in, pad_mask, no_pad_mask)
            enc_in = enc_out  # 上一层的输出等于下一层的输入

        return enc_out
Ejemplo n.º 6
0
def main():
    print('\n===开始推理测试===\n\n')

    def index_2_word(lang, seq):
        """ 转化索引到单词"""
        seq = [int(idx.detach()) for idx in seq]
        new_seq = []
        for i in seq:
            if i != config.sos and i != config.eos and i != config.pad:
                new_seq.append(i)
        idx_2_word = [lang['index2word'][i] for i in new_seq]

        return idx_2_word

    # 命令行参数
    parser = argparse.ArgumentParser(description='Transformer模型推理测试部分!')

    parser.add_argument('-save_model',
                        default='./results/best_model.chkpt',
                        help='训练好的模型的存放的路径')
    parser.add_argument('-save_data',
                        default='./results/words_data.pt',
                        help='保存训练和验证的文本序列信息')
    parser.add_argument('-infer_data_input',
                        default='./data/test.en',
                        help='推理测试输入数据集的路径')
    parser.add_argument('-infer_data_target',
                        default='./data/test.de',
                        help='推理测试目标数据集的路径')
    parser.add_argument('-pre_target',
                        default='./results/input_target_infer.txt',
                        help='推理预测结果存放的路径')
    parser.add_argument('-infer_batch_size',
                        default=32,
                        type=int,
                        help='推理预测阶段的批大小')
    parser.add_argument('-beam_search_size',
                        default=5,
                        type=int,
                        help='Beam search搜索的宽度')
    parser.add_argument('-infer_n_best',
                        default=1,
                        type=int,
                        help='通过Beam search后,推理预测出top n的句子')

    args = parser.parse_args()  # 赋值参数
    config.args_2_variable_infer(args)  # 修改默认参数

    # 加载训练&验证字符信息
    words_data = torch.load(args.save_data)
    source_lang = words_data['src_lang']
    target_lang = words_data['tgt_lang']
    data_obj = words_data['data_obj']

    checkpoint = torch.load(
        args.save_model,
        map_location='cuda' if torch.cuda.is_available() else 'cpu')
    transformer = Transformer(input_vocab_num=source_lang['n_words'],
                              target_vocab_num=target_lang['n_words'],
                              src_max_len=data_obj['src_max_len'],
                              tgt_max_len=data_obj['tgt_max_len'])

    transformer.load_state_dict(checkpoint['model'])
    print('加载预训练的模型参数完成!')

    infer = Translator(model=transformer,
                       tgt_max_len=data_obj['tgt_max_len'])  # 推理预测模型

    data_obj = DataProcess()
    *_, src_tgt_seq = data_obj.word_2_index(args.infer_data_input,
                                            args.infer_data_target,
                                            source_lang, target_lang)  # 测试数据
    # 打包批次数据
    data_loader = DataLoader(dataset=src_tgt_seq,
                             batch_size=args.infer_batch_size,
                             shuffle=True,
                             drop_last=False)

    with open(args.pre_target, 'w', encoding='utf-8') as f:
        for batch_dat in tqdm(data_loader, desc='Inferring...',
                              leave=True):  # 迭代推理批次数据
            src_seq, tgt_seq = Tools().batch_2_tensor(
                batch_dat)  # 获得输入序列和实际目标序列
            src_pos = Tools().seq_2_pos(src_seq)  # 得到输入序列的pos位置向量
            all_pre_seq, all_pre_seq_p = infer.translate_batch(
                src_seq, src_pos)  # 获得所有预测的结果和对应的概率

            for index, pre_seq in enumerate(all_pre_seq):
                src_word_seq = index_2_word(source_lang,
                                            src_seq[index])  # 清洗输入序列并转化为字符
                tgt_word_seq = index_2_word(target_lang,
                                            tgt_seq[index])  # 清洗目标序列并转化为字符
                for seq in pre_seq:  # 清洗预测序列并转化为字符
                    new_seq = []
                    for i in seq:
                        if i != config.sos and i != config.eos and i != config.pad:
                            new_seq.append(i)
                    pre_word_seq = [
                        target_lang['index2word'][idx] for idx in new_seq
                    ]

                f.write('输入序列->:' + ' '.join(src_word_seq) + '\n')  # 写入输入序列
                f.write('->预测序列:' + ' '.join(pre_word_seq) + '\n')  # 写入预测序列
                f.write('==目标序列:' + ' '.join(tgt_word_seq) + '\n\n')  # 写入实际序列

    print('推理预测序列完毕!')
Ejemplo n.º 7
0
    def __init__(self, transformer, optimizer, criterion):
        self.transformer = transformer  # transformer模型
        self.optimizer = optimizer  # 优化器
        self.criterion = criterion  # 损失函数

        self.tool = Tools()  # 工具类

        self.val_acc_all = [0]  # 收集验证集所有的准确率
        self.save_checkpoint = 0  # 模型最佳保存点的次数

        self.val_all_batch = 0
        self.train_all_batch = 0

        self.train_loss = []
        self.val_loss = []
        self.train_ppl = []
        self.val_ppl = []
        self.train_acc = []
        self.val_acc = []

        if config.visual and config.use_visdom:  # Visdom可视化
            # 损失初始化
            viz_loss = Visdom(env='Transformer_Train_Val_Loss')
            self.viz_loss = viz_loss
            t_loss_x, t_loss_y = 0, 0
            win_loss_1 = viz_loss.line(np.array([t_loss_x]),
                                       np.array([t_loss_y]),
                                       opts=dict(title='训练loss',
                                                 legend=['train'],
                                                 markers=True,
                                                 markersize=5))

            v_loss_x, v_loss_y = 0, 0
            win_loss_2 = viz_loss.line(np.array([v_loss_x]),
                                       np.array([v_loss_y]),
                                       opts=dict(title='验证loss',
                                                 legend=['val'],
                                                 markers=True,
                                                 markersize=5))
            self.win_loss_1 = win_loss_1
            self.win_loss_2 = win_loss_2

            # ppl点初始化
            viz_ppl = Visdom(env='Transformer_Train_Val_PPL')
            self.viz_ppl = viz_ppl
            t_ppl_x, t_ppl_y = 0, 0
            win_ppl_1 = viz_ppl.line(np.array([t_ppl_x]),
                                     np.array([t_ppl_y]),
                                     opts=dict(title='训练PPL',
                                               legend=['train'],
                                               markers=True,
                                               markersize=5))

            v_ppl_x, v_ppl_y = 0, 0
            win_ppl_2 = viz_ppl.line(np.array([v_ppl_x]),
                                     np.array([v_ppl_y]),
                                     opts=dict(title='验证PPL',
                                               legend=['val'],
                                               markers=True,
                                               markersize=5))
            self.win_ppl_1 = win_ppl_1
            self.win_ppl_2 = win_ppl_2

            # acc点初始化
            viz_acc = Visdom(env='Transformer_Train_Val_ACC')
            self.viz_acc = viz_acc
            t_acc_x, t_acc_y = 0, 0
            win_acc_1 = viz_acc.line(np.array([t_acc_x]),
                                     np.array([t_acc_y]),
                                     opts=dict(title='验证ACC',
                                               legend=['train'],
                                               markers=True,
                                               markersize=5))
            v_acc_x, v_acc_y = 0, 0

            win_acc_2 = viz_acc.line(np.array([v_acc_x]),
                                     np.array([v_acc_y]),
                                     opts=dict(title='验证ACC',
                                               legend=['val'],
                                               markers=True,
                                               markersize=5))
            self.win_acc_1 = win_acc_1
            self.win_acc_2 = win_acc_2
Ejemplo n.º 8
0
class Sequence2Sequence():
    def __init__(self, transformer, optimizer, criterion):
        self.transformer = transformer  # transformer模型
        self.optimizer = optimizer  # 优化器
        self.criterion = criterion  # 损失函数

        self.tool = Tools()  # 工具类

        self.val_acc_all = [0]  # 收集验证集所有的准确率
        self.save_checkpoint = 0  # 模型最佳保存点的次数

        self.val_all_batch = 0
        self.train_all_batch = 0

        self.train_loss = []
        self.val_loss = []
        self.train_ppl = []
        self.val_ppl = []
        self.train_acc = []
        self.val_acc = []

        if config.visual and config.use_visdom:  # Visdom可视化
            # 损失初始化
            viz_loss = Visdom(env='Transformer_Train_Val_Loss')
            self.viz_loss = viz_loss
            t_loss_x, t_loss_y = 0, 0
            win_loss_1 = viz_loss.line(np.array([t_loss_x]),
                                       np.array([t_loss_y]),
                                       opts=dict(title='训练loss',
                                                 legend=['train'],
                                                 markers=True,
                                                 markersize=5))

            v_loss_x, v_loss_y = 0, 0
            win_loss_2 = viz_loss.line(np.array([v_loss_x]),
                                       np.array([v_loss_y]),
                                       opts=dict(title='验证loss',
                                                 legend=['val'],
                                                 markers=True,
                                                 markersize=5))
            self.win_loss_1 = win_loss_1
            self.win_loss_2 = win_loss_2

            # ppl点初始化
            viz_ppl = Visdom(env='Transformer_Train_Val_PPL')
            self.viz_ppl = viz_ppl
            t_ppl_x, t_ppl_y = 0, 0
            win_ppl_1 = viz_ppl.line(np.array([t_ppl_x]),
                                     np.array([t_ppl_y]),
                                     opts=dict(title='训练PPL',
                                               legend=['train'],
                                               markers=True,
                                               markersize=5))

            v_ppl_x, v_ppl_y = 0, 0
            win_ppl_2 = viz_ppl.line(np.array([v_ppl_x]),
                                     np.array([v_ppl_y]),
                                     opts=dict(title='验证PPL',
                                               legend=['val'],
                                               markers=True,
                                               markersize=5))
            self.win_ppl_1 = win_ppl_1
            self.win_ppl_2 = win_ppl_2

            # acc点初始化
            viz_acc = Visdom(env='Transformer_Train_Val_ACC')
            self.viz_acc = viz_acc
            t_acc_x, t_acc_y = 0, 0
            win_acc_1 = viz_acc.line(np.array([t_acc_x]),
                                     np.array([t_acc_y]),
                                     opts=dict(title='验证ACC',
                                               legend=['train'],
                                               markers=True,
                                               markersize=5))
            v_acc_x, v_acc_y = 0, 0

            win_acc_2 = viz_acc.line(np.array([v_acc_x]),
                                     np.array([v_acc_y]),
                                     opts=dict(title='验证ACC',
                                               legend=['val'],
                                               markers=True,
                                               markersize=5))
            self.win_acc_1 = win_acc_1
            self.win_acc_2 = win_acc_2

    # 训练&验证
    def train_val(self, train_loader, val_loader):
        print('\n===开始训练&验证===\n')

        train_log = None
        val_log = None
        if config.log:  # 模型写入日志
            train_log = open((os.path.abspath('.') + '/results/train.log'),
                             'w',
                             encoding='utf-8')
            train_log.write('轮次:{epoch:3.0f}, '
                            '当前批次:{step:3.0f}, '
                            '累计批次:{total_step:7.0f}, '
                            '批大小:{batch_size:3.0f}, '
                            '损失:{loss:10.6f}, '
                            '该批学习率:{lr:15.10f}\n\n'.format(epoch=0,
                                                           step=0,
                                                           total_step=0,
                                                           batch_size=0,
                                                           loss=0,
                                                           lr=0))

            val_log = open((os.path.abspath('.') + '/results/val.log'),
                           'w',
                           encoding='utf-8')
            val_log.write('轮次:{epoch:3.0f}, '
                          '批大小:{batch_size:3.0f}, '
                          '损失:{loss:10.6f}\n\n, '.format(epoch=0,
                                                         batch_size=0,
                                                         loss=0))

        for epoch in range(config.epochs):  # 按轮次训练数据
            print('[训练轮次 {}]'.format(epoch))

            step = 0  # 每个轮次的批次数

            each_batch_loss = []  # 记录每一批次数据的损失
            each_batch_lr = []  # 记录每一个批次的学习率
            self.transformer.train()  # 设置模型为训练状态
            total_loss = 0  # 本轮次所有批次数据的训练损失
            total_word_num = 0  # 本轮次所有批次数据的词数
            total_word_correct = 1  # 本轮次所有批次数据中单词正确的个数
            epoch_start_time = datetime.datetime.now()  # 一个轮次模型的计算开始时间

            for batch_data in tqdm(train_loader,
                                   mininterval=1,
                                   desc='Training...',
                                   leave=False):  # 迭代计算批次数据
                self.train_all_batch += 1  # 总批次加一
                step += 1  # 该轮次批数加一

                self.optimizer.zero_grad()  # 优化器梯度清零
                src_seq, tgt_seq = self.tool.batch_2_tensor(
                    batch_data)  # 得到输入和目标序列 B*L B*L
                pre_tgt = self.transformer(src_seq,
                                           tgt_seq)  # transformer模型预测结果
                real_tgt = tgt_seq[:,
                                   1:].contiguous().view(-1)  # 构造实际的目标序列token
                loss, correct = self.criterion.cal_loss(real_tgt, pre_tgt)
                loss.backward(retain_graph=True)  # 损失反向传播(计算损失后还保留变量)
                learn_rate = self.optimizer.step_update_lrate()  # 更新学习率,优化器步进

                total_loss += loss.item()  # 累加损失
                each_batch_loss.append(loss.detach())  # 获取该批次损失
                each_batch_lr.append(learn_rate)

                non_pad_mask = real_tgt.ne(config.pad)
                word_num = non_pad_mask.sum().item()
                total_word_num += word_num
                total_word_correct += correct

                # 写入训练日志
                if train_log:
                    train_log.write('轮次:{epoch:3.0f}, '
                                    '当前批次:{step:3.0f}, '
                                    '累计批次:{total_step:7.0f}, '
                                    '批大小:{batch_size:3.0f}, '
                                    '损失:{loss:10.6f}, '
                                    '该批学习率:{lr:15.10f}\n'.format(
                                        epoch=epoch,
                                        step=step,
                                        total_step=self.train_all_batch,
                                        batch_size=len(batch_data[0]),
                                        loss=loss.detach(),
                                        lr=learn_rate))

                if config.visual and config.use_visdom:
                    self.viz_loss.line(np.array([loss.cpu().detach()]),
                                       np.array([self.train_all_batch]),
                                       win=self.win_loss_1,
                                       update='append')

                self.train_loss.append(loss.cpu().detach())

            loss_per_word = total_loss / total_word_num  # 平均到每个单词的损失
            ppl = math.exp(min(loss_per_word, 100))  # 困惑度,越小越好
            acc = total_word_correct / total_word_num  # 平均到每个单词的准确率acc
            acc = 100 * acc  # 准确率,越大越好
            epoch_end_time = datetime.datetime.now()
            self.train_ppl.append(ppl)
            self.train_acc.append(acc)

            if config.visual and config.use_visdom:
                self.viz_ppl.line(np.array([ppl]),
                                  np.array([epoch]),
                                  win=self.win_ppl_1,
                                  update='append')
                self.viz_acc.line(np.array([acc]),
                                  np.array([epoch]),
                                  win=self.win_acc_1,
                                  update='append')

            print('批数 %4.0f' % step, '| 累积批数 %8.0f' % self.train_all_batch,
                  '| 批大小 %3.0f' % config.batch_size, '| 耗时',
                  epoch_end_time - epoch_start_time)

            print('训练', '| 困惑度PPL↓ %10.6f' % ppl, '| 准确率ACC↑ %10.5f' % acc,
                  '| 首批损失 %10.5f' % each_batch_loss[0], '| 尾批损失 %10.5f' %
                  each_batch_loss[-2])  # 最后一批不满完整batch_size,不显示

            train_log.write(
                '困惑度PPL↓:{PPL:10.5f}, 准确率ACC↑:{ACC:10.5f}\n\n'.format(PPL=ppl,
                                                                      ACC=acc))

            self.evaluate(val_loader, epoch, self.train_all_batch,
                          val_log)  # 每一个训练轮次结束,验证一次

        pic.train_loss(self.train_loss)
        pic.val_loss(self.val_loss)
        pic.ppl(self.train_ppl, self.val_ppl)
        pic.acc(self.train_acc, self.val_acc)
        # TODO 训练结束,待保存训练模型
        print('训练&验证结束!')

    # 每个轮次训练结束,用验证数据验证模型
    def evaluate(self, data_loader, epoch, batch, val_log):
        self.transformer.eval()  # 设置模型为验证状态
        total_loss = 0  # 本轮次所有批次数据的训练损失
        total_word_num = 0  # 本轮次所有批次数据的词数
        total_word_correct = 0  # 本轮次所有批次数据中单词正取的个数

        with torch.no_grad():  # 设置验证产生的损失不更新模型
            for batch_data in tqdm(data_loader,
                                   mininterval=1,
                                   desc='Validating...',
                                   leave=False):  # 迭代计算批次数据
                src_seq, tgt_seq = self.tool.batch_2_tensor(
                    batch_data)  # 获取输入序列和验证序列
                pre_tgt = self.transformer(src_seq, tgt_seq)  # 模型预测的目标序列
                real_tgt = tgt_seq[:, 1:].contiguous().view(-1)  # 构建真实的目标序列
                loss, correct = self.criterion.cal_loss(real_tgt, pre_tgt)

                total_loss += loss.item()  # 累加损失
                self.val_all_batch += 1  # 累加该轮次验证批次数

                non_pad_mask = real_tgt.ne(config.pad)
                word_num = non_pad_mask.sum().item()
                total_word_num += word_num
                total_word_correct += correct

                if val_log:
                    val_log.write('轮次:{epoch:3.0f}, '
                                  '批大小:{batch_size:3.0f}, '
                                  '损失:{loss:10.6f}\n'.format(
                                      epoch=epoch,
                                      batch_size=len(batch_data[0]),
                                      loss=loss.detach()))

                if config.visual and config.use_visdom:  # Visdom 可视化
                    self.viz_loss.line(np.array([loss.cpu().detach()]),
                                       np.array([self.val_all_batch]),
                                       win=self.win_loss_2,
                                       update='append')

                self.val_loss.append(loss.cpu().detach())

            loss_per_word = total_loss / total_word_num  # 平均到每个单词的损失
            ppl = math.exp(min(loss_per_word, 100))  # 困惑度,越小越好
            acc = total_word_correct / total_word_num  # 平均到每个单词的准确率acc
            acc = 100 * acc  # 准确率,越大越好

            self.val_acc_all.append(acc)  # 收集每一次验证集的准确率
            self.val_ppl.append(ppl)
            self.val_acc.append(acc)

            if config.visual and config.use_visdom:
                self.viz_ppl.line(np.array([ppl]),
                                  np.array([epoch]),
                                  win=self.win_ppl_2,
                                  update='append')
                self.viz_acc.line(np.array([acc]),
                                  np.array([epoch]),
                                  win=self.win_acc_2,
                                  update='append')

            print('验证', '| 困惑度PPL↓ %10.5f' % ppl, '| 准确率ACC↑ %10.5f' % acc)

            if val_log:
                val_log.write(
                    '困惑度PPL↓:{PPL:10.5f}, 准确率ACC↑:{ACC:10.5f}\n\n'.format(
                        PPL=ppl, ACC=acc))

        # 模型保存点
        if config.save_trained_model:
            model_state_dict = self.transformer.state_dict()  # 保存训练模型状态
            checkpoint = {  # 保存点的信息
                'model': model_state_dict,
                'settings': config,
                'epoch': epoch,
                'total batch': batch
            }

            if config.save_trained_model_type == 'all':
                model_name = os.path.abspath(
                    '.') + '/all' + '_acc_{acc:3.3f}.chkpt'.format(acc=100 *
                                                                   acc)
                torch.save(checkpoint, model_name)
            elif config.save_trained_model_type == 'best':
                model_name = os.path.abspath('.') + '/results/best_model.chkpt'
                if acc >= max(self.val_acc_all):
                    torch.save(checkpoint, model_name)
                    self.save_checkpoint += 1
                    print('已经第{}次更新模型最佳保存点!'.format(self.save_checkpoint))

    # 批序列推理
    def infer(self, data_loader, source_lang, target_lang, tgt_max_len):
        print('\n===开始推理测试===\n\n')

        def index_2_word(lang, seq):
            """ 转化索引到单词"""
            seq = [int(idx.detach()) for idx in seq]
            new_seq = []
            for i in seq:
                if i != config.sos and i != config.eos and i != config.pad:
                    new_seq.append(i)
            if type(lang) != dict:
                idx_2_word = [lang.index2word[i] for i in new_seq]
            else:
                idx_2_word = [lang['index2word'][i] for i in new_seq]

            return idx_2_word

        if config.save_trained_model:  # 如果有预训练好的模型保存点
            # 加载训练&验证字符信息
            words_data = torch.load(config.save_data)
            source_lang = words_data['src_lang']
            target_lang = words_data['tgt_lang']
            data_obj = words_data['data_obj']

            checkpoint = torch.load(
                config.checkpoint,
                map_location='cuda' if torch.cuda.is_available() else 'cpu')
            transformer = Transformer(input_vocab_num=source_lang['n_words'],
                                      target_vocab_num=target_lang['n_words'],
                                      src_max_len=data_obj['src_max_len'],
                                      tgt_max_len=data_obj['tgt_max_len'])

            transformer.load_state_dict(checkpoint['model'])
            self.transformer = transformer

            print('加载预训练的模型参数完成!')

        infer = Translator(self.transformer, tgt_max_len)  # 批次数据推理类

        with open((os.path.abspath('.') + '/results/input_target_infer.txt'),
                  'w',
                  encoding='utf-8') as f:
            for batch_dat in tqdm(data_loader,
                                  desc='Inferring...',
                                  leave=False):  # 迭代推理批次数据
                src_seq, tgt_seq = self.tool.batch_2_tensor(
                    batch_dat)  # 获得输入序列和实际目标序列
                src_pos = self.tool.seq_2_pos(src_seq)  # 得到输入序列的pos位置向量
                all_pre_seq, all_pre_seq_p = infer.translate_batch(
                    src_seq, src_pos)  # 获得所有预测的结果和对应的概率

                for index, pre_seq in enumerate(all_pre_seq):
                    src_word_seq = index_2_word(source_lang,
                                                src_seq[index])  # 清洗输入序列并转化为字符
                    tgt_word_seq = index_2_word(target_lang,
                                                tgt_seq[index])  # 清洗目标序列并转化为字符
                    for seq in pre_seq:  # 清洗预测序列并转化为字符
                        new_seq = []
                        for i in seq:
                            if i != config.sos and i != config.eos and i != config.pad:
                                new_seq.append(i)
                        if type(target_lang) != dict:
                            pre_word_seq = [
                                target_lang.index2word[idx] for idx in new_seq
                            ]
                        else:
                            pre_word_seq = [
                                target_lang['index2word'][idx]
                                for idx in new_seq
                            ]

                    f.write('输入序列->:' + ' '.join(src_word_seq) +
                            '\n')  # 写入输入序列
                    f.write('->预测序列:' + ' '.join(pre_word_seq) +
                            '\n')  # 写入预测序列
                    f.write('==目标序列:' + ' '.join(tgt_word_seq) +
                            '\n\n')  # 写入实际序列

        print('推理预测序列完毕!')
Ejemplo n.º 9
0
def main():
    """
    从Diagnosis序列生成Concept序列的特殊测试(验证)模式,计算输入序列是同一个情况下,目标序列是不同时,
    计算预测序列和目标序列的loss
    :param candidate_k: 候选的concept个数
    :param train: 为集成模型二分类准备数据需要对train数据生成loss
    :param val: 为集成模型二分类准备数据需要对val数据生成loss
    :return: 写出loss文件
    """
    print('数据集路径:', config.data_path)
    print('输出路径:', config.save_path)

    word_data = torch.load(config.save_data)  # 加载保存的训练-验证的输入、目标序列的token信息
    src_lang = word_data['src_lang']  # 输入序列token信息
    tgt_lang = word_data['tgt_lang']  # 目标序列token信息
    data_obj = word_data['data_obj']  # 数据对象

    checkpoint = torch.load(  # 加载保存好的模型训练保存点
        config.save_model_checkpoint,
        map_location=lambda storage, loc: storage.cuda(0))
    transformer = Transformer(  # 定义transformer模型
        input_vocab_num=src_lang['n_words'],
        target_vocab_num=tgt_lang['n_words'],
        src_max_len=data_obj['src_max_len'],
        tgt_max_len=data_obj['tgt_max_len']).cuda()
    transformer.load_state_dict(checkpoint['model'])  # 给transformer模型加载训练好的参数
    transformer.eval()  # 验证模式,保证模型参数不变
    print('加载预训练的模型参数完成!\n')

    data_object = DataProcess()  # 数据处理类对象
    *_, src_tgt_seq = data_object.word_2_index(config.infer_input_k,
                                               config.infer_target_k, src_lang,
                                               tgt_lang)  # 测试数据

    data_loader = DataLoader(
        dataset=src_tgt_seq,
        batch_size=config.k,  # 根据k的大小,可以乘上2倍,不影响
        shuffle=False,
        drop_last=False)  # 打包批次数据

    all_batch_loss = []  # 所有批次数据中每条数据的loss
    with torch.no_grad():  # 不更新模型梯度情况下测试(验证)
        for candidate_k_data in tqdm(data_loader,
                                     mininterval=1,
                                     ncols=1,
                                     desc='特殊测试中',
                                     leave=True):
            src_seq, tgt_seq = Tools().batch_2_tensor(
                candidate_k_data)  # 获得候选的k个输入、目标序列
            _, pre_seq = transformer.forward(src_seq, tgt_seq)  # 模型预测的序列
            tgt_seq = tgt_seq[:, 1:]  # 构建真实的目标序列
            assert pre_seq.size()[0] == tgt_seq.size(
            )[0], '预测序列和目标序列的条数不一致,无法计算loss!'
            for i in range(pre_seq.size()[0]):  # 循环计算这一批次数据每条的loss
                seq_len = 0
                for j in tgt_seq[i]:
                    if j != config.pad:
                        seq_len += 1  # 计算当前目标序列的实际长度
                loss = F.cross_entropy(  # 计算当前预测序列和实际序列的loss
                    pre_seq[i],
                    tgt_seq[i],
                    ignore_index=config.pad,
                    reduction='elementwise_mean')
                loss = loss.detach().cpu().tolist()
                # if config.loss_cal == 'sum':  # 如果模型的loss是sum模式,要除以实际目标序列的长度
                #     loss /= seq_len  FIXME 这里用mean的效果远比sum好,要找出是什么原因
                all_batch_loss.append(round(loss, 7))  # 计算出的loss要除以实际序列长度

    infer_target_k = open(config.infer_target_k, 'r',
                          encoding='utf-8').readlines()
    assert len(all_batch_loss) == len(infer_target_k), '语义损失数和条数不一致!!!'

    with open(config.result_loss, 'w', encoding='utf-8') as f:  # 写出序列loss文件
        for idx, i in enumerate(all_batch_loss):
            if idx != len(all_batch_loss) - 1:
                f.write(str(i) + '\t' + infer_target_k[idx].split('\n')[0])
                f.write('\n')
            else:
                f.write(str(i) + '\t' + infer_target_k[idx].split('\n')[0])
    end_time = time.time()
    take_time = end_time - start_time
    per_item_time = round(take_time / (len(all_batch_loss) / config.k), 7)

    print('耗时{},每条耗时{}'.format(take_time, per_item_time))
Ejemplo n.º 10
0
def main(candidate_k):
    show_keyword(config, candidate_k)

    print('===开始推理测试===\n')

    def index_2_word(lang, seq):
        """ 转化索引到单词"""
        seq = [int(idx.detach()) for idx in seq]
        new_seq = []
        for i in seq:
            if i != config.sos and i != config.eos and i != config.pad:
                new_seq.append(i)
        idx_2_word = [lang['index2word'][i] for i in new_seq]

        return idx_2_word

    # 加载训练&验证字符信息
    words_data = torch.load(config.save_data)  # 加载已经保存的输入、目标序列token信息
    source_lang = words_data['src_lang']  # 输入序列token信息
    target_lang = words_data['tgt_lang']  # 目标序列token信息
    data_obj = words_data['data_obj']

    checkpoint = torch.load(
        config.save_model_checkpoint,
        map_location='cuda' if torch.cuda.is_available() else 'cpu')
    transformer = Transformer(  # 定义transformer模型
        input_vocab_num=source_lang['n_words'],
        target_vocab_num=target_lang['n_words'],
        src_max_len=data_obj['src_max_len'],
        tgt_max_len=data_obj['tgt_max_len'])

    transformer.load_state_dict(checkpoint['model'])  # 加载transformer模型预训练参数
    transformer.eval()  # 模型验证模式,不更改参数
    print('加载预训练的模型参数完成!\n')

    infer = Translator(model=transformer,
                       tgt_max_len=data_obj['tgt_max_len'])  # 推理预测模型

    data_obj = DataProcess()
    *_, src_tgt_seq = data_obj.word_2_index(
        config.test_input,
        config.test_target,
        source_lang,
        target_lang,
        father=config.infer_father if config.enc_father else None)  # 测试数据
    # 打包批次数据
    data_loader = DataLoader(dataset=src_tgt_seq,
                             batch_size=candidate_k,
                             shuffle=False,
                             drop_last=False)

    all_sent_gene_p = []  # 所有句子的beam生成概率
    all_sent_scores, all_sent_scores_sf = [], []  # 所有生成句子的得分
    with open(config.infer_result, 'w', encoding='utf-8') as f:
        for batch_data in tqdm(data_loader,
                               ncols=1,
                               desc='推理测试中...',
                               leave=True):  # 迭代推理批次数据

            if config.has_father:  # 编码父概念
                src_seq, tgt_seq, father_seq, none_seq = Tools(
                ).batch_2_tensor(batch_data, source_lang['word2index']['none'])
                src_pos = Tools().seq_2_pos(src_seq)  # 得到输入序列的pos位置向量
                father_pos = Tools().seq_2_pos(father_seq)  # 得到父概念序列的pos位置向量
                batch_pre_seq, batch_sent_scores, batch_sent_scores_sf, batch_gene_p = infer.translate_batch(
                    src_seq,
                    src_pos,
                    father_seq=father_seq,
                    father_pos=father_pos,
                    none_mask=none_seq)  # 预测和对应概率
            else:  # 不编码父概念
                src_seq, tgt_seq = Tools().batch_2_tensor(
                    batch_data)  # 获得输入序列和实际目标序列
                src_pos = Tools().seq_2_pos(src_seq)  # 得到输入序列的pos位置向量
                batch_pre_seq, batch_sent_scores, batch_sent_scores_sf, batch_gene_p = infer.translate_batch(
                    src_seq, src_pos)  # 获得预测结果和概率

            batch_sent_scores = [
                round(i.cpu().item(), 3) for i in batch_sent_scores
            ]
            batch_sent_scores_sf = [
                round(i.cpu().item(), 3) for i in batch_sent_scores_sf
            ]
            all_sent_gene_p += batch_gene_p
            all_sent_scores += batch_sent_scores
            all_sent_scores_sf += batch_sent_scores_sf

            pre_word_seq = None  # 推理预测的单词序列
            for index, pre_seq in enumerate(batch_pre_seq):
                src_word_seq = index_2_word(source_lang,
                                            src_seq[index])  # 清洗输入序列并转化为字符
                tgt_word_seq = index_2_word(target_lang,
                                            tgt_seq[index])  # 清洗目标序列并转化为字符
                for seq in pre_seq:  # 清洗预测序列并转化为字符
                    new_seq = []
                    for i in seq:
                        if i != config.sos and i != config.eos and i != config.pad:
                            new_seq.append(i)
                    pre_word_seq = [
                        target_lang['index2word'][idx] for idx in new_seq
                    ]

                f.write('输入序列->:' + ' '.join(src_word_seq) + '\n')  # 写入输入序列
                f.write('->预测序列:' + ' '.join(pre_word_seq) + '\n')  # 写入预测序列
                f.write('==目标序列:' + ' '.join(tgt_word_seq) + '\n\n')  # 写入实际序列

    with open(config.generate_pro, 'w',
              encoding='utf-8') as gene_f:  # 写出每一句话的生成概率
        for idx, p in enumerate(all_sent_scores_sf):
            gene_f.write(str(idx))
            gene_f.write(' ')
            gene_f.write(str(p))
            gene_f.write('\n')

    print('推理预测序列完毕!')