Beispiel #1
0
def main():

    # 载入数据集
    trainset, validset, testset = [], [], []
    if args.inference:  # 测试时只载入测试集
        with open(args.testset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                testset.append(json.loads(line))
        print('载入测试集%d条' % len(testset))
    else:  # 训练时载入训练集和验证集
        with open(args.trainset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                trainset.append(json.loads(line))
        print('载入训练集%d条' % len(trainset))
        with open(args.validset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                validset.append(json.loads(line))
        print('载入验证集%d条' % len(validset))

    # 载入词汇表,词向量
    vocab, embeds = [], []
    with open(args.embed_path, 'r', encoding='utf8') as fr:
        for line in fr:
            line = line.strip()
            word = line[:line.find(' ')]
            vec = line[line.find(' ') + 1:].split()
            embed = [float(v) for v in vec]
            assert len(embed) == config.embedding_size  # 检测词向量维度
            vocab.append(word)
            embeds.append(embed)
    print('载入词汇表: %d个' % len(vocab))
    print('词向量维度: %d' % config.embedding_size)

    # 通过词汇表构建一个word2index和index2word的工具
    sentence_processor = SentenceProcessor(vocab, config.pad_id,
                                           config.start_id, config.end_id,
                                           config.unk_id)

    # 创建模型
    model = Model(config)
    epoch = 0  # 训练集迭代次数
    global_step = 0  # 参数更新次数

    # 载入模型
    if os.path.isfile(args.model_path):  # 如果载入模型的位置存在则载入模型
        epoch, global_step = model.load_model(args.model_path)
        print('载入模型完成')
        # 记录模型的文件夹
        log_dir = os.path.split(args.model_path)[0]
    elif args.inference:  # 如果载入模型的位置不存在,但是又要测试,这是没有意义的
        print('请测试一个训练过的模型!')
        return
    else:  # 如果载入模型的位置不存在,重新开始训练,则载入预训练的词向量
        model.embedding.embedding.weight = torch.nn.Parameter(
            torch.FloatTensor(embeds))
        print('初始化模型完成')
        # 记录模型的文件夹
        log_dir = os.path.join(args.log_path, 'run' + str(int(time.time())))
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

    if args.gpu:
        model.to('cuda')  # 将模型参数转到gpu

    model.print_parameters()  # 输出模型参数个数

    # 定义优化器参数
    optim = Optim(config.method, config.lr, config.lr_decay,
                  config.weight_decay, config.max_grad_norm)
    optim.set_parameters(model.parameters())  # 给优化器设置参数
    optim.update_lr(epoch)  # 每个epoch更新学习率

    # 训练
    if not args.inference:

        summary_writer = SummaryWriter(os.path.join(
            log_dir, 'summary'))  # 创建tensorboard记录的文件夹

        dp_train = DataProcessor(trainset, config.batch_size,
                                 sentence_processor)  # 数据的迭代器
        dp_valid = DataProcessor(validset,
                                 config.batch_size,
                                 sentence_processor,
                                 shuffle=False)

        while epoch < args.max_epoch:  # 最大训练轮数

            model.train()  # 切换到训练模式

            for data in dp_train.get_batch_data():

                start_time = time.time()

                feed_data = prepare_feed_data(data)
                nll_loss, ppl = train(model, feed_data)

                optim.optimizer.zero_grad()  # 清空梯度
                nll_loss.mean().backward()  # 反向传播
                optim.step()  # 更新参数

                use_time = time.time() - start_time

                global_step += 1  # 参数更新次数+1

                # summary当前情况
                if global_step % args.print_per_step == 0:
                    print(
                        'epoch: %d, global_step: %d, lr: %g, nll_loss: %.2f, ppl: %.2f, time: %.2fs'
                        %
                        (epoch, global_step, optim.lr, nll_loss.mean().item(),
                         ppl.mean().exp().item(), use_time))
                    summary_writer.add_scalar('train_nll',
                                              nll_loss.mean().item(),
                                              global_step)
                    summary_writer.add_scalar('train_ppl',
                                              ppl.mean().exp().item(),
                                              global_step)
                    summary_writer.flush()  # 将缓冲区写入文件

                if global_step % args.log_per_step == 0:  # 保存模型

                    log_file = os.path.join(
                        log_dir, '%03d%012d.model' % (epoch, global_step))
                    model.save_model(epoch, global_step, log_file)

                    # 验证集上计算困惑度
                    model.eval()
                    nll_loss, ppl = valid(model, dp_valid)
                    model.train()
                    print('在验证集上的nll损失为: %g, 困惑度为: %g' %
                          (nll_loss, np.exp(ppl)))
                    summary_writer.add_scalar('valid_nll', nll_loss,
                                              global_step)
                    summary_writer.add_scalar('valid_ppl', np.exp(ppl),
                                              global_step)
                    summary_writer.flush()  # 将缓冲区写入文件

            epoch += 1  # 数据集迭代次数+1
            optim.update_lr(epoch)  # 调整学习率

            # 保存模型
            log_file = os.path.join(log_dir,
                                    '%03d%012d.model' % (epoch, global_step))
            model.save_model(epoch, global_step, log_file)

            # 验证集上计算困惑度
            model.eval()
            nll_loss, ppl = valid(model, dp_valid)
            print('在验证集上的nll损失为: %g, 困惑度为: %g' % (nll_loss, np.exp(ppl)))
            summary_writer.add_scalar('valid_nll', nll_loss, global_step)
            summary_writer.add_scalar('valid_ppl', np.exp(ppl), global_step)
            summary_writer.flush()  # 将缓冲区写入文件

        summary_writer.close()

    else:  # 测试

        if not os.path.exists(args.result_path):  # 创建结果文件夹
            os.makedirs(args.result_path)

        result_file = os.path.join(args.result_path, '%03d%012d.txt' %
                                   (epoch, global_step))  # 命名结果文件
        fw = open(result_file, 'w', encoding='utf8')

        dp_test = DataProcessor(testset,
                                config.batch_size,
                                sentence_processor,
                                shuffle=False)

        model.eval()  # 切换到测试模式,会停用dropout等等

        nll_loss, ppl = valid(model, dp_test)  # 评估困惑度
        print('在测试集上的nll损失为: %g, 困惑度为: %g' % (nll_loss, np.exp(ppl)))

        len_results = []  # 统计生成结果的总长度

        for data in dp_test.get_batch_data():

            posts = data['str_posts']
            responses = data['str_responses']

            feed_data = prepare_feed_data(data, inference=True)
            results = test(model, feed_data)  # 使用模型计算结果 [batch, len_decoder]

            for idx, result in enumerate(results):
                new_data = {}
                new_data['post'] = posts[idx]
                new_data['response'] = responses[idx]
                new_data['result'] = sentence_processor.index2word(
                    result)  # 将输出的句子转回单词的形式
                len_results.append(len(new_data['result']))
                fw.write(json.dumps(new_data) + '\n')

        fw.close()
        print('生成句子平均长度: %d' % (1.0 * sum(len_results) / len(len_results)))
Beispiel #2
0
def main():
    trainset, validset, testset = [], [], []
    if args.inference:  # 测试时只载入测试集
        with open(args.testset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                testset.append(json.loads(line))
        print(f'载入测试集{len(testset)}条')
    else:  # 训练时载入训练集和验证集
        with open(args.trainset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                trainset.append(json.loads(line))
        print(f'载入训练集{len(trainset)}条')
        with open(args.validset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                validset.append(json.loads(line))
        print(f'载入验证集{len(validset)}条')

    vocab, embeds = [], []
    with open(args.embed_path, 'r', encoding='utf8') as fr:
        for line in fr:
            line = line.strip()
            word = line[:line.find(' ')]
            vec = line[line.find(' ') + 1:].split()
            embed = [float(v) for v in vec]
            assert len(embed) == config.embedding_size  # 检测词向量维度
            vocab.append(word)
            embeds.append(embed)
    print(f'载入词汇表: {len(vocab)}个')
    print(f'词向量维度: {config.embedding_size}')

    vads = []
    with open(args.vad_path, 'r', encoding='utf8') as fr:
        for line in fr:
            line = line.strip()
            vad = line[line.find(' ') + 1:].split()
            vad = [float(item) for item in vad]
            assert len(vad) == config.affect_embedding_size
            vads.append(vad)
    print(f'载入vad字典: {len(vads)}个')
    print(f'vad维度: {config.affect_embedding_size}')

    # 通过词汇表构建一个word2index和index2word的工具
    sentence_processor = SentenceProcessor(vocab, config.pad_id,
                                           config.start_id, config.end_id,
                                           config.unk_id)

    model = Model(config)
    model.print_parameters()  # 输出模型参数个数
    epoch = 0  # 训练集迭代次数
    global_step = 0  # 参数更新次数

    # 载入模型
    if os.path.isfile(args.model_path):  # 如果载入模型的位置存在则载入模型
        epoch, global_step = model.load_model(args.model_path)
        model.affect_embedding.embedding.weight.requires_grad = False
        print('载入模型完成')
        log_dir = os.path.split(args.model_path)[0]
    elif args.inference:  # 如果载入模型的位置不存在,但是又要测试,这是没有意义的
        print('请测试一个训练过的模型!')
        return
    else:  # 如果载入模型的位置不存在,重新开始训练,则载入预训练的词向量
        model.embedding.embedding.weight = torch.nn.Parameter(
            torch.FloatTensor(embeds))
        model.affect_embedding.embedding.weight = torch.nn.Parameter(
            torch.tensor(vads).float())
        model.affect_embedding.embedding.weight.requires_grad = False
        print('初始化模型完成')
        log_dir = os.path.join(args.log_path, 'run' + str(int(time.time())))
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
    if args.gpu:
        model.to('cuda')  # 将模型参数转到gpu

    # 定义优化器参数
    optim = Optim(config.method, config.lr, config.lr_decay,
                  config.weight_decay, config.max_grad_norm)
    optim.set_parameters(model.parameters())  # 给优化器设置参数
    optim.update_lr(epoch)  # 每个epoch更新学习率

    # 训练
    if not args.inference:
        dp_train = DataProcessor(trainset, config.batch_size,
                                 sentence_processor)  # 数据的迭代器
        dp_valid = DataProcessor(validset,
                                 config.batch_size,
                                 sentence_processor,
                                 shuffle=False)

        while epoch < args.max_epoch:  # 最大训练轮数
            model.train()  # 切换到训练模式
            for data in dp_train.get_batch_data():
                start_time = time.time()
                feed_data = prepare_feed_data(data)
                nll_loss, precision = train(model, feed_data)
                nll_loss.mean().backward()  # 反向传播
                optim.step()  # 更新参数
                optim.optimizer.zero_grad()  # 清空梯度
                use_time = time.time() - start_time

                global_step += 1  # 参数更新次数+1
                if global_step % args.print_per_step == 0:
                    print(
                        'epoch: {:d}, global_step: {:d}, lr: {:g}, nll_loss: {:.2f}, precision: {:.2%},'
                        ' time: {:.2f}s/step'.format(epoch, global_step,
                                                     optim.lr,
                                                     nll_loss.mean().item(),
                                                     precision.mean().item(),
                                                     use_time))

            epoch += 1  # 数据集迭代次数+1
            optim.update_lr(epoch)  # 调整学习率

            log_file = os.path.join(
                log_dir, '{:03d}{:012d}.model'.format(epoch, global_step))
            model.save_model(epoch, global_step, log_file)
            model.eval()
            nll_loss, precision = valid(model, dp_valid)
            print('在验证集上的NLL损失为: {:g}, 准确率为: {:.2%}'.format(
                nll_loss, precision))

    else:  # 测试
        if not os.path.exists(args.result_path):  # 创建结果文件夹
            os.makedirs(args.result_path)

        result_file = os.path.join(args.result_path,
                                   '{:03d}{:012d}.txt'.format(
                                       epoch, global_step))  # 命名结果文件
        fw = open(result_file, 'w', encoding='utf8')
        dp_test = DataProcessor(testset,
                                config.batch_size,
                                sentence_processor,
                                shuffle=False)

        model.eval()
        nll_loss, precision = valid(model, dp_test)
        print('在测试集上的NLL损失为: {:g}, 准确率为: {:.2%}'.format(nll_loss, precision))

        for data in dp_test.get_batch_data():
            texts = data['str_texts']
            emotions = data['emotions']
            feed_data = prepare_feed_data(data)
            results = test(model, feed_data)  # 使用模型计算结果 [batch]

            for idx, result in enumerate(results):
                new_data = dict()
                new_data['text'] = texts[idx]
                new_data['emotion'] = emotions[idx]
                new_data['result'] = result  # 将输出的句子转回单词的形式
                fw.write(json.dumps(new_data, ensure_ascii=False) + '\n')
        fw.close()
Beispiel #3
0
def main():
    trainset, validset, testset = [], [], []
    if args.inference:  # 测试时只载入测试集
        with open(args.testset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                testset.append(json.loads(line))
        print(f'载入测试集{len(testset)}条')
    else:  # 训练时载入训练集和验证集
        with open(args.trainset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                trainset.append(json.loads(line))
        print(f'载入训练集{len(trainset)}条')
        with open(args.validset_path, 'r', encoding='utf8') as fr:
            for line in fr:
                validset.append(json.loads(line))
        print(f'载入验证集{len(validset)}条')

    vocab, embeds = [], []
    with open(args.embed_path, 'r', encoding='utf8') as fr:
        for line in fr:
            line = line.strip()
            word = line[:line.find(' ')]
            vec = line[line.find(' ') + 1:].split()
            embed = [float(v) for v in vec]
            assert len(embed) == config.embedding_size  # 检测词向量维度
            vocab.append(word)
            embeds.append(embed)
    print(f'载入词汇表: {len(vocab)}个')
    print(f'词向量维度: {config.embedding_size}')

    vads = []
    with open(args.vad_path, 'r', encoding='utf8') as fr:
        for line in fr:
            line = line.strip()
            vad = line[line.find(' ') + 1:].split()
            vad = [float(item) for item in vad]
            assert len(vad) == config.affect_embedding_size
            vads.append(vad)
    print(f'载入vad字典: {len(vads)}个')
    print(f'vad维度: {config.affect_embedding_size}')

    # 通过词汇表构建一个word2index和index2word的工具
    sentence_processor = SentenceProcessor(vocab, config.pad_id,
                                           config.start_id, config.end_id,
                                           config.unk_id)

    model = Model(config)
    model.print_parameters()  # 输出模型参数个数
    epoch = 0  # 训练集迭代次数
    global_step = 0  # 参数更新次数

    # 载入模型
    if os.path.isfile(args.model_path):  # 如果载入模型的位置存在则载入模型
        epoch, global_step = model.load_model(args.model_path)
        model.affect_embedding.embedding.weight.requires_grad = False
        print('载入模型完成')
        log_dir = os.path.split(args.model_path)[0]
    elif args.inference:  # 如果载入模型的位置不存在,但是又要测试,这是没有意义的
        print('请测试一个训练过的模型!')
        return
    else:  # 如果载入模型的位置不存在,重新开始训练,则载入预训练的词向量
        model.embedding.embedding.weight = torch.nn.Parameter(
            torch.FloatTensor(embeds))
        model.affect_embedding.embedding.weight = torch.nn.Parameter(
            torch.FloatTensor(vads))
        model.affect_embedding.embedding.weight.requires_grad = False
        print('初始化模型完成')
        log_dir = os.path.join(args.log_path, 'run' + str(int(time.time())))
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

    if args.gpu:
        model.to('cuda')  # 将模型参数转到gpu

    # 定义优化器参数
    optim = Optim(config.method, config.lr, config.lr_decay,
                  config.weight_decay, config.eps, config.max_grad_norm)
    optim.set_parameters(model.parameters())  # 给优化器设置参数
    optim.update_lr(epoch)  # 每个epoch更新学习率

    # 训练
    if not args.inference:
        summary_writer = SummaryWriter(os.path.join(
            log_dir, 'summary'))  # 创建tensorboard记录的文件夹
        dp_train = DataProcessor(trainset, config.batch_size,
                                 sentence_processor)  # 数据的迭代器
        dp_valid = DataProcessor(validset,
                                 config.batch_size,
                                 sentence_processor,
                                 shuffle=False)

        while epoch < args.max_epoch:  # 最大训练轮数
            model.train()  # 切换到训练模式
            for data in dp_train.get_batch_data():
                start_time = time.time()
                feed_data = prepare_feed_data(data)
                rl_loss, reward, loss, nll_loss, kld_loss, kld_weight, ppl = \
                    train(model, feed_data, global_step)
                if args.reinforce:
                    rl_loss.mean().backward()
                else:
                    loss.mean().backward()  # 反向传播
                optim.step()  # 更新参数
                optim.optimizer.zero_grad()  # 清空梯度
                use_time = time.time() - start_time

                global_step += 1  # 参数更新次数+1
                if global_step % args.print_per_step == 0:
                    print(
                        'epoch: {:d}, global_step: {:d}, lr: {:g}, rl_loss: {:.2f}, reward: {:.2f}, nll_loss: {:.2f},'
                        ' kld_loss: {:.2f}, kld_weight: {:g}, ppl: {:.2f}, time: {:.2f}s/step'
                        .format(epoch, global_step, optim.lr,
                                rl_loss.mean().item(),
                                reward.mean().item(),
                                nll_loss.mean().item(),
                                kld_loss.mean().item(), kld_weight,
                                ppl.mean().exp().item(), use_time))
                    summary_writer.add_scalar('train_rl',
                                              rl_loss.mean().item(),
                                              global_step)
                    summary_writer.add_scalar('train_reward',
                                              reward.mean().item(),
                                              global_step)
                    summary_writer.add_scalar('train_nll',
                                              nll_loss.mean().item(),
                                              global_step)
                    summary_writer.add_scalar('train_kld',
                                              kld_loss.mean().item(),
                                              global_step)
                    summary_writer.add_scalar('train_weight', kld_weight,
                                              global_step)
                    summary_writer.add_scalar('train_ppl',
                                              ppl.mean().exp().item(),
                                              global_step)
                    summary_writer.flush()  # 将缓冲区写入文件

                if global_step % args.log_per_step == 0 \
                        or (global_step % (2*config.kl_step) - config.kl_step) == config.kl_step // 2:
                    log_file = os.path.join(
                        log_dir,
                        '{:03d}{:012d}.model'.format(epoch, global_step))
                    model.save_model(epoch, global_step, log_file)

                    model.eval()
                    reward, nll_loss, kld_loss, ppl = valid(
                        model, dp_valid, global_step - 1)
                    model.train()
                    print(
                        '在验证集上的REWARD为: {:g}, NLL损失为: {:g}, KL损失为: {:g}, PPL为: {:g}'
                        .format(reward, nll_loss, kld_loss, np.exp(ppl)))
                    summary_writer.add_scalar('valid_reward', reward,
                                              global_step)
                    summary_writer.add_scalar('valid_nll', nll_loss,
                                              global_step)
                    summary_writer.add_scalar('valid_kld', kld_loss,
                                              global_step)
                    summary_writer.add_scalar('valid_ppl', np.exp(ppl),
                                              global_step)
                    summary_writer.flush()  # 将缓冲区写入文件

            epoch += 1  # 数据集迭代次数+1
            optim.update_lr(epoch)  # 调整学习率

            log_file = os.path.join(
                log_dir, '{:03d}{:012d}.model'.format(epoch, global_step))
            model.save_model(epoch, global_step, log_file)

            model.eval()
            reward, nll_loss, kld_loss, ppl = valid(model, dp_valid,
                                                    global_step - 1)
            print('在验证集上的REWARD为: {:g}, NLL损失为: {:g}, KL损失为: {:g}, PPL为: {:g}'.
                  format(reward, nll_loss, kld_loss, np.exp(ppl)))
            summary_writer.add_scalar('valid_reward', reward, global_step)
            summary_writer.add_scalar('valid_nll', nll_loss, global_step)
            summary_writer.add_scalar('valid_kld', kld_loss, global_step)
            summary_writer.add_scalar('valid_ppl', np.exp(ppl), global_step)
            summary_writer.flush()  # 将缓冲区写入文件

        summary_writer.close()
    else:  # 测试
        if not os.path.exists(args.result_path):  # 创建结果文件夹
            os.makedirs(args.result_path)

        result_file = os.path.join(args.result_path,
                                   '{:03d}{:012d}.txt'.format(
                                       epoch, global_step))  # 命名结果文件
        fw = open(result_file, 'w', encoding='utf8')
        dp_test = DataProcessor(testset,
                                config.batch_size,
                                sentence_processor,
                                shuffle=False)

        model.eval()  # 切换到测试模式,会停用dropout等等
        reward, nll_loss, kld_loss, ppl = valid(model, dp_test,
                                                global_step - 1)
        print('在测试集上的REWARD为: {:g}, NLL损失为: {:g}, KL损失为: {:g}, PPL为: {:g}'.
              format(reward, nll_loss, kld_loss, np.exp(ppl)))

        len_results = []  # 统计生成结果的总长度
        for data in dp_test.get_batch_data():
            posts = data['str_posts']
            responses = data['str_responses']
            feed_data = prepare_feed_data(data, inference=True)
            results = test(model, feed_data)  # 使用模型计算结果 [batch, len_decoder]

            for idx, result in enumerate(results):
                new_data = dict()
                new_data['post'] = posts[idx]
                new_data['response'] = responses[idx]
                new_data['result'] = sentence_processor.index2word(
                    result)  # 将输出的句子转回单词的形式
                len_results.append(len(new_data['result']))
                fw.write(json.dumps(new_data, ensure_ascii=False) + '\n')

        fw.close()
        print(f'生成句子平均长度: {1.0 * sum(len_results) / len(len_results)}')
Beispiel #4
0
def filter_by_emotion(args):
    vocab, embeds = [], []
    with open(args.embed_path, 'r', encoding='utf8') as fr:
        for line in fr:
            line = line.strip()
            word = line[:line.find(' ')]
            vec = line[line.find(' ') + 1:].split()
            embed = [float(v) for v in vec]
            assert len(embed) == config.embedding_size  # 检测词向量维度
            vocab.append(word)
            embeds.append(embed)
    print(f'载入词汇表: {len(vocab)}个')
    print(f'词向量维度: {config.embedding_size}')
    vads = []
    with open(args.vad_path, 'r', encoding='utf8') as fr:
        for line in fr:
            line = line.strip()
            vad = line[line.find(' ') + 1:].split()
            vad = [float(item) for item in vad]
            assert len(vad) == config.affect_embedding_size
            vads.append(vad)
    print(f'载入vad字典: {len(vads)}个')
    print(f'vad维度: {config.affect_embedding_size}')
    sentence_processor = SentenceProcessor(vocab, config.pad_id,
                                           config.start_id, config.end_id,
                                           config.unk_id)
    model = Model(config)
    if args.gpu:
        model.cuda()
    num = 0
    with open(args.data_path, 'r', encoding='utf8') as fr:
        with open(args.save_path, 'w', encoding='utf8') as fw:
            for line in fr:
                data = json.loads(line.strip())
                post = data['post']
                response = data['response']
                id_post, len_post = sentence_processor.word2index(post)
                id_response, len_response = sentence_processor.word2index(
                    response)
                max_len = max(len_post, len_response) + 2
                id_post = sentence_processor.pad_sentence(id_post, max_len)
                id_response = sentence_processor.pad_sentence(
                    id_response, max_len)
                texts = [id_post, id_response]
                lengths = [len_post + 2, len_response + 2]
                feed_data = {
                    'x': torch.tensor(texts).long(),
                    'len_x': torch.tensor(lengths).long()
                }
                if args.gpu:
                    for key, value in feed_data.items():
                        feed_data[key] = value.cuda()
                result = model(feed_data).argmax(1).detach().tolist()
                if sum(result) != 0:
                    new_data = {
                        'post': post,
                        'post_e': result[0],
                        'response': response,
                        'response_e': result[1]
                    }
                    fw.write(json.dumps(new_data, ensure_ascii=False) + '\n')
                    num += 1
    print(f'剩余{num}条数据')