Ejemplo n.º 1
0
def eval(model, dataloader):
    model.eval()
    decoder = GreedyDecoder(dataloader.dataset.labels_str)
    cer = 0
    refs = 0
    print("decoding")
    with torch.no_grad():
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            x = x.to(device)
            outs, out_lens = model(x, x_lens)
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset:offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = decoder.convert_to_strings(ys)
            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                cer += decoder.cer(trans, ref)
                refs += ref
        cer /= float(len(refs))
        cer = metric_average(cer, 'cer')
    model.train()
    return cer
Ejemplo n.º 2
0
def evaluate(dataloader, dev_manifest_path):
    cer = 0
    decoder1 = GreedyDecoder(dataloader.dataset.labels_str)
    with open(dev_manifest_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    with torch.no_grad():
        for line in tqdm(lines):
            path, label = line.replace('\n', '').split(',')
            out_strings = predict(path)
            cer += decoder1.cer(out_strings, label) / float(len(label))
        cer /= len(lines)
    return cer
Ejemplo n.º 3
0
    valid_loader = AudioDataLoader(valid_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)

    dtype = torch.FloatTensor
    ltype = torch.LongTensor

    if torch.cuda.is_available():
        print('use gpu')
        dtype = torch.cuda.FloatTensor
        ltype = torch.cuda.LongTensor

    model = WaveNet(args.layer_size, args.stack_size, args.in_channels,
                    args.res_channels)

    decoder = GreedyDecoder(labels)

    avg_loss, start_epoch, start_iter = 0, 0, 0
    loss_results, cer_results, wer_results = torch.Tensor(
        args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs)
    best_wer = None

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(start_epoch, args.epochs):
        end = time.time()
        start_epoch_time = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_sampler):
Ejemplo n.º 4
0
def train(args):
    if dist.get_rank() == 0:
        # 日志记录器
        writer = LogWriter(logdir='log')
    # 设置支持多卡训练
    dist.init_parallel_env()
    # 获取训练数据
    train_dataset = PPASRDataset(args.train_manifest,
                                 args.dataset_vocab,
                                 mean=args.data_mean,
                                 std=args.data_std,
                                 min_duration=args.min_duration,
                                 max_duration=args.max_duration)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              collate_fn=collate_fn,
                              num_workers=args.num_workers,
                              use_shared_memory=False)
    train_loader_shuffle = DataLoader(dataset=train_dataset,
                                      batch_size=args.batch_size,
                                      collate_fn=collate_fn,
                                      num_workers=args.num_workers,
                                      shuffle=True,
                                      use_shared_memory=False)
    # 获取测试数据
    test_dataset = PPASRDataset(args.test_manifest,
                                args.dataset_vocab,
                                mean=args.data_mean,
                                std=args.data_std)
    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=args.batch_size,
                             collate_fn=collate_fn,
                             num_workers=args.num_workers,
                             use_shared_memory=False)
    # 获取解码器,用于评估
    greedy_decoder = GreedyDecoder(train_dataset.vocabulary)
    # 获取模型,同时数据均值和标准值到模型中,方便以后推理使用
    model = PPASR(train_dataset.vocabulary,
                  data_mean=paddle.to_tensor(args.data_mean),
                  data_std=paddle.to_tensor(args.data_std))
    if dist.get_rank() == 0:
        print('input_size的第三个参数是变长的,这里为了能查看输出的大小变化,指定了一个值!')
        paddle.summary(model, input_size=(args.batch_size, 128, 500))
    # 设置支持多卡训练
    model = paddle.DataParallel(model)
    # 设置优化方法
    clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
    # 分段学习率
    boundaries = [10, 20, 50, 100]
    lr = [0.1**l * args.learning_rate for l in range(len(boundaries) + 1)]
    # 获取预训练的epoch数
    last_epoch = int(re.findall(r'\d+', args.pretrained_model)
                     [-1]) if args.pretrained_model is not None else -1
    scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=boundaries,
                                                   values=lr,
                                                   last_epoch=last_epoch,
                                                   verbose=True)
    optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
                                      learning_rate=scheduler,
                                      grad_clip=clip)
    # 获取损失函数
    ctc_loss = paddle.nn.CTCLoss()
    # 加载预训练模型
    if args.pretrained_model is not None:
        model.set_state_dict(
            paddle.load(os.path.join(args.pretrained_model, 'model.pdparams')))
        optimizer.set_state_dict(
            paddle.load(os.path.join(args.pretrained_model,
                                     'optimizer.pdopt')))
    train_step = 0
    test_step = 0
    # 开始训练
    for epoch in range(last_epoch, args.num_epoch):
        # 第一个epoch不打乱数据
        if epoch == 1:
            train_loader = train_loader_shuffle
        for batch_id, (inputs, labels, input_lens,
                       label_lens) in enumerate(train_loader()):
            out, out_lens = model(inputs, input_lens)
            out = paddle.transpose(out, perm=[2, 0, 1])
            # 计算损失
            loss = ctc_loss(out, labels, out_lens, label_lens)
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            # 多卡训练只使用一个进程打印
            if batch_id % 100 == 0 and dist.get_rank() == 0:
                print('[%s] Train epoch %d, batch %d, loss: %f' %
                      (datetime.now(), epoch, batch_id, loss))
                writer.add_scalar('Train loss', loss, train_step)
                train_step += 1
            # 固定步数也要保存一次模型
            if batch_id % 2000 == 0 and batch_id != 0 and dist.get_rank() == 0:
                # 保存模型
                save_model(args=args,
                           epoch=epoch,
                           model=model,
                           optimizer=optimizer)
        # 多卡训练只使用一个进程执行评估和保存模型
        if dist.get_rank() == 0:
            # 执行评估
            model.eval()
            cer = evaluate(model, test_loader, greedy_decoder)
            print('[%s] Test epoch %d, cer: %f' % (datetime.now(), epoch, cer))
            writer.add_scalar('Test cer', cer, test_step)
            test_step += 1
            model.train()
            # 记录学习率
            writer.add_scalar('Learning rate', scheduler.last_lr, epoch)
            # 保存模型
            save_model(args=args,
                       epoch=epoch,
                       model=model,
                       optimizer=optimizer)
        scheduler.step()
Ejemplo n.º 5
0
def test_model(rank, model, test_data, criterion=nn.NLLLoss(), tokenizer=None):
    test_loss = 0
    correct = 0
    top_5 = 0

    correct2 = 0
    test_len = 0
    perplexity_loss = 0.

    total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0

    model.eval()
    targets_list = []
    preds = []

    decoder = None

    if args.task == 'voice':
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))

    for data, target in test_data:
        if args.task == 'nlp':

            data, target = mask_tokens(data, tokenizer,
                                       args) if args.mlm else (data, data)
            data, target = Variable(data).cuda(), Variable(target).cuda()

            outputs = model(data,
                            masked_lm_labels=target) if args.mlm else model(
                                data, labels=target)

            loss = outputs[0]
            #criterion(outputs[1].view(-1, 30000), target.view(-1))
            test_loss += loss.data.item()
            perplexity_loss += loss.data.item()

            acc = accuracy(outputs[1].view(-1, 30000),
                           target.view(-1),
                           topk=(1, 5))

            correct += acc[0].item()
            top_5 += acc[1].item()

        elif args.task == 'tag':
            data, target = Variable(data).cuda(), Variable(target).cuda()
            output = model(data)
            loss = criterion(output, target)

            # we have to scan the sample one by one
            for idx, sample in enumerate(output):
                target_index = torch.nonzero(
                    target[idx]).flatten().cpu().numpy().tolist()
                maxk = len(target_index)
                preds += [sample.topk(maxk)[1].cpu().numpy().tolist()]
                targets_list += [target_index]

            test_loss += loss.data.item()

        elif args.task == 'speech':
            data, target = Variable(data).cuda(), Variable(target).cuda()
            data = torch.unsqueeze(data, 1)

            output = model(data)
            loss = criterion(output, target)

            test_loss += loss.data.item()  # Variable.data
            acc = accuracy(output, target, topk=(1, 5))

            correct += acc[0].item()
            top_5 += acc[1].item()

        elif args.task == 'text_clf':
            (inputs, masks) = data
            inputs, masks, target = Variable(inputs).cuda(), Variable(
                masks).cuda(), Variable(target).cuda()
            loss, output = model(inputs,
                                 token_type_ids=None,
                                 attention_mask=masks,
                                 labels=target)

            #loss = torch.mean(loss)
            test_loss += loss.item()  # Variable.data
            acc = accuracy(output, target, topk=(1, 2))

            correct += acc[0].item()
            top_5 += acc[1].item()

        elif args.task == 'voice':
            (inputs, target, input_percentages, target_sizes) = data

            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            inputs = Variable(inputs).cuda()

            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(target[offset:offset + size])
                offset += size

            out, output_sizes = model(inputs, input_sizes)

            decoded_output, _ = decoder.decode(out, output_sizes)
            target_strings = decoder.convert_to_strings(split_targets)

            for x in range(len(target_strings)):
                transcript, reference = decoded_output[x][0], target_strings[
                    x][0]
                wer_inst = decoder.wer(transcript, reference)
                cer_inst = decoder.cer(transcript, reference)
                total_wer += wer_inst
                total_cer += cer_inst
                num_tokens += len(reference.split())
                num_chars += len(reference.replace(' ', ''))

            outputs = out.transpose(0, 1)
            outputs = outputs.float()
            loss = criterion(outputs, target, output_sizes, target_sizes)
            test_loss += loss.data.item()
        else:
            data, target = Variable(data).cuda(), Variable(target).cuda()

            output = model(data)
            loss = criterion(output, target)

            test_loss += loss.data.item()  # Variable.data
            acc = accuracy(output, target, topk=(1, 5))

            correct += acc[0].item()
            top_5 += acc[1].item()

        test_len += len(target)

    if args.task == 'voice':
        correct, top_5, test_len = float(total_wer), float(total_cer), float(
            num_tokens)

    # loss function averages over batch size
    test_loss /= len(test_data)
    perplexity_loss /= len(test_data)

    sum_loss = test_loss * test_len

    # in NLP, we care about the perplexity of the model
    acc = round(correct / test_len, 4)
    acc_5 = round(top_5 / test_len, 4)
    test_loss = round(test_loss, 4)

    if args.task == 'tag':
        # precision, recall, f1, sup = precision_recall_fscore_support(targets_list, preds, average='samples')
        top_5, correct, test_len = cal_accuracy(targets_list, preds)

    logging.info(
        'Rank {}: Test set: Average loss: {}, Top-1 Accuracy: {}/{} ({}), Top-5 Accuracy: {}'
        .format(rank, test_loss, correct, len(test_data.dataset), acc, acc_5))

    return test_loss, acc, acc_5, [correct, top_5, sum_loss, test_len]
Ejemplo n.º 6
0
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
add_arg('audio_path',    str,  'dataset/test.wav',       '用于识别的音频路径')
add_arg('dataset_vocab', str,  'dataset/zh_vocab.json',  '数据字典的路径')
add_arg('model_path',    str,  'models/step_final/',     '模型的路径')
args = parser.parse_args()


print_arguments(args)
# 加载数据字典
with open(args.dataset_vocab, 'r', encoding='utf-8') as f:
    labels = eval(f.read())
vocabulary = dict([(labels[i], i) for i in range(len(labels))])
# 获取解码器
greedy_decoder = GreedyDecoder(vocabulary)

# 创建模型
model = PPASR(vocabulary)
model.set_state_dict(paddle.load(os.path.join(args.model_path, 'model.pdparams')))
# 获取保存在模型中的数据均值和标准值
data_mean = model.data_mean.numpy()[0]
data_std = model.data_std.numpy()[0]
model.eval()


def infer():
    # 读取音频文件转成梅尔频率倒谱系数(MFCCs)
    mfccs = load_audio_mfcc(args.audio_path, mean=data_mean, std=data_std)

    mfccs = paddle.to_tensor(mfccs, dtype='float32')
Ejemplo n.º 7
0
add_arg('num_workers', int, 8, '读取数据的线程数量')
add_arg('test_manifest', str, 'dataset/manifest.test', '测试数据的数据列表路径')
add_arg('dataset_vocab', str, 'dataset/zh_vocab.json', '数据字典的路径')
add_arg('model_path', str, 'models/step_final/', '模型的路径')
args = parser.parse_args()

print_arguments(args)
# 获取测试数据
test_dataset = PPASRDataset(args.test_manifest, args.dataset_vocab)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=args.batch_size,
                         collate_fn=collate_fn,
                         num_workers=args.num_workers,
                         use_shared_memory=False)
# 获取解码器,用于评估
greedy_decoder = GreedyDecoder(test_dataset.vocabulary)
# 获取模型
model = PPASR(test_dataset.vocabulary)
model.set_state_dict(
    paddle.load(os.path.join(args.model_path, 'model.pdparams')))
# 获取保存在模型中的数据均值和标准值,设置数据处理器
test_dataset.mean = model.data_mean.numpy()[0]
test_dataset.std = model.data_std.numpy()[0]
model.eval()


# 评估模型
def evaluate():
    cer = []
    for batch_id, (inputs, labels, _, _) in enumerate(tqdm(test_loader())):
        # 执行识别