Esempio n. 1
0
                    default=0.0003,
                    help='learning rate for optimization')
args = parser.parse_args()

use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

if args.dataset == 'librispeech':
    from datasets.libri_speech import LibriSpeech as SpeechDataset, vocab, idx2char
else:
    from datasets.mb_speech import MBSpeech as SpeechDataset, vocab, idx2char

train_dataset = SpeechDataset(
    transform=Compose([LoadAudio(
    ), SpeedChange(), ExtractSpeechFeatures()]))
valid_dataset = SpeechDataset(
    transform=Compose([LoadAudio(), ExtractSpeechFeatures()]))
indices = list(range(len(train_dataset)))
train_sampler = SubsetRandomSampler(indices[:-args.batch_size])
valid_sampler = SubsetRandomSampler(indices[-args.batch_size:])

train_data_loader = DataLoader(train_dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
                               collate_fn=collate_fn,
                               num_workers=args.dataload_workers_nums,
                               sampler=train_sampler)
valid_data_loader = DataLoader(valid_dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
Esempio n. 2
0
                                 alpha=args.alpha, beta=args.beta, cutoff_top_n=40, cutoff_prob=1.0, beam_width=1000)
    else:
        decoder = GreedyDecoder(labels=vocab)

    t = time.time()
    decoded_output, _ = decoder.decode(outputs)
    print("decode time: %.3fs" % (time.time() - t))

    return decoded_output[0][0]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--checkpoint", type=str, required=True, help='checkpoint file to test')
    parser.add_argument("--lm", type=str, required=False, help='link to KenLM 5-gram binary language model')
    parser.add_argument("--alpha", type=float, default=0.3, help='alpha for CTC decode')
    parser.add_argument("--beta", type=float, default=1.85, help='beta for CTC decode')
    parser.add_argument("audio", help='a WAV file')
    args = parser.parse_args()

    data = {
        'fname': args.audio,
        'text': ''
    }
    data = Compose([LoadAudio(), ExtractSpeechFeatures()])(data)

    result = transcribe(data, args)

    print("Predicted:")
    print(result)
        LibriSpeech(name='train-clean-100'),
        LibriSpeech(name='train-clean-360'),
        LibriSpeech(name='train-other-500'),
        LibriSpeech(name='dev-clean',)
    ])
elif args.dataset == 'backgroundsounds':
    from datasets.background_sounds import BackgroundSounds
    dataset = BackgroundSounds(is_random=False)
elif args.dataset == 'bolorspeech':
    from datasets.bolor_speech import BolorSpeech
    dataset = ConcatDataset([
        BolorSpeech(name='train'),
        BolorSpeech(name='train2'),
        BolorSpeech(name='test'),
        BolorSpeech(name='demo'),
        BolorSpeech(name='annotation'),
        BolorSpeech(name='annotation-1111')
    ])
else:
    print("unknown dataset!")
    import sys
    sys.exit(1)


transform=Compose([LoadAudio(), ComputeMagSpectrogram()])
for data in tqdm(dataset):
    fname = data['fname']
    data = transform(data)
    mel_spectrogram = data['input']
    np.save(fname.replace('.wav', '.npy'), mel_spectrogram)
                    type=float,
                    default=0.001,
                    help='learning rate for optimization')
args = parser.parse_args()

use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

if args.dataset == 'librispeech':
    from datasets.libri_speech import LibriSpeech as SpeechDataset, vocab, idx2char
else:
    from datasets.mb_speech import MBSpeech as SpeechDataset, vocab, idx2char

dataset = SpeechDataset(transform=LoadAudio())
indices = list(range(len(dataset)))
train_sampler = SubsetRandomSampler(indices[:-args.batch_size])
valid_sampler = SubsetRandomSampler(indices[-args.batch_size:])

train_data_loader = DataLoader(dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
                               collate_fn=collate_fn,
                               num_workers=args.dataload_workers_nums,
                               sampler=train_sampler)
valid_data_loader = DataLoader(dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
                               collate_fn=collate_fn,
                               num_workers=args.dataload_workers_nums,