default=0.0003, help='learning rate for optimization') args = parser.parse_args() use_gpu = torch.cuda.is_available() print('use_gpu', use_gpu) if use_gpu: torch.backends.cudnn.benchmark = True if args.dataset == 'librispeech': from datasets.libri_speech import LibriSpeech as SpeechDataset, vocab, idx2char else: from datasets.mb_speech import MBSpeech as SpeechDataset, vocab, idx2char train_dataset = SpeechDataset( transform=Compose([LoadAudio( ), SpeedChange(), ExtractSpeechFeatures()])) valid_dataset = SpeechDataset( transform=Compose([LoadAudio(), ExtractSpeechFeatures()])) indices = list(range(len(train_dataset))) train_sampler = SubsetRandomSampler(indices[:-args.batch_size]) valid_sampler = SubsetRandomSampler(indices[-args.batch_size:]) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.dataload_workers_nums, sampler=train_sampler) valid_data_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False,
alpha=args.alpha, beta=args.beta, cutoff_top_n=40, cutoff_prob=1.0, beam_width=1000) else: decoder = GreedyDecoder(labels=vocab) t = time.time() decoded_output, _ = decoder.decode(outputs) print("decode time: %.3fs" % (time.time() - t)) return decoded_output[0][0] if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--checkpoint", type=str, required=True, help='checkpoint file to test') parser.add_argument("--lm", type=str, required=False, help='link to KenLM 5-gram binary language model') parser.add_argument("--alpha", type=float, default=0.3, help='alpha for CTC decode') parser.add_argument("--beta", type=float, default=1.85, help='beta for CTC decode') parser.add_argument("audio", help='a WAV file') args = parser.parse_args() data = { 'fname': args.audio, 'text': '' } data = Compose([LoadAudio(), ExtractSpeechFeatures()])(data) result = transcribe(data, args) print("Predicted:") print(result)
LibriSpeech(name='train-clean-100'), LibriSpeech(name='train-clean-360'), LibriSpeech(name='train-other-500'), LibriSpeech(name='dev-clean',) ]) elif args.dataset == 'backgroundsounds': from datasets.background_sounds import BackgroundSounds dataset = BackgroundSounds(is_random=False) elif args.dataset == 'bolorspeech': from datasets.bolor_speech import BolorSpeech dataset = ConcatDataset([ BolorSpeech(name='train'), BolorSpeech(name='train2'), BolorSpeech(name='test'), BolorSpeech(name='demo'), BolorSpeech(name='annotation'), BolorSpeech(name='annotation-1111') ]) else: print("unknown dataset!") import sys sys.exit(1) transform=Compose([LoadAudio(), ComputeMagSpectrogram()]) for data in tqdm(dataset): fname = data['fname'] data = transform(data) mel_spectrogram = data['input'] np.save(fname.replace('.wav', '.npy'), mel_spectrogram)
type=float, default=0.001, help='learning rate for optimization') args = parser.parse_args() use_gpu = torch.cuda.is_available() print('use_gpu', use_gpu) if use_gpu: torch.backends.cudnn.benchmark = True if args.dataset == 'librispeech': from datasets.libri_speech import LibriSpeech as SpeechDataset, vocab, idx2char else: from datasets.mb_speech import MBSpeech as SpeechDataset, vocab, idx2char dataset = SpeechDataset(transform=LoadAudio()) indices = list(range(len(dataset))) train_sampler = SubsetRandomSampler(indices[:-args.batch_size]) valid_sampler = SubsetRandomSampler(indices[-args.batch_size:]) train_data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.dataload_workers_nums, sampler=train_sampler) valid_data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.dataload_workers_nums,