def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path / '%sk_steps_%s_target.wav' % (repr1(k), repr1(i))) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = str(save_path / '%sk_steps_%s_%s.wav' % (repr1(k), repr1(i), repr1(batch_str))) _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path/f'{k}k_steps_{i}_target.wav') batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav') _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav(wav, save_path/f'__{file_name}__{k}k_steps_target.wav') mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError(f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!') _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError(f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]') else: raise ValueError(f"Expected an extension of .wav or .npy, but got {suffix}!") mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = save_path/f'__{file_name}__{k}k_steps_{batch_str}.wav' _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def synthesize(text): input = text + "|00-" + lang + "|" + lang # Change to Multi_TTS path sys.path.append( os.path.join(os.path.dirname(__file__), "dependencies/Multilingual_Text_to_Speech")) if "utils" in sys.modules: del sys.modules["utils"] from synthesize import synthesize from utils import build_model # Load Mulilingual pretrained model model = build_model( os.path.abspath("./dependencies/checkpoints/generated_switching.pyt")) model.eval() # generate spectogram spectogram = synthesize(model, "|" + input) # Change to WaveRNN Path sys.path.append( os.path.join(os.path.dirname(__file__), "dependencies/WaveRNN")) if "utils" in sys.modules: del sys.modules["utils"] from models.fatchord_version import WaveRNN from utils import hparams as hp from gen_wavernn import generate import torch # Load WaveRNN pretrained model hp.configure("hparams.py") model = WaveRNN( rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to( torch.device('cuda' if torch.cuda.is_available() else 'cpu')) model.load( os.path.join(os.path.dirname(__file__), "dependencies/checkpoints/wavernn_weight.pyt")) waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap) f = write("./temp/result.wav", "x") f.write(waveform) f.close()
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 os.makedirs(save_path/'test', exist_ok=True) for file_name in tqdm(os.listdir(load_path)): if file_name.endswith('.npy'): mel = np.load(os.path.join(load_path, file_name)) mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = save_path/f'test/{file_name}__{k}k_steps_{batch_str}.wav' _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def train(self, model: WaveRNN, optimizer: Optimizer, train_gta=False) -> None: voc_schedule = self.train_cfg['schedule'] voc_schedule = parse_schedule(voc_schedule) for i, session_params in enumerate(voc_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set, val_set_samples = get_vocoder_datasets( path=self.paths.data, batch_size=bs, train_gta=train_gta, max_mel_len=self.train_cfg['max_mel_len'], hop_length=self.dsp.hop_length, voc_pad=model.pad, voc_seq_len=self.train_cfg['seq_len'], voc_mode=self.dsp.voc_mode, bits=self.dsp.bits, num_gen_samples=self.train_cfg['num_gen_samples']) session = VocSession(index=i, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set, val_set_samples=val_set_samples) self.train_session(model, optimizer, session, train_gta)
def evaluate(self, model: WaveRNN, val_set: Dataset) -> float: model.eval() val_loss = 0 device = next(model.parameters()).device for i, (x, y, m) in enumerate(val_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) with torch.no_grad(): y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = self.loss_func(y_hat, y) val_loss += loss.item() return val_loss / len(val_set)
def wave_rnn(pretrained=True, **kwargs): model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode) if pretrained: state_dict = fetch_and_load_state_dict("wavernn") model.load_state_dict(state_dict) return model
def load_wavernn(checkpoint_path: str) -> Tuple[WaveRNN, Dict[str, Any]]: print(f'Loading voc checkpoint {checkpoint_path}') checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu')) config = checkpoint['config'] voc_model = WaveRNN.from_config(config) voc_model.load_state_dict(checkpoint['model']) print(f'Loaded model with step {voc_model.get_step()}') return voc_model, config
def evaluate(self, model: WaveRNN, val_set: Dataset) -> float: model.eval() val_loss = 0 device = next(model.parameters()).device for i, batch in enumerate(val_set, 1): batch = to_device(batch, device=device) x, y, m = batch['x'], batch['y'], batch['mel'] with torch.no_grad(): y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = self.loss_func(y_hat, y) val_loss += loss.item() return val_loss / len(val_set)
def generate_samples(self, model: WaveRNN, session: VocSession) -> Tuple[float, list]: """ Generates audio samples to cherry-pick models. To evaluate audio quality we calculate the l1 distance between mels of predictions and targets. """ model.eval() mel_losses = [] gen_wavs = [] device = next(model.parameters()).device for i, sample in enumerate(session.val_set_samples, 1): m, x = sample['mel'], sample['x'] if i > self.train_cfg['num_gen_samples']: break x = x[0].numpy() bits = 16 if self.dsp.voc_mode == 'MOL' else self.dsp.bits if self.dsp.mu_law and self.dsp.voc_mode != 'MOL': x = DSP.decode_mu_law(x, 2**bits, from_labels=True) else: x = DSP.label_2_float(x, bits) gen_wav = model.generate(mels=m, batched=self.train_cfg['gen_batched'], target=self.train_cfg['target'], overlap=self.train_cfg['overlap'], mu_law=self.dsp.mu_law, silent=True) gen_wavs.append(gen_wav) y_mel = self.dsp.wav_to_mel(x.squeeze(), normalize=False) y_mel = torch.tensor(y_mel).to(device) y_hat_mel = self.dsp.wav_to_mel(gen_wav, normalize=False) y_hat_mel = torch.tensor(y_hat_mel).to(device) loss = F.l1_loss(y_hat_mel, y_mel) mel_losses.append(loss.item()) self.writer.add_audio(tag=f'Validation_Samples/target_{i}', snd_tensor=x, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag=f'Validation_Samples/generated_{i}', snd_tensor=gen_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) return sum(mel_losses) / len(mel_losses), gen_wavs[0]
def generate_samples(self, model: WaveRNN, session: VocSession) -> Tuple[float, list]: """ Generates audio samples to cherry-pick models. To evaluate audio quality we calculate the l1 distance between mels of predictions and targets. """ model.eval() mel_losses = [] gen_wavs = [] device = next(model.parameters()).device for i, (m, x) in enumerate(session.val_set_samples, 1): if i > hp.voc_gen_num_samples: break x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) gen_wav = model.generate(mels=m, save_path=None, batched=hp.voc_gen_batched, target=hp.voc_target, overlap=hp.voc_overlap, mu_law=hp.mu_law, silent=True) gen_wavs.append(gen_wav) y_mel = raw_melspec(x.squeeze()) y_mel = torch.tensor(y_mel).to(device) y_hat_mel = raw_melspec(gen_wav) y_hat_mel = torch.tensor(y_hat_mel).to(device) loss = F.l1_loss(y_hat_mel, y_mel) mel_losses.append(loss.item()) self.writer.add_audio(tag=f'Validation_Samples/target_{i}', snd_tensor=x, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag=f'Validation_Samples/generated_{i}', snd_tensor=gen_wav, global_step=model.step, sample_rate=hp.sample_rate) return sum(mel_losses) / len(mel_losses), gen_wavs[0]
def get_wavernn_model(model_path): device = torch.device('cuda') print() model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) model.load(model_path) return model
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 mypqmf = PQMF() for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) if hp.voc_multiband: x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2 ** bits, from_labels=True) else: x = label_2_float(x, bits) source = mypqmf.synthesis( torch.tensor(x, dtype=torch.float).unsqueeze( 0)).numpy() # (1, sub_band, T//sub_band) -> (1, 1, T) source = source.squeeze() # (T,) save_wav(source,save_path/f'{k}k_steps_{i}_target.wav') # np.save(save_path/f'{k}k_steps_{i}_target.npy', x, allow_pickle=False) else: x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path/f'{k}k_steps_{i}_target.wav') batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav') # 返回PQMF后 _ = model.generate(m, save_str,batched, target, overlap, hp.mu_law)
def train(self, model: WaveRNN, optimizer: Optimizer, train_gta=False) -> None: for i, session_params in enumerate(hp.voc_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set, val_set_samples = get_vocoder_datasets( path=self.paths.data, batch_size=bs, train_gta=train_gta) session = VocSession( index=i, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set, val_set_samples=val_set_samples) self.train_session(model, optimizer, session, train_gta)
def __init__(self, tts_path: str, voc_path: str, device='cuda'): self.device = torch.device(device) tts_checkpoint = torch.load(tts_path, map_location=self.device) tts_config = tts_checkpoint['config'] tts_model = ForwardTacotron.from_config(tts_config) tts_model.load_state_dict(tts_checkpoint['model']) self.tts_model = tts_model self.wavernn = WaveRNN.from_checkpoint(voc_path) self.melgan = torch.hub.load('seungwonpark/melgan', 'melgan') self.melgan.to(device).eval() self.cleaner = Cleaner.from_config(tts_config) self.tokenizer = Tokenizer() self.dsp = DSP.from_config(tts_config)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav(wav, save_path / f'{prefix}{file_name}.target.wav') mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( f'Expected a numpy array shaped (n_mels, n_hops), but got {mel.shape}!' ) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]' ) else: raise ValueError( f"Expected an extension of .wav or .npy, but got {suffix}!") m = torch.tensor(mel).unsqueeze(0) save_str_wavernn = save_path / f'{prefix}{file_name}.wavernn.wav' save_str_griffinlim = save_path / f'{prefix}{file_name}.griffinlim.wav' wav = reconstruct_waveform(mel, n_iter=32) save_wav(wav, save_str_griffinlim) _ = model.generate(m, save_str_wavernn, batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav( wav, save_path / '__%s__%sk_steps_target.wav' % (repr1(file_name), repr1(k))) mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( 'Expected a numpy array shaped (n_mels, n_hops), but got %s!' % (repr1(wav.shape))) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( 'Expected spectrogram range in [0,1] but was instead [%s, %s]' % (repr1(_min), repr1(_max))) else: raise ValueError('Expected an extension of .wav or .npy, but got %s!' % (repr1(suffix))) mel = torch.tensor(mel).unsqueeze(0) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = save_path / '__%s__%sk_steps_%s.wav' % ( repr1(file_name), repr1(k), repr1(batch_str)) _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): ''' :param model: :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件 :param samples: 要生成的样本量,也就是要生成的音频个数 :param batched: 在这个脚本中batched为True :param target: 11000 :param overlap: 550 :param save_path: model_outputs_* :return: 生成的音频文件 ''' k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path / f'{k}k_steps_{i}_target.wav') # 保存原音频文件 batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path / f'{k}k_steps_{i}_{batch_str}.wav') _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set_wav, samples, batched, target, overlap, save_path: Path): ''' :param model: :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件 :param samples: 要生成的样本量,也就是要生成的音频个数 :param batched: 在这个脚本中batched为True :param target: 11000 :param overlap: 550 :param save_path: model_outputs_* :return: 生成的音频文件 ''' c = 0 for i in os.listdir(test_set_wav): m = np.expand_dims(np.load(join(test_set_wav, i)).T, 0) filenname = basename(i)[:-4] wave_path = "/emotion_wav/" save_str = wave_path + str(filenname) + ".wav" _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav( wav, os.path.join(save_path, "target", os.path.basename(load_path))) print("Generating from {0}".format(load_path)) mel = melspectrogram(wav) print("Melspectrograms generated!") elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!' ) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]' ) else: raise ValueError( f"Expected an extension of .wav or .npy, but got {suffix}!") mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = os.path.join(save_path, os.path.basename(load_path)) beg = time.time() print("Start generating... [{0}]".format(beg)) output = model.generate(mel, save_str, batched, target, overlap, hp.mu_law) end = time.time() print("Done generating... [{0}] -> delta: [{1}]".format(end, end - beg)) save_wav(output, save_str)
print("Cur Dir", os.getcwd()) if "utils" in sys.modules: del sys.modules["utils"] sys.path.append(WAVERNN_FOLDER) from gen_wavernn import generate from utils import hparams as hp from models.fatchord_version import WaveRNN hp.configure(WAVERNN_FOLDER+'/hparams.py') model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to('cpu') model.load(CHECKPOINTS_FOLDER + "/" + wavernn_chpt) y = [] ix=1 while os.path.exists(CHR_FOLDER+"/"+str(ix)+".npy"): print("Found", CHR_FOLDER+"/"+str(ix)+".npy") y.append(np.load(CHR_FOLDER+"/"+str(ix)+".npy")) ix+=1 idx=1 for s in y: waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder') parser.add_argument('--lr', '-l', type=float, help='[float] override hparams.py learning rate') parser.add_argument('--batch_size', '-b', type=int, help='[int] override hparams.py batch size') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--gta', '-g', action='store_true', help='train wavernn on GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # load hparams from file if args.lr is None: args.lr = hp.voc_lr if args.batch_size is None: args.batch_size = hp.voc_batch_size paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) batch_size = args.batch_size force_train = args.force_train train_gta = args.gta lr = args.lr if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) # Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length optimizer = optim.Adam(voc_model.parameters()) restore_checkpoint('voc', paths, voc_model, optimizer, create_if_missing=True) train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta) total_steps = 10_000_000 if force_train else hp.voc_total_steps simple_table([ ('Remaining', str( (total_steps - voc_model.get_step()) // 1000) + 'k Steps'), ('Batch Size', batch_size), ('LR', lr), ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta) ]) loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set, lr, total_steps) print('Training Complete.') print( 'To continue training increase voc_total_steps in hparams.py or use --force_train' )
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.voc_checkpoint_every == 0: gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, paths.voc_output) ckpt_name = f'wave_step{k}K' save_checkpoint('voc', paths, model, optimizer, name=ckpt_name, is_silent=True) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, is_silent=True) model.log(paths.voc_log, msg) print(' ')
if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode='MOL').to(device) voc_model.load('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims,
if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) if args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron( embed_dims=hp.forward_embed_dims, num_chars=len(phonemes), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims,
gta = args.gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising Model...\n') model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) voc_weights = args.voc_weights if args.voc_weights else paths.voc_latest_weights model.load(voc_weights) simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')])
def train_session(self, model: WaveRNN, optimizer: Optimizer, session: VocSession, train_gta: bool) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps ', str(training_steps // 1000) + 'k'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Sequence Length', self.train_cfg['seq_len']), ('GTA Training', train_gta)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, batch in enumerate(session.train_set, 1): start = time.time() model.train() batch = to_device(batch, device=device) x, y = batch['x'], batch['y'] y_hat = model(x, batch['mel']) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = batch['y'].float() y = y.unsqueeze(-1) loss = self.loss_func(y_hat, y) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), self.train_cfg['clip_grad_norm']) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % self.train_cfg['gen_samples_every'] == 0: stream(msg + 'generating samples...') gen_result = self.generate_samples(model, session) if gen_result is not None: mel_loss, gen_wav = gen_result self.writer.add_scalar('Loss/generated_mel_l1', mel_loss, model.get_step()) self.track_top_models(mel_loss, gen_wav, model) if step % self.train_cfg['checkpoint_every'] == 0: save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / f'wavernn_step{k}k.pt') self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / 'latest_model.pt') loss_avg.reset() duration_avg.reset() print(' ')
parser.add_argument('--gta', '-g', action='store_true', help='train wavernn on GTA features') parser.add_argument('--config', metavar='FILE', default='config.yaml', help='The config containing all hyperparams.') args = parser.parse_args() config = read_config(args.config) paths = Paths(config['data_path'], config['voc_model_id'], config['tts_model_id']) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print('Using device:', device) print('\nInitialising Model...\n') voc_model = WaveRNN.from_config(config).to(device) dsp = DSP.from_config(config) assert np.cumprod( config['vocoder']['model']['upsample_factors'])[-1] == dsp.hop_length optimizer = optim.Adam(voc_model.parameters()) restore_checkpoint(model=voc_model, optim=optimizer, path=paths.voc_checkpoints / 'latest_model.pt', device=device) voc_trainer = VocTrainer(paths=paths, dsp=dsp, config=config) voc_trainer.train(voc_model, optimizer, train_gta=args.gta)
def thak(): class Tshamsoo(): force_cpu = os.getenv('FORCE_CPU', False) hp_file = 'hparams.py' vocoder = os.getenv('VOCODER', 'wavernn') batched = os.getenv('BATCHED', True) target = os.getenv('TARGET', None) overlap = os.getenv('OVERLAP', None) tts_weights = None save_attn = os.getenv('SAVE_ATTN', False) voc_weights = None iters = os.getenv('GL_ITERS', 32) args = Tshamsoo() if args.vocoder in ['griffinlim', 'gl']: args.vocoder = 'griffinlim' elif args.vocoder in ['wavernn', 'wr']: args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(args.hp_file) # Load hparams from file tts_weights = args.tts_weights save_attn = args.save_attn paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) if args.vocoder == 'wavernn': # set defaults for any arguments that depend on hparams if args.target is None: args.target = hp.voc_target if args.overlap is None: args.overlap = hp.voc_overlap if args.batched is None: args.batched = hp.voc_gen_batched batched = args.batched target = int(args.target) overlap = int(args.overlap) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) else: voc_model = None batched = None target = None overlap = None print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights tts_model.load(tts_load_path) return args, voc_model, tts_model, batched, target, overlap, save_attn