def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path/f'{k}k_steps_{i}_target.wav') batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav') _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def train(self, model: WaveRNN, optimizer: Optimizer, train_gta=False) -> None: voc_schedule = self.train_cfg['schedule'] voc_schedule = parse_schedule(voc_schedule) for i, session_params in enumerate(voc_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set, val_set_samples = get_vocoder_datasets( path=self.paths.data, batch_size=bs, train_gta=train_gta, max_mel_len=self.train_cfg['max_mel_len'], hop_length=self.dsp.hop_length, voc_pad=model.pad, voc_seq_len=self.train_cfg['seq_len'], voc_mode=self.dsp.voc_mode, bits=self.dsp.bits, num_gen_samples=self.train_cfg['num_gen_samples']) session = VocSession(index=i, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set, val_set_samples=val_set_samples) self.train_session(model, optimizer, session, train_gta)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav(wav, save_path/f'__{file_name}__{k}k_steps_target.wav') mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError(f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!') _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError(f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]') else: raise ValueError(f"Expected an extension of .wav or .npy, but got {suffix}!") mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = save_path/f'__{file_name}__{k}k_steps_{batch_str}.wav' _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path / '%sk_steps_%s_target.wav' % (repr1(k), repr1(i))) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = str(save_path / '%sk_steps_%s_%s.wav' % (repr1(k), repr1(i), repr1(batch_str))) _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def train(self, model: WaveRNN, optimizer: Optimizer, train_gta=False) -> None: for i, session_params in enumerate(hp.voc_schedule, 1): lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set, val_set_samples = get_vocoder_datasets( path=self.paths.data, batch_size=bs, train_gta=train_gta) session = VocSession( index=i, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set, val_set_samples=val_set_samples) self.train_session(model, optimizer, session, train_gta)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 os.makedirs(save_path/'test', exist_ok=True) for file_name in tqdm(os.listdir(load_path)): if file_name.endswith('.npy'): mel = np.load(os.path.join(load_path, file_name)) mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = save_path/f'test/{file_name}__{k}k_steps_{batch_str}.wav' _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 mypqmf = PQMF() for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) if hp.voc_multiband: x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2 ** bits, from_labels=True) else: x = label_2_float(x, bits) source = mypqmf.synthesis( torch.tensor(x, dtype=torch.float).unsqueeze( 0)).numpy() # (1, sub_band, T//sub_band) -> (1, 1, T) source = source.squeeze() # (T,) save_wav(source,save_path/f'{k}k_steps_{i}_target.wav') # np.save(save_path/f'{k}k_steps_{i}_target.npy', x, allow_pickle=False) else: x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path/f'{k}k_steps_{i}_target.wav') batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav') # 返回PQMF后 _ = model.generate(m, save_str,batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav( wav, save_path / '__%s__%sk_steps_target.wav' % (repr1(file_name), repr1(k))) mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( 'Expected a numpy array shaped (n_mels, n_hops), but got %s!' % (repr1(wav.shape))) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( 'Expected spectrogram range in [0,1] but was instead [%s, %s]' % (repr1(_min), repr1(_max))) else: raise ValueError('Expected an extension of .wav or .npy, but got %s!' % (repr1(suffix))) mel = torch.tensor(mel).unsqueeze(0) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = save_path / '__%s__%sk_steps_%s.wav' % ( repr1(file_name), repr1(k), repr1(batch_str)) _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav(wav, save_path / f'{prefix}{file_name}.target.wav') mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( f'Expected a numpy array shaped (n_mels, n_hops), but got {mel.shape}!' ) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]' ) else: raise ValueError( f"Expected an extension of .wav or .npy, but got {suffix}!") m = torch.tensor(mel).unsqueeze(0) save_str_wavernn = save_path / f'{prefix}{file_name}.wavernn.wav' save_str_griffinlim = save_path / f'{prefix}{file_name}.griffinlim.wav' wav = reconstruct_waveform(mel, n_iter=32) save_wav(wav, save_str_griffinlim) _ = model.generate(m, save_str_wavernn, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): ''' :param model: :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件 :param samples: 要生成的样本量,也就是要生成的音频个数 :param batched: 在这个脚本中batched为True :param target: 11000 :param overlap: 550 :param save_path: model_outputs_* :return: 生成的音频文件 ''' k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path / f'{k}k_steps_{i}_target.wav') # 保存原音频文件 batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path / f'{k}k_steps_{i}_{batch_str}.wav') _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder') parser.add_argument('--lr', '-l', type=float, help='[float] override hparams.py learning rate') parser.add_argument('--batch_size', '-b', type=int, help='[int] override hparams.py batch size') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--gta', '-g', action='store_true', help='train wavernn on GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # load hparams from file if args.lr is None: args.lr = hp.voc_lr if args.batch_size is None: args.batch_size = hp.voc_batch_size paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) batch_size = args.batch_size force_train = args.force_train train_gta = args.gta lr = args.lr if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) # Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length optimizer = optim.Adam(voc_model.parameters()) restore_checkpoint('voc', paths, voc_model, optimizer, create_if_missing=True) train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta) total_steps = 10_000_000 if force_train else hp.voc_total_steps simple_table([ ('Remaining', str( (total_steps - voc_model.get_step()) // 1000) + 'k Steps'), ('Batch Size', batch_size), ('LR', lr), ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta) ]) loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set, lr, total_steps) print('Training Complete.') print( 'To continue training increase voc_total_steps in hparams.py or use --force_train' )
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.voc_checkpoint_every == 0: gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, paths.voc_output) ckpt_name = f'wave_step{k}K' save_checkpoint('voc', paths, model, optimizer, name=ckpt_name, is_silent=True) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, is_silent=True) model.log(paths.voc_log, msg) print(' ')
postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_model.load('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f ] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.r simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', 11_000 if batched else 'N/A'), ('Overlap Samples', 550 if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = tts_model.generate(x)
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, init_lr, final_lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device # for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 for e in range(1, epochs + 1): adjust_learning_rate(optimizer, e, epochs, init_lr, final_lr) # 初始学习率与最终学习率-Begee start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to( device) # x/y: (Batch, sub_bands, T) ######################### MultiBand-WaveRNN ######################### if hp.voc_multiband: y0 = y[:, 0, :].squeeze(0).unsqueeze( -1) # y0/y1/y2/y3: (Batch, T, 1) y1 = y[:, 1, :].squeeze(0).unsqueeze(-1) y2 = y[:, 2, :].squeeze(0).unsqueeze(-1) y3 = y[:, 3, :].squeeze(0).unsqueeze(-1) y_hat = model(x, m) # (Batch, T, num_classes, sub_bands) if model.mode == 'RAW': y_hat0 = y_hat[:, :, :, 0].transpose(1, 2).unsqueeze( -1) # (Batch, num_classes, T, 1) y_hat1 = y_hat[:, :, :, 1].transpose(1, 2).unsqueeze(-1) y_hat2 = y_hat[:, :, :, 2].transpose(1, 2).unsqueeze(-1) y_hat3 = y_hat[:, :, :, 3].transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y0 = y0.float() y1 = y1.float() y2 = y2.float() y3 = y3.float() loss = loss_func(y_hat0, y0) + loss_func( y_hat1, y1) + loss_func(y_hat2, y2) + loss_func( y_hat3, y3) ######################### MultiBand-WaveRNN ######################### optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm).cpu() if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.voc_checkpoint_every == 0: gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, paths.voc_output) ckpt_name = f'wave_step{k}K' save_checkpoint('voc', paths, model, optimizer, name=ckpt_name, is_silent=True) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, is_silent=True) model.log(paths.voc_log, msg) print(' ')
def train_session(self, model: WaveRNN, optimizer: Optimizer, session: VocSession, train_gta: bool) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps ', str(training_steps // 1000) + 'k'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Sequence Length', self.train_cfg['seq_len']), ('GTA Training', train_gta)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, batch in enumerate(session.train_set, 1): start = time.time() model.train() batch = to_device(batch, device=device) x, y = batch['x'], batch['y'] y_hat = model(x, batch['mel']) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = batch['y'].float() y = y.unsqueeze(-1) loss = self.loss_func(y_hat, y) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), self.train_cfg['clip_grad_norm']) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % self.train_cfg['gen_samples_every'] == 0: stream(msg + 'generating samples...') gen_result = self.generate_samples(model, session) if gen_result is not None: mel_loss, gen_wav = gen_result self.writer.add_scalar('Loss/generated_mel_l1', mel_loss, model.get_step()) self.track_top_models(mel_loss, gen_wav, model) if step % self.train_cfg['checkpoint_every'] == 0: save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / f'wavernn_step{k}k.pt') self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / 'latest_model.pt') loss_avg.reset() duration_avg.reset() print(' ')
sample_rate=hp.sample_rate, pad_val=hp.voc_pad_val, mode=hp.voc_mode).cuda() # Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) voc_model.restore(paths.voc_latest_weights) optimizer = torch.optim.Adam(voc_model.parameters()) train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta) total_steps = 10_000_000 if force_train else hp.voc_total_steps simple_table([('Remaining', str((total_steps - voc_model.get_step())//1000) + 'k Steps'), ('Batch Size', batch_size), ('Initial learning rate', init_lr), ('Final learnging rate', final_lr), ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta)]) loss_func = torch.nn.functional.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss voc_train_loop(voc_model, loss_func, optimizer, train_set, test_set, init_lr, final_lr, total_steps) print('Training Complete.') print('To continue training increase voc_total_steps in hparams.py or use --force_train')
res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, pad_val=hp.voc_pad_val, mode=hp.voc_mode).cuda() paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) restore_path = args.weights if args.weights else paths.voc_latest_weights model.restore(restore_path) model.eval() if hp.amp: model, _ = amp.initialize(model, [], opt_level='O3') simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) k = model.get_step() // 1000 for file_name in os.listdir(args.dir): if file_name.endswith('.npy'): mel = np.load(os.path.join(args.dir, file_name)) mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = f'{file_name}__{k}k_steps_{batch_str}.wav' model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
# Check to make sure the hop length is correctly factorised assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) voc_model.restore(paths.voc_latest_weights) optimiser = optim.Adam(voc_model.parameters()) train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta) total_steps = 10_000_000 if force_train else hp.voc_total_steps simple_table([ ('Remaining', str( (total_steps - voc_model.get_step()) // 1000) + 'k Steps'), ('Batch Size', batch_size), ('LR', lr), ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta) ]) loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss voc_train_loop(voc_model, loss_func, optimiser, train_set, test_set, lr, total_steps) print('Training Complete.') print( 'To continue training increase voc_total_steps in hparams.py or use --force_train' )
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device # set learning rate for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 total_number_of_batches = len(train_set) writer = SummaryWriter("runs/{0}-{1}".format( model_name_prefix, datetime.now().strftime("%Y%m%d-%H%M%S"))) scheduler = StepLR(optimizer, step_size=1, gamma=0.983) for e in range(EPOCH, epochs + 1): start = time.time() running_loss = 0. avg_loss = 0 for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 # Write to tensorboard per batch writer.add_scalar('Epoch loss', loss.item(), e * total_number_of_batches + i) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) """ ####################### Testing ############################ torch.cuda.empty_cache() loss_test = 0 for _, (x_test, y_test, m_test) in enumerate(test_set, 1): x_test, m_test, y_test = x_test.to(device), m_test.to(device), y_test.to(device) if device.type == 'cuda' and torch.cuda.device_count() > 1: raise RuntimeError("Unsupported") else: y_test_hat = model(x_test, m_test) if model.mode == 'RAW': y_test_hat = y_test_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y_test = y_test.float() y_test = y_test.unsqueeze(-1) loss_test += loss_func(y_test_hat, y_test).item() avg_loss_test = loss_test / len(test_set) msg = f'| Epoch: {e}/{epochs} | Test-Loss: {loss_test:.4f} | Test-AvgLoss: {avg_loss_test:.4f} | ' stream("\n") stream(msg) writer.add_scalar('Test loss', loss_test, e) writer.add_scalar('Average test loss', avg_loss_test, e) ############################################################ """ # Write to tensorboard per epoch writer.add_scalar('Running loss', running_loss, e) writer.add_scalar('Average loss', avg_loss, e) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, name="{0}-epoch-{1}-loss-{2}".format( model_name_prefix, e, avg_loss), is_silent=True) model.log(paths.voc_log, msg) print(' ') scheduler.step() print('Epoch:', e, 'LR:', scheduler.get_lr())
def TTS_Wave(self): os.makedirs('quick_start/tts_weights/', exist_ok=True) os.makedirs('quick_start/voc_weights/', exist_ok=True) zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r') zip_ref.extractall('quick_start/voc_weights/') zip_ref.close() zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r') zip_ref.extractall('quick_start/tts_weights/') zip_ref.close() # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument('-name', metavar='name', type=str,help='name of pdf') parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!') parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)') parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)') parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index') parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples') parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.set_defaults(batched=hp.voc_gen_batched) parser.set_defaults(target=hp.voc_target) parser.set_defaults(overlap=hp.voc_overlap) parser.set_defaults(input_text=None) parser.set_defaults(weights_path=None) args = parser.parse_args() batched = args.batched target = args.target overlap = args.overlap input_text = args.input_text weights_path = args.weights_path if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_device(0) else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode='MOL').to(device) voc_model.restore('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).to(device) tts_model.restore('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('final.txt') as f: inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.get_r() simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print("f'\n| Generating {i}/{len(inputs)}'") _, m, attention = tts_model.generate(x) if input_text: save_path = './output_audio/'+str(i)+'.wav' else: save_path = './output_audio/'+str(i)+'.wav' # save_attention(attention, save_path) m = torch.tensor(m).unsqueeze(0) m = (m + 4) / 8 voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law) if i == 2: temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav") temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = temp1 + temp2 os.remove("./output_audio/"+str(i-1)+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") elif i > 2: preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav") newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = preTemp + newTemp os.remove("./output_audio/"+self.path[:-4]+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") print("Done")