def __init__(self, data_path, voc_id, tts_id): self.base = Path(__file__).parent.parent.expanduser().resolve() # Data Paths self.data = Path(data_path).expanduser().resolve() self.quant = self.data / 'quant' self.mel = self.data / 'mel' self.gta = self.data / 'gta' # WaveRNN/Vocoder Paths self.voc_checkpoints = self.base / 'checkpoints' / '%s.wavernn' % ( repr1(voc_id)) self.voc_latest_weights = self.voc_checkpoints / 'latest_weights.pyt' self.voc_latest_optim = self.voc_checkpoints / 'latest_optim.pyt' self.voc_output = self.base / 'model_outputs' / '%s.wavernn' % ( repr1(voc_id)) self.voc_step = self.voc_checkpoints / 'step.npy' self.voc_log = self.voc_checkpoints / 'log.txt' # Tactron/TTS Paths self.tts_checkpoints = self.base / 'checkpoints' / '%s.tacotron' % ( repr1(tts_id)) self.tts_latest_weights = self.tts_checkpoints / 'latest_weights.pyt' self.tts_latest_optim = self.tts_checkpoints / 'latest_optim.pyt' self.tts_output = self.base / 'model_outputs' / '%s.tacotron' % ( repr1(tts_id)) self.tts_step = self.tts_checkpoints / 'step.npy' self.tts_log = self.tts_checkpoints / 'log.txt' self.tts_attention = self.tts_checkpoints / 'attention' self.tts_mel_plot = self.tts_checkpoints / 'mel_plots' self.create_paths()
def create_gta_features(model: Tacotron, train_set, save_path: Path): device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) for i, (x, mels, ids, mel_lens) in enumerate(train_set, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, gta, _ = model(x, mels) gta = gta.cpu().numpy() for j, item_id in enumerate(ids): mel = gta[j][:, :mel_lens[j]] mel = (mel + 4) / 8 np.save(save_path / '%s.npy' % (repr1(item_id)), mel, allow_pickle=False) bar = progbar(i, iters) msg = '%s %s/%s Batches ' % (repr1(bar), repr1(i), repr1(iters)) stream(msg)
def save_checkpoint(checkpoint_type: str, paths: Paths, model, optimizer, *, name=None, is_silent=False): """Saves the training session to disk. Args: paths: Provides information about the different paths to use. model: A `Tacotron` or `WaveRNN` model to save the parameters and buffers from. optimizer: An optmizer to save the state of (momentum, etc). name: If provided, will name to a checkpoint with the given name. Note that regardless of whether this is provided or not, this function will always update the files specified in `paths` that give the location of the latest weights and optimizer state. Saving a named checkpoint happens in addition to this update. """ def helper(path_dict, is_named): s = 'named' if is_named else 'latest' num_exist = sum(p.exists() for p in path_dict.values()) if num_exist not in (0, 2): # Checkpoint broken raise FileNotFoundError( 'We expected either both or no files in the %s checkpoint to exist, but instead we got exactly one!' % (repr1(s))) if num_exist == 0: if not is_silent: print('Creating %s checkpoint...' % (repr1(s))) for p in path_dict.values(): p.parent.mkdir(parents=True, exist_ok=True) else: if not is_silent: print('Saving to existing %s checkpoint...' % (repr1(s))) if not is_silent: print('Saving %s weights: %s' % (repr1(s), repr1(path_dict["w"]))) model.save(path_dict['w']) if not is_silent: print('Saving %s optimizer state: %s' % (repr1(s), repr1(path_dict["o"]))) torch.save(optimizer.state_dict(), path_dict['o']) weights_path, optim_path, checkpoint_path = \ get_checkpoint_paths(checkpoint_type, paths) latest_paths = {'w': weights_path, 'o': optim_path} helper(latest_paths, False) if name: named_paths = { 'w': checkpoint_path / '%s_weights.pyt' % (repr1(name)), 'o': checkpoint_path / '%s_optim.pyt' % (repr1(name)), } helper(named_paths, True)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path / '%sk_steps_%s_target.wav' % (repr1(k), repr1(i))) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = str(save_path / '%sk_steps_%s_%s.wav' % (repr1(k), repr1(i), repr1(batch_str))) _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def helper(path_dict, is_named): s = 'named' if is_named else 'latest' num_exist = sum(p.exists() for p in path_dict.values()) if num_exist not in (0, 2): # Checkpoint broken raise FileNotFoundError( 'We expected either both or no files in the %s checkpoint to exist, but instead we got exactly one!' % (repr1(s))) if num_exist == 0: if not is_silent: print('Creating %s checkpoint...' % (repr1(s))) for p in path_dict.values(): p.parent.mkdir(parents=True, exist_ok=True) else: if not is_silent: print('Saving to existing %s checkpoint...' % (repr1(s))) if not is_silent: print('Saving %s weights: %s' % (repr1(s), repr1(path_dict["w"]))) model.save(path_dict['w']) if not is_silent: print('Saving %s optimizer state: %s' % (repr1(s), repr1(path_dict["o"]))) torch.save(optimizer.state_dict(), path_dict['o'])
def restore_checkpoint(checkpoint_type: str, paths: Paths, model, optimizer, *, name=None, create_if_missing=False): """Restores from a training session saved to disk. NOTE: The optimizer's state is placed on the same device as it's model parameters. Therefore, be sure you have done `model.to(device)` before calling this method. Args: paths: Provides information about the different paths to use. model: A `Tacotron` or `WaveRNN` model to save the parameters and buffers from. optimizer: An optmizer to save the state of (momentum, etc). name: If provided, will restore from a checkpoint with the given name. Otherwise, will restore from the latest weights and optimizer state as specified in `paths`. create_if_missing: If `True`, will create the checkpoint if it doesn't yet exist, as well as update the files specified in `paths` that give the location of the current latest weights and optimizer state. If `False` and the checkpoint doesn't exist, will raise a `FileNotFoundError`. """ weights_path, optim_path, checkpoint_path = \ get_checkpoint_paths(checkpoint_type, paths) if name: path_dict = { 'w': checkpoint_path / '%s_weights.pyt' % (repr1(name)), 'o': checkpoint_path / '%s_optim.pyt' % (repr1(name)), } s = 'named' else: path_dict = {'w': weights_path, 'o': optim_path} s = 'latest' num_exist = sum(p.exists() for p in path_dict.values()) if num_exist == 2: # Checkpoint exists print('Restoring from %s checkpoint...' % (repr1(s))) print('Loading %s weights: %s' % (repr1(s), repr1(path_dict["w"]))) model.load(path_dict['w']) print('Loading %s optimizer state: {path_dict["o"]}' % (repr1(s))) optimizer.load_state_dict(torch.load(path_dict['o'])) elif create_if_missing: save_checkpoint(checkpoint_type, paths, model, optimizer, name=name, is_silent=False) else: raise FileNotFoundError('The %s checkpoint could not be found!' % (repr1(s)))
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_train = args.force_train force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hp.tts_schedule) - 1: # There are no more sessions. Check if we force training. if force_train: # Don't finish the loop - train forever training_steps = 999_999_999 else: # We have completed training. Breaking is same as continue break else: # There is a following session, go to it continue model.r = r simple_table([('Steps with r=%s' % (repr1(r)), str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr), ('Outputs/Step (r)', model.r)]) train_set, attn_example = get_tts_datasets(paths.data, batch_size, r) tts_train_loop(paths, model, optimizer, train_set, lr, training_steps, attn_example) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') train_set, attn_example = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' )
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = 'taco_step%sK' % (repr1(k)) save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / '%s' % (repr1(step))) save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / '%s' % (repr1(step)), 600) msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % ( repr1(e), repr1(epochs), repr1(i), repr1(total_iters), avg_loss, speed, repr1(k)) stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def __getitem__(self, index): item_id = self.metadata[index] m = np.load(self.mel_path / '%s.npy' % (repr1(item_id))) x = np.load(self.quant_path / '%s.npy' % (repr1(item_id))) return m, x
def __getitem__(self, index): item_id = self.metadata[index] x = text_to_sequence(self.text_dict[item_id], hp.tts_cleaner_names) mel = np.load(self.path / 'mel' / '%s.npy' % (repr1(item_id))) mel_len = mel.shape[-1] return x, mel, item_id, mel_len
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav( wav, save_path / '__%s__%sk_steps_target.wav' % (repr1(file_name), repr1(k))) mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( 'Expected a numpy array shaped (n_mels, n_hops), but got %s!' % (repr1(wav.shape))) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( 'Expected spectrogram range in [0,1] but was instead [%s, %s]' % (repr1(_min), repr1(_max))) else: raise ValueError('Expected an extension of .wav or .npy, but got %s!' % (repr1(suffix))) mel = torch.tensor(mel).unsqueeze(0) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = save_path / '__%s__%sk_steps_%s.wav' % ( repr1(file_name), repr1(k), repr1(batch_str)) _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def get_voc_named_optim(self, name): """Gets the path for the optimizer state in a named voc checkpoint.""" return self.voc_checkpoints / '%s_optim.pyt' % (repr1(name))
def get_voc_named_weights(self, name): """Gets the path for the weights in a named voc checkpoint.""" return self.voc_checkpoints / '%s_weights.pyt' % (repr1(name))
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.voc_checkpoint_every == 0: gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, paths.voc_output) ckpt_name = 'wave_step%sK' % (repr1(k)) save_checkpoint('voc', paths, model, optimizer, name=ckpt_name, is_silent=True) msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.1f steps/s | Step: %sk | ' % ( repr1(e), repr1(epochs), repr1(i), repr1(total_iters), avg_loss, speed, repr1(k)) stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, is_silent=True) model.log(paths.voc_log, msg) print(' ')
def get_files(path: Union[str, Path], extension='.wav'): if isinstance(path, str): path = Path(path).expanduser().resolve() return list(path.rglob('*%s' % (repr1(extension))))
if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f ] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.r simple_table([('WaveRNN', str(voc_k) + 'k'), ('Tacotron(r=%s)' % (repr1(r)), str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print('\n| Generating %s/%s' % (repr1(i), repr1(len(inputs)))) _, m, attention = tts_model.generate(x) if input_text: save_path = 'quick_start/__input_%s_%sk.wav' % (repr1( input_text[:10]), repr1(tts_k)) else: save_path = 'quick_start/%s_batched%s_%sk.wav' % ( repr1(i), repr1(str(batched)), repr1(tts_k))