def train(run_id: str, metadata_fpath: str, models_dir: str, save_every: int, backup_every: int, force_restart:bool, hparams): models_dir = Path(models_dir) models_dir.mkdir(exist_ok=True) model_dir = models_dir.joinpath(run_id) plot_dir = model_dir.joinpath("plots") wav_dir = model_dir.joinpath("wavs") mel_output_dir = model_dir.joinpath("mel-spectrograms") meta_folder = model_dir.joinpath("metas") model_dir.mkdir(exist_ok=True) plot_dir.mkdir(exist_ok=True) wav_dir.mkdir(exist_ok=True) mel_output_dir.mkdir(exist_ok=True) meta_folder.mkdir(exist_ok=True) weights_fpath = model_dir.joinpath(run_id).with_suffix(".pt") print("Checkpoint path: {}".format(weights_fpath)) print("Loading training data from: {}".format(metadata_fpath)) print("Using model: Tacotron") # return # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) # From WaveRNN/train_tacotron.py if torch.cuda.is_available(): device = torch.device("cuda") for session in hparams.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError("`batch_size` must be evenly divisible by n_gpus!") else: device = torch.device("cpu") print("Using device:", device) # Instantiate Tacotron Model print("\nInitialising Tacotron Model...\n") model = Tacotron(embed_dims=hparams.tts_embed_dims, num_chars=len(symbols), encoder_dims=hparams.tts_encoder_dims, decoder_dims=hparams.tts_decoder_dims, n_mels=hparams.num_mels, fft_bins=hparams.num_mels, postnet_dims=hparams.tts_postnet_dims, encoder_K=hparams.tts_encoder_K, lstm_dims=hparams.tts_lstm_dims, postnet_K=hparams.tts_postnet_K, num_highways=hparams.tts_num_highways, dropout=hparams.tts_dropout, stop_threshold=hparams.tts_stop_threshold, speaker_embedding_size=hparams.speaker_embedding_size).to(device) # Initialize the optimizer optimizer = optim.Adam(model.parameters()) # Load the weights if force_restart or not weights_fpath.exists(): print("\nStarting the training of Tacotron from scratch\n") model.save(weights_fpath) # Embeddings metadata char_embedding_fpath = meta_folder.joinpath("CharacterEmbeddings.tsv") with open(char_embedding_fpath, "w", encoding="utf-8") as f: for symbol in symbols: if symbol == " ": symbol = "\\s" # For visual purposes, swap space with \s f.write("{}\n".format(symbol)) else: print("\nLoading weights at %s" % weights_fpath) model.load(weights_fpath, optimizer) print("Tacotron weights loaded from step %d" % model.step) # Initialize the dataset dataset = SynthesizerDataset(metadata_fpath, hparams) # test_loader = DataLoader(dataset, # batch_size=1, # shuffle=True, # pin_memory=True) for i, session in enumerate(hparams.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hparams.tts_schedule) - 1: # We have completed training. Save the model and exit model.save(weights_fpath, optimizer) break else: # There is a following session, go to it continue model.r = r # Begin the training simple_table([(f"Steps with r={r}", str(training_steps // 1000) + "k Steps"), ("Batch Size", batch_size), ("Learning Rate", lr), ("Outputs/Step (r)", model.r)]) for p in optimizer.param_groups: p["lr"] = lr data_loader = DataLoader(dataset, collate_fn=lambda batch: collate_synthesizer(batch, r, hparams), batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True) total_iters = len(dataset) steps_per_epoch = np.ceil(total_iters / batch_size).astype(np.int32) epochs = np.ceil(training_steps / steps_per_epoch).astype(np.int32) for epoch in range(1, epochs+1): for i, (texts, mels, embeds, idx) in enumerate(data_loader, 1): start_time = time.time() start = time.perf_counter() # Generate stop tokens for training stop = torch.ones(mels.shape[0], mels.shape[2]) for j, k in enumerate(idx): stop[j, :int(dataset.metadata[k][3])-1] = 0 texts = texts.to(device) mels = mels.to(device) embeds = embeds.to(device) stop = stop.to(device) # print('texts', texts.shape) # print(mels.shape) # print(embeds.shape) # print(stop.shape) # Forward pass # Parallelize model onto GPUS using workaround due to python bug if device.type == "cuda" and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, stop_pred = data_parallel_workaround(model, texts, mels, embeds) else: m1_hat, m2_hat, attention, stop_pred = model(texts, mels, embeds) # Backward pass m1_loss = F.mse_loss(m1_hat, mels) + F.l1_loss(m1_hat, mels) m2_loss = F.mse_loss(m2_hat, mels) stop_loss = F.binary_cross_entropy(stop_pred, stop) loss = m1_loss + m2_loss + stop_loss optimizer.zero_grad() loss.backward() # if hparams.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.tts_clip_grad_norm) if np.isnan(grad_norm.cpu()): print("grad_norm was NaN!") optimizer.step() time_window.append(time.time() - start_time) loss_window.append(loss.item()) step = model.get_step() k = step // 1000 msg = f"| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Loss: {loss_window.average:#.4} | {1./time_window.average:#.2} steps/s | Step: {k}k | " stream(msg) if step % 10 == 0: good_logger.log_training(reduced_loss=loss.item(), reduced_mel_loss=loss.item() - stop_loss.item(), reduced_gate_loss=stop_loss.item(), grad_norm=grad_norm, learning_rate=optimizer.param_groups[0]['lr'], duration=time.perf_counter() - start, iteration=step) # Backup or save model as appropriate if backup_every != 0 and step % backup_every == 0 : backup_fpath = Path("{}/{}_{}k.pt".format(str(weights_fpath.parent), run_id, k)) model.save(backup_fpath, optimizer) if save_every != 0 and step % save_every == 0 : # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts model.save(weights_fpath, optimizer) # Evaluate model to generate samples epoch_eval = hparams.tts_eval_interval == -1 and i == steps_per_epoch # If epoch is done step_eval = hparams.tts_eval_interval > 0 and step % hparams.tts_eval_interval == 0 # Every N steps if epoch_eval or step_eval: for sample_idx in range(hparams.tts_eval_num_samples): # At most, generate samples equal to number in the batch if sample_idx + 1 <= len(texts): # Remove padding from mels using frame length in metadata mel_length = int(dataset.metadata[idx[sample_idx]][3]) mel_prediction = np_now(m2_hat[sample_idx]).T[:mel_length] target_spectrogram = np_now(mels[sample_idx]).T[:mel_length] attention_len = mel_length // model.r eval_model(attention=np_now(attention[sample_idx][:, :attention_len]), mel_prediction=mel_prediction, target_spectrogram=target_spectrogram, input_seq=np_now(texts[sample_idx]), step=step, plot_dir=plot_dir, mel_output_dir=mel_output_dir, wav_dir=wav_dir, sample_num=sample_idx + 1, loss=loss, hparams=hparams) # Break out of loop to update training schedule if step >= max_step: break # Add line break after every epoch print("")
class Synthesizer: sample_rate = hparams.sample_rate hparams = hparams def __init__(self, model_fpath: Path, verbose=True): """ The model isn't instantiated and loaded in memory until needed or until load() is called. :param model_fpath: path to the trained model file :param verbose: if False, prints less information when using the model """ self.model_fpath = model_fpath self.verbose = verbose # Check for GPU if torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") if self.verbose: print("Synthesizer using device:", self.device) # Tacotron model will be instantiated later on first use. self._model = None def is_loaded(self): """ Whether the model is loaded in memory. """ return self._model is not None def load(self): """ Instantiates and loads the model given the weights file that was passed in the constructor. """ self._model = Tacotron( embed_dims=hparams.tts_embed_dims, num_chars=len(symbols), encoder_dims=hparams.tts_encoder_dims, decoder_dims=hparams.tts_decoder_dims, n_mels=hparams.num_mels, fft_bins=hparams.num_mels, postnet_dims=hparams.tts_postnet_dims, encoder_K=hparams.tts_encoder_K, lstm_dims=hparams.tts_lstm_dims, postnet_K=hparams.tts_postnet_K, num_highways=hparams.tts_num_highways, dropout=hparams.tts_dropout, stop_threshold=hparams.tts_stop_threshold, speaker_embedding_size=hparams.speaker_embedding_size).to( self.device) self._model.load(self.model_fpath) self._model.eval() if self.verbose: print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"])) def synthesize_spectrograms(self, texts: List[str], embeddings: Union[np.ndarray, List[np.ndarray]], return_alignments=False): """ Synthesizes mel spectrograms from texts and speaker embeddings. :param texts: a list of N text prompts to be synthesized :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) :param return_alignments: if True, a matrix representing the alignments between the characters and each decoder output step will be returned for each spectrogram :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the sequence length of spectrogram i, and possibly the alignments. """ # Load the model on the first request. if not self.is_loaded(): self.load() # Print some info about the model when it is loaded tts_k = self._model.get_step() // 1000 simple_table([("Tacotron", str(tts_k) + "k"), ("r", self._model.r)]) # Preprocess text inputs inputs = [ text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts ] if not isinstance(embeddings, list): embeddings = [embeddings] # Batch inputs batched_inputs = [ inputs[i:i + hparams.synthesis_batch_size] for i in range(0, len(inputs), hparams.synthesis_batch_size) ] batched_embeds = [ embeddings[i:i + hparams.synthesis_batch_size] for i in range(0, len(embeddings), hparams.synthesis_batch_size) ] specs = [] for i, batch in enumerate(batched_inputs, 1): if self.verbose: print(f"\n| Generating {i}/{len(batched_inputs)}") # Pad texts so they are all the same length text_lens = [len(text) for text in batch] max_text_len = max(text_lens) chars = [pad1d(text, max_text_len) for text in batch] chars = np.stack(chars) # Stack speaker embeddings into 2D array for batch processing speaker_embeds = np.stack(batched_embeds[i - 1]) # Convert to tensor chars = torch.tensor(chars).long().to(self.device) speaker_embeddings = torch.tensor(speaker_embeds).float().to( self.device) # Inference _, mels, alignments = self._model.generate(chars, speaker_embeddings) mels = mels.detach().cpu().numpy() for m in mels: # Trim silence from end of each spectrogram while np.max(m[:, -1]) < hparams.tts_stop_threshold: m = m[:, :-1] specs.append(m) if self.verbose: print("\n\nDone.\n") return (specs, alignments) if return_alignments else specs @staticmethod def load_preprocess_wav(fpath): """ Loads and preprocesses an audio file under the same conditions the audio files were used to train the synthesizer. """ wav = librosa.load(str(fpath), hparams.sample_rate)[0] if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max return wav @staticmethod def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): """ Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that were fed to the synthesizer when training. """ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): wav = Synthesizer.load_preprocess_wav(fpath_or_wav) else: wav = fpath_or_wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) return mel_spectrogram @staticmethod def griffin_lim(mel): """ Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built with the same parameters present in hparams.py. """ return audio.inv_mel_spectrogram(mel, hparams)