def evaluation(hp: HParams, tier: int, dataloader: torch.utils.data.DataLoader, model: Union[Tier1, Tier], criterion: GMMLoss, logger: logging.Logger) -> int: """ Evaluates the model (tier) with respect to the data according to the criterion and logs it. Args: hp (HParams): hyperparameters for the model and other parameters (training, dataset, ...). tier (int): number of the tier (the model). dataloader (torch.utils.data.DataLoader): dataset enclosed as a DataLoader following PyTorch guidelines. model (Tier): individual tier that will be evaluated. criterion (GMMLoss): function to compute the loss of the model. logger (logging.Logger): to log general information about the evaluation. Returns: avg_loss_of_frame (int): the average loss of a frame of a spectrogram. """ model.eval() loss_of_sample = [] length_of_sample = [] for waveform, utterance in dataloader: # 1.1 Transform waveform input to melspectrogram and apply preprocessing to normalize waveform = waveform.to(device=hp.device, non_blocking=True) spectrogram = transforms.wave_to_melspectrogram(waveform, hp) spectrogram = audio_normalizing.preprocessing(spectrogram, hp) # 1.2 Get input and output from the original spectrogram for this tier input_spectrogram, output_spectrogram = tierutil.split(spectrogram=spectrogram, tier=tier, n_tiers=hp.network.n_tiers) with torch.no_grad(): # 2. Compute the model output if tier == 1: # generation is unconditional so there is only one input mu_hat, std_hat, pi_hat = model(spectrogram=input_spectrogram) else: # generation is conditional on the spectrogram generated by previous tiers mu_hat, std_hat, pi_hat = model(spectrogram=output_spectrogram, spectrogram_prev_tier=input_spectrogram) # 3. Calculate the loss loss = criterion(mu=mu_hat, std=std_hat, pi=pi_hat, target=output_spectrogram) loss_of_sample.append(loss.item()) length_of_sample.append(input_spectrogram.size(2)) # get FRAMES of input total_loss = sum(loss_of_sample) total_length = sum(length_of_sample) avg_loss_of_frame_sample = [loss / float(length) for loss, length in zip(loss_of_sample, length_of_sample)] avg_loss_of_frame = sum(avg_loss_of_frame_sample) / len(avg_loss_of_frame_sample) logger.info(f"Evaluation - Total loss: {total_loss} Total length: {total_length} " f"Avg loss of frame: {avg_loss_of_frame}") model.train() return avg_loss_of_frame
def train_tier(args: argparse.Namespace, hp: HParams, tier: int, extension_architecture: str, timestamp: str, tensorboardwriter: TensorboardWriter, logger: logging.Logger) -> None: """ Trains one tier of MelNet. Args: args (argparse.Namespace): parameters to set up the training. At least, args must contain: args = {"path_config": ..., "tier": ..., "checkpoint_path": ...} hp (HParams): hyperparameters for the model and other parameters (training, dataset, ...) tier (int): number of the tier to train. extension_architecture (str): information about the network's architecture of this run (training) to identify the logs and weights of the model. timestamp (str): information that identifies completely this run (training). tensorboardwriter (TensorboardWriter): to log information about training to tensorboard. logger (logging.Logger): to log general information about the training of the model. """ logger.info(f"Start training of tier {tier}/{hp.network.n_tiers}") # Setup the data ready to be consumed train_dataloader, test_dataloader, num_samples = get_dataloader(hp) # Setup tier # Calculate size of FREQ dimension for this tier tier_freq = tierutil.get_size_freqdim_of_tier(n_mels=hp.audio.mel_channels, n_tiers=hp.network.n_tiers, tier=tier) if tier == 1: model = Tier1(tier=tier, n_layers=hp.network.layers[tier - 1], hidden_size=hp.network.hidden_size, gmm_size=hp.network.gmm_size, freq=tier_freq) else: model = Tier(tier=tier, n_layers=hp.network.layers[tier - 1], hidden_size=hp.network.hidden_size, gmm_size=hp.network.gmm_size, freq=tier_freq) model = model.to(hp.device) model.train() parameters = model.parameters() # Setup loss criterion and optimizer criterion = GMMLoss() optimizer = torch.optim.RMSprop(params=parameters, lr=hp.training.lr, momentum=hp.training.momentum) # Check if training has to be resumed from previous checkpoint if args.checkpoint_path is not None: model, optimizer = resume_training(args, hp, tier, model, optimizer, logger) else: logger.info( f"Starting new training on dataset {hp.data.dataset} with configuration file " f"name {hp.name}") # Train the tier total_iterations = 0 loss_logging = 0 # accumulated loss between logging iterations loss_save = 0 # accumulated loss between saving iterations prev_loss_onesample = 1e8 # used to compare between saving iterations and decide whether or not # to save the model gradients = [] for epoch in range(hp.training.epochs): logger.info(f"Epoch: {epoch}/{hp.training.epochs} - Starting") for i, (waveform, utterance) in enumerate(train_dataloader): # 1.1 Transform waveform input to melspectrogram and apply preprocessing to normalize waveform = waveform.to(device=hp.device, non_blocking=True) spectrogram = transforms.wave_to_melspectrogram(waveform, hp) spectrogram = audio_normalizing.preprocessing(spectrogram, hp) # 1.2 Get input and output from the original spectrogram for this tier input_spectrogram, output_spectrogram = tierutil.split( spectrogram=spectrogram, tier=tier, n_tiers=hp.network.n_tiers) length_spectrogram = input_spectrogram.size(2) # if item is too long, we jump to the next one if length_spectrogram > 1000: continue # 2. Compute the model output if tier == 1: # generation is unconditional so there is only one input mu_hat, std_hat, pi_hat = model(spectrogram=input_spectrogram) else: # generation is conditional on the spectrogram generated by previous tiers mu_hat, std_hat, pi_hat = model( spectrogram=output_spectrogram, spectrogram_prev_tier=input_spectrogram) # gpumemory.stat_cuda("Forward") # 3. Calculate the loss loss = criterion(mu=mu_hat, std=std_hat, pi=pi_hat, target=output_spectrogram) # gpumemory.stat_cuda("Loss") del spectrogram del mu_hat, std_hat, pi_hat # 3.1 Check if loss has exploded if torch.isnan(loss) or torch.isinf(loss): error_msg = f"Loss exploded at Epoch: {epoch}/{hp.training.epochs} - " \ f"Iteration: {i * hp.training.batch_size}/{num_samples}" logger.error(error_msg) raise Exception(error_msg) # 4. Compute gradients loss_cpu = loss.item() loss = loss / hp.training.accumulation_steps loss.backward() # 5. Perform backpropagation (using gradient accumulation so efective batch size is the # same as in the paper) if (total_iterations + 1) % (hp.training.accumulation_steps / hp.training.batch_size) == 0: gradients.append(gradient_norm(model)) avg_gradient = sum(gradients) / len(gradients) logger.info(f"Gradient norm: {gradients[-1]} - " f"Avg gradient: {avg_gradient}") torch.nn.utils.clip_grad_norm_(parameters, 2200) optimizer.step() model.zero_grad() # 6. Logging and saving model loss_oneframe = loss_cpu / (length_spectrogram * hp.training.batch_size) loss_logging += loss_oneframe # accumulated loss between logging iterations loss_save += loss_oneframe # accumulated loss between saving iterations # 6.1 Save model (if is better than previous tier) if (total_iterations + 1) % hp.training.save_iterations == 0: # Calculate average loss of one sample of a batch loss_onesample = loss_save / hp.training.save_iterations # if loss_onesample of these iterations is lower, the tier is better and we save it if loss_onesample <= prev_loss_onesample: path = f"{hp.training.dir_chkpt}/tier{tier}_{timestamp}_loss{loss_onesample:.2f}.pt" torch.save(obj={ 'dataset': hp.data.dataset, 'tier_idx': tier, 'hp': hp, 'epoch': epoch, 'iterations': i, 'total_iterations': total_iterations, 'tier': model.state_dict(), 'optimizer': optimizer.state_dict() }, f=path) logger.info(f"Model saved to: {path}") prev_loss_onesample = loss_onesample loss_save = 0 # 6.2 Logging if (total_iterations + 1) % hp.logging.log_iterations == 0: # Calculate average loss of one sample of a batch loss_onesample = loss_logging / hp.logging.log_iterations tensorboardwriter.log_training(hp, loss_onesample, total_iterations) logger.info( f"Epoch: {epoch}/{hp.training.epochs} - " f"Iteration: {i * hp.training.batch_size}/{num_samples} - " f"Loss: {loss_onesample:.4f}") loss_logging = 0 # 6.3 Evaluate if (total_iterations + 1) % hp.training.evaluation_iterations == 0: evaluation(hp, tier, test_dataloader, model, criterion, logger) total_iterations += 1 # After finishing training: save model, hyperparameters and total loss path = f"{hp.training.dir_chkpt}/tier{tier}_{timestamp}_epoch{epoch}_final.pt" torch.save(obj={ 'dataset': hp.data.dataset, 'tier_idx': tier, 'hp': hp, 'epoch': epoch, 'iterations': evaluation(hp, tier, test_dataloader, model, criterion, logger), 'total_iterations': total_iterations, 'tier': model.state_dict(), 'optimizer': optimizer.state_dict() }, f=path) logger.info(f"Model saved to: {path}") tensorboardwriter.log_end_training(hp=hp, loss=-1) logger.info("Finished training")
def sample(self, hp: HParams, synthesisp: HParams, timestamp: str, logger: logging.Logger, n_samples: int, length: int) -> torch.Tensor: """ Generates n_samples of audio of the given length. Args: hp (HParams): parameters. Parameters needed are hp.device synthesisp (HParams): parameters for performing the synthesis. Parameters needed are synthesisp.output_path to save the spectrogram generated at each tier. timestamp (str): information that identifies completely this run (synthesis). logger (logging.Logger): n_samples (int): amount of samples to generate. length (int): length of the samples to generate (in timesteps). Returns: spectrograms (torch.Tensor): samples of audio in spectrogram representation. Shape: [B=n_samples, FREQ=self.freq, FRAMES=length]. """ assert length >= 2 ** ( self.n_tiers / 2), "Length is too short for being generated with the " \ "number of tiers of this model." # Initially, the spectrogram (x) to generate it does not exist. x = None # Load a spectrogram from the dataset from src.utils.training_batch import get_dataloader from src.dataprocessing import transforms as T from src.dataprocessing.audio_normalizing import preprocessing dataloader, _, _ = get_dataloader(hp) wave = None for i, (waveform, utterance) in enumerate(dataloader): if "building" in utterance[0]: print(utterance[0]) if utterance[ 0] == "One building, Market Hall, was unavailable for November 22.": wave = waveform break if wave is None: logger.info("wave not found") return #dataiter = iter(dataloader) #wave, utterance = dataiter.next() waveform = wave.to(device=hp.device, non_blocking=True) spectrogram = T.wave_to_melspectrogram(waveform, hp) spectrogram = preprocessing(spectrogram, hp) # Split the spectrogram to get the spectrogram that would be the output of the first tier input_spectrogram, output_spectrogram = tierutil.split( spectrogram=spectrogram, tier=1, n_tiers=hp.network.n_tiers) # Use the spectrogram from the dataset as output from first tier x = output_spectrogram length = spectrogram.size(2) # Save spectrogram generated at tier1 torch.save(x, f"{synthesisp.output_path}/{timestamp}_tier1.pt") # --- TIER >1 --- for tier_idx in range(2, self.n_tiers + 1): temp_x = None # temporary spectrogram that will be generated by this tier # The spectrogram is generated autoregressively, frame (length, or timestep) by frame. logger.info(f"Starting Tier {tier_idx}/{self.n_tiers}") freq_of_tierX = tierutil.get_size_freqdim_of_tier( n_mels=self.freq, n_tiers=self.n_tiers, tier=tier_idx) length_of_tierX = tierutil.get_size_timedim_of_tier( timesteps=length, n_tiers=self.n_tiers, tier=tier_idx) print("Shape of original spectrogram: ", spectrogram.size()) print("Shape of spectrogram_prev_tier (x): ", x.size()) print("Freq_of_tierX: ", freq_of_tierX) print("Length_of_tierX: ", length_of_tierX) length_of_tierX = min(length_of_tierX, x.size(2)) x = x[:, :, :length_of_tierX] for i in range(0, length_of_tierX): logger.info( f"Tier {tier_idx}/{self.n_tiers} - Frame {i}/{length_of_tierX}" ) if temp_x is None: # If the spectrogram of this tier has not been initialized, we initialized to an # initial frame of all zeros temp_x = torch.zeros((n_samples, freq_of_tierX, 1), device=hp.device) else: # If the spectrogram of this tier has already been initialized, we have already # computed some frames. We concatenate a new frame initialized to all zeros # which will be replaced pixel by pixel by the new values # We change shape from [B, FREQ, FRAMES] to [B, FREQ, FRAMES+1] by adding a new # frame temp_x = torch.cat([ temp_x, torch.zeros( (n_samples, freq_of_tierX, 1), device=hp.device) ], dim=-1) # Inside a frame, the spectrogram is generated autoregressively, freq by freq for j in range(0, freq_of_tierX): # we generate the parameters for all the spectrogram (across all samples) mu_hat, std_hat, pi_hat = self.tiers[tier_idx - 1](temp_x, x) # with the parameters we generate the values of the next spectrogram # (across all samples) new_spectrogram = sample_gmm_batch(mu_hat, std_hat, pi_hat) # but only use the value of the new pixel that we are generating # (across all samples) since the spectrogram is generated autoregressively temp_x[:, j, i] = new_spectrogram[:, j, i] # After generating the spectrogram of this tier, we interleave it to put it together # with the spectrogram generated by previous tiers. In the next iteration, this will # be the input to condition the next tier x = tierutil.interleave(temp_x, x, tier_idx) x = x.to(hp.device) # Save spectrogram generated at tier1 torch.save( temp_x, f"{synthesisp.output_path}/{timestamp}_tier{tier_idx}.pt") torch.save( x, f"{synthesisp.output_path}/{timestamp}_tier1-tier{tier_idx}.pt" ) return x