def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = Flowtron(**model_config).cuda() model_for_saving.load_state_dict(model.state_dict()) torch.save({'model': model_for_saving, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate}, filepath, _use_new_zipfile_serialization=False)
def infer(flowtron_path, waveglow_path, output_dir, text, speaker_id, n_frames, sigma, gate_threshold, seed): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # load waveglow waveglow = torch.load(waveglow_path)['model'].cuda().eval() waveglow.cuda().half() for k in waveglow.convinv: k.float() waveglow.eval() # load flowtron model = Flowtron(**model_config).cuda() state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict'] model.load_state_dict(state_dict) model.eval() print("Loaded checkpoint '{}')".format(flowtron_path)) ignore_keys = ['training_files', 'validation_files'] trainset = Data( data_config['training_files'], **dict((k, v) for k, v in data_config.items() if k not in ignore_keys)) speaker_vecs = trainset.get_speaker_id(speaker_id).cuda() text = trainset.get_text(text).cuda() speaker_vecs = speaker_vecs[None] text = text[None] with torch.no_grad(): residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma mels, attentions = model.infer(residual, speaker_vecs, text, gate_threshold=gate_threshold) for k in range(len(attentions)): attention = torch.cat(attentions[k]).cpu().numpy() fig, axes = plt.subplots(1, 2, figsize=(16, 4)) axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto') axes[1].imshow(attention[:, 0].transpose(), origin='bottom', aspect='auto') fig.savefig( os.path.join( output_dir, 'sid{}_sigma{}_attnlayer{}.png'.format(speaker_id, sigma, k))) plt.close("all") with torch.no_grad(): audio = waveglow.infer(mels.half(), sigma=0.8).float() audio = audio.cpu().numpy()[0] # normalize audio for now audio = audio / np.abs(audio).max() print(audio.shape) write( os.path.join(output_dir, 'sid{}_sigma{}.wav'.format(speaker_id, sigma)), data_config['sampling_rate'], audio)
def __init__(self): self.config_path = 'flowtron/config.json' self.models_path = os.getcwd() + '/models/' self.training_files_path = os.getcwd() + '/filelists/dataset_train.txt' with open(self.config_path) as f: data = f.read() self.config = json.loads(data) self.config['model_config']['n_speakers'] = 41 self.lambd = 0.001 self.sigma = 0.85 self.waveglow_sigma = 1 self.n_frames = 1800 self.aggregation_type = 'batch' self.model = Flowtron(**self.config['model_config']).cuda() flowtron_path = self.models_path + self.models['flowtron'] waveglow_path = self.models_path + self.waveglow['default'] if 'state_dict' in torch.load(flowtron_path, map_location='cpu'): load = torch.load(flowtron_path, map_location='cpu') state_dict = load['state_dict'] else: load = torch.load(flowtron_path, map_location='cpu') state_dict = load['model'].state_dict() self.model.load_state_dict(state_dict, strict=False) self.model.eval() self.waveglow = torch.load(waveglow_path)['model'] self.waveglow.cuda().eval() self.z_baseline = torch.cuda.FloatTensor( 1, 80, self.n_frames).normal_() * self.sigma ignore_keys = ['training_files', 'validation_files'] self.trainset = Data( self.training_files_path, **dict((k, v) for k, v in self.config['data_config'].items() if k not in ignore_keys))
def setup(): # Parse configs. Globals nicer in this case with open("flowtron/infer.json") as f: data = f.read() global config config = json.loads(data) global data_config data_config = config["data_config"] global model_config model_config = config["model_config"] torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False global flowtron global waveglow global trainset encoder_weights = Path("encoder/saved_models/pretrained.pt") encoder.load_model(encoder_weights) torch.manual_seed(1234) torch.cuda.manual_seed(1234) #Load waveglow waveglow = torch.load("flowtron/tacotron2/waveglow/saved_models/waveglow_256channels_universal_v5.pt")['model'].cuda().eval() waveglow.cuda().half() for k in waveglow.convinv: k.float() waveglow.eval() #Load flowtron flowtron = Flowtron(**model_config).cuda() state_dict = torch.load("flowtron/saved_models/pretrained.pt", map_location='cpu')['model'].state_dict() flowtron.load_state_dict(state_dict) flowtron.eval() ignore_keys = ['training_files', 'validation_files'] trainset = Data( data_config['training_files'], **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
def load_models(flowtron_path, waveglow_path): # load waveglow waveglow = torch.load(waveglow_path)['model'].cuda().eval() waveglow.cuda() for k in waveglow.convinv: k.float() waveglow.eval() # load flowtron try: model = Flowtron(**model_config).cuda() state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict'] model.load_state_dict(state_dict) except KeyError: model = torch.load(flowtron_path)['model'] model.eval() print("Loaded model '{}')".format(flowtron_path)) return model, waveglow
def train(n_gpus, rank, output_directory, epochs, optim_algo, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, finetune_layers, warmstart_checkpoint_path, with_tensorboard, grad_clip_val, fp16_run, tensorboard_path=None): fp16_run = bool(fp16_run) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), bool(model_config['use_gate_layer'])) model = Flowtron(**model_config).cuda() if len(finetune_layers): for name, param in model.named_parameters(): if name in finetune_layers: param.requires_grad = True else: param.requires_grad = False print("Initializing %s optimizer" % (optim_algo)) if optim_algo == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_algo == 'RAdam': optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) else: print("Unrecognized optimizer %s!" % (optim_algo)) exit(1) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) scaler = amp.GradScaler(enabled=fp16_run) train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("Output directory", output_directory) if with_tensorboard and rank == 0: tboard_out_path = tensorboard_path if tensorboard_path is None: tboard_out_path = os.path.join(output_directory, "logs/run1") print("Setting up Tensorboard log in %s" % (tboard_out_path)) logger = FlowtronLogger(tboard_out_path) # force set the learning rate to what is specified for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() attn_prior = attn_prior.cuda() if valset.use_attn_prior else None with amp.autocast(enabled=fp16_run): z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens, attn_prior) loss_nll, loss_gate = criterion( (z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) loss = loss_nll + loss_gate if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(loss_gate.data, n_gpus).item() reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = loss_gate.item() reduced_nll_loss = loss_nll.item() scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) scaler.step(optimizer) scaler.update() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_gate', reduced_gate_loss, iteration) logger.add_scalar('training_loss_nll', reduced_nll_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if iteration % iters_per_checkpoint == 0: val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, warmstart_checkpoint_path, with_tensorboard, fp16_run): torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), model_config['use_gate_layer']) model = Flowtron(**model_config).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: logger = FlowtronLogger(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens) loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if (iteration % iters_per_checkpoint == 0): val_loss, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def infer(flowtron_path, waveglow_path, text, speaker_id, n_frames, sigma, seed): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # load waveglow waveglow = torch.load(waveglow_path)['model'].cuda().eval() waveglow.cuda().half() for k in waveglow.convinv: k.float() waveglow.eval() # load flowtron model = Flowtron(**model_config).cuda() cpt_dict = torch.load(flowtron_path) if 'model' in cpt_dict: dummy_dict = cpt_dict['model'].state_dict() else: dummy_dict = cpt_dict['state_dict'] model.load_state_dict(dummy_dict) model.eval() print("Loaded checkpoint '{}')".format(flowtron_path)) ignore_keys = ['training_files', 'validation_files'] trainset = Data( data_config['training_files'], **dict((k, v) for k, v in data_config.items() if k not in ignore_keys)) tic_prep = time.time() str_text = text num_char = len(str_text) num_word = len(str_text.split()) speaker_vecs = trainset.get_speaker_id(speaker_id).cuda() text = trainset.get_text(text).cuda() speaker_vecs = speaker_vecs[None] text = text[None] toc_prep = time.time() ############## warm up ########### to measure exact flowtron inference time with torch.no_grad(): tic_warmup = time.time() residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma mels, attentions = model.infer(residual, speaker_vecs, text) toc_warmup = time.time() tic_flowtron = time.time() with torch.no_grad(), torch.autograd.profiler.emit_nvtx( ): ########### prof. tic_residual = time.time() residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma toc_residual = time.time() profiler.start() ########### prof. mels, attentions = model.infer(residual, speaker_vecs, text) profiler.stop() ########### prof. toc_flowtron = time.time() for k in range(len(attentions)): attention = torch.cat(attentions[k]).cpu().numpy() fig, axes = plt.subplots(1, 2, figsize=(16, 4)) axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto') axes[1].imshow(attention[:, 0].transpose(), origin='bottom', aspect='auto') fig.savefig('sid{}_sigma{}_attnlayer{}.png'.format( speaker_id, sigma, k)) plt.close("all") tic_waveglow = time.time() audio = waveglow.infer(mels.half(), sigma=0.8).float() toc_waveglow = time.time() audio = audio.cpu().numpy()[0] # normalize audio for now audio = audio / np.abs(audio).max() len_audio = len(audio) dur_audio = len_audio / 22050 num_frames = int(len_audio / 256) dur_prep = toc_prep - tic_prep dur_residual = toc_residual - tic_residual dur_flowtron_in = toc_flowtron - toc_residual dur_warmup = toc_warmup - tic_warmup dur_flowtron_out = toc_flowtron - tic_residual dur_waveglow = toc_waveglow - tic_waveglow dur_total = dur_prep + dur_flowtron_out + dur_waveglow RTF = dur_audio / dur_total str_text = "\n text : " + str_text str_num = "\n text {:d} char {:d} words ".format(num_char, num_word) str_audio = "\n generated audio : {:2.3f} samples {:2.3f} sec with {:d} mel frames ".format( len_audio, dur_audio, num_frames) str_perf = "\n total time {:2.3f} = text prep {:2.3f} + flowtron{:2.3f} + wg {:2.3f} ".format( dur_total, dur_prep, dur_flowtron_out, dur_waveglow) str_flow = "\n total flowtron {:2.3f} = residual cal {:2.3f} + flowtron {:2.3f} ".format( dur_flowtron_out, dur_residual, dur_flowtron_in) str_rtf = "\n RTF is {:2.3f} x with warm up {:2.3f} ".format( RTF, dur_warmup) print(str_text, str_num, str_audio, str_perf, str_flow, str_rtf) write("sid{}_sigma{}.wav".format(speaker_id, sigma), data_config['sampling_rate'], audio)
class AudioGeneratorFlowtron: models = { 'flowtron': 'flowtron_model.pt', } waveglow = {'default': 'waveglow_256channels_universal_v5.pt'} def __init__(self): self.config_path = 'flowtron/config.json' self.models_path = os.getcwd() + '/models/' self.training_files_path = os.getcwd() + '/filelists/dataset_train.txt' with open(self.config_path) as f: data = f.read() self.config = json.loads(data) self.config['model_config']['n_speakers'] = 41 self.lambd = 0.001 self.sigma = 0.85 self.waveglow_sigma = 1 self.n_frames = 1800 self.aggregation_type = 'batch' self.model = Flowtron(**self.config['model_config']).cuda() flowtron_path = self.models_path + self.models['flowtron'] waveglow_path = self.models_path + self.waveglow['default'] if 'state_dict' in torch.load(flowtron_path, map_location='cpu'): load = torch.load(flowtron_path, map_location='cpu') state_dict = load['state_dict'] else: load = torch.load(flowtron_path, map_location='cpu') state_dict = load['model'].state_dict() self.model.load_state_dict(state_dict, strict=False) self.model.eval() self.waveglow = torch.load(waveglow_path)['model'] self.waveglow.cuda().eval() self.z_baseline = torch.cuda.FloatTensor( 1, 80, self.n_frames).normal_() * self.sigma ignore_keys = ['training_files', 'validation_files'] self.trainset = Data( self.training_files_path, **dict((k, v) for k, v in self.config['data_config'].items() if k not in ignore_keys)) def generate(self, text: str, speaker: int): speaker_vecs = self.trainset.get_speaker_id(speaker).cuda() speaker_vecs = speaker_vecs[None] text = self.trainset.get_text(text).cuda() text = text[None] with torch.no_grad(): mel_baseline = self.model.infer(self.z_baseline, speaker_vecs, text)[0] with torch.no_grad(): audio_base = self.waveglow.infer(mel_baseline, sigma=self.waveglow_sigma) audio = audio_base[0].data.cpu().numpy() return audio def prepare_dataset(self, dataset_path): dataset = Data( dataset_path, **dict((k, v) for k, v in self.config['data_config'].items() if k not in ['training_files', 'validation_files'])) return dataset