def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cpu() model_for_saving.load_state_dict(model.state_dict()) torch.save({'model': model_for_saving, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate}, filepath)
def save_checkpoint(model, optimizer, epoch, filepath): print(f'Saving model and optimizer state at epoch {epoch} to {filepath}') model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) torch.save( { 'model': model_for_saving, 'epoch': epoch, 'optimizer': optimizer.state_dict() }, filepath)
def __init__(self, tacotron2_path, waveglow_path, **kwargs): super(TTSModel, self).__init__() hparams = HParams(**kwargs) self.hparams = hparams self.model = Tacotron2(hparams) if torch.cuda.is_available(): self.model.load_state_dict( torch.load(tacotron2_path)["state_dict"]) self.model.cuda().eval() else: self.model.load_state_dict( torch.load(tacotron2_path, map_location="cpu")["state_dict"]) self.model.eval() self.k_cache = klepto.archives.file_archive(cached=False) if waveglow_path: if torch.cuda.is_available(): wave_params = torch.load(waveglow_path) else: wave_params = torch.load(waveglow_path, map_location="cpu") try: self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) self.waveglow.load_state_dict(wave_params) except: self.waveglow = wave_params["model"] self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) if torch.cuda.is_available(): self.waveglow.cuda().eval() else: self.waveglow.eval() # workaround from # https://github.com/NVIDIA/waveglow/issues/127 for m in self.waveglow.modules(): if "Conv" in str(type(m)): setattr(m, "padding_mode", "zeros") for k in self.waveglow.convinv: k.float().half() self.denoiser = Denoiser(self.waveglow, n_mel_channels=hparams.n_mel_channels) self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self._synth_speech) else: self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self._synth_speech_fast) self.taco_stft = TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, n_mel_channels=hparams.n_mel_channels, sampling_rate=hparams.sampling_rate, mel_fmax=4000, )
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) torch.save( { "model": model_for_saving, "iteration": iteration, "optimizer": optimizer.state_dict(), "learning_rate": learning_rate, }, filepath, )
def save_checkpoint(model, optimizer, amp, iteration, filepath): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) checkpoint = { 'model': model_for_saving, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'cuda_rng_state_all': torch.cuda.get_rng_state_all(), 'random_rng_state': torch.random.get_rng_state() } if amp is not None: checkpoint['amp'] = amp.state_dict() torch.save(checkpoint, filepath)
def main(): try: args = get_arguments() lc = read_binary_lc(args.lc, hparams.num_mels) if hparams.lc_encode or hparams.transposed_upsampling: lc = np.reshape(lc, [1, -1, hparams.num_mels]) else: # upsampling local condition lc = np.tile(lc, [1, 1, hparams.upsampling_rate]) lc = np.reshape(lc, [1, -1, hparams.num_mels]) print(lc.shape) glow = WaveGlow(lc_dim=hparams.num_mels, n_flows=hparams.n_flows, n_group=hparams.n_group, n_early_every=hparams.n_early_every, n_early_size=hparams.n_early_size) lc_placeholder = tf.placeholder(tf.float32, shape=[1, None, hparams.num_mels], name='lc') audio = glow.infer(lc_placeholder, sigma=args.sigma) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) print("restore model") saver = tf.train.Saver(var_list=tf.trainable_variables()) saver.restore(sess, args.restore_from) print('restore model successfully!') audio_output = sess.run(audio, feed_dict={lc_placeholder: lc}) audio_output = audio_output.flatten() print(audio_output) write_wav(audio_output, hparams.sample_rate, args.wave_name) except Exception: raise
def test(sigma, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda().eval() # Load checkpoint if one exists model, iteration = load_checkpoint(checkpoint_path, model) model.eval() testset = Mel2Samp(data_config['testing_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax']) test_loader = DataLoader(testset, num_workers=1, shuffle=False, sampler=None, batch_size=batch_size, pin_memory=False, drop_last=True) with torch.no_grad(): val_loss = 0.0 for j, batch in enumerate(test_loader): mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) val_loss += loss.item() val_loss = val_loss / (j + 1) model.train() print("test loss: {}:\t{:.9f}".format(iteration, val_loss))
def waveglow_infer(mel, config): print( colored('Running WaveGlow with ', 'blue', attrs=['bold']) + config.vocoder_path) waveglow = WaveGlow(config) waveglow, _, _ = load_checkpoint(config.vocoder_path, waveglow) #waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow') waveglow = waveglow.remove_weightnorm(waveglow) waveglow = set_device(waveglow, config.device) waveglow.eval() denoiser = Denoiser(waveglow, config) denoiser = set_device(denoiser, config.device) with torch.no_grad(): wave = waveglow.infer(mel, config.sigma).float() wave = denoiser(wave, strength=config.denoising_strength) wave = wave / torch.max(torch.abs(wave)) return wave.cpu()
if not os.path.exists("results"): os.mkdir("results") audio.save_wav(wav[0].data.cpu().numpy(), os.path.join("results", str(num) + ".wav")) if __name__ == "__main__": # Test device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(hp.seed) torch.cuda.manual_seed(hp.seed) model = WaveGlow().cuda() checkpoint = torch.load('test/TTSglow_130000') model.load_state_dict(checkpoint['model'].state_dict()) dataset = FastSpeechDataset() testing_loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, drop_last=True, num_workers=4) model = model.eval() for i, data_of_batch in enumerate(testing_loader): src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"]
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=4): print("num_workers", num_workers) torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=1, gamma=0.96) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) evalset = Mel2Samp(**eval_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=False, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) epoch_offset = max(1, int(iteration / len(train_loader))) start_time = datetime.datetime.now() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print('Epoch:', epoch, 'LR:', scheduler.get_lr()) elapsed = datetime.datetime.now() - start_time print("Epoch: [{}][els: {}] {}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch)) model.train() total_loss = 0. for i, batch in enumerate(train_loader): model.zero_grad() if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: mel, audio, spk_embed_or_id = batch spk_embed_or_id = torch.autograd.Variable( spk_embed_or_id.cuda()) else: mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: outputs = model((mel, audio, spk_embed_or_id)) else: outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() total_loss += reduced_loss if i > 0 and i % 10 == 0: elapsed = datetime.datetime.now() - start_time print( "[{}][els: {}] epoch {},total steps{}, {}/{} steps:\t{:.9f}" .format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, iteration, i, len(train_loader), reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 elapsed = datetime.datetime.now() - start_time print("[{}][els: {}] {} epoch :\tavg loss {:.9f}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, total_loss / len(train_loader))) scheduler.step() eval.eval(eval_loader, model, criterion, num_gpus, start_time, epoch, waveglow_config["multi_speaker_config"]["use_multi_speaker"])
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(data_config['training_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=False) if 'testing_files' in data_config: testset = Mel2Samp(data_config['testing_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=True) else: testset = None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) else: logger = None model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() model.zero_grad() print("train batch loaded, {} ({} of {})".format( iteration, i, len(train_loader))) mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) is_overflow = math.isnan(grad_norm) optimizer.step() duration = time.perf_counter() - start print( "train batch done, {} ({} of {}): {:.9f} (took {:.2f})".format( iteration, i, len(train_loader), reduced_loss, duration)) if logger: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) logger.add_scalar('duration', duration, i + len(train_loader) * epoch) if testset and not is_overflow and (iteration % iters_per_checkpoint == 0): if testset: validate(model, criterion, testset, iteration, batch_size, num_gpus, logger) if rank == 0: rotate_checkpoints(output_directory) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path, hparams): torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hparams.sigma) model = WaveGlow(hparams).cuda() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = TextMelLoader(hparams.training_files, hparams) collate_fn = TextMelCollate() # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== batch_size = hparams.batch_size train_loader = DataLoader(trainset, num_workers=0, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) # Get shared output_directory readya if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hparams.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) print("Total Epochs: {}".format(hparams.epochs)) print("Batch Size: {}".format(hparams.batch_size)) print("learning rate: {}".format(hparams.learning_rate)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) with torch.no_grad(): enc_outputs, alignments = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel_padded = mel_padded.transpose(1, 2) # mel_padded = mel_padded / torch.abs(mel_padded).max().item() mel_pos = torch.arange(1000) mel_pos = to_gpu(mel_pos).long().unsqueeze(0) mel_pos = mel_pos.expand(hparams.batch_size, -1) src_pos = torch.arange(hparams.n_position) src_pos = to_gpu(src_pos).long().unsqueeze(0) src_pos = src_pos.expand(hparams.batch_size, -1) mel_padded = (mel_padded + 5) / 10 z, log_s_list, log_det_w_list, dec_enc_attn = model( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) outputs = (z, log_s_list, log_det_w_list, dec_enc_attn) loss = criterion(outputs, alignments) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hparams.with_tensorboard and rank == 0: logger.log_training(reduced_loss, grad_norm, learning_rate, iteration) if (iteration % hparams.iters_per_checkpoint == 0): if rank == 0: mel_predict, test_attn = model.test( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) logger.log_alignment(model, dec_enc_attn, alignments, mel_padded, mel_predict, test_attn, iteration) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength, args): #mel_files = files_to_list(mel_files) #print(mel_files) files = ['/local-scratch/fuyang/cmpt726/final_project/cremad/1091_WSI_SAD_XX.wav'] #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav'] with open('config.json') as f: data = f.read() config = json.loads(data) waveglow_config = config["waveglow_config"] model = WaveGlow(**waveglow_config) checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu') model_for_loading = checkpoint_dict['model'] model.load_state_dict(model_for_loading.state_dict()) model.cuda() #waveglow = torch.load(waveglow_path)['model'] #waveglow = waveglow.remove_weightnorm(waveglow) #waveglow.cuda() waveglow = model if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O1") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0) for i, file_path in enumerate(files): audio, rate = load_wav_to_torch(file_path) if rate != sampling_rate: audio = resampy.resample(audio.numpy(), rate, sampling_rate) audio = torch.from_numpy(audio).float() #if audio.size(0) >= args.segment_length: # max_audio_start = audio.size(0) - args.segment_length # audio_start = random.randint(0, max_audio_start) # audio = audio[audio_start:audio_start+args.segment_length] #else: # audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data mel = mel_extractor.get_mel(audio) audio = audio / MAX_WAV_VALUE mel = torch.autograd.Variable(mel.cuda().unsqueeze(0)) audio = torch.autograd.Variable(audio.cuda().unsqueeze(0)) audio = audio.half() if is_fp16 else audio mel = mel.half() if is_fp16 else mel outputs = waveglow((mel, audio)) z = outputs[0][:,4:] print(outputs) mel_up = waveglow.upsample(mel) time_cutoff = waveglow.upsample.kernel_size[0]-waveglow.upsample.stride[0] mel_up = mel_up[:,:,:-time_cutoff] #mel_up = mel_up[:,:,:-(time_cutoff+128)] mel_up = mel_up.unfold(2, waveglow.n_group, waveglow.n_group).permute(0,2,1,3) mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1), -1).permute(0, 2, 1) audio = z mel_up = mel_up[:,:,:audio.size(2)] sigma = 0.7 z_i = 0 for k in reversed(range(waveglow.n_flows)): n_half = int(audio.size(1)/2) audio_0 = audio[:,:n_half, :] audio_1 = audio[:, n_half:, :] output = waveglow.WN[k]((audio_0, mel_up)) s = output[:,n_half:, :] b = output[:, :n_half, :] audio_1 = (audio_1-b)/torch.exp(s) audio = torch.cat([audio_0, audio_1],1) audio = waveglow.convinv[k](audio, reverse=True) if k % waveglow.n_early_every == 0 and k > 0: z = outputs[0][:, 2-z_i:4-z_i] #if mel_up.type() == 'torch.cuda.HalfTensor': # z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() #else: # z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() audio = torch.cat((sigma*z, audio),1) audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_synthesis.wav".format('fuyangz')) write(audio_path, sampling_rate, audio) print(audio_path)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 print("checkpoint path", checkpoint_path) #model = warm_load_checkpoint(checkpoint_path, model) model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() loss.backward() optimizer.step() if (iteration % iters_per_checkpoint == 0): print("{}:\t{:.9f}".format(iteration, reduced_loss)) checkpoint_path = "{}/waveglow".format(output_directory) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength, args): #mel_files = files_to_list(mel_files) #print(mel_files) dataset = voice_dataset(dataBase={ 'ravdess': './our_data/ravdess', 'cremad': './our_data/cremad' }, style=('happy', 'sad', 'angry')) #print(len(dataset.final_data['happy'])) #sample = dataset.pick_one_random_sample('happy') styles = ['happy', 'sad', 'angry'] with open('config.json') as f: data = f.read() config = json.loads(data) waveglow_config = config["waveglow_config"] model = WaveGlow(**waveglow_config) checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu') model_for_loading = checkpoint_dict['model'] model.load_state_dict(model_for_loading.state_dict()) model.cuda() waveglow = model if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O1") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0) vector_all = {} for style in styles: files = dataset.final_data[style].copy() random.shuffle(files) vectors = [] for i, (_, file_path) in enumerate(files): if i > 200: break try: audio, rate = load_wav_to_torch(file_path) if rate != sampling_rate: audio = resampy.resample(audio.numpy(), rate, sampling_rate) audio = torch.from_numpy(audio).float() #if audio.size(0) >= args.segment_length: # max_audio_start = audio.size(0) - args.segment_length # audio_start = random.randint(0, max_audio_start) # audio = audio[audio_start:audio_start+args.segment_length] #else: # audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data mel = mel_extractor.get_mel(audio) audio = audio / MAX_WAV_VALUE mel = torch.autograd.Variable(mel.cuda().unsqueeze(0)) audio = torch.autograd.Variable(audio.cuda().unsqueeze(0)) audio = audio.half() if is_fp16 else audio mel = mel.half() if is_fp16 else mel outputs = waveglow((mel, audio)) vectors.append( outputs[0].squeeze(0).mean(1).detach().cpu().numpy()) print(style, i) except: continue vector_all[style] = vectors np.save('all_style_vector', vector_all)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=2): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # HACK: setup separate training and eval sets training_files = data_config['training_files'] eval_files = data_config['eval_files'] del data_config['training_files'] del data_config['eval_files'] data_config['audio_files'] = training_files trainset = Mel2Samp(**data_config) data_config['audio_files'] = eval_files evalset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print("Creating dataloaders with " + str(num_workers) + " workers") train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=True, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger_train = SummaryWriter( os.path.join(output_directory, 'logs', 'train')) logger_eval = SummaryWriter( os.path.join(output_directory, 'logs', 'eval')) epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): model.train() with tqdm(total=len(train_loader)) as train_pbar: for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() train_pbar.set_description( "Epoch {} Iter {} Loss {:.3f}".format( epoch, iteration, reduced_loss)) if with_tensorboard and rank == 0 and iteration % 10 == 0: logger_train.add_scalar('loss', reduced_loss, i + len(train_loader) * epoch) # adding logging for GPU utilization and memory usage gpu_memory_used, gpu_utilization = get_gpu_stats() k = 'gpu' + str(0) logger_train.add_scalar(k + '/memory', gpu_memory_used, iteration) logger_train.add_scalar(k + '/load', gpu_utilization, iteration) logger_train.flush() if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 train_pbar.update(1) # Eval model.eval() torch.cuda.empty_cache() with torch.no_grad(): tensorboard_mel, tensorboard_audio = None, None loss_accum = [] with tqdm(total=len(eval_loader)) as eval_pbar: for i, batch in enumerate(eval_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs).item() loss_accum.append(loss) eval_pbar.set_description("Epoch {} Eval {:.3f}".format( epoch, loss)) outputs = None # use the first batch for tensorboard audio samples if i == 0: tensorboard_mel = mel tensorboard_audio = audio eval_pbar.update(1) if with_tensorboard and rank == 0: loss_avg = statistics.mean(loss_accum) tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg)) logger_eval.add_scalar('loss', loss_avg, iteration) # log audio samples to tensorboard tensorboard_audio_generated = model.infer(tensorboard_mel) for i in range(0, 5): ta = tensorboard_audio[i].cpu().numpy() tag = tensorboard_audio_generated[i].cpu().numpy() logger_eval.add_audio("sample " + str(i) + "/orig", ta, epoch, sample_rate=data_config['sampling_rate']) logger_eval.add_audio("sample " + str(i) + "/gen", tag, epoch, sample_rate=data_config['sampling_rate']) logger_eval.flush()
def train(num_gpus, rank, group_name, output_directory, epochs, init_lr, final_lr, sigma, epochs_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): os.makedirs(output_directory, exist_ok=True) torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=init_lr) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists epoch_offset = 1 if checkpoint_path != "": model, optimizer, epoch_offset = load_checkpoint( checkpoint_path, model, optimizer) epoch_offset += 1 # next epoch is epoch_offset + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=8, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs + 1): print(f'Epoch: {epoch}') adjust_learning_rate(optimizer, epoch, init_lr, final_lr, epochs) for i, batch in enumerate(tqdm.tqdm(train_loader)): optimizer.zero_grad() batch = model.pre_process(batch) outputs = model(batch) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + 1 + len(train_loader) * epoch) if epoch % epochs_per_checkpoint == 0: if rank == 0: # Keep only one checkpoint last_chkpt = os.path.join( output_directory, f'waveglow_{epoch - epochs_per_checkpoint:06d}.pt') if os.path.exists(last_chkpt): os.remove(last_chkpt) checkpoint_path = os.path.join(output_directory, f'waveglow_{epoch:06d}.pt') save_checkpoint(model, optimizer, epoch, checkpoint_path)
def train( num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, ): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader( trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, ) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, "logs")) # fixed for visualization real_mels, real_audios = zip(*[trainset[i] for i in range(8)]) real_mel = torch.cat(real_mels, dim=-1) real_audio = torch.cat(real_audios, dim=0) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: step = i + len(train_loader) * epoch logger.add_scalar("training_loss", reduced_loss, step) if step % 500 == 0: # select the first eight data sample model.eval() with torch.no_grad(): device = mel.device fake_audio = (model.infer( torch.stack(real_mels).to(device)).flatten( 0, 1).cpu()) model.train() fake_mel = trainset.get_mel(fake_audio) logger.add_image( "training_mel_real", plot_spectrogram_to_numpy(real_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_real", real_audio, step, 22050, ) logger.add_image( "training_mel_fake", plot_spectrogram_to_numpy(fake_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_fake", fake_audio, step, 22050, ) logger.flush() if iteration % iters_per_checkpoint == 0: if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config, filter_length=data_config["filter_length"], hop_length=data_config["hop_length"]).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) trainset = Mel2Samp(**data_config) train_loader = DataLoader(trainset, num_workers=6, sampler=RandomSampler(0, 14), batch_size=batch_size, pin_memory=True, drop_last=False) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() model = model.cuda() s = time() reduced_loss = 0 for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) reduced_loss += loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() denominator = i % iters_per_checkpoint + 1 print("iteration:{}, loss:{:.4f}, time:{:.2f} " "".format(iteration + 1, reduced_loss / denominator, (time() - s) / denominator), end="\r") if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss / denominator, iteration + 1) if (iteration + 1) % iters_per_checkpoint == 0: s = time() reduced_loss = 0 if rank == 0: checkpoint_path = "{}/waveglow_it{}.pt".format( output_directory, iteration + 1) save_checkpoint(model, optimizer, learning_rate, iteration + 1, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, warm_start): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = Over9000(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: amp = None # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, warm_start) if fp16_run and not warm_start: amp.load_state_dict(torch.load(checkpoint_path)['amp']) iteration += 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=16, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.999, patience=250, cooldown=250, verbose=True, min_lr=1e-5) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = mel.cuda() audio = audio.cuda() outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 1.0) optimizer.step() if epoch > 1: scheduler.step(loss) print("{}:\t{:.9f}\t{:.9f}".format(iteration, reduced_loss, grad_norm)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, amp, iteration, checkpoint_path) iteration += 1
class TTSModel(object): """docstring for TTSModel.""" def __init__(self, tacotron2_path, waveglow_path, **kwargs): super(TTSModel, self).__init__() hparams = HParams(**kwargs) self.hparams = hparams self.model = Tacotron2(hparams) if torch.cuda.is_available(): self.model.load_state_dict( torch.load(tacotron2_path)["state_dict"]) self.model.cuda().eval() else: self.model.load_state_dict( torch.load(tacotron2_path, map_location="cpu")["state_dict"]) self.model.eval() self.k_cache = klepto.archives.file_archive(cached=False) if waveglow_path: if torch.cuda.is_available(): wave_params = torch.load(waveglow_path) else: wave_params = torch.load(waveglow_path, map_location="cpu") try: self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) self.waveglow.load_state_dict(wave_params) except: self.waveglow = wave_params["model"] self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) if torch.cuda.is_available(): self.waveglow.cuda().eval() else: self.waveglow.eval() # workaround from # https://github.com/NVIDIA/waveglow/issues/127 for m in self.waveglow.modules(): if "Conv" in str(type(m)): setattr(m, "padding_mode", "zeros") for k in self.waveglow.convinv: k.float().half() self.denoiser = Denoiser(self.waveglow, n_mel_channels=hparams.n_mel_channels) self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self._synth_speech) else: self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self._synth_speech_fast) self.taco_stft = TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, n_mel_channels=hparams.n_mel_channels, sampling_rate=hparams.sampling_rate, mel_fmax=4000, ) def _generate_mel_postnet(self, text): sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] if torch.cuda.is_available(): sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() else: sequence = torch.autograd.Variable( torch.from_numpy(sequence)).long() with torch.no_grad(): mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( sequence) return mel_outputs_postnet def synth_speech_array(self, text, vocoder): mel_outputs_postnet = self._generate_mel_postnet(text) if vocoder == VOCODER_WAVEGLOW: with torch.no_grad(): audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_t = self.denoiser(audio_t, 0.1)[0] audio = audio_t[0].data elif vocoder == VOCODER_GL: mel_decompress = self.taco_stft.spectral_de_normalize( mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling spec_from_mel = (spec_from_mel.cuda() if torch.cuda.is_available() else spec_from_mel) audio = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), self.taco_stft.stft_fn, GL_ITERS, ) audio = audio.squeeze() else: raise ValueError("vocoder arg should be one of [wavglow|gl]") audio = audio.cpu().numpy() return audio def _synth_speech(self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE): audio = self.synth_speech_array(text, VOCODER_WAVEGLOW) return postprocess_audio( audio, src_rate=self.hparams.sampling_rate, dst_rate=sample_rate, tempo=speed, ) def _synth_speech_fast(self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE): audio = self.synth_speech_array(text, VOCODER_GL) return postprocess_audio( audio, tempo=speed, src_rate=self.hparams.sampling_rate, dst_rate=sample_rate, )
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path): # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(hp.seed) torch.cuda.manual_seed(hp.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hp.sigma) model = WaveGlow().cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hp.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hp.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # Get dataset dataset = FastSpeechDataset() # Get training loader print("Get Training Loader") training_loader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=cpu_count()) if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hp.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model = model.train() epoch_offset = max(0, int(iteration / len(training_loader))) beta = hp.batch_size print("Total Epochs: {}".format(hp.epochs)) print("Batch Size: {}".format(hp.batch_size)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hp.epochs): print("Epoch: {}".format(epoch)) for i, data_of_batch in enumerate(training_loader): model.zero_grad() if not hp.pre_target: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = get_alignment(src_seq, tacotron2).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) else: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] alignment_target = data_of_batch["alignment"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = torch.from_numpy( alignment_target).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) outputs = model(src_seq, src_pos, mel_tgt, mel_max_len, alignment_target) _, _, _, duration_predictor = outputs mel_tgt = mel_tgt.transpose(1, 2) max_like, dur_loss = criterion(outputs, alignment_target, mel_tgt) if beta > 1 and iteration % 10000 == 0: beta = beta // 2 loss = max_like + dur_loss if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hp.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() #grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hp.with_tensorboard and rank == 0: logger.log_training(reduced_loss, dur_loss, learning_rate, iteration) if (iteration % hp.save_step == 0): if rank == 0: # logger.log_alignment(model, mel_predict, mel_tgt, iteration) checkpoint_path = "{}/TTSglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength, args): #mel_files = files_to_list(mel_files) #print(mel_files) dataset = voice_dataset(dataBase={ 'ravdess': './our_data/ravdess', 'cremad': './our_data/cremad' }, style=('happy', 'sad', 'angry')) #print(len(dataset.final_data['happy'])) #sample = dataset.pick_one_random_sample('happy') files = dataset.final_data[style] #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav'] with open('config.json') as f: data = f.read() config = json.loads(data) waveglow_config = config["waveglow_config"] model = WaveGlow(**waveglow_config) checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu') model_for_loading = checkpoint_dict['model'] model.load_state_dict(model_for_loading.state_dict()) model.cuda() waveglow = model if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O1") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0) avg_z = np.zeros(8) _count = 0 for i, (_, file_path) in enumerate(files): if i > 50: break try: audio, rate = load_wav_to_torch(file_path) if rate != sampling_rate: audio = resampy.resample(audio.numpy(), rate, sampling_rate) audio = torch.from_numpy(audio).float() #if audio.size(0) >= args.segment_length: # max_audio_start = audio.size(0) - args.segment_length # audio_start = random.randint(0, max_audio_start) # audio = audio[audio_start:audio_start+args.segment_length] #else: # audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data mel = mel_extractor.get_mel(audio) audio = audio / MAX_WAV_VALUE mel = torch.autograd.Variable(mel.cuda().unsqueeze(0)) audio = torch.autograd.Variable(audio.cuda().unsqueeze(0)) audio = audio.half() if is_fp16 else audio mel = mel.half() if is_fp16 else mel outputs = waveglow((mel, audio)) avg_z += outputs[0].squeeze(0).mean(1).detach().cpu().numpy() _count += 1 z = outputs[0][:, 4:] #print(outputs) mel_up = waveglow.upsample(mel) time_cutoff = waveglow.upsample.kernel_size[ 0] - waveglow.upsample.stride[0] mel_up = mel_up[:, :, :-time_cutoff] #mel_up = mel_up[:,:,:-(time_cutoff+128)] mel_up = mel_up.unfold(2, waveglow.n_group, waveglow.n_group).permute(0, 2, 1, 3) mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1), -1).permute(0, 2, 1) audio = z mel_up = mel_up[:, :, :audio.size(2)] sigma = 0.7 z_i = 0 for k in reversed(range(waveglow.n_flows)): n_half = int(audio.size(1) / 2) audio_0 = audio[:, :n_half, :] audio_1 = audio[:, n_half:, :] output = waveglow.WN[k]((audio_0, mel_up)) s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = (audio_1 - b) / torch.exp(s) audio = torch.cat([audio_0, audio_1], 1) audio = waveglow.convinv[k](audio, reverse=True) if k % waveglow.n_early_every == 0 and k > 0: z = outputs[0][:, 2 - z_i:4 - z_i] #if mel_up.type() == 'torch.cuda.HalfTensor': # z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() #else: # z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() audio = torch.cat((sigma * z, audio), 1) audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_synthesis.wav".format(file_path[:-4])) if os.path.exists( os.path.join(*audio_path.split('/')[:-1])) is False: os.makedirs(os.path.join(*audio_path.split('/')[:-1]), exist_ok=True) write(audio_path, sampling_rate, audio) print(audio_path) except: continue avg_z = avg_z / _count np.save(style, avg_z)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, weight_sharing, optimizer_type, dataloader_type): ws = weight_sharing torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer_type = optimizer_type.lower() if optimizer_type == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) elif optimizer_type == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) else: print("Unsupported optimizer: %s. Aborting." % optimizer_type) return None if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 dataloader_type = dataloader_type.lower() if dataloader_type == "vanilla": trainset = Mel2Samp(**data_config) elif dataloader_type == "split": trainset = Mel2SampSplit(**data_config) else: print("Unsupported dataloader type: %s. Aborting." % dataloader_type) return None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=(num_gpus == 1), sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) name = "waveglow_ws%d_%s_%s_batch%d" % (ws, optimizer_type, dataloader_type, batch_size) if learning_rate != 1e-4: name = name + "_lr{:.0e}".format(learning_rate) if num_gpus > 1: name = name + "_x%d" % num_gpus if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join("./logs", name)) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== stime2 = None for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) stime = time() for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if (iteration % 100 == 0): if not stime2 is None: tot_time2 = time() - stime2 print("{}:\t{:.9f}, time: {}".format( iteration, reduced_loss, int(tot_time2))) stime2 = time() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}_{}".format( output_directory, name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 tot_time = time() - stime print("Epoch %d completed. Time: %d seconds" % (epoch, int(tot_time)))
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath, drive_fid): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) torch.save( { 'model': model_for_saving, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate }, filepath) uploaded = False attempt = 0 file_title = filepath[filepath.find("/") + 1:] while not uploaded and attempt < 10: attempt += 1 try: if gauth.credentials is None: # Authenticate if they're not there gauth.LocalWebserverAuth() elif gauth.access_token_expired: # Refresh them if expired print("Google Drive Token Expired, Refreshing") gauth.Refresh() else: # Initialize the saved creds gauth.Authorize() # Save the current credentials to a file # gauth.SaveCredentialsFile("GoogleDriveCredentials.txt") f = drive.CreateFile({ 'title': file_title, "parents": [{ "kind": "drive#fileLink", "id": drive_fid }] }) f.SetContentFile(filepath) f.Upload() uploaded = True break except: print("Failed uploading to drive at attempt #{}".format(attempt)) sleep(30) if uploaded: try: ok = False for file in drive.ListFile({ 'q': "'" + drive_fid + "' in parents" }).GetList(): if file['title'] == file_title: if file["fileSize"] > 4000000: ok = True print("File was successfully uploaded") else: file.Delete() uploaded = False print("File was not uploaded normally. Deleting") sleep(30) break if ok: for file in drive.ListFile({ 'q': "'" + drive_fid + "' in parents" }).GetList(): if file['title'] != file_title: file.Delete() sleep( 30 ) #make sure the file is deleted from drive first except: pass
def train(num_gpus, rank, group_name, prj_name, run_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, grad_clip_thresh, checkpoint_path, pretrained_path, with_tensorboard, with_wandb): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 if pretrained_path != "": model = load_pretrained(pretrained_path, model) trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset) shuffle_at_dataloader = False else: train_sampler = None shuffle_at_dataloader = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle_at_dataloader, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): iter_start = time.perf_counter() float_epoch = float(iteration) / len(train_loader) model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss, etc = criterion(outputs) (z_L2_normalized, neg_log_s_total, neg_log_det_W_total) = etc if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isnan(grad_norm) if not is_overflow: clipped_grad_norm = get_clip_grad_norm( grad_norm, grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) clipped_grad_norm = get_clip_grad_norm(grad_norm, grad_clip_thresh) optimizer.step() iter_duration = time.perf_counter() - iter_start print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if with_wandb and rank == 0: wandb.log( { 'iteration': iteration, 'epoch': float_epoch, 'iter_duration': iter_duration, 'training_loss': reduced_loss, 'training_loss/z_L2_normalized': z_L2_normalized, 'training_loss/neg_log_s_total': neg_log_s_total, 'training_loss/neg_log_det_W_total': neg_log_det_W_total, }, step=iteration) if not is_overflow: wandb.log( { 'grad_norm': grad_norm, 'clipped_grad_norm': clipped_grad_norm, }, step=iteration) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/{}/{}/waveglow_{}".format( output_directory, prj_name, run_name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(): args = get_arguments() args.logdir = os.path.join(hparams.logdir_root, args.run_name) if not os.path.exists(args.logdir): os.makedirs(args.logdir) assert hparams.upsampling_rate == hparams.hop_length, 'upsamling rate should be same as hop_length' # Create coordinator. coord = tf.train.Coordinator() global_step = tf.get_variable("global_step", [], initializer=tf.constant_initializer(0), trainable=False) learning_rate = tf.train.exponential_decay(hparams.lr, global_step, hparams.decay_steps, 0.95, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) with tf.device('/cpu:0'): with tf.name_scope('inputs'): reader = DataReader(coord, args.filelist, args.wave_dir, args.lc_dir) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) reader.start_threads() audio_placeholder = tf.placeholder(tf.float32, shape=[None, None, 1], name='audio') lc_placeholder = tf.placeholder(tf.float32, shape=[None, None, hparams.num_mels], name='lc') tower_losses = [] tower_grads = [] with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): for i in range(args.ngpu): with tf.device('/gpu:%d' % i), tf.name_scope('tower_%d' % i): glow = WaveGlow(lc_dim=hparams.num_mels, n_flows=hparams.n_flows, n_group=hparams.n_group, n_early_every=hparams.n_early_every, n_early_size=hparams.n_early_size) print('create network %i' % i) local_audio_placeholder = audio_placeholder[i * hparams.batch_size:(i + 1) * hparams.batch_size, :, :] local_lc_placeholder = lc_placeholder[i * hparams.batch_size:(i + 1) * hparams.batch_size, :, :] output_audio, log_s_list, log_det_W_list = glow.create_forward_network(local_audio_placeholder, local_lc_placeholder) loss = compute_waveglow_loss(output_audio, log_s_list, log_det_W_list, sigma=hparams.sigma) grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables()) tower_losses.append(loss) tower_grads.append(grads) tf.summary.scalar('loss_tower_%d' % i, loss) # # gradient clipping # gradients = [grad for grad, var in averaged_gradients] # params = [var for grad, var in averaged_gradients] # clipped_gradients, norm = tf.clip_by_global_norm(gradients, 1.0) # # with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): # train_ops = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step) print("create network finished") loss = tf.reduce_mean(tower_losses) averaged_gradients = average_gradients(tower_grads) train_ops = optimizer.apply_gradients(averaged_gradients, global_step=global_step) tf.summary.scalar('loss', loss) # Set up logging for TensorBoard. writer = tf.summary.FileWriter(args.logdir) writer.add_graph(tf.get_default_graph()) run_metadata = tf.RunMetadata() summaries = tf.summary.merge_all() # Set up session init = tf.global_variables_initializer() sess.run(init) print('parameters initialization finished') saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=30) saved_global_step = 0 if args.restore_from is not None: try: saved_global_step = load(saver, sess, args.restore_from) if saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. saved_global_step = 0 except Exception: print("Something went wrong while restoring checkpoint. " "We will terminate training to avoid accidentally overwriting " "the previous model.") raise print("restore model successfully!") print('start training.') last_saved_step = saved_global_step try: for step in range(saved_global_step + 1, hparams.train_steps): audio, lc = reader.dequeue(num_elements=hparams.batch_size * args.ngpu) if hparams.lc_encode or hparams.transposed_upsampling: # if using local condition bi-lstm encoding or tranposed conv upsampling, no need to upsample # bi-lstm, upsamle will be done in the tf code lc = np.reshape(lc, [hparams.batch_size * args.ngpu, -1, hparams.num_mels]) else: # upsampling by directly repeat lc = np.tile(lc, [1, 1, hparams.upsampling_rate]) lc = np.reshape(lc, [hparams.batch_size * args.ngpu, -1, hparams.num_mels]) start_time = time.time() if step % 50 == 0 and args.store_metadata: # Slow run that stores extra information for debugging. print('Storing metadata') run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) summary, loss_value, _, lr = sess.run( [summaries, loss, train_ops, learning_rate], feed_dict={audio_placeholder: audio, lc_placeholder: lc}, options=run_options, run_metadata=run_metadata) writer.add_summary(summary, step) writer.add_run_metadata(run_metadata, 'step_{:04d}'.format(step)) tl = timeline.Timeline(run_metadata.step_stats) timeline_path = os.path.join(args.logdir, 'timeline.trace') with open(timeline_path, 'w') as f: f.write(tl.generate_chrome_trace_format(show_memory=True)) else: summary, loss_value, _, lr = sess.run([summaries, loss, train_ops, learning_rate], feed_dict={audio_placeholder: audio, lc_placeholder: lc}) writer.add_summary(summary, step) duration = time.time() - start_time step_log = 'step {:d} - loss = {:.3f}, lr={:.8f}, time cost={:4f}'\ .format(step, loss_value, lr, duration) print(step_log) if step % hparams.save_model_every == 0: save(saver, sess, args.logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C is displayed so save message # is on its own line. print() finally: if step > last_saved_step: save(saver, sess, args.logdir, step) coord.request_stop() coord.join()
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cpu() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cpu()) audio = torch.autograd.Variable(audio.cpu()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
from dataset import FastSpeechDataset, collate_fn, DataLoader from scipy.io.wavfile import write import sys device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') MAX_WAV_VALUE = 32768.0 if __name__ == "__main__": # Test device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') sampling_rate = 22050 torch.manual_seed(hp.seed) torch.cuda.manual_seed(hp.seed) model = WaveGlow().cuda() checkpoint = torch.load('test/TTSglow_67000') model.load_state_dict(checkpoint['model'].state_dict()) model = model.remove_weightnorm(model) dataset = FastSpeechDataset() testing_loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, drop_last=True, num_workers=4) model = model.train() for i, data_of_batch in enumerate(testing_loader): audio_tgt = data_of_batch["audios"]