def log(self, epoch, flag, audio_gen, f0, audio): # log gradients if flag == 'train': self.log_writer.add_scalar( f"{self.scale_tb_prefix}_grads/G/grad/tail", self.netG.module.last_conv_layers[-1].weight_v.grad.abs().mean( ), epoch, ) self.log_writer.add_scalar( f"{self.scale_tb_prefix}_grads/G/grad/head", self.netG.module.first_conv.weight_v.grad.abs().mean(), epoch, ) if self.hp_adv and epoch > self.disc_start: self.log_writer.add_scalar( f"{self.scale_tb_prefix}_grads/D/grad/tail", self.netD.module.conv_layers[-1].weight_v.grad.abs().mean( ), epoch) self.log_writer.add_scalar( f"{self.scale_tb_prefix}_grads/D/grad/head", self.netD.module.conv_layers[0].weight_v.grad.abs().mean(), epoch, ) # log losses for key in self.loss_meter_keys: if flag == 'train': self.log_writer.add_scalar( f"{self.scale_tb_prefix}_{flag}/{key}", self.loss_meter_train[key].summarize_epoch(), epoch) elif flag == 'val': self.log_writer.add_scalar( f"{self.scale_tb_prefix}_{flag}/{key}", self.loss_meter_val[key].summarize_epoch(), epoch) # log audios if self.log_audio: audios = [audio_gen, f0, audio] names = ['audio_gen', 'f0', 'audio'] srs = [self.sr, self.sr_f0, self.sr] for audio, name, sr in zip(audios, names, srs): audio_numpy = audio[0].clamp( -1, 1).squeeze().detach().cpu().numpy() sf.write( f"{self.scale_output_dirpath}/{name}_sample_{flag}.wav", audio_numpy, sr) audio = sampling.resample_torch(audio[0], sr, 16000).clamp( -1, 1).detach().squeeze() self.log_writer.add_audio( f"{self.scale_tb_prefix}_{flag}/{name}", audio, epoch, sample_rate=16000) nfft = int(sr / (48000 / 4096)) sample_stft = writers.show_spec(audio_numpy, nfft=nfft, sr=sr) self.log_writer.add_figure( f'{self.scale_tb_prefix}_{flag}/stft_{name}', sample_stft, epoch)
def draw_f0(Gs, samplers, in_s, max_val, loudness_list): G_z = in_s with torch.no_grad(): for G, sampler, loudness in zip(Gs, samplers, loudness_list): z_in = G_z G_z = G(z_in.detach(), loudness.detach()) G_z = resample_torch(G_z, None, None, max_val=max_val, sampler=sampler) return G_z
def f0_transfer(real_audio, loudness_list, Gs, samplers, max_val, save_all=False): with torch.no_grad(): prev_audios = [] prev_in = real_audio for it, (G, sampler, loudness) in enumerate(zip(Gs, samplers, loudness_list)): audio_curr = G(prev_in.detach(), loudness.detach()) prev_in = audio_curr[:, :, :prev_in.shape[-1]] if save_all: prev_audios.append(audio_curr.detach()) prev_in = resample_torch(prev_in, None, None, max_val=max_val, sampler=sampler) audio_curr = Gs[-1](prev_in.detach(), loudness_list[-1].detach()) prev_audios.append(audio_curr.detach()) return prev_audios
def main(args): CWD = Path(hydra.utils.get_original_cwd()) os.chdir(CWD) # Load model args trained_dirpath = Path(args.trained_dirpath) run_args = torch.load(trained_dirpath / 'args.pth') # define args from trained model sr = run_args.sr num_scales = run_args.num_scales scale_factor = run_args.scale_factor max_value = run_args.max_val max_value_f0 = run_args.max_val_f0 cond_freq = run_args.cond_freq # Convert filepaths input_dirpath = Path(args.input_dirpath) input_files = input_dirpath.glob('*.wav') output_dirpath = trained_dirpath.joinpath(args.exp_name) try: output_dirpath.mkdir() except FileExistsError: print('Directory already exists') # Pytorch device device = torch.device("cuda") #load input file base_audio = BaseAudio(args.crepe_path, device, args.unvoiced_flag) srs = create_srs(sr, num_scales, scale_factor) samplers = create_samplers(srs, device=device) if args.norm_loudness_flag: norm_dicts = load_norm_dicts(trained_dirpath / 'loudness.json') else: norm_dicts = None octave_shifts = [2**x for x in args.octaves] # Load Trained models Gs = load_trained_pyramid(trained_dirpath, network_params=run_args.generator_params, device=device, srs=srs) for filepath in tqdm(input_files, desc='Generating audio file'): for octave in tqdm(octave_shifts, desc='Octave'): real_audio = load_audio(filepath, sr, max_value) loudness_hop = 8 * sr // cond_freq real_audio = real_audio[:len(real_audio) // loudness_hop * loudness_hop] loudness_list = calc_loudness_list(audio=real_audio, srs=srs, device=device, sr_in=sr, norm_dicts=norm_dicts) real_audio = base_audio.forward(real_audio, sr, max_value_f0, numpy_flag=True, octave=octave) real_audio_orig = real_audio[None, None, ...].to(device) # resample input to the wanted scale real_audio = resample_torch(real_audio_orig, sr, srs[0], max_val=max_value_f0) audio_outputs = f0_transfer(real_audio,loudness_list, Gs, samplers, max_val=max_value, save_all = False) # add f0 sine input audio_outputs.append(real_audio_orig) save_audios(output_dirpath, audio_outputs,f'{filepath.stem}_{octave}', [srs[-1]])
def main(args): CWD = Path(hydra.utils.get_original_cwd()) os.chdir(CWD) # Load model args trained_dirpath = Path(args.trained_dirpath) run_args = torch.load(trained_dirpath / 'args.pth') # define args from trained model sr = run_args.sr num_scales = run_args.num_scales scale_factor = run_args.scale_factor max_value = run_args.max_val max_value_f0 = run_args.max_val_f0 cond_freq = run_args.cond_freq # Convert filepaths input_dirpath = Path(args.input_dirpath) input_files = input_dirpath.glob('*.wav') output_dirpath = trained_dirpath.joinpath(args.exp_name) try: output_dirpath.mkdir() except FileExistsError: print('Directory already exists') # Pytorch device device = torch.device("cuda") #load input file base_audio = BaseAudio(args.crepe_path, device, args.unvoiced_flag) srs = create_srs(sr, num_scales, scale_factor) samplers = create_samplers(srs, device=device) if args.norm_loudness_flag: norm_dicts = load_norm_dicts(trained_dirpath / 'loudness.json') else: norm_dicts = None octave_shifts = [2**x for x in args.octaves] # Load Trained models Gs = load_trained_pyramid(trained_dirpath, network_params=run_args.generator_params, device=device, srs=srs) filepath = next(iter(input_files)) octave = next(iter(octave_shifts)) real_audio = load_audio(filepath, sr, max_value) loudness_hop = 8 * sr // cond_freq real_audio = real_audio[:len(real_audio) // loudness_hop * loudness_hop] loudness_list = calc_loudness_list(audio=real_audio, srs=srs, device=device, sr_in=sr, norm_dicts=norm_dicts) real_audio = base_audio.forward(real_audio, sr, max_value_f0, numpy_flag=True, octave=octave) real_audio_orig = real_audio[None, None, ...].to(device) # resample input to the wanted scale real_audio = resample_torch(real_audio_orig, sr, srs[0], max_val=max_value_f0) BUFFER_SIZES = [256, 512, 1024, 2048, 4096, 8192, 16384, 32768] num_iters = 100 times = [] for bs in BUFFER_SIZES: this_ra = real_audio[..., :bs // 8] this_ll = [l[..., :bs // (64000 // l.shape[-1])] for l in loudness_list] with torch.no_grad(): for i in trange(num_iters): start_time = time.time() audio_outputs = f0_transfer(this_ra, this_ll, Gs, samplers, max_val=max_value, save_all = False) time_elapsed = time.time() - start_time times.append( ["htp", "gpu", bs, time_elapsed] ) df = pd.DataFrame(times) df.to_csv("htp_gpu_rtf.csv")
def main(args): CWD = Path(hydra.utils.get_original_cwd()) os.chdir(CWD) # Load model args trained_dirpath = Path(args.trained_dirpath) run_args = torch.load(trained_dirpath / 'args.pth') # define args from trained model sr = run_args.sr num_scales = run_args.num_scales scale_factor = run_args.scale_factor max_value = run_args.max_val max_value_f0 = run_args.max_val_f0 cond_freq = run_args.cond_freq # Convert filepaths input_dirpath = Path(args.input_dirpath) input_files = input_dirpath.glob('*.wav') output_dirpath = trained_dirpath.joinpath(args.exp_name) try: output_dirpath.mkdir() except FileExistsError: print('Directory already exists') # Pytorch device device = torch.device("cuda") #load input file base_audio = BaseAudio(args.crepe_path, device, args.unvoiced_flag) srs = create_srs(sr, num_scales, scale_factor) samplers = create_samplers(srs, device=device) if args.norm_loudness_flag: norm_dicts = load_norm_dicts(trained_dirpath / 'loudness.json') else: norm_dicts = None octave_shifts = [2**x for x in args.octaves] # Load Trained models Gs = load_trained_pyramid(trained_dirpath, network_params=run_args.generator_params, device=device, srs=srs) filepath = next(iter(input_files)) octave = next(iter(octave_shifts)) real_audio = load_audio(filepath, sr, max_value) loudness_hop = 8 * sr // cond_freq real_audio = real_audio[:len(real_audio) // loudness_hop * loudness_hop] loudness_list = calc_loudness_list(audio=real_audio, srs=srs, device=device, sr_in=sr, norm_dicts=norm_dicts) real_audio = base_audio.forward(real_audio, sr, max_value_f0, numpy_flag=True, octave=octave) real_audio_orig = real_audio[None, None, ...].to(device) # resample input to the wanted scale real_audio = resample_torch(real_audio_orig, sr, srs[0], max_val=max_value_f0) num_iters = 100 times = [] with torch.no_grad(): for i in trange(num_iters): start_time = time.time() audio_outputs = f0_transfer(real_audio,loudness_list, Gs, samplers, max_val=max_value, save_all = False) time_elapsed = time.time() - start_time times.append(time_elapsed) length_in_seconds = 4 rtfs = np.array(times) / length_in_seconds print("Mean RTF: %.4f" % np.mean(rtfs)) print("90th percentile RTF: %.4f" % np.percentile(rtfs, 90))