Beispiel #1
0
    def log(self, epoch, flag, audio_gen, f0, audio):
        # log gradients
        if flag == 'train':
            self.log_writer.add_scalar(
                f"{self.scale_tb_prefix}_grads/G/grad/tail",
                self.netG.module.last_conv_layers[-1].weight_v.grad.abs().mean(
                ),
                epoch,
            )
            self.log_writer.add_scalar(
                f"{self.scale_tb_prefix}_grads/G/grad/head",
                self.netG.module.first_conv.weight_v.grad.abs().mean(),
                epoch,
            )

            if self.hp_adv and epoch > self.disc_start:
                self.log_writer.add_scalar(
                    f"{self.scale_tb_prefix}_grads/D/grad/tail",
                    self.netD.module.conv_layers[-1].weight_v.grad.abs().mean(
                    ), epoch)
                self.log_writer.add_scalar(
                    f"{self.scale_tb_prefix}_grads/D/grad/head",
                    self.netD.module.conv_layers[0].weight_v.grad.abs().mean(),
                    epoch,
                )
        # log losses
        for key in self.loss_meter_keys:
            if flag == 'train':
                self.log_writer.add_scalar(
                    f"{self.scale_tb_prefix}_{flag}/{key}",
                    self.loss_meter_train[key].summarize_epoch(), epoch)
            elif flag == 'val':
                self.log_writer.add_scalar(
                    f"{self.scale_tb_prefix}_{flag}/{key}",
                    self.loss_meter_val[key].summarize_epoch(), epoch)
        # log audios
        if self.log_audio:
            audios = [audio_gen, f0, audio]
            names = ['audio_gen', 'f0', 'audio']
            srs = [self.sr, self.sr_f0, self.sr]

            for audio, name, sr in zip(audios, names, srs):
                audio_numpy = audio[0].clamp(
                    -1, 1).squeeze().detach().cpu().numpy()
                sf.write(
                    f"{self.scale_output_dirpath}/{name}_sample_{flag}.wav",
                    audio_numpy, sr)
                audio = sampling.resample_torch(audio[0], sr, 16000).clamp(
                    -1, 1).detach().squeeze()
                self.log_writer.add_audio(
                    f"{self.scale_tb_prefix}_{flag}/{name}",
                    audio,
                    epoch,
                    sample_rate=16000)
                nfft = int(sr / (48000 / 4096))
                sample_stft = writers.show_spec(audio_numpy, nfft=nfft, sr=sr)
                self.log_writer.add_figure(
                    f'{self.scale_tb_prefix}_{flag}/stft_{name}', sample_stft,
                    epoch)
Beispiel #2
0
def draw_f0(Gs, samplers, in_s, max_val, loudness_list):
    G_z = in_s
    with torch.no_grad():
        for G, sampler, loudness in zip(Gs, samplers, loudness_list):
            z_in = G_z
            G_z = G(z_in.detach(), loudness.detach())
            G_z = resample_torch(G_z, None, None, max_val=max_val, sampler=sampler)
    return G_z
def f0_transfer(real_audio, loudness_list, Gs, samplers, max_val, save_all=False):

    with torch.no_grad():

        prev_audios = []
        prev_in = real_audio

        for it, (G, sampler, loudness) in enumerate(zip(Gs, samplers, loudness_list)):

            audio_curr = G(prev_in.detach(), loudness.detach())
            prev_in = audio_curr[:, :, :prev_in.shape[-1]]
            if save_all:
                prev_audios.append(audio_curr.detach())
            prev_in = resample_torch(prev_in, None, None, max_val=max_val, sampler=sampler)

        audio_curr = Gs[-1](prev_in.detach(), loudness_list[-1].detach())
        prev_audios.append(audio_curr.detach())

    return prev_audios
def main(args):
    CWD = Path(hydra.utils.get_original_cwd())
    os.chdir(CWD)
    # Load model args
    trained_dirpath = Path(args.trained_dirpath)
    run_args = torch.load(trained_dirpath / 'args.pth')

    # define args from trained model
    sr = run_args.sr
    num_scales = run_args.num_scales
    scale_factor = run_args.scale_factor
    max_value = run_args.max_val
    max_value_f0 = run_args.max_val_f0
    cond_freq = run_args.cond_freq

    # Convert filepaths
    input_dirpath = Path(args.input_dirpath)
    input_files = input_dirpath.glob('*.wav')

    output_dirpath = trained_dirpath.joinpath(args.exp_name)
    try:
        output_dirpath.mkdir()
    except FileExistsError:
        print('Directory already exists')

    # Pytorch device
    device = torch.device("cuda")

    #load input file
    base_audio = BaseAudio(args.crepe_path, device, args.unvoiced_flag)
    srs = create_srs(sr, num_scales, scale_factor)
    samplers = create_samplers(srs, device=device)

    if args.norm_loudness_flag:
        norm_dicts = load_norm_dicts(trained_dirpath / 'loudness.json')
    else:
        norm_dicts = None

    octave_shifts = [2**x for x in args.octaves]

    # Load Trained models
    Gs = load_trained_pyramid(trained_dirpath, network_params=run_args.generator_params, device=device, srs=srs)

    for filepath in tqdm(input_files, desc='Generating audio file'):
        for octave in tqdm(octave_shifts, desc='Octave'):
            real_audio = load_audio(filepath, sr, max_value)
            loudness_hop = 8 * sr // cond_freq
            real_audio = real_audio[:len(real_audio) // loudness_hop * loudness_hop]
            loudness_list = calc_loudness_list(audio=real_audio, srs=srs, device=device,
                                               sr_in=sr, norm_dicts=norm_dicts)
            real_audio = base_audio.forward(real_audio, sr, max_value_f0, numpy_flag=True, octave=octave)

            real_audio_orig = real_audio[None, None, ...].to(device)
            # resample input to the wanted scale
            real_audio = resample_torch(real_audio_orig, sr, srs[0], max_val=max_value_f0)

            audio_outputs = f0_transfer(real_audio,loudness_list, Gs, samplers, max_val=max_value, save_all = False)

            # add f0 sine input
            audio_outputs.append(real_audio_orig)
            save_audios(output_dirpath, audio_outputs,f'{filepath.stem}_{octave}', [srs[-1]])
Beispiel #5
0
def main(args):
    CWD = Path(hydra.utils.get_original_cwd())
    os.chdir(CWD)
    # Load model args
    trained_dirpath = Path(args.trained_dirpath)
    run_args = torch.load(trained_dirpath / 'args.pth')

    # define args from trained model
    sr = run_args.sr
    num_scales = run_args.num_scales
    scale_factor = run_args.scale_factor
    max_value = run_args.max_val
    max_value_f0 = run_args.max_val_f0
    cond_freq = run_args.cond_freq

    # Convert filepaths
    input_dirpath = Path(args.input_dirpath)
    input_files = input_dirpath.glob('*.wav')

    output_dirpath = trained_dirpath.joinpath(args.exp_name)
    try:
        output_dirpath.mkdir()
    except FileExistsError:
        print('Directory already exists')

    # Pytorch device
    device = torch.device("cuda")

    #load input file
    base_audio = BaseAudio(args.crepe_path, device, args.unvoiced_flag)
    srs = create_srs(sr, num_scales, scale_factor)
    samplers = create_samplers(srs, device=device)

    if args.norm_loudness_flag:
        norm_dicts = load_norm_dicts(trained_dirpath / 'loudness.json')
    else:
        norm_dicts = None

    octave_shifts = [2**x for x in args.octaves]

    # Load Trained models
    Gs = load_trained_pyramid(trained_dirpath, network_params=run_args.generator_params, device=device, srs=srs)

    filepath = next(iter(input_files))
    octave = next(iter(octave_shifts))

    real_audio = load_audio(filepath, sr, max_value)
    loudness_hop = 8 * sr // cond_freq
    real_audio = real_audio[:len(real_audio) // loudness_hop * loudness_hop]
    loudness_list = calc_loudness_list(audio=real_audio, srs=srs, device=device,
                                        sr_in=sr, norm_dicts=norm_dicts)
    real_audio = base_audio.forward(real_audio, sr, max_value_f0, numpy_flag=True, octave=octave)

    real_audio_orig = real_audio[None, None, ...].to(device)
    # resample input to the wanted scale
    real_audio = resample_torch(real_audio_orig, sr, srs[0], max_val=max_value_f0)

    BUFFER_SIZES = [256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
    num_iters = 100
    times = []
    for bs in BUFFER_SIZES:
        this_ra = real_audio[..., :bs // 8]
        this_ll = [l[..., :bs // (64000 // l.shape[-1])] for l in loudness_list]
        with torch.no_grad():
            for i in trange(num_iters):
                start_time = time.time()
                audio_outputs = f0_transfer(this_ra, this_ll, Gs, samplers, max_val=max_value, save_all = False)
                time_elapsed = time.time() - start_time
                times.append(
                    ["htp", "gpu", bs, time_elapsed]
                )
    
    df = pd.DataFrame(times)
    df.to_csv("htp_gpu_rtf.csv")
Beispiel #6
0
def main(args):
    CWD = Path(hydra.utils.get_original_cwd())
    os.chdir(CWD)
    # Load model args
    trained_dirpath = Path(args.trained_dirpath)
    run_args = torch.load(trained_dirpath / 'args.pth')

    # define args from trained model
    sr = run_args.sr
    num_scales = run_args.num_scales
    scale_factor = run_args.scale_factor
    max_value = run_args.max_val
    max_value_f0 = run_args.max_val_f0
    cond_freq = run_args.cond_freq

    # Convert filepaths
    input_dirpath = Path(args.input_dirpath)
    input_files = input_dirpath.glob('*.wav')

    output_dirpath = trained_dirpath.joinpath(args.exp_name)
    try:
        output_dirpath.mkdir()
    except FileExistsError:
        print('Directory already exists')

    # Pytorch device
    device = torch.device("cuda")

    #load input file
    base_audio = BaseAudio(args.crepe_path, device, args.unvoiced_flag)
    srs = create_srs(sr, num_scales, scale_factor)
    samplers = create_samplers(srs, device=device)

    if args.norm_loudness_flag:
        norm_dicts = load_norm_dicts(trained_dirpath / 'loudness.json')
    else:
        norm_dicts = None

    octave_shifts = [2**x for x in args.octaves]

    # Load Trained models
    Gs = load_trained_pyramid(trained_dirpath, network_params=run_args.generator_params, device=device, srs=srs)

    filepath = next(iter(input_files))
    octave = next(iter(octave_shifts))

    real_audio = load_audio(filepath, sr, max_value)
    loudness_hop = 8 * sr // cond_freq
    real_audio = real_audio[:len(real_audio) // loudness_hop * loudness_hop]
    loudness_list = calc_loudness_list(audio=real_audio, srs=srs, device=device,
                                        sr_in=sr, norm_dicts=norm_dicts)
    real_audio = base_audio.forward(real_audio, sr, max_value_f0, numpy_flag=True, octave=octave)

    real_audio_orig = real_audio[None, None, ...].to(device)
    # resample input to the wanted scale
    real_audio = resample_torch(real_audio_orig, sr, srs[0], max_val=max_value_f0)

    num_iters = 100
    times = []
    with torch.no_grad():
        for i in trange(num_iters):
            start_time = time.time()
            audio_outputs = f0_transfer(real_audio,loudness_list, Gs, samplers, max_val=max_value, save_all = False)
            time_elapsed = time.time() - start_time
            times.append(time_elapsed)
    
    length_in_seconds = 4
    rtfs = np.array(times) / length_in_seconds
    print("Mean RTF: %.4f" % np.mean(rtfs))
    print("90th percentile RTF: %.4f" % np.percentile(rtfs, 90))