Esempio n. 1
0
def infer_wavenet(args):
    import sys
    sys.path.append('thirdparty/wavenet_vocoder')

    from train import build_model
    from synthesis import wavegen
    from tqdm import tqdm
    target_sample_rate = 22050

    hparams, model = load_model(args.model_name)
    meller = MelSpectrogram()
    files = [
        item for item in os.listdir(args.folder_in) if item.endswith('wav')
    ]
    for idx, audio in enumerate(files):
        wav_path = os.path.join(args.folder_in, audio)
        wav = load_wav(wav_path, target_sample_rate)
        c = meller(wav)[0]
        if c.shape[1] != hparams.num_mels:
            c = c.transpose(0, 1)
        # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
        # c = np.interp(c, (0, 4), (0, 1))

        # Generate
        waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
        path = os.path.join(args.folder_out, audio)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torchaudio.save(path, waveform, hparams.sample_rate)
Esempio n. 2
0
def genspec(pkl_path, write_name, save_dir="./result_wav/"):
    spect_vc = pickle.load(open(pkl_path, "rb"))
    i = 0
    for spect in spect_vc:
        c = spect[1]
        i = i + 1
        waveform = wavegen(model, c=c)
        librosa.output.write_wav(save_dir + write_name + '_' + str(i) + '.wav',
                                 waveform,
                                 sr=16000)
def generateAudioGroup(original_audio, ref_audios, autovc_checkpoint = 'checkpoints_fully/autovc_700000.pt', vocoder_checkpoint = "../checkpoint_step001000000_ema.pth"):

    mel_org = makeSpect(original_audio, None)

    def pad_seq(x, base=32):
        len_out = int(base * ceil(float(x.shape[0])/base))
        len_pad = len_out - x.shape[0]
        assert len_pad >= 0
        return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

    device = 'cuda:0'
    G = Generator(32,256,512,32).eval().to(device)

    g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
    
    G = g_checkpoint.eval()

    x_org = mel_org
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

    emb_org = get_verification_pytorch_1000(original_audio)
    emb_refs = []
    i = 0
    
    for file in os.listdir(ref_audios):
        i += 1
        print("{}/{}".format(i, len(os.listdir(ref_audios))))
    
        emb_ref = get_verification_pytorch_1000(ref_audios + file, 1)
        if emb_ref is not None: emb_refs.append(emb_ref)
        
   
    emb_refs = np.mean(emb_refs, axis=0)
    
    emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda()
    emb_refs = torch.FloatTensor(emb_refs).unsqueeze(0).cuda()
    
    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_refs)

    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()


    device = torch.device("cuda")
    model = build_model().to(device)
    checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda'))
    model.load_state_dict(checkpoint["state_dict"])

    waveform = wavegen(model, c=uttr_trg)   
    return waveform
def generateAudio(original_audio, ref_audio, autovc_checkpoint, vocoder_checkpoint ,english=False):

    mel_org = makeSpect(original_audio, None)

    def pad_seq(x, base=32):
        len_out = int(base * ceil(float(x.shape[0])/base))
        len_pad = len_out - x.shape[0]
        assert len_pad >= 0
        return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

    device = 'cuda:0'
    G = Generator(32,256,512,32).eval().to(device)

    g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
    
    G = g_checkpoint.eval()

    x_org = mel_org
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

    emb_org = get_verification_pytorch_1000(original_audio)
    
    if not english:
        emb_ref = get_verification_pytorch_1000(ref_audio)
    else:
        emb_ref = get_verification_eng(ref_audio)
        
    if emb_org is None or emb_ref is None: return None
   
    emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda()
    if not english:
        emb_ref = torch.FloatTensor(emb_ref).unsqueeze(0).cuda()
    else:
        emb_ref = emb_ref.type(torch.cuda.FloatTensor)
    
    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_ref)

    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()


    device = torch.device("cuda")
    model = build_model().to(device)
    checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda'))
    model.load_state_dict(checkpoint["state_dict"])

    waveform = wavegen(model, c=uttr_trg)   
    return waveform
Esempio n. 5
0
    def __decode__(self):

        spect_vc = pickle.load(open('results.pkl', 'rb'))
        #device = torch.device("cuda")
        model = build_model()#.to(device)
        checkpoint = torch.load("checkpoint_step001000000_ema.pth", map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint["state_dict"])

        for spect in spect_vc:
            name = spect[0]
            c = spect[1]
            print(name)
            waveform = wavegen(model, c=c)

            save_path = os.path.join("audio/download/audio.wav")
            librosa.output.write_wav(save_path, waveform, sr=16000)

        return save_path
Esempio n. 6
0
    wav = load_wav(src_wav_path)
    emb = np.load(src_emb_path)
    emb_tgt = np.load(tgt_emb_path)

    mel = melspectrogram(wav)

    pad_len = math.ceil(mel.shape[1] / 32) * 32 - mel.shape[1]
    mel = np.pad(mel, ((0,0), (0, pad_len)), mode='constant')

    mel = torch.FloatTensor(mel)
    emb = torch.FloatTensor(emb)
    emb_tgt = torch.FloatTensor(emb_tgt)

    model = Generator(dim_neck, dim_emb, dim_pre, freq)

    checkpoint = torch.load(autovc_checkpoint_path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['model'])
    model.eval()

    x = mel.unsqueeze(0).transpose(2,1) 
    e = emb.unsqueeze(0)
    et = emb_tgt.unsqueeze(0)

    mel_outputs, mel_outputs_postnet, codes = model(x, e, et)
    mel_rec = mel_outputs_postnet.transpose(2,1).cpu().detach().numpy()[0]

    mel_rec = mel_rec[:,:-pad_len]

    c = np.transpose(mel_rec, (1, 0))
    waveform = wavegen(wavnet, device, c=c)
    librosa.output.write_wav(output_path, waveform, sr=16000)
Esempio n. 7
0
            if g is not None:
                print("Global conditioned by speaker id {}".format(g))

        # Paths
        dst_wav_path = join(
            dst_dir, "{}_{}{}_predicted.wav".format(idx, checkpoint_name,
                                                    file_name_suffix))
        target_wav_path = join(
            dst_dir, "{}_{}{}_target.wav".format(idx, checkpoint_name,
                                                 file_name_suffix))

        # Generate
        waveform = wavegen(model,
                           length,
                           c=c,
                           g=g,
                           initial_value=initial_value,
                           fast=True,
                           tqdm=_tqdm)

        # save
        librosa.output.write_wav(dst_wav_path,
                                 waveform,
                                 sr=hparams.sample_rate)
        librosa.output.write_wav(target_wav_path,
                                 P.inv_mulaw_quantize(x),
                                 sr=hparams.sample_rate)

        # log
        if output_html:
            print("""

device = 'cuda:0'

g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
G = g_checkpoint

x_org = mel_org
x_org, len_pad = pad_seq(x_org)
uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

with torch.no_grad():
    _, x_identic_psnt, _ = G(uttr_org, emb_ref)

if len_pad == 0:
    uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
else:
    uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()

device = torch.device("cuda")
model = build_model().to(device)
checkpoint = torch.load(
    "../drive/MyDrive/MultiSpeaker_Tacotron2/checkpoint_step001000000_ema.pth",
    map_location=torch.device('cuda'))
model.load_state_dict(checkpoint["state_dict"])

waveform = wavegen(model, c=uttr_trg)
sf.write('{}-{}.wav'.format(original_name, ref_name),
         waveform,
         16000,
         subtype='PCM_24')
Esempio n. 9
0
                                              condition), uttr_trg))

    # %%

# spectrogram to waveform
import torch
import librosa
import pickle
import os
from synthesis import build_model
from synthesis import wavegen

if not os.path.exists('results'):
    os.makedirs('results')

model = build_model().to(device)
checkpoint = torch.load(
    "/datapool/home/zxt20/JieWang2020ICASSP/speechflow_plus-grl11/pre-trained-model/wave_netcheckpoint_step001000000_ema.pth"
)
# 预训练好的wavenet vocoder
model.load_state_dict(checkpoint["state_dict"])

for spect in spect_vc:
    name = spect[0]
    c = spect[1]
    print(name)
    waveform = wavegen(model, c=c)
    librosa.output.write_wav('results_L1/' + name + '.wav', waveform, sr=16000)

# %%
Esempio n. 10
0
if in_path[-1] == str(os.sep):
    in_path = in_path[:-1]

model = build_model().to(device)
model.load_state_dict(checkpoint["state_dict"])

wav_paths = [in_path + os.sep + "{}".format(fi) for fi in os.listdir(in_path) if ".wav" in fi]
out_dir = in_path + "_mel"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

for wp in wav_paths:
    print("Saving mels for {}".format(wp))
    _process_utterance(wp, out_dir)

mel_dir = out_dir
wav_out_dir = mel_dir + "_wavenet_render"
if not os.path.exists(wav_out_dir):
    os.mkdir(wav_out_dir)
sample_rate = 22050
mel_paths = [mel_dir + os.sep + "{}".format(fi) for fi in os.listdir(mel_dir) if "mel" in fi]
for mel_path in mel_paths:
    c = np.load(mel_path)
    if c.shape[1] != hparams.num_mels:
        np.swapaxes(c, 0, 1)
    waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
    fname = mel_path.split(os.sep)[-1].split(".")[0]
    fpath = wav_out_dir + str(os.sep) + '{}.wav'.format(fname)
    wavfile.write(fpath, sample_rate, waveform)
    print("Saved HD audio {}".format(fpath))
Esempio n. 11
0
        plt.subplot(1, len(all_spmels), j + 1)
        if j == 0: plt.title('original_' + name)
        elif j == 1: plt.title('resynthOrg_' + name)
        else:
            try:
                plt.title(name + '_to_' +
                          str(style_names[j - num_unconv_styles]))
            except:
                pdb.set_trace()
        plt.imshow(np.rot90(all_spmels[j]))
    plt.savefig(subdir_for_wavs + '/example' + str(counter) + '_spmels')

    # synthesize nu shit
    for k, spmel in enumerate(all_spmels):
        # x_identic_psnt = tensor.squeeze(0).squeeze(0).detach().cpu().numpy()
        waveform = wavegen(model, config.which_cuda, c=spmel)
        #     librosa.output.write_wav(name+'.wav', waveform, sr=16000)
        #        if k == 0:
        #            sf.write(subdir_for_wavs +f'/example{counter}_{name}_ORG.wav', waveform, samplerate=16000)
        if k == 0:
            sf.write(subdir_for_wavs +
                     f'/example{counter}_{name}_synthed_from_org.wav',
                     waveform,
                     samplerate=16000)
        else:
            sf.write(subdir_for_wavs +
                     f'/example{counter}_{name}_to_{style_names[k-1]}.wav',
                     waveform,
                     samplerate=16000)
    counter += 2
Esempio n. 12
0
# if not os.path.exists('results_p'):
#     os.makedirs('results_p')
if not os.path.exists('results_nop_1_rhym'):
    os.makedirs('results_nop_1_rhym')
model = build_model().to(device)
# checkpoint = torch.load("/home/jie-wang19/Speechsplitexp/speech_split_baseline/Base_origin/pre-trained-model/checkpoint_step001000000_ema.pth")
checkpoint = torch.load(
    "/datapool/home/zxt20/JieWang2020ICASSP/SpeechFlow-master_ordin/pre-trained-model/wave_netcheckpoint_step001000000_ema.pth"
)
# 预训练好的wavenet vocoder
model.load_state_dict(checkpoint["state_dict"])
# i = 0
# for spect in spect_vc:
#     # i += 1
#     name = spect[0]
#     c = spect[1]
#     #waveform = audio.inv_mel_spectrogram(c.T, hparams)
#     print(name)
#     waveform = wavegen(model, c=c)
#     librosa.output.write_wav('results_p/' + name + '.wav', waveform, sr=16000)

for sp in spect_vc_NOP:
    nn = sp[0]
    c = sp[1]
    print(nn)
    waveform_NOP = wavegen(model, c=c)
    librosa.output.write_wav('results_nop_1_rhym/' + nn + '.wav',
                             waveform_NOP,
                             sr=16000)
# %%
Esempio n. 13
0
def vocode_spec(spec, model, out_name):
    c = spec
    waveform = wavegen(model, c=c)
    librosa.output.write_wav(out_name, waveform, sr=16000)
 def step(self, spect):
     waveform = wavegen(self.model, c=spect)
     return waveform
Esempio n. 15
0
f0_onehot = torch.from_numpy(f0_onehot).to(device)

# concat pitch contour to freq axis (cols)
S = S[np.newaxis, :192, :]
S, _ = pad_seq_to_2(S, 192)
uttr = torch.from_numpy(S.astype(np.float32)).to(device)

#f0_onehot = tr.zeros_like(f0_onehot)
uttr_f0 = torch.cat((uttr, f0_onehot), dim=-1)

# Generate back from components
emb = tr.zeros(1, 82).to(device)
print(uttr_f0.shape, uttr.shape, emb.shape)

# uttr_f0 = tr.zeros_like(uttr_f0)
out = G(uttr_f0, uttr, emb)

# Synthesize wav back
model = build_model().to(device)
checkpoint = torch.load("assets/checkpoint_step001000000_ema.pth",
                        map_location=device)
model.load_state_dict(checkpoint["state_dict"])

print(out.shape)
waveform = wavegen(model, c=out.squeeze().cpu())
# librosa.output.write_wav('results/'+name+'.wav', waveform, sr=16000)
sf.write('results/back_synthesized-zeros-pitch.wav',
         waveform,
         16000,
         subtype='PCM_24')