Exemple #1
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        waveglow.half()
        for k in waveglow.convinv:
            k.float()

    for i, file_path in enumerate(mel_files):
        stime = time.time()
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        stime2 = time.time()
        with torch.no_grad():
            audio = MAX_WAV_VALUE * waveglow.infer(mel, sigma=sigma)[0]
        inf_time = time.time() - stime2
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        #print(audio_path)
        len_audio = len(audio) / 22050.
        print(
            "{}: (audio length {:.2f} sec), (total computing time {:.2f} sec), (inference time: {:.2f} sec) "
            .format(audio_path, len_audio,
                    time.time() - stime, inf_time))
Exemple #2
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        waveglow.half()
        for k in waveglow.convinv:
            k.float()

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        #mel = torch.transpose(mel,1,2)
        #print(mel.size())
        with torch.no_grad():
            audio = MAX_WAV_VALUE * waveglow.infer(mel, sigma=sigma)[0]
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
Exemple #3
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
Exemple #4
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    for m in waveglow.modules():
        if 'Conv' in str(type(m)):
            setattr(m, 'padding_mode', 'zeros')
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()



    for i, file_path in enumerate(mel_files):

        file_name = os.path.splitext(os.path.basename(file_path))[0]
        #print(file_name)
        mel = torch.load(file_path)
        # print("mel",mel)
        #print(mel.shape)
        mel = torch.autograd.Variable(mel.cuda())
        # print("mel",mel)
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        # print("mel",mel)

        print(torch.min(mel),torch.max(mel))
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)


            k.append(abs(audio).max().item())
            #print(min(k),max(k))
            #audio = audio*18000*abs(audio).max()/0.99
            #print("audio",audio)
            #print((audio).min().item(),(audio).max().item())
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(
            output_dir, "{}_synthesis_sig0.7_d_0.1.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
Exemple #5
0
 def __init__(self, training_files, num_frame, filter_length, hop_length,
              win_length, sampling_rate, mel_fmin, mel_fmax):
     self.audio_files = ms.files_to_list(training_files)
     self.all_length = torch.tensor(0, dtype=torch.long)
     self.hop_length = hop_length
     self.win_length = win_length
     random.seed(4321)
     random.shuffle(self.audio_files)
     self.stft = ms.TacotronSTFT(filter_length=filter_length,
                                 hop_length=self.hop_length,
                                 win_length=self.win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
     self.segment = num_frame * self.hop_length + self.win_length
     self.sampling_rate = sampling_rate
Exemple #6
0
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    tic_prepare= time.time()
    mel_files = files_to_list(mel_files)
    squeezewave = torch.load(squeezewave_path)['model']
    squeezewave = squeezewave.remove_weightnorm(squeezewave)
    squeezewave.cuda().eval()
    if is_fp16:
        from apex import amp
        squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(squeezewave).cuda()
        
    toc_prepare = time.time()
    dur_prepare = toc_prepare - tic_prepare
    print("prepare model {:3.2}sec".format(dur_prepare) )
    

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        tic=time.time()
        
        with torch.no_grad():
            audio = squeezewave.infer(mel, sigma=sigma).float()
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        toc=time.time()
        dur = toc -tic
        
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        
        len_wav = len(audio)
        sec_wav = len_wav/sampling_rate
        samples_sec =  len_wav / dur
        audio = audio.astype('int16')
        audio_path = os.path.join(
            output_dir, "{}_s{}.wav".format(file_name,sigma))
        write(audio_path, sampling_rate, audio)
        print("{} it took {:4.3f}sec  for  {:4.3f}sec {:4.2f}K sample 22Khz Audio files :   RTF {:4.3f} {:4.3f}X  {:4.2f}Ksamples/sec  "
              .format(audio_path, dur, sec_wav, len_wav/1000,  dur/sec_wav,  sec_wav/dur , samples_sec/1000  ) ) 
Exemple #7
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)  #测试集mel谱list
    waveglow = torch.load(waveglow_path)['model']  #加载模型
    waveglow = waveglow.remove_weightnorm(waveglow)  #?移除权重归一化
    waveglow.cuda().eval()  #cuda()拷贝进gpu #?变成测试模式,dropout和BN在训练时和测不一样
    #apex加速
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")
    # denoiser_strength=0
    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()
    for i, file_path in enumerate(mel_files):
        #file_name-对应的wav
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        #加载MFCC特征,80个滤波器
        mel = torch.load(file_path)
        #mel={key:mel[key].cuda() for key in mel}
        #封装数据
        mel = torch.autograd.Variable(mel.cuda())
        #80,375 -> 1*80*375
        mel = torch.unsqueeze(mel, 0)
        #变成fp16数据以便apex加速
        mel = mel.half() if is_fp16 else mel
        #反向传播不会自动求导
        with torch.no_grad():
            #生成1*96000Tensor数据,x为原始音频,z为mel谱
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            #为了转成wav?
            audio = audio * MAX_WAV_VALUE
        #变成1维数据
        audio = audio.squeeze()
        #在cpu中转成numpy
        audio = audio.cpu().numpy()
        #改变类型
        audio = audio.astype('int16')
        #生成数据存储位置
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        #写入音频
        print(audio_path)
Exemple #8
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        print('Loading file: ', file_path)
        if file_path.find('.pt') != -1:
            print('load by torch')
            mel = torch.load(file_path)
        elif file_path.find('.npy') != -1:
            print('load by numpy')
            mel = np.load(file_path)
            mel = torch.from_numpy(mel)
        print(f"original mel shape: {mel.shape}")
        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        print(f"mel shape right before using waveglow: {mel.shape}")
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
Exemple #9
0
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    mel_files = files_to_list(mel_files)
    waveglow = torch.load(waveglow_path)['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]

        if True:
            # Processing for generic mel files
            shape = tuple(np.fromfile(file_path, count=2, dtype=np.int32))
            mel = np.memmap(file_path, offset=8, dtype=np.float32, shape=shape)
            # mel = mel[1:1000,:]
            mel = mel.transpose()
            mel = torch.from_numpy(mel)
        else:
            mel = torch.load(file_path)

        mel = torch.autograd.Variable(mel.cuda())
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            audio = waveglow.infer(mel, sigma=sigma)
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir, "{}.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate,
         is_fp16, denoiser_strength):
    mel_files = files_to_list(mel_files)

    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device = torch.device('cpu')
    squeezewave = torch.load(squeezewave_path, map_location=device)['model']
    squeezewave = squeezewave.remove_weightnorm(squeezewave)
    squeezewave.eval()
    if is_fp16:
        from apex import amp
        squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(squeezewave)
    start = time.time()
    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path, map_location=device)
        mel = torch.autograd.Variable(mel)
        mel = mel.half()
        with torch.no_grad():
            audio = squeezewave.infer(mel, sigma=sigma).float()
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
    end = time.time()
    print("Squeezewave vocoder time")
    print(end - start)
Exemple #11
0
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate,
         is_fp16, denoiser_strength, device):
    mel_files = files_to_list(mel_files)
    squeezewave = torch.load(squeezewave_path, map_location=device)['model']
    squeezewave.device = device  # hack for loading model trained on gpu to cpu
    squeezewave = squeezewave.remove_weightnorm(squeezewave)
    squeezewave.to(device=device).eval()
    if is_fp16:
        from apex import amp
        squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(squeezewave).to(device=device)

    for i, file_path in enumerate(mel_files):
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        mel = torch.load(file_path)
        if len(mel.shape) > 2:
            mel = mel.squeeze()
            print(f"squeezed to {mel.shape}")
        assert len(mel.shape) == 2
        mel = torch.autograd.Variable(mel.to(device=device))
        mel = torch.unsqueeze(mel, 0)
        mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            audio = squeezewave.infer(mel, sigma=sigma).float()
            if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        audio_path = os.path.join(output_dir,
                                  "{}_synthesis.wav".format(file_name))
        write(audio_path, sampling_rate, audio)
        print(audio_path)
Exemple #12
0
import argparse
import os
import subprocess

from mel2samp import files_to_list

if __name__ == "__main__":
    # Get defaults so it can work with no Sacred
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', "--filelist_path", required=True)
    parser.add_argument('-o',
                        '--output_dir',
                        type=str,
                        help='Output directory')
    args = parser.parse_args()

    filepaths = files_to_list(args.filelist_path)

    for filepath in filepaths:
        source_wav = filepath
        filename = os.path.basename(filepath)
        dest_wav = f'{args.output_dir}/{filename}'
        command = f'sox {source_wav} {dest_wav} remix 1,2 rate 22050'
        subprocess.run(command, shell=True)