Ejemplo n.º 1
0
    def synthesize(self, step, wav, wavpath,
            sampling_rate, frame_period, num_mcep, cpsyn_flag):
        wav_name = basename(wavpath)
        # print(wav_name)
        f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period)
        f0_converted = pitch_conversion(f0=f0,
            mean_log_src=self.test_loader.logf0s_mean_src, std_log_src=self.test_loader.logf0s_std_src,
            mean_log_target=self.test_loader.logf0s_mean_trg, std_log_target=self.test_loader.logf0s_std_trg)
        coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep)# to dim 36

        coded_sp_norm = (coded_sp - self.test_loader.mcep_mean_src) / self.test_loader.mcep_std_src
        coded_sp_norm_tensor = torch.FloatTensor(coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(self.device) # [1, 1, D, T]
        conds = torch.FloatTensor(self.test_loader.spk_c_trg).to(self.device) # [1, C]
        # print(conds.size())
        coded_sp_converted_norm = self.G(coded_sp_norm_tensor, conds).data.cpu().numpy()
        coded_sp_converted = np.squeeze(coded_sp_converted_norm).T * self.test_loader.mcep_std_trg + self.test_loader.mcep_mean_trg
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted) # TODO why need C contiguous
        # decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
        wav_transformed = world_speech_synthesis(f0=f0_converted, coded_sp=coded_sp_converted,
                                                ap=ap, fs=sampling_rate, frame_period=frame_period)

        librosa.output.write_wav(
            join(self.sample_dir,
                '{}-{}_{}-vcto-{}.wav'.format(
                    str(step),
                    self.test_loader.src_spk,
                    wav_name.split('.')[0],
                    self.test_loader.trg_spk
                    )
                ),
             wav_transformed,
             sampling_rate)
        if cpsyn_flag:
            wav_cpsyn = world_speech_synthesis(f0=f0, coded_sp=coded_sp,
                                        ap=ap, fs=sampling_rate, frame_period=frame_period)
            librosa.output.write_wav(
                    join(self.sample_dir,
                        'cpsyn-{}_{}'.format(
                            self.test_loader.src_spk,
                            wav_name
                            )
                        ),
                    wav_cpsyn,
                    sampling_rate)
Ejemplo n.º 2
0
    def collect_features(self,file_path):
        """PyWorld analysis"""
        sr = 16000

        save_path = os.path.join(self.preprocess_dir, self.speaker, os.path.basename(file_path))

        if os.path.exists(save_path):
            features = np.load(save_path)
        else:

            wav, _ = librosa.load(file_path, sr=sr, mono=True)
            wav_padded = wav_padding(wav, sr=sr, frame_period=5, multiple=4)
            f0, _, sp, ap = world_decompose(wav_padded,sr)

            mcep = world_encode_spectral_envelop(sp, sr, dim=24)

            # Extending to 2D to stack and log zeroes 1e-16. TODO: Better solution for this
            f0 = np.ma.log(f0[:,None])
            #f0[f0 == -np.inf] = 1e-16

            features = np.hstack((f0, mcep, ap))
            features.dump(save_path)

        return features
Ejemplo n.º 3
0
def convert(src_wav_dir, trg_wav_file):
    all_src_wav_files = glob.glob(f'{src_wav_dir}/*.wav')
    # This regex for src_wav_files creates about 20 output files to get a good sample without taking too
    # much time or memory. It can be altered (including setting to a single file or all_src_wav_files)
    # to create fewer/more output files.
    src_wav_files = glob.glob(f'{src_wav_dir}/p???_0[01][0-9].wav')
    src_wavs = [
        utils.load_wav(src_wav_file, utils.SAMPLING_RATE)
        for src_wav_file in src_wav_files
    ]
    trg_wav = utils.load_wav(trg_wav_file, utils.SAMPLING_RATE)
    trg_wav_name = splitext(basename(trg_wav_file))[0]
    converted_dir = VCTK_PATH.joinpath('converted_audio',
                                       'trg_' + trg_wav_name)
    os.makedirs(converted_dir, exist_ok=True)

    src_stats = get_stats(all_src_wav_files)
    trg_stats = get_stats([trg_wav_file])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    G = get_model(device)

    _, _, trg_sp, _ = utils.world_decompose(wav=trg_wav,
                                            fs=utils.SAMPLING_RATE,
                                            frame_period=utils.FRAME_PERIOD)
    trg_coded_sp = utils.world_encode_spectral_envelop(sp=trg_sp,
                                                       fs=utils.SAMPLING_RATE,
                                                       dim=utils.NUM_MCEP)
    trg_coded_sp_norm = (trg_coded_sp - trg_stats['coded_sps_mean']
                         ) / trg_stats['coded_sps_std']
    assert trg_coded_sp_norm.shape[0] >= 8192
    trg_coded_sp_norm = trg_coded_sp_norm[:8192, :]
    trg_coded_sp_norm_tensor = torch.FloatTensor(
        trg_coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device)

    trg_embed = G.trg_downsample(trg_coded_sp_norm_tensor)

    with torch.no_grad():
        for i, src_wav in enumerate(tqdm(src_wavs)):
            f0, _, sp, ap = utils.world_decompose(
                wav=src_wav,
                fs=utils.SAMPLING_RATE,
                frame_period=utils.FRAME_PERIOD)
            coded_sp = utils.world_encode_spectral_envelop(
                sp=sp, fs=utils.SAMPLING_RATE, dim=utils.NUM_MCEP)

            f0_converted = utils.pitch_conversion(
                f0=f0,
                mean_log_src=src_stats['log_f0s_mean'],
                std_log_src=src_stats['log_f0s_std'],
                mean_log_target=trg_stats['log_f0s_mean'],
                std_log_target=trg_stats['log_f0s_std'])

            coded_sp_norm = (coded_sp - src_stats['coded_sps_mean']
                             ) / src_stats['coded_sps_std']
            coded_sp_norm_tensor = torch.FloatTensor(
                coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device)

            # coded_sp_converted_norm = G(coded_sp_norm_tensor, trg_embed).data.cpu().numpy()
            coded_sp_converted_norm = G.forward_with_trg_embed(
                coded_sp_norm_tensor, trg_embed)
            coded_sp_converted_norm = coded_sp_converted_norm.data.cpu().numpy(
            )
            coded_sp_converted = np.squeeze(coded_sp_converted_norm).T
            coded_sp_converted = coded_sp_converted * trg_stats[
                'coded_sps_std'] + trg_stats['coded_sps_mean']
            coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
            coded_sp_converted = coded_sp_converted.astype('double')
            wav_transformed = utils.world_speech_synthesis(
                f0=f0_converted,
                coded_sp=coded_sp_converted,
                ap=ap,
                fs=utils.SAMPLING_RATE,
                frame_period=utils.FRAME_PERIOD)

            output_path = converted_dir.joinpath(
                'src_' + os.path.basename(src_wav_files[i]))
            print(f'Saving to {output_path}')
            librosa.output.write_wav(output_path, wav_transformed,
                                     utils.SAMPLING_RATE)
Ejemplo n.º 4
0
def convert(config):
    os.makedirs(join(config.convert_dir, config.resume_model), exist_ok=True)
    sampling_rate, num_mcep, frame_period = config.sampling_rate, 36, 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Restore model
    print(f'Loading the trained models from step {config.resume_model}...')
    generator = Generator(num_speakers=config.num_speakers).to(device)
    g_path = join(config.model_save_dir, f'{config.resume_model}-G.ckpt')
    generator.load_state_dict(
        torch.load(g_path, map_location=lambda storage, loc: storage))

    # for all possible speaker pairs in config.speakers
    for i in range(0, len(config.speakers)):
        for j in range(0, len(config.speakers)):
            if i != j:
                target_dir = join(
                    config.convert_dir, str(config.resume_model),
                    f'{config.speakers[i]}_to_{config.speakers[j]}')

                os.makedirs(target_dir, exist_ok=True)

                # Load speakers
                data_loader = ConvertDataset(config,
                                             src_spk=config.speakers[i],
                                             trg_spk=config.speakers[j])
                print('---------------------------------------')
                print('Source: ', config.speakers[i], ' Target: ',
                      config.speakers[j])
                print('---------------------------------------')

                # Read a batch of testdata
                src_test_wavfiles = data_loader.get_batch_test_data(
                    batch_size=config.num_converted_wavs)
                src_test_wavs = [
                    load_wav(wavfile, sampling_rate)
                    for wavfile in src_test_wavfiles
                ]

                with torch.no_grad():
                    for idx, wav in enumerate(src_test_wavs):
                        print(f'({idx}), file length: {len(wav)}')
                        wav_name = basename(src_test_wavfiles[idx])

                        # convert wav to mceps
                        f0, _, sp, ap = world_decompose(
                            wav=wav,
                            fs=sampling_rate,
                            frame_period=frame_period)
                        f0_converted = pitch_conversion(
                            f0=f0,
                            mean_log_src=data_loader.logf0s_mean_src,
                            std_log_src=data_loader.logf0s_std_src,
                            mean_log_target=data_loader.logf0s_mean_trg,
                            std_log_target=data_loader.logf0s_std_trg)
                        coded_sp = world_encode_spectral_envelop(
                            sp=sp, fs=sampling_rate, dim=num_mcep)
                        print("Before being fed into G: ", coded_sp.shape)
                        coded_sp_norm = (coded_sp - data_loader.mcep_mean_src
                                         ) / data_loader.mcep_std_src
                        coded_sp_norm_tensor = torch.FloatTensor(
                            coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(
                                device)
                        spk_conds = torch.FloatTensor(
                            data_loader.spk_c_trg).to(device)

                        # Include org_conds if using src and target domain codes.
                        org_conds = torch.FloatTensor(
                            data_loader.spk_c_org).to(device)

                        # generate converted speech
                        coded_sp_converted_norm = generator(
                            coded_sp_norm_tensor,
                            spk_conds).data.cpu().numpy()
                        coded_sp_converted = np.squeeze(
                            coded_sp_converted_norm
                        ).T * data_loader.mcep_std_trg + data_loader.mcep_mean_trg
                        coded_sp_converted = np.ascontiguousarray(
                            coded_sp_converted)
                        print("After being fed into G: ",
                              coded_sp_converted.shape)

                        # convert back to wav
                        wav_transformed = world_speech_synthesis(
                            f0=f0_converted,
                            coded_sp=coded_sp_converted,
                            ap=ap,
                            fs=sampling_rate,
                            frame_period=frame_period)
                        wav_id = wav_name.split('.')[0]

                        # SAVE TARGET SYNTHESIZED
                        soundfile.write(
                            join(target_dir,
                                 f'{wav_id}-vcto-{data_loader.trg_spk}.wav'),
                            wav_transformed, sampling_rate)

                        # SAVE COPY OF TARGET REFERENCE
                        wav_num = wav_name.split('.')[0].split('_')[1]
                        copy(
                            f'{config.wav_dir}/{config.speakers[j]}/{config.speakers[j]}_{wav_num}.wav',
                            target_dir)
Ejemplo n.º 5
0
def test(config):
    os.makedirs(join(config.convert_dir, str(config.resume_iters)),
                exist_ok=True)
    sampling_rate, num_mcep, frame_period = 16000, 36, 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    G = Generator().to(device)
    test_loader = TestDataset(config)
    # Restore model
    print(f'Loading the trained models from step {config.resume_iters}...')
    G_path = join(config.model_save_dir, f'{config.resume_iters}-G.ckpt')
    G.load_state_dict(
        torch.load(G_path, map_location=lambda storage, loc: storage))

    # Read a batch of testdata
    test_wavfiles = test_loader.get_batch_test_data(
        batch_size=config.num_converted_wavs)
    test_wavs = [load_wav(wavfile, sampling_rate) for wavfile in test_wavfiles]

    with torch.no_grad():
        for idx, wav in enumerate(test_wavs):
            print(len(wav))
            wav_name = basename(test_wavfiles[idx])
            # print(wav_name)
            f0, timeaxis, sp, ap = world_decompose(wav=wav,
                                                   fs=sampling_rate,
                                                   frame_period=frame_period)
            f0_converted = pitch_conversion(
                f0=f0,
                mean_log_src=test_loader.logf0s_mean_src,
                std_log_src=test_loader.logf0s_std_src,
                mean_log_target=test_loader.logf0s_mean_trg,
                std_log_target=test_loader.logf0s_std_trg)
            coded_sp = world_encode_spectral_envelop(sp=sp,
                                                     fs=sampling_rate,
                                                     dim=num_mcep)
            print("Before being fed into G: ", coded_sp.shape)
            coded_sp_norm = (coded_sp - test_loader.mcep_mean_src
                             ) / test_loader.mcep_std_src
            coded_sp_norm_tensor = torch.FloatTensor(
                coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device)
            spk_conds = torch.FloatTensor(test_loader.spk_c_trg).to(device)
            # print(spk_conds.size())
            coded_sp_converted_norm = G(coded_sp_norm_tensor,
                                        spk_conds).data.cpu().numpy()
            coded_sp_converted = np.squeeze(
                coded_sp_converted_norm
            ).T * test_loader.mcep_std_trg + test_loader.mcep_mean_trg
            coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
            print("After being fed into G: ", coded_sp_converted.shape)
            wav_transformed = world_speech_synthesis(
                f0=f0_converted,
                coded_sp=coded_sp_converted,
                ap=ap,
                fs=sampling_rate,
                frame_period=frame_period)
            wav_id = wav_name.split('.')[0]
            librosa.output.write_wav(
                join(config.convert_dir, str(config.resume_iters),
                     f'{wav_id}-vcto-{test_loader.trg_spk}.wav'),
                wav_transformed, sampling_rate)
            if [True, False][0]:
                wav_cpsyn = world_speech_synthesis(f0=f0,
                                                   coded_sp=coded_sp,
                                                   ap=ap,
                                                   fs=sampling_rate,
                                                   frame_period=frame_period)
                librosa.output.write_wav(
                    join(config.convert_dir, str(config.resume_iters),
                         f'cpsyn-{wav_name}'), wav_cpsyn, sampling_rate)
Ejemplo n.º 6
0
def convert(config):
    os.makedirs(join(config.convert_dir, config.resume_model), exist_ok=True)
    sampling_rate, num_mcep, frame_period = config.sampling_rate, 36, 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 重启模型
    print(f'从步骤{config.resume_model}开始加载训练过的模型...')
    generator = Generator(num_speakers=config.num_speakers).to(device)
    g_path = join(config.model_save_dir, f'{config.resume_model}-G.ckpt')
    generator.load_state_dict(
        torch.load(g_path, map_location=lambda storage, loc: storage))

    # 遍历config.speakers中所有的发音者对
    for i in range(0, len(config.speakers)):
        for j in range(0, len(config.speakers)):
            if i != j:
                target_dir = join(
                    config.convert_dir, str(config.resume_model),
                    f'{config.speakers[i]}_to_{config.speakers[j]}')

                os.makedirs(target_dir, exist_ok=True)

                # 载入发音者
                data_loader = ConvertDataset(config,
                                             src_spk=config.speakers[i],
                                             trg_spk=config.speakers[j])
                print('---------------------------------------')
                print('源:', config.speakers[i], '目标:', config.speakers[j])
                print('---------------------------------------')

                # 读入一批的测试数据
                src_test_wavfiles = data_loader.get_batch_test_data(
                    batch_size=config.num_converted_wavs)
                src_test_wavs = [
                    load_wav(wavfile, sampling_rate)
                    for wavfile in src_test_wavfiles
                ]

                with torch.no_grad():
                    for idx, wav in enumerate(src_test_wavs):
                        print(f'({idx}),文件长度:{len(wav)}')
                        wav_name = basename(src_test_wavfiles[idx])

                        # 转换wav格式文件为MECP数据
                        f0, _, sp, ap = world_decompose(
                            wav=wav,
                            fs=sampling_rate,
                            frame_period=frame_period)
                        f0_converted = pitch_conversion(
                            f0=f0,
                            mean_log_src=data_loader.logf0s_mean_src,
                            std_log_src=data_loader.logf0s_std_src,
                            mean_log_target=data_loader.logf0s_mean_trg,
                            std_log_target=data_loader.logf0s_std_trg)
                        coded_sp = world_encode_spectral_envelop(
                            sp=sp, fs=sampling_rate, dim=num_mcep)
                        print("在喂入数据到G前:", coded_sp.shape)
                        coded_sp_norm = (coded_sp - data_loader.mcep_mean_src
                                         ) / data_loader.mcep_std_src
                        coded_sp_norm_tensor = torch.FloatTensor(
                            coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(
                                device)
                        spk_conds = torch.FloatTensor(
                            data_loader.spk_c_trg).to(device)

                        # 生成转换后的音频数据
                        coded_sp_converted_norm = generator(
                            coded_sp_norm_tensor,
                            spk_conds).data.cpu().numpy()
                        coded_sp_converted = np.squeeze(
                            coded_sp_converted_norm
                        ).T * data_loader.mcep_std_trg + data_loader.mcep_mean_trg
                        coded_sp_converted = np.ascontiguousarray(
                            coded_sp_converted)
                        print("在喂入数据到G后:", coded_sp_converted.shape)

                        # 将数据转换回wav数据
                        wav_transformed = world_speech_synthesis(
                            f0=f0_converted,
                            coded_sp=coded_sp_converted,
                            ap=ap,
                            fs=sampling_rate,
                            frame_period=frame_period)
                        wav_id = wav_name.split('.')[0]

                        # SAVE TARGET SYNTHESIZED
                        librosa.output.write_wav(
                            join(target_dir,
                                 f'{wav_id}-vcto-{data_loader.trg_spk}.wav'),
                            wav_transformed, sampling_rate)

                        # SAVE COPY OF TARGET REFERENCE
                        wav_num = wav_name.split('.')[0].split('_')[1]
                        copy(
                            f'{config.wav_dir}/{config.speakers[j]}/{config.speakers[j]}_{wav_num}.wav',
                            target_dir)