def synthesize(self, step, wav, wavpath, sampling_rate, frame_period, num_mcep, cpsyn_flag): wav_name = basename(wavpath) # print(wav_name) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion(f0=f0, mean_log_src=self.test_loader.logf0s_mean_src, std_log_src=self.test_loader.logf0s_std_src, mean_log_target=self.test_loader.logf0s_mean_trg, std_log_target=self.test_loader.logf0s_std_trg) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep)# to dim 36 coded_sp_norm = (coded_sp - self.test_loader.mcep_mean_src) / self.test_loader.mcep_std_src coded_sp_norm_tensor = torch.FloatTensor(coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(self.device) # [1, 1, D, T] conds = torch.FloatTensor(self.test_loader.spk_c_trg).to(self.device) # [1, C] # print(conds.size()) coded_sp_converted_norm = self.G(coded_sp_norm_tensor, conds).data.cpu().numpy() coded_sp_converted = np.squeeze(coded_sp_converted_norm).T * self.test_loader.mcep_std_trg + self.test_loader.mcep_mean_trg coded_sp_converted = np.ascontiguousarray(coded_sp_converted) # TODO why need C contiguous # decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate) wav_transformed = world_speech_synthesis(f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( join(self.sample_dir, '{}-{}_{}-vcto-{}.wav'.format( str(step), self.test_loader.src_spk, wav_name.split('.')[0], self.test_loader.trg_spk ) ), wav_transformed, sampling_rate) if cpsyn_flag: wav_cpsyn = world_speech_synthesis(f0=f0, coded_sp=coded_sp, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( join(self.sample_dir, 'cpsyn-{}_{}'.format( self.test_loader.src_spk, wav_name ) ), wav_cpsyn, sampling_rate)
def convert(src_wav_dir, trg_wav_file): all_src_wav_files = glob.glob(f'{src_wav_dir}/*.wav') # This regex for src_wav_files creates about 20 output files to get a good sample without taking too # much time or memory. It can be altered (including setting to a single file or all_src_wav_files) # to create fewer/more output files. src_wav_files = glob.glob(f'{src_wav_dir}/p???_0[01][0-9].wav') src_wavs = [ utils.load_wav(src_wav_file, utils.SAMPLING_RATE) for src_wav_file in src_wav_files ] trg_wav = utils.load_wav(trg_wav_file, utils.SAMPLING_RATE) trg_wav_name = splitext(basename(trg_wav_file))[0] converted_dir = VCTK_PATH.joinpath('converted_audio', 'trg_' + trg_wav_name) os.makedirs(converted_dir, exist_ok=True) src_stats = get_stats(all_src_wav_files) trg_stats = get_stats([trg_wav_file]) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') G = get_model(device) _, _, trg_sp, _ = utils.world_decompose(wav=trg_wav, fs=utils.SAMPLING_RATE, frame_period=utils.FRAME_PERIOD) trg_coded_sp = utils.world_encode_spectral_envelop(sp=trg_sp, fs=utils.SAMPLING_RATE, dim=utils.NUM_MCEP) trg_coded_sp_norm = (trg_coded_sp - trg_stats['coded_sps_mean'] ) / trg_stats['coded_sps_std'] assert trg_coded_sp_norm.shape[0] >= 8192 trg_coded_sp_norm = trg_coded_sp_norm[:8192, :] trg_coded_sp_norm_tensor = torch.FloatTensor( trg_coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device) trg_embed = G.trg_downsample(trg_coded_sp_norm_tensor) with torch.no_grad(): for i, src_wav in enumerate(tqdm(src_wavs)): f0, _, sp, ap = utils.world_decompose( wav=src_wav, fs=utils.SAMPLING_RATE, frame_period=utils.FRAME_PERIOD) coded_sp = utils.world_encode_spectral_envelop( sp=sp, fs=utils.SAMPLING_RATE, dim=utils.NUM_MCEP) f0_converted = utils.pitch_conversion( f0=f0, mean_log_src=src_stats['log_f0s_mean'], std_log_src=src_stats['log_f0s_std'], mean_log_target=trg_stats['log_f0s_mean'], std_log_target=trg_stats['log_f0s_std']) coded_sp_norm = (coded_sp - src_stats['coded_sps_mean'] ) / src_stats['coded_sps_std'] coded_sp_norm_tensor = torch.FloatTensor( coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device) # coded_sp_converted_norm = G(coded_sp_norm_tensor, trg_embed).data.cpu().numpy() coded_sp_converted_norm = G.forward_with_trg_embed( coded_sp_norm_tensor, trg_embed) coded_sp_converted_norm = coded_sp_converted_norm.data.cpu().numpy( ) coded_sp_converted = np.squeeze(coded_sp_converted_norm).T coded_sp_converted = coded_sp_converted * trg_stats[ 'coded_sps_std'] + trg_stats['coded_sps_mean'] coded_sp_converted = np.ascontiguousarray(coded_sp_converted) coded_sp_converted = coded_sp_converted.astype('double') wav_transformed = utils.world_speech_synthesis( f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=utils.SAMPLING_RATE, frame_period=utils.FRAME_PERIOD) output_path = converted_dir.joinpath( 'src_' + os.path.basename(src_wav_files[i])) print(f'Saving to {output_path}') librosa.output.write_wav(output_path, wav_transformed, utils.SAMPLING_RATE)
def convert(config): os.makedirs(join(config.convert_dir, config.resume_model), exist_ok=True) sampling_rate, num_mcep, frame_period = config.sampling_rate, 36, 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Restore model print(f'Loading the trained models from step {config.resume_model}...') generator = Generator(num_speakers=config.num_speakers).to(device) g_path = join(config.model_save_dir, f'{config.resume_model}-G.ckpt') generator.load_state_dict( torch.load(g_path, map_location=lambda storage, loc: storage)) # for all possible speaker pairs in config.speakers for i in range(0, len(config.speakers)): for j in range(0, len(config.speakers)): if i != j: target_dir = join( config.convert_dir, str(config.resume_model), f'{config.speakers[i]}_to_{config.speakers[j]}') os.makedirs(target_dir, exist_ok=True) # Load speakers data_loader = ConvertDataset(config, src_spk=config.speakers[i], trg_spk=config.speakers[j]) print('---------------------------------------') print('Source: ', config.speakers[i], ' Target: ', config.speakers[j]) print('---------------------------------------') # Read a batch of testdata src_test_wavfiles = data_loader.get_batch_test_data( batch_size=config.num_converted_wavs) src_test_wavs = [ load_wav(wavfile, sampling_rate) for wavfile in src_test_wavfiles ] with torch.no_grad(): for idx, wav in enumerate(src_test_wavs): print(f'({idx}), file length: {len(wav)}') wav_name = basename(src_test_wavfiles[idx]) # convert wav to mceps f0, _, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=data_loader.logf0s_mean_src, std_log_src=data_loader.logf0s_std_src, mean_log_target=data_loader.logf0s_mean_trg, std_log_target=data_loader.logf0s_std_trg) coded_sp = world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) print("Before being fed into G: ", coded_sp.shape) coded_sp_norm = (coded_sp - data_loader.mcep_mean_src ) / data_loader.mcep_std_src coded_sp_norm_tensor = torch.FloatTensor( coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to( device) spk_conds = torch.FloatTensor( data_loader.spk_c_trg).to(device) # Include org_conds if using src and target domain codes. org_conds = torch.FloatTensor( data_loader.spk_c_org).to(device) # generate converted speech coded_sp_converted_norm = generator( coded_sp_norm_tensor, spk_conds).data.cpu().numpy() coded_sp_converted = np.squeeze( coded_sp_converted_norm ).T * data_loader.mcep_std_trg + data_loader.mcep_mean_trg coded_sp_converted = np.ascontiguousarray( coded_sp_converted) print("After being fed into G: ", coded_sp_converted.shape) # convert back to wav wav_transformed = world_speech_synthesis( f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) wav_id = wav_name.split('.')[0] # SAVE TARGET SYNTHESIZED soundfile.write( join(target_dir, f'{wav_id}-vcto-{data_loader.trg_spk}.wav'), wav_transformed, sampling_rate) # SAVE COPY OF TARGET REFERENCE wav_num = wav_name.split('.')[0].split('_')[1] copy( f'{config.wav_dir}/{config.speakers[j]}/{config.speakers[j]}_{wav_num}.wav', target_dir)
def test(config): os.makedirs(join(config.convert_dir, str(config.resume_iters)), exist_ok=True) sampling_rate, num_mcep, frame_period = 16000, 36, 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') G = Generator().to(device) test_loader = TestDataset(config) # Restore model print(f'Loading the trained models from step {config.resume_iters}...') G_path = join(config.model_save_dir, f'{config.resume_iters}-G.ckpt') G.load_state_dict( torch.load(G_path, map_location=lambda storage, loc: storage)) # Read a batch of testdata test_wavfiles = test_loader.get_batch_test_data( batch_size=config.num_converted_wavs) test_wavs = [load_wav(wavfile, sampling_rate) for wavfile in test_wavfiles] with torch.no_grad(): for idx, wav in enumerate(test_wavs): print(len(wav)) wav_name = basename(test_wavfiles[idx]) # print(wav_name) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=test_loader.logf0s_mean_src, std_log_src=test_loader.logf0s_std_src, mean_log_target=test_loader.logf0s_mean_trg, std_log_target=test_loader.logf0s_std_trg) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) print("Before being fed into G: ", coded_sp.shape) coded_sp_norm = (coded_sp - test_loader.mcep_mean_src ) / test_loader.mcep_std_src coded_sp_norm_tensor = torch.FloatTensor( coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device) spk_conds = torch.FloatTensor(test_loader.spk_c_trg).to(device) # print(spk_conds.size()) coded_sp_converted_norm = G(coded_sp_norm_tensor, spk_conds).data.cpu().numpy() coded_sp_converted = np.squeeze( coded_sp_converted_norm ).T * test_loader.mcep_std_trg + test_loader.mcep_mean_trg coded_sp_converted = np.ascontiguousarray(coded_sp_converted) print("After being fed into G: ", coded_sp_converted.shape) wav_transformed = world_speech_synthesis( f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) wav_id = wav_name.split('.')[0] librosa.output.write_wav( join(config.convert_dir, str(config.resume_iters), f'{wav_id}-vcto-{test_loader.trg_spk}.wav'), wav_transformed, sampling_rate) if [True, False][0]: wav_cpsyn = world_speech_synthesis(f0=f0, coded_sp=coded_sp, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( join(config.convert_dir, str(config.resume_iters), f'cpsyn-{wav_name}'), wav_cpsyn, sampling_rate)
def convert(config): os.makedirs(join(config.convert_dir, config.resume_model), exist_ok=True) sampling_rate, num_mcep, frame_period = config.sampling_rate, 36, 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 重启模型 print(f'从步骤{config.resume_model}开始加载训练过的模型...') generator = Generator(num_speakers=config.num_speakers).to(device) g_path = join(config.model_save_dir, f'{config.resume_model}-G.ckpt') generator.load_state_dict( torch.load(g_path, map_location=lambda storage, loc: storage)) # 遍历config.speakers中所有的发音者对 for i in range(0, len(config.speakers)): for j in range(0, len(config.speakers)): if i != j: target_dir = join( config.convert_dir, str(config.resume_model), f'{config.speakers[i]}_to_{config.speakers[j]}') os.makedirs(target_dir, exist_ok=True) # 载入发音者 data_loader = ConvertDataset(config, src_spk=config.speakers[i], trg_spk=config.speakers[j]) print('---------------------------------------') print('源:', config.speakers[i], '目标:', config.speakers[j]) print('---------------------------------------') # 读入一批的测试数据 src_test_wavfiles = data_loader.get_batch_test_data( batch_size=config.num_converted_wavs) src_test_wavs = [ load_wav(wavfile, sampling_rate) for wavfile in src_test_wavfiles ] with torch.no_grad(): for idx, wav in enumerate(src_test_wavs): print(f'({idx}),文件长度:{len(wav)}') wav_name = basename(src_test_wavfiles[idx]) # 转换wav格式文件为MECP数据 f0, _, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=data_loader.logf0s_mean_src, std_log_src=data_loader.logf0s_std_src, mean_log_target=data_loader.logf0s_mean_trg, std_log_target=data_loader.logf0s_std_trg) coded_sp = world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) print("在喂入数据到G前:", coded_sp.shape) coded_sp_norm = (coded_sp - data_loader.mcep_mean_src ) / data_loader.mcep_std_src coded_sp_norm_tensor = torch.FloatTensor( coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to( device) spk_conds = torch.FloatTensor( data_loader.spk_c_trg).to(device) # 生成转换后的音频数据 coded_sp_converted_norm = generator( coded_sp_norm_tensor, spk_conds).data.cpu().numpy() coded_sp_converted = np.squeeze( coded_sp_converted_norm ).T * data_loader.mcep_std_trg + data_loader.mcep_mean_trg coded_sp_converted = np.ascontiguousarray( coded_sp_converted) print("在喂入数据到G后:", coded_sp_converted.shape) # 将数据转换回wav数据 wav_transformed = world_speech_synthesis( f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) wav_id = wav_name.split('.')[0] # SAVE TARGET SYNTHESIZED librosa.output.write_wav( join(target_dir, f'{wav_id}-vcto-{data_loader.trg_spk}.wav'), wav_transformed, sampling_rate) # SAVE COPY OF TARGET REFERENCE wav_num = wav_name.split('.')[0].split('_')[1] copy( f'{config.wav_dir}/{config.speakers[j]}/{config.speakers[j]}_{wav_num}.wav', target_dir)