def save_world_wav(feats, model_name, filename): # feats = [f0, sp, ap, sp_coded, labels] if isinstance(feats[3], torch.Tensor): feats[3] = feats[3].cpu().numpy() if hp.normalise_mels: feats[3] = _unnormalise_coded_sp(feats[3]) path = os.path.join(hp.sample_set_dir, model_name) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, filename) # print("Made path.") feats[3] = np.ascontiguousarray(feats[3], dtype=np.float64) # print("Made contiguous.") # print(feats[3].shape) decoded_sp = decode_spectral_envelope(feats[3], hp.sr, fft_size=hp.n_fft) # print("Decoded.") # f0_converted = norm.pitch_conversion(f0, speaker, target) wav = synthesize(feats[0], decoded_sp, feats[1], hp.sr) # Audio(wav,rate=hp.sr) # librosa.display.waveplot(y=wav, sr=hp.sr) # print("Sythesized wav.") save_wav(wav, path)
def world_decode_spectral_envelop(coded_sp, fs): # 将之前编码降维后的数据恢复以前的维度 fftlen = pyworld.get_cheaptrick_fft_size(fs) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def world_decode_spectral_envelop(coded_sp, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) # coded_sp = coded_sp.astype(np.float32) # coded_sp = np.ascontiguousarray(coded_sp) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def world_decode_spectral_env(spectral_env_mel, settings): mfcc = dct(spectral_env_mel) / np.sqrt(settings['coded_dim'] * 2) fftlen = pyworld.get_cheaptrick_fft_size(settings['sample_rate']) spectral_env = pyworld.decode_spectral_envelope(mfcc, settings['sample_rate'], fftlen) return spectral_env
def synthesis(ori_path, aim_sp, aim_spkid): print('synthesizing ...') wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64) f0, timeaxis = pw.harvest(wav, hp.SR) sp_per_timeaxis_before = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) # 1024 压缩到 513 维 # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT) # print('f0.shape = ') # print(f0) ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) aim_decoded_sp = pw.decode_spectral_envelope( aim_sp, hp.SR, fft_size=hp.N_FFT) # 转换/解码 后的sp: print('解码后的513维度的aim_decoded_sp = ') print(aim_decoded_sp.shape) print(aim_decoded_sp[399][:]) synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR) print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav') librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav', synwav, sr=hp.SR)
def convertFeaturesIntoWav(f0seq, MCEPseq, APseq, fs, frame_period=5.0): contNumpy_MCEPseq = np.ascontiguousarray(MCEPseq.T, dtype=np.float64) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pyworld.decode_spectral_envelope(contNumpy_MCEPseq, fs, fftlen) # print(f"dtypes. f0seq:{f0seq.dtype}, spectrogram:{spectrogram.dtype}, APseq:{APseq.dtype}") wav = pyworld.synthesize(f0seq, spectrogram, APseq, fs, frame_period) return wav.astype(np.float32)
def world_decode_spectral_envelop(coded_sp, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) #coded_sp = coded_sp.astype(np.float32) #coded_sp = np.ascontiguousarray(coded_sp) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def test(self): """Translate speech using StarGAN .""" # Load the trained generator. self.restore_model(self.test_iters) norm = Normalizer() # Set data loader. d, speaker = TestSet(self.test_dir).test_data(self.src_speaker) targets = self.trg_speaker for target in targets: print(target) assert target in speakers label_t = self.spk_enc.transform([target])[0] if label_t == [0]: label_t = [1, 0] elif label_t == [1]: label_t = [0, 1] label_t = np.asarray([label_t]) with torch.no_grad(): for filename, content in d.items(): f0 = content['f0'] ap = content['ap'] sp_norm_pad = self.pad_coded_sp(content['coded_sp_norm']) convert_result = [] for start_idx in range(0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = torch.FloatTensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = torch.FloatTensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).data.cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content['coded_sp_norm']. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion(f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f'{speaker}-{target}_iter{self.test_iters}_{filename}' path = os.path.join(self.result_dir, name) print(f'[save]:{path}') librosa.output.write_wav(path, wav, SAMPLE_RATE)
def inv_world_spectrogram(f0, sp, ap, sr=_sr, **kwargs): """world声码器频谱转为语音。""" frame_period = kwargs.get("frame_period", pw.default_frame_period) f0_floor = kwargs.get("f0_floor", pw.default_f0_floor) fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor)) sp_dec = pw.decode_spectral_envelope(sp, sr, fft_size=fft_size) ap_dec = pw.decode_aperiodicity(ap, sr, fft_size=fft_size) y = pw.synthesize(f0, sp_dec, ap_dec, sr, frame_period=frame_period) return y
def mcep2wav(mcep, f0, ap): f0 = f0.astype(np.float64) ap = ap.astype(np.float64) mcep = mcep.astype(np.float64) decoded_sp = pyworld.decode_spectral_envelope(mcep, sampling_rate, fft_size=n_fft) wav = pyworld.synthesize(f0, decoded_sp, ap, sampling_rate) return wav
def worldDecodeSpectralEnvelop(coded_sp: np.ndarray, fs: int = SAMPLE_RATE) -> np.ndarray: ''' MCEPsをスペクトル包絡に戻す Parameters ---------- coded_sp: np.ndarray MCEPsのデータ fs: int, default SAMPLE_RATE サンプリング周波数 Returns ------- decoded_sp: np.ndarray スペクトル包絡 ''' fftlen = pyworld.get_cheaptrick_fft_size(fs) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def save_world_wav(feats, filename): # feats = [f0, sp, ap, sp_coded, labels] if isinstance(feats[3], torch.Tensor): feats[3] = feats[3].cpu().numpy() if hp.normalise: feats[3] = _unnormalise_coded_sp(feats[3]) # path = os.path.join(hp.sample_set_dir, model_name) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) # path = os.path.join(path, filename) feats[3] = np.ascontiguousarray(feats[3], dtype=np.float64) decoded_sp = decode_spectral_envelope(feats[3], hp.sr, fft_size=hp.n_fft) wav = synthesize(feats[0], decoded_sp, feats[1], hp.sr) save_wav(wav, filename)
def synthesis(ori_path, aim_sp, aim_spkid): print('synthesizing ...') wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64) f0, timeaxis = pw.harvest(wav, hp.SR, frame_period=10) sp_per_timeaxis_before = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) # 1024 压缩到 513 维 ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) aim_decoded_sp = pw.decode_spectral_envelope( aim_sp, hp.SR, fft_size=hp.N_FFT) # 转换/解码 后的sp:维度从60变成513 print('line23: f0.shape = ' + str(f0.shape) + 'aim_decoded_sp.shape = ' + str(aim_decoded_sp.shape) + 'ap.shape = ' + str(ap.shape)) print('\n line26 : aim_sp.shape = ' + str(aim_sp.shape)) synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR) print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav') librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav', synwav, sr=hp.SR)
def world_decode_spectral_envelop(coded_sp, fs): # Decode Mel-cepstral to sp fftlen = pyworld.get_cheaptrick_fft_size(fs) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def train(self): # 衰减的学习率缓存 g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr # 开始训练步骤数为0 start_iters = 0 # 如果存在就跳过 if self.resume_iters: pass # 调用定义的个性化标准化方法 norm = Normalizer() # iter用来生成迭代器,这里用来迭代加载数据集 data_iter = iter(self.data_loader) print('开始训练......') # 记录当前时间,now函数取当前时间 start_time = datetime.now() # 利用总迭代次数来进行遍历 for i in range(start_iters, self.num_iters): # =================================================================================== # # 1.预处理输入数据 # # =================================================================================== # # 获取真实的图像和对应标签标签 try: # next方法为迭代下一个迭代器 # 利用自定义的加载器获取真实x值,发音者标签在组中索引与源标签 x_real, speaker_idx_org, label_org = next(data_iter) except: # 如果迭代器有问题就再转换为迭代器一次然后迭代 data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # 随机生成目标域标签 # torch.randperm返回一个从0到参数-1范围的随机数组 # 因为标签二值化了,所以这里的标签是10组成的,所以一共有label_org.size(0)个标签 # 获得的是随机索引 rand_idx = torch.randperm(label_org.size(0)) # 根据随机数作为源标签的索引作为目标标签数 label_trg = label_org[rand_idx] # 同理得到随机目标发音者 speaker_idx_trg = speaker_idx_org[rand_idx] # to表示使用cpu或者gpu运行 x_real = x_real.to(self.device) # 输入数据 label_org = label_org.to(self.device) # 源域one-hot格式标签 label_trg = label_trg.to(self.device) # 目标域ont-hot格式标签 speaker_idx_org = speaker_idx_org.to(self.device) # 源域标签 speaker_idx_trg = speaker_idx_trg.to(self.device) # 目标域标签 # =================================================================================== # # 2.训练判别器 # # =================================================================================== # # 用真实音频数据计算损失 # nn.CrossEntropyLoss()为交叉熵损失函数,但是不是普通的形式,而是主要是将softmax-log-NLLLoss合并到一块得到的结果。 CELoss = nn.CrossEntropyLoss() # 调用分类器计算真实数据 cls_real = self.C(x_real) # 计算对应的域分类损失,即用交叉熵实现 cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) # 重置缓冲区,具体实现在下面 self.reset_grad() # tensor.backward为自动求导函数 cls_loss_real.backward() # optimizer.step这个方法会更新模型所有的参数以提升学习率,一般在backward函数后根据其计算的梯度来更新参数 self.c_optimizer.step() # 记录中 loss = {} # 从真实域分类损失张量中获取元素值 # item()得到一个元素张量里面的元素值 loss['C/C_loss'] = cls_loss_real.item() # 基于源数据的D判断结果 out_r = self.D(x_real, label_org) # 用假音频帧计算损失 # 根据真实样本与目标标签生成生成样本 x_fake = self.G(x_real, label_trg) # detach截断反向传播的梯度流,从而让梯度不影响判别器D # 基于生成样本的D判断结果 out_f = self.D(x_fake.detach(), label_trg) # torch.nn.Function.binary_cross_entropy_with_logits度量目标逻辑和输出逻辑之间的二进制交叉熵的函数 # 接受任意形状的输入,target要求与输入形状一致。切记:target的值必须在[0,N-1]之间,其中N为类别数,否则会出现莫名其妙的错误,比如loss为负数。 # 计算其实就是交叉熵,不过输入不要求在0,1之间,该函数会自动添加sigmoid运算 # 返回一个填充了标量值1的张量,其大小与输入相同。torch.ones_like(input) # 相当于torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device) # binary_cross_entropy_with_logits和binary_cross_entropy的区别 # 有一个(类)损失函数名字中带了with_logits. 而这里的logits指的是,该损失函数已经内部自带了计算logit的操作, # 无需在传入给这个loss函数之前手动使用sigmoid/softmax将之前网络的输入映射到[0,1]之间 d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \ F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float)) # 生成样本的分类结果 out_cls = self.C(x_fake) # 交叉熵计算生成样本的域分类损失 d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # 计算梯度惩罚的损失 alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device) # 计算x_hat # requires_grad_设置积分方法,将requires_grad是否积分的属性设置为真 # 取一个随机数混合真实样本和生成样本得到一个x尖 x_hat = (alpha * x_real.data + (1 - alpha) * x_fake.data).requires_grad_(True) # 计算混合样本和目标标签的判别结果 out_src = self.D(x_hat, label_trg) # 调用自定义方法得到处理导数后的数据 d_loss_gp = self.gradient_penalty(out_src, x_hat) # 计算判别器的总体损失 d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp # 调用自定义方法重置梯度变化缓冲区 self.reset_grad() # 对D的损失求导 d_loss.backward() # 更新模型判别器D参数 self.d_optimizer.step() # loss['D/d_loss_t'] = d_loss_t.item() # loss['D/loss_cls'] = d_loss_cls.item() # loss['D/D_gp'] = d_loss_gp.item() # 获取判别器损失 loss['D/D_loss'] = d_loss.item() # =================================================================================== # # 3.训练生成器 # # =================================================================================== # # 进行模运算,判读更新时间 if (i + 1) % self.n_critic == 0: # 源至目标域 # 利用真实样本和目标标签生成生成样本 x_fake = self.G(x_real, label_trg) # 判别生成样本与目标标签 g_out_src = self.D(x_fake, label_trg) # 将生成与目标标签的损失与相同大小纯1张量计算交叉熵得到生成G损失 g_loss_fake = F.binary_cross_entropy_with_logits( input=g_out_src, target=torch.ones_like(g_out_src, dtype=torch.float)) # 得到真实样本通过域分类器得到的类别 out_cls = self.C(x_real) # 计算C计算类别与输入的类别的交叉熵损失即G的分类损失 g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org) # 目标至源域 # 通过G将生成样本转换为源标签 x_reconst = self.G(x_fake, label_org) # 得到循环一致性损失,即通过G转回来的损失,按道理这两个是同样的 # l1_loss为L1损失函数,即平均绝对误差 g_loss_rec = F.l1_loss(x_reconst, x_real) # 源到源域(身份一致性损失). # 通过真实样本与源标签生成,按道理也是生成x_real x_fake_iden = self.G(x_real, label_org) # 利用L1损失函数计算 id_loss = F.l1_loss(x_fake_iden, x_real) # 后退和优化 # 得到生成器的总体损失函数 g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\ self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss # 重置梯度变化缓冲区 self.reset_grad() # 对G损失求导 g_loss.backward() # 更新生成器参数 self.g_optimizer.step() # 记录对应的损失 loss['G/loss_fake'] = g_loss_fake.item() loss['G/loss_rec'] = g_loss_rec.item() loss['G/loss_cls'] = g_loss_cls.item() loss['G/loss_id'] = id_loss.item() loss['G/g_loss'] = g_loss.item() # =================================================================================== # # 4.其他 # # =================================================================================== # # 打印训练相关信息 if (i + 1) % self.log_step == 0: # 得到训练时间 et = datetime.now() - start_time # 截取后面的时间段 et = str(et)[:-7] # 耗时与迭代次数 log = "耗时:[{}], 迭代次数:[{}/{}]".format(et, i + 1, self.num_iters) # 打印对应损失值 for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) # 如果调用tensorboard来记录训练过程 if self.use_tensorboard: for tag, value in loss.items(): # 添加到log中 self.logger.scalar_summary(tag, value, i + 1) # 翻译固定数据进行调试 if (i + 1) % self.sample_step == 0: # torch.no_grad是一个上下文管理器,被该语句包括起来的部分将不会track 梯度 # 所有依赖他的tensor会全部变成True,反向传播时就不会自动求导了,反向传播就不会保存梯度,因此大大节约了显存或者说内存。 with torch.no_grad(): # 调用自定义方法,定义一个路由,并随机选取一个发音者作为测试数据 d, speaker = TestSet(self.test_dir).test_data() # random.choice返回参数的随机项 # 随机在speakers中选择一个不是目标的发音者 target = random.choice( [x for x in speakers if x != speaker]) # 将二值化的标签组取出第一个作为目标 # LabelBinary.transfrom方法将复杂类标签转换为二进制标签 label_t = self.spk_enc.transform([target])[0] # np.asarray将python原生列表或元组形式的现有数据来创建numpy数组 label_t = np.asarray([label_t]) # 取出字典中的文件名与内容 for filename, content in d.items(): f0 = content['f0'] ap = content['ap'] # 调用自定义方法处理对应的数据 sp_norm_pad = self.pad_coded_sp( content['coded_sp_norm']) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = torch.FloatTensor(one_seg).to( self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = torch.FloatTensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).data.cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content['coded_sp_norm']. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f'{speaker}-{target}_iter{i+1}_{filename}' path = os.path.join(self.sample_dir, name) print(f'[save]:{path}') librosa.output.write_wav(path, wav, SAMPLE_RATE) # 保存模型检查点 if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i + 1)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i + 1)) C_path = os.path.join(self.model_save_dir, '{}-C.ckpt'.format(i + 1)) torch.save(self.G.state_dict(), G_path) torch.save(self.D.state_dict(), D_path) torch.save(self.C.state_dict(), C_path) print('Saved model checkpoints into {}...'.format( self.model_save_dir)) # 衰减学习率 if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= (self.g_lr / float(self.num_iters_decay)) d_lr -= (self.d_lr / float(self.num_iters_decay)) c_lr -= (self.c_lr / float(self.num_iters_decay)) self.update_lr(g_lr, d_lr, c_lr) print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format( g_lr, d_lr))
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') #x, fs = sf.read('utterance/vaiueo2d.wav') x, fs = sf.read('utterance/p226_002.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # 2-4 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. code_sp = pw.code_spectral_envelope(sp, fs, 80) code_ap = pw.code_aperiodicity(ap, fs) fft_size = (sp.shape[1] - 1) * 2 rest_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size) rest_ap = pw.decode_aperiodicity(code_ap, fs, fft_size) y_r = pw.synthesize(f0, rest_sp, rest_ap, fs, args.frame_period) sf.write('test/y_with_f0_refinement_code_and_restore.wav', y_r, fs) print("fft size: {:d}".format(fft_size)) print("coded sp shape: ({:d}, {:d})".format(code_sp.shape[0], code_sp.shape[1])) print("coded ap shape: ({:d}, {:d})".format(code_ap.shape[0], code_ap.shape[1])) # 2-5 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. frame_shift: 12.5 ms, frame_length: 50.0 ms f0_xx, t_xx = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=12.5, speed=args.speed) f0_xx = pw.stonemask(x, f0_xx, t_xx, fs) sp_xx = pw.cheaptrick(x, f0_xx, t_xx, fs) ap_xx = pw.d4c(x, f0_xx, t_xx, fs) code_sp_xx = pw.code_spectral_envelope(sp_xx, fs, 80) code_ap_xx = pw.code_aperiodicity(ap_xx, fs) fft_size = (sp_xx.shape[1] - 1) * 2 rest_sp_xx = pw.decode_spectral_envelope(code_sp_xx, fs, fft_size) rest_ap_xx = pw.decode_aperiodicity(code_ap_xx, fs, fft_size) y_r_xx = pw.synthesize(f0_xx, rest_sp_xx, rest_ap_xx, fs, 12.5) sf.write( 'test/y_with_f0_refinement_code_and_restore_frame_period_12.5.wav', y_r_xx, fs) print("coded sp_xx shape: ({:d}, {:d})".format(code_sp_xx.shape[0], code_sp_xx.shape[1])) print("coded ap_xx shape: ({:d}, {:d})".format(code_ap_xx.shape[0], code_ap_xx.shape[1])) # Comparison savefig('test/wavform.png', [x, _y, y, y_h, y_r, y_r_xx]) savefig('test/sp.png', [_sp, sp, sp_h, rest_sp, rest_sp_xx]) savefig('test/ap.png', [_ap, ap, ap_h, rest_ap, rest_ap_xx], log=False) savefig('test/f0.png', [_f0, f0, f0_h, f0_xx]) print('Please check "test" directory for output files')
def decode_spectral_envelop(coded_spect, sampling_rate): fftlen = pyworld.get_cheaptrick_fft_size(sampling_rate) decoded_spect = pyworld.decode_spectral_envelope(coded_spect, sampling_rate, fftlen) return decoded_spect
sp = pw.cheaptrick(x, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, fs) # extract aperiodicity end = timer() print('Feature Extraction:', end - start, 'seconds') # f0_new from copy import deepcopy # to avoid call by reference!! f0_new = deepcopy(f0) # 1-58 59-138 139-198 // 269-360 // 429-522 f0_new[1:198] = np.flip(f0_new[1:198], 0) # reverse pitch f0_new[269:360] = f0_new[269:360] + 62 #E(330hz) -> G (392hz) f0_new[429:522] = f0_new[429:522] + 193 #E(330hz) -> G(523hz) #%% reduce dimension of spectral envelope and aperiodicity. enc_sp = pw.code_spectral_envelope(sp, fs, number_of_dimensions=32) dec_sp = pw.decode_spectral_envelope(enc_sp, fs, fft_size=(sp.shape[1] - 1) * 2) enc_ap = pw.code_aperiodicity(ap, fs) dec_ap = pw.decode_aperiodicity(enc_ap, fs, fft_size=(ap.shape[1] - 1) * 2) #%% y = pw.synthesize(f0, sp, ap, fs) librosa.output.write_wav('y_EyesNose_short_resynthesis.wav', y, fs) #%% y = pw.synthesize(f0, dec_sp, ap, fs) librosa.output.write_wav('y_EyesNose_short_resynthesis_sp_decode_32.wav', y, fs) #%% synthesis using new f0 y = pw.synthesize(f0_new, sp, ap, fs)
def train(self): # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr start_iters = 0 if self.resume_iters: pass norm = Normalizer() data_iter = iter(self.data_loader) print("Start training......") start_time = datetime.now() for i in range(start_iters, self.num_iters): # Preprocess input data # Fetch real images and labels. try: x_real, speaker_idx_org, label_org = next(data_iter) except: data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # Generate target domain labels randomly. rand_idx = flow.randperm(label_org.size(0)) label_trg = label_org[rand_idx] speaker_idx_trg = speaker_idx_org[rand_idx] x_real = x_real.to(self.device) # Original domain one-hot labels. label_org = label_org.to(self.device) # Target domain one-hot labels. label_trg = label_trg.to(self.device) speaker_idx_org = speaker_idx_org.to(self.device) speaker_idx_trg = speaker_idx_trg.to(self.device) # Train the discriminator # Compute loss with real audio frame. CELoss = nn.CrossEntropyLoss() cls_real = self.C(x_real) cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) self.reset_grad() cls_loss_real.backward() self.c_optimizer.step() # Logging. loss = {} loss["C/C_loss"] = cls_loss_real.item() out_r = self.D(x_real, label_org) # Compute loss with fake audio frame. x_fake = self.G(x_real, label_trg) out_f = self.D(x_fake.detach(), label_trg) d_loss_t = nn.BCEWithLogitsLoss()( input=out_f, target=flow.zeros_like( out_f).float()) + nn.BCEWithLogitsLoss()( input=out_r, target=flow.ones_like(out_r).float()) out_cls = self.C(x_fake) d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Compute loss for gradient penalty. alpha = flow.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = ((alpha * x_real + (1 - alpha) * x_fake).detach().requires_grad_(True)) out_src = self.D(x_hat, label_trg) # TODO: Second-order derivation is not currently supported in oneflow, so gradient penalty cannot be used temporarily. if self.use_gradient_penalty: d_loss_gp = self.gradient_penalty(out_src, x_hat) d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp else: d_loss = d_loss_t + self.lambda_cls * d_loss_cls self.reset_grad() d_loss.backward() self.d_optimizer.step() loss["D/D_loss"] = d_loss.item() # Train the generator if (i + 1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, label_trg) g_out_src = self.D(x_fake, label_trg) g_loss_fake = nn.BCEWithLogitsLoss()( input=g_out_src, target=flow.ones_like(g_out_src).float()) out_cls = self.C(x_real) g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org) # Target-to-original domain. x_reconst = self.G(x_fake, label_org) g_loss_rec = nn.L1Loss()(x_reconst, x_real) # Original-to-Original domain(identity). x_fake_iden = self.G(x_real, label_org) id_loss = nn.L1Loss()(x_fake_iden, x_real) # Backward and optimize. g_loss = (g_loss_fake + self.lambda_cycle * g_loss_rec + self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss) self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss["G/loss_fake"] = g_loss_fake.item() loss["G/loss_rec"] = g_loss_rec.item() loss["G/loss_cls"] = g_loss_cls.item() loss["G/loss_id"] = id_loss.item() loss["G/g_loss"] = g_loss.item() # Miscellaneous # Print out training information. if (i + 1) % self.log_step == 0: et = datetime.now() - start_time et = str(et)[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format( et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: with flow.no_grad(): d, speaker = TestSet(self.test_dir).test_data() target = random.choice( [x for x in speakers if x != speaker]) label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) for filename, content in d.items(): f0 = content["f0"] ap = content["ap"] sp_norm_pad = self.pad_coded_sp( content["coded_sp_norm"]) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = flow.Tensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = flow.Tensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).detach().cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content["coded_sp_norm"]. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f"{speaker}-{target}_iter{i+1}_{filename}" path = os.path.join(self.sample_dir, name) print(f"[save]:{path}") sf.write(path, wav, SAMPLE_RATE) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, "{}-G".format(i + 1)) D_path = os.path.join(self.model_save_dir, "{}-D".format(i + 1)) C_path = os.path.join(self.model_save_dir, "{}-C".format(i + 1)) flow.save(self.G.state_dict(), G_path) flow.save(self.D.state_dict(), D_path) flow.save(self.C.state_dict(), C_path) print("Saved model checkpoints into {}...".format( self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= self.g_lr / float(self.num_iters_decay) d_lr -= self.d_lr / float(self.num_iters_decay) c_lr -= self.c_lr / float(self.num_iters_decay) self.update_lr(g_lr, d_lr, c_lr) print("Decayed learning rates, g_lr: {}, d_lr: {}.".format( g_lr, d_lr))
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): alignment = alignment[idx].cpu().data.numpy() tag = "alignment_layer{}".format(i + 1) writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # save files as well for now alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_layer_{}_alignment.png".format( global_step, i + 1)) save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(checkpoint_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) tag = "averaged_alignment" writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() if hparams.vocoder != "world": mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Predicted mel spectrogram", mel_output, global_step) else: mel_output_prep = mel_output try: writer.add_image("Predicted WORLD output", mel_output_prep, global_step) except: pass mel_output = denormalize(mel_output) nfft = pw.get_cheaptrick_fft_size(hparams.sample_rate) f0 = mel_output[:,0].astype(np.float64) sp = pw.decode_spectral_envelope(mel_output[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft) ap = pw.decode_aperiodicity(mel_output[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft) signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period) path = join(checkpoint_dir, "step{:09d}_out.wav".format( global_step)) audio.save_wav(signal, path) try: signal /= np.max(np.abs(signal)) writer.add_audio("Target audio signal", signal, global_step, sample_rate=fs) except: print("Unexpected error :", sys.exc_info()) mel_tgt = mel[idx].cpu().data.numpy() mel_tgt = denormalize(mel_tgt) f0 = mel_tgt[:,0].astype(np.float64) sp = pw.decode_spectral_envelope(mel_tgt[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft) ap = pw.decode_aperiodicity(mel_tgt[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft) signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period) try: signal /= np.max(np.abs(signal)) writer.add_audio("Target audio signal", signal, global_step, sample_rate=hparams.sample_rate) except: print("Unexpected error :", sys.exc_info()) # Predicted spectrogram if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Predicted linear spectrogram", spectrogram, global_step) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format( global_step)) try: writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=fs) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) # Target mel spectrogram if mel_outputs is not None: mel_output = mel[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Target mel spectrogram", mel_output, global_step) # Target spectrogram if linear_outputs is not None: linear_output = y[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Target linear spectrogram", spectrogram, global_step) #ei path = join(checkpoint_dir, "step{:09d}_mel_target.npy".format( global_step)) mel_output = mel[idx].cpu().data.numpy() np.save(path, denormalize(mel_output)) path = join(checkpoint_dir, "step{:09d}_mel_out.npy".format( global_step)) mel_output = denormalize(mel_outputs[idx].cpu().data.numpy()) np.save(path, mel_output)
def train(self): # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr start_iters = 0 if self.resume_iters: pass norm = Normalizer() data_iter = iter(self.data_loader) print('Start training......') start_time = datetime.now() for i in range(start_iters, self.num_iters): # =================================================================================== # # 1. Preprocess input data # # =================================================================================== # # Fetch real images and labels. try: x_real, speaker_idx_org, label_org = next(data_iter) except: data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # Generate target domain labels randomly. rand_idx = torch.randperm(label_org.size(0)) label_trg = label_org[rand_idx] speaker_idx_trg = speaker_idx_org[rand_idx] x_real = x_real.to(self.device) # Input images. label_org = label_org.to( self.device) # Original domain one-hot labels. label_trg = label_trg.to( self.device) # Target domain one-hot labels. speaker_idx_org = speaker_idx_org.to( self.device) # Original domain labels speaker_idx_trg = speaker_idx_trg.to( self.device) #Target domain labels # =================================================================================== # # 2. Train the discriminator # # =================================================================================== # # Compute loss with real audio frame. CELoss = nn.CrossEntropyLoss() cls_real = self.C(x_real) cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) self.reset_grad() cls_loss_real.backward() self.c_optimizer.step() # Logging. loss = {} loss['C/C_loss'] = cls_loss_real.item() out_r = self.D(x_real, label_org) # Compute loss with fake audio frame. x_fake = self.G(x_real, label_trg) out_f = self.D(x_fake.detach(), label_trg) d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \ F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float)) out_cls = self.C(x_fake) d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Compute loss for gradient penalty. alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = (alpha * x_real.data + (1 - alpha) * x_fake.data).requires_grad_(True) out_src = self.D(x_hat, label_trg) d_loss_gp = self.gradient_penalty(out_src, x_hat) d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp self.reset_grad() d_loss.backward() self.d_optimizer.step() # loss['D/d_loss_t'] = d_loss_t.item() # loss['D/loss_cls'] = d_loss_cls.item() # loss['D/D_gp'] = d_loss_gp.item() loss['D/D_loss'] = d_loss.item() # =================================================================================== # # 3. Train the generator # # =================================================================================== # if (i + 1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, label_trg) g_out_src = self.D(x_fake, label_trg) g_loss_fake = F.binary_cross_entropy_with_logits( input=g_out_src, target=torch.ones_like(g_out_src, dtype=torch.float)) out_cls = self.C(x_fake) g_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Target-to-original domain. x_reconst = self.G(x_fake, label_org) g_loss_rec = F.l1_loss(x_reconst, x_real) # Original-to-Original domain(identity). x_fake_iden = self.G(x_real, label_org) id_loss = F.l1_loss(x_fake_iden, x_real) # Backward and optimize. g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\ self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss['G/loss_fake'] = g_loss_fake.item() loss['G/loss_rec'] = g_loss_rec.item() loss['G/loss_cls'] = g_loss_cls.item() loss['G/loss_id'] = id_loss.item() loss['G/g_loss'] = g_loss.item() # =================================================================================== # # 4. Miscellaneous # # =================================================================================== # # Print out training information. if (i + 1) % self.log_step == 0: et = datetime.now() - start_time et = str(et)[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format( et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) if self.use_tensorboard: for tag, value in loss.items(): self.logger.scalar_summary(tag, value, i + 1) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: with torch.no_grad(): d, speaker = TestSet(self.test_dir).test_data() target = random.choice( [x for x in speakers if x != speaker]) label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) for filename, content in d.items(): f0 = content['f0'] ap = content['ap'] sp_norm_pad = self.pad_coded_sp( content['coded_sp_norm']) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = torch.FloatTensor(one_seg).to( self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = torch.FloatTensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).data.cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content['coded_sp_norm']. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f'{speaker}-{target}_iter{i+1}_{filename}' path = os.path.join(self.sample_dir, name) print(f'[save]:{path}') librosa.output.write_wav(path, wav, SAMPLE_RATE) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i + 1)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i + 1)) C_path = os.path.join(self.model_save_dir, '{}-C.ckpt'.format(i + 1)) torch.save(self.G.state_dict(), G_path) torch.save(self.D.state_dict(), D_path) torch.save(self.C.state_dict(), C_path) print('Saved model checkpoints into {}...'.format( self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= (self.g_lr / float(self.num_iters_decay)) d_lr -= (self.d_lr / float(self.num_iters_decay)) c_lr -= (self.c_lr / float(self.num_iters_decay)) self.update_lr(g_lr, d_lr, c_lr) print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format( g_lr, d_lr))
# output_dir = './data/processed' # # parser.add_argument('--input_dir', type = str, help = 'the direcotry contains data need to be processed', default = input_dir) # parser.add_argument('--output_dir', type = str, help = 'the directory stores the processed data', default = output_dir) # # argv = parser.parse_args() # input_dir = argv.input_dir # output_dir = argv.output_dir # # os.makedirs(output_dir, exist_ok=True) # # wav_to_mcep_file(input_dir, SAMPLE_RATE, processed_filepath=output_dir) # # #input_dir is train dataset. we need to calculate and save the speech\ # # statistical characteristics for each speaker. # generator = GenerateStatistics(output_dir) # generator.generate_stats() # generator.normalize_dataset() # end = datetime.now() # print(f"[Runing Time]: {end-start}") data_dir = '../data/audio/' sample = data_dir + 'Ses01F_impro01_F000.wav' wav = librosa.load(one_file, sr=sr, mono=True, dtype=np.float64)[0] f0, ap, sp, coded_sp = call_mcep(wav) decoded_sp = decode_spectral_envelope(coded_sp, SAMPLE_RATE, fft_size=FFTSIZE) # f0_converted = norm.pitch_conversion(f0, speaker, target) wav2 = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) audio_utils.save(wav2, './')
out_name = key.replace("/", "_") else: out_name = key out_wavfile = tspk_dir / f"{out_name}.wav" out_wavfile.parent.mkdir(exist_ok=True, parents=True) # test generator mcep_T = np.asarray(mcep.T, dtype=np.float32) mcep_T = mcep_T.reshape((1, *mcep_T.shape)) gen_mcep_var = generator(mcep_T, tspk_lab) gen_mcep = gen_mcep_var[0].data.T denorm_gen_mcep = denorm_mcep(gen_mcep, f0, mcep_mean[tspk], mcep_std[tspk]) denorm_gen_mcep = signal.medfilt(denorm_gen_mcep, (5, 1)) conved_f0 = conv_f0(f0, logf0_mean[tspk], logf0_std[tspk]) specenv = pw.decode_spectral_envelope(denorm_gen_mcep, args.samplerate, args.fftsize) x = pw.synthesize(conved_f0, specenv, ap, args.samplerate, frame_period=args.frame_period*1000) x = x / max(abs(x)) * 30000 x = x.astype(np.int16) wavfile.write(out_wavfile, args.samplerate, x) # test discriminator # test real data if fspk not in real_flags: real_datas[key] = np.squeeze(adverserial_discriminator(mcep_T, fspk_lab, dp_ratio=0.0)[1].data) # test fake data fake_datas[key] = np.squeeze(adverserial_discriminator(gen_mcep_var, tspk_lab, dp_ratio=0.0)[1].data) # save values of discriminator of fake data (fspk -> tspk) plt.clf()
import pyworld IN_WAVE_FILE = "in.wav" # 入力音声 OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 SP_DIM = 50 # スペクトル包絡の圧縮後の次元 # 音声の読み込み fs, x = wavfile.read(IN_WAVE_FILE) x = x.astype(np.float64) # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) f0, sp, ap = pyworld.wav2world(x, fs) fft_size = pyworld.get_cheaptrick_fft_size(fs) # スペクトル包絡をエンコード / デコード # https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html code_sp = pyworld.code_spectral_envelope(sp, fs, SP_DIM) decode_sp = pyworld.decode_spectral_envelope(code_sp, fs, fft_size) # 非周期性指標をエンコード / デコード code_ap = pyworld.code_aperiodicity(ap, fs) decode_ap = pyworld.decode_aperiodicity(code_ap, fs, fft_size) # 音声の再合成 y = pyworld.synthesize(f0, decode_sp, decode_ap, fs) y = y.astype(np.int16) # 音声の書き込み wavfile.write(OUT_WAVE_FILE, fs, y)
def inference(self, dataset, rank_size=1): if dataset is None: print("convert dataset error!") return for i, samples in enumerate(dataset): samples = self.model.prepare_samples(samples) src_coded_sp, src_speaker_onehot, src_f0, src_ap = samples["src_coded_sp"], \ samples["src_speaker"], samples["src_f0"], samples["src_ap"] tar_speaker_onehot = samples["tar_speaker"] # Map the ids to the speakers name src_id, tar_id = samples["src_id"], samples["tar_id"] src_id, tar_id = int(src_id), int(tar_id) src_speaker = self.speakers_ids_dict[src_id] tar_speaker = self.speakers_ids_dict[tar_id] src_wav_filename = samples["src_wav_filename"] src_filename = src_wav_filename.numpy()[0].decode().replace( ".npz", "") gen_coded_sp = self.model.convert(src_coded_sp, tar_speaker_onehot) gen_coded_sp = tf.transpose(tf.squeeze(gen_coded_sp), [1, 0]) coded_sp = self.feature_normalizer(gen_coded_sp, str(tar_speaker), reverse=True) def apply_f0_cmvn(cmvn_dict, feat_data, src_speaker, tar_speaker): if tar_speaker not in cmvn_dict: print("tar_speaker not in cmvn_dict!") return feat_data f0 = feat_data.numpy() src_mean = cmvn_dict[src_speaker][2] src_var = cmvn_dict[src_speaker][3] tar_mean = cmvn_dict[tar_speaker][2] tar_var = cmvn_dict[tar_speaker][3] f0_converted = np.exp((np.ma.log(f0) - src_mean) / np.sqrt(src_var) * np.sqrt(tar_var) + tar_mean) return f0_converted f0 = apply_f0_cmvn(self.feature_normalizer.cmvn_dict, src_f0, str(src_speaker), str(tar_speaker)) # Restoration of sp characteristics c = [] for one_slice in coded_sp: one_slice = np.ascontiguousarray(one_slice, dtype=np.float64).reshape( 1, -1) decoded_sp = pyworld.decode_spectral_envelope( one_slice, self.fs, fft_size=self.fft_size) c.append(decoded_sp) sp = np.concatenate((c), axis=0) f0 = np.squeeze(f0, axis=(0, )).astype(np.float64) src_ap = np.squeeze(src_ap.numpy(), axis=(0, )).astype(np.float64) # Remove the extra padding at the end of the sp feature sp = sp[:src_ap.shape[0], :] # sp: T,fft_size//2+1 f0: T ap: T,fft_size//2+1 synwav = pyworld.synthesize(f0, sp, src_ap, self.fs) wavname = src_speaker + "_" + tar_speaker + "_" + src_filename + ".wav" wavfolder = os.path.join(self.hparams.output_directory) if not os.path.exists(wavfolder): os.makedirs(wavfolder) wavpath = os.path.join(wavfolder, wavname) librosa.output.write_wav(wavpath, synwav, sr=self.fs) print("generate wav:", wavpath)
else: sentences = args.sentences print(f"sentences: {sentences}") for s, snt in enumerate(sentences): feature, gen_letter_stateseq = feat_generator.generate(snt) mcep = mcep_generator.generate(feature, args.target_speaker) ap = ap_generator.generate(gen_letter_stateseq) f0 = f0_generator.generate(gen_letter_stateseq) f0[f0 < 0] = 0 mcep = denorm_mcep(mcep, mcep_min, mcep_max) mcep = signal.medfilt(mcep, (5, 1)) mcep = mcep.astype(float, order="C") decoded_sp = pw.decode_spectral_envelope(mcep, args.samplerate, args.fftsize) synthesized = pw.synthesize(f0, decoded_sp, ap, args.samplerate, frame_period=args.frame_period * 1000) synthesized = synthesized / max(abs(synthesized)) * 30000 args.output_prefix.parent.mkdir(parents=True, exist_ok=True) out_file = args.output_prefix.with_name( f"{args.output_prefix.name}_{s:02d}_({'_'.join(map(str, snt))}).wav") wavfile.write(out_file, args.samplerate, synthesized.astype(np.int16))