def _prepare_sp(self, path): sp = np.load(str(path)) sp = self._normalize(encode_sp(sp, sr=22050, mel_bins=36)) sp = self._crop(sp, upper_bound=128) return sp
content_discriminator = ContentDiscriminator() content_discriminator.to_gpu() dis_con_opt = set_optimizer(content_discriminator, alpha=0.0001) for epoch in range(epochs): sum_gen_loss = 0 sum_dis_loss = 0 for batch in range(0, Ntrain, batchsize): x_sp_box = [] y_sp_box = [] for _ in range(batchsize): # sp loading -> mel conversion -> normalization -> crop rnd_x = np.random.randint(x_len) sp_x = np.load(x_path + x_list[rnd_x]) sp_x = normalize(encode_sp(sp_x, mel_bins=36)) rnd_y = np.random.randint(y_len) sp_y = np.load(y_path + y_list[rnd_y]) sp_y = normalize(encode_sp(sp_y, mel_bins=36)) sp_x = crop(sp_x, upper_bound=128) sp_y = crop(sp_y, upper_bound=128) x_sp_box.append(sp_x[np.newaxis,:]) y_sp_box.append(sp_y[np.newaxis,:]) x = chainer.as_variable(xp.array(x_sp_box).astype(xp.float32)) y = chainer.as_variable(xp.array(y_sp_box).astype(xp.float32)) # Discriminator update a_out, b_out = generator(x, y) a_enc, _, a_fake, _, a_infer = a_out
serializers.load_npz('./generator_xy.model', generator_xy) #serializers.load_npz('./generator_mtok.model', generator_xy) x_sp_box = [] rnd = np.random.randint(x_len) wav = x_wav_path + x_list[rnd] #wav = './Dataset/jsut_ver1.1/basic5000/wav16/BASIC5000_3895.wav' print(wav) xx = load(wav, sampling_rate=16000) f0, sp, ap = audio2af(xx) f0_tmp = np.exp((np.ma.log(f0) - 5.35) / 0.24 * 0.27 + 5.67) #f0_tmp = np.exp((np.ma.log(f0) - 6.20) / 0.29 * 0.27 + 5.67) sp, mean, std = normalize(encode_sp(sp)) sp_y = np.zeros_like(sp, dtype=np.float64) length = sp.shape[0] for index in range(int(sp.shape[0] / 128) + 1): if 128 * (index + 1) > length: sp_tmp = sp[128 * index:length] sp_tmp = np.pad(sp_tmp, ((0, 128 - sp_tmp.shape[0]), (0, 0)), 'constant', constant_values=0) else: sp_tmp = sp[128 * index:128 * (index + 1)] x_sp_box.append(sp_tmp[np.newaxis, :])