def test(self): """Translate speech using StarGAN .""" # Load the trained generator. self.restore_model(self.test_iters) norm = Normalizer() # Set data loader. d, speaker = TestSet(self.test_dir).test_data(self.src_speaker) targets = self.trg_speaker for target in targets: print(target) assert target in speakers label_t = self.spk_enc.transform([target])[0] if label_t == [0]: label_t = [1, 0] elif label_t == [1]: label_t = [0, 1] label_t = np.asarray([label_t]) with torch.no_grad(): for filename, content in d.items(): f0 = content['f0'] ap = content['ap'] sp_norm_pad = self.pad_coded_sp(content['coded_sp_norm']) convert_result = [] for start_idx in range(0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = torch.FloatTensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = torch.FloatTensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).data.cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content['coded_sp_norm']. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion(f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f'{speaker}-{target}_iter{self.test_iters}_{filename}' path = os.path.join(self.result_dir, name) print(f'[save]:{path}') librosa.output.write_wav(path, wav, SAMPLE_RATE)
class TestSet(object): """docstring for TestSet.""" def __init__(self, datadir: str): super(TestSet, self).__init__() self.datadir = datadir self.norm = Normalizer() def choose(self): '''choose one speaker for test''' r = random.choice(speakers) return r def test_data(self, src_speaker=None): '''choose one speaker for conversion''' if src_speaker: r_s = src_speaker else: r_s = self.choose() p = os.path.join(self.datadir, r_s) wavfiles = librosa.util.find_files(p, ext='wav') res = {} for f in wavfiles: filename = os.path.basename(f) wav, _ = librosa.load(f, sr=SAMPLE_RATE, dtype=np.float64) f0, timeaxis, sp, ap, coded_sp = world_features( wav, SAMPLE_RATE, FFTSIZE, FEATURE_DIM) coded_sp_norm = self.norm.forward_process(coded_sp.T, r_s) if not res.__contains__(filename): res[filename] = {} res[filename]['coded_sp_norm'] = np.asarray(coded_sp_norm) res[filename]['f0'] = np.asarray(f0) res[filename]['ap'] = np.asarray(ap) return res, r_s
class TestSet(object): def __init__(self, data_dir: str, sr: int): super(TestSet, self).__init__() self.data_dir = data_dir self.norm = Normalizer() self.sample_rate = sr def choose(self): r = choice(speakers) return r def test_data(self, src_speaker=None): if src_speaker: r_s = src_speaker else: r_s = self.choose() p = os.path.join(self.data_dir, r_s) wavfiles = librosa.util.find_files(p, ext='wav') res = {} for f in wavfiles: filename = os.path.basename(f) wav, _ = librosa.load(f, sr=self.sample_rate, dtype=np.float64) f0, ap, mcep = cal_mcep(wav, self.sample_rate, FEATURE_DIM, FFTSIZE, SHIFTMS, ALPHA) mcep_norm = self.norm.forward_process(mcep, r_s) if not res.__contains__(filename): res[filename] = {} res[filename]['mcep_norm'] = np.asarray(mcep_norm) res[filename]['f0'] = np.asarray(f0) res[filename]['ap'] = np.asarray(ap) return res, r_s
class TestSet(object): """对于测试数据的的说明""" def __init__(self, datadir: str): super(TestSet, self).__init__() self.datadir = datadir # 定义对象norm方法为数据正则化方法 self.norm = Normalizer() # 随机选取发音者方法 def choose(self): """为测试数据选择一个发音者""" # 根据speakers这个序列中随机取出一个speaker作为目标发音者 r = random.choice(speakers) return r # 默认源发音者为空值 def test_data(self, src_speaker=None): """为转换数据选择一个发音者""" # 如果传入了源发音者,即将这个参数赋值给r_s变量 if src_speaker: r_s = src_speaker # 如果没有参数传入 else: # 就自动调用对象的choose方法随机选取一个发音者 r_s = self.choose() # 将这个源发音者的名称和原本的数据集地址拼接在一起作为新路径 # 因为这个数据集是以发音者的名字作为子集的,所以要找到对应发音者的子数据集必须这样处理 # 如果采用不同的数据集命名方式那么处理方式就会不同 p = os.path.join(self.datadir, r_s) # 根据路由找到该路径下的所有wav格式文件 wavfiles = librosa.util.find_files(p, ext='wav') res = {} # 遍历这个数据集对象 for f in wavfiles: # 获取对应的文件名 filename = os.path.basename(f) # librosa.load方法f为文件地址,sr参数为采样率,如果保存原有采样率则赋值为None # dtype为精度,将返回一个音频时间序列和一个音频采样率 # 因为音频采样率不被使用,所以以_作为变量名占位保存 # 该方法还有mono :bool,是否将信号转换为单声道 # offset :float,在此时间之后开始阅读(以秒为单位) # duration:float,仅加载这么多的音频(以秒为单位) wav, _ = librosa.load(f, sr=SAMPLE_RATE, dtype=np.float64) # 使用自定义的world_features获取对应的wav数据 f0, timeaxis, sp, ap, coded_sp = world_features( wav, SAMPLE_RATE, FFTSIZE, FEATURE_DIM) # 调用自定义的forward_process方法对编码频谱包络进行处理 coded_sp_norm = self.norm.forward_process(coded_sp.T, r_s) # 将res添加对应的字典值,第一维键名为文件名 if not res.__contains__(filename): res[filename] = {} res[filename]['coded_sp_norm'] = np.asarray(coded_sp_norm) res[filename]['f0'] = np.asarray(f0) res[filename]['ap'] = np.asarray(ap) # 返回处理后的数据字典与目标发音者标签 return res, r_s
def train(self): # 衰减的学习率缓存 g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr # 开始训练步骤数为0 start_iters = 0 # 如果存在就跳过 if self.resume_iters: pass # 调用定义的个性化标准化方法 norm = Normalizer() # iter用来生成迭代器,这里用来迭代加载数据集 data_iter = iter(self.data_loader) print('开始训练......') # 记录当前时间,now函数取当前时间 start_time = datetime.now() # 利用总迭代次数来进行遍历 for i in range(start_iters, self.num_iters): # =================================================================================== # # 1.预处理输入数据 # # =================================================================================== # # 获取真实的图像和对应标签标签 try: # next方法为迭代下一个迭代器 # 利用自定义的加载器获取真实x值,发音者标签在组中索引与源标签 x_real, speaker_idx_org, label_org = next(data_iter) except: # 如果迭代器有问题就再转换为迭代器一次然后迭代 data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # 随机生成目标域标签 # torch.randperm返回一个从0到参数-1范围的随机数组 # 因为标签二值化了,所以这里的标签是10组成的,所以一共有label_org.size(0)个标签 # 获得的是随机索引 rand_idx = torch.randperm(label_org.size(0)) # 根据随机数作为源标签的索引作为目标标签数 label_trg = label_org[rand_idx] # 同理得到随机目标发音者 speaker_idx_trg = speaker_idx_org[rand_idx] # to表示使用cpu或者gpu运行 x_real = x_real.to(self.device) # 输入数据 label_org = label_org.to(self.device) # 源域one-hot格式标签 label_trg = label_trg.to(self.device) # 目标域ont-hot格式标签 speaker_idx_org = speaker_idx_org.to(self.device) # 源域标签 speaker_idx_trg = speaker_idx_trg.to(self.device) # 目标域标签 # =================================================================================== # # 2.训练判别器 # # =================================================================================== # # 用真实音频数据计算损失 # nn.CrossEntropyLoss()为交叉熵损失函数,但是不是普通的形式,而是主要是将softmax-log-NLLLoss合并到一块得到的结果。 CELoss = nn.CrossEntropyLoss() # 调用分类器计算真实数据 cls_real = self.C(x_real) # 计算对应的域分类损失,即用交叉熵实现 cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) # 重置缓冲区,具体实现在下面 self.reset_grad() # tensor.backward为自动求导函数 cls_loss_real.backward() # optimizer.step这个方法会更新模型所有的参数以提升学习率,一般在backward函数后根据其计算的梯度来更新参数 self.c_optimizer.step() # 记录中 loss = {} # 从真实域分类损失张量中获取元素值 # item()得到一个元素张量里面的元素值 loss['C/C_loss'] = cls_loss_real.item() # 基于源数据的D判断结果 out_r = self.D(x_real, label_org) # 用假音频帧计算损失 # 根据真实样本与目标标签生成生成样本 x_fake = self.G(x_real, label_trg) # detach截断反向传播的梯度流,从而让梯度不影响判别器D # 基于生成样本的D判断结果 out_f = self.D(x_fake.detach(), label_trg) # torch.nn.Function.binary_cross_entropy_with_logits度量目标逻辑和输出逻辑之间的二进制交叉熵的函数 # 接受任意形状的输入,target要求与输入形状一致。切记:target的值必须在[0,N-1]之间,其中N为类别数,否则会出现莫名其妙的错误,比如loss为负数。 # 计算其实就是交叉熵,不过输入不要求在0,1之间,该函数会自动添加sigmoid运算 # 返回一个填充了标量值1的张量,其大小与输入相同。torch.ones_like(input) # 相当于torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device) # binary_cross_entropy_with_logits和binary_cross_entropy的区别 # 有一个(类)损失函数名字中带了with_logits. 而这里的logits指的是,该损失函数已经内部自带了计算logit的操作, # 无需在传入给这个loss函数之前手动使用sigmoid/softmax将之前网络的输入映射到[0,1]之间 d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \ F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float)) # 生成样本的分类结果 out_cls = self.C(x_fake) # 交叉熵计算生成样本的域分类损失 d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # 计算梯度惩罚的损失 alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device) # 计算x_hat # requires_grad_设置积分方法,将requires_grad是否积分的属性设置为真 # 取一个随机数混合真实样本和生成样本得到一个x尖 x_hat = (alpha * x_real.data + (1 - alpha) * x_fake.data).requires_grad_(True) # 计算混合样本和目标标签的判别结果 out_src = self.D(x_hat, label_trg) # 调用自定义方法得到处理导数后的数据 d_loss_gp = self.gradient_penalty(out_src, x_hat) # 计算判别器的总体损失 d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp # 调用自定义方法重置梯度变化缓冲区 self.reset_grad() # 对D的损失求导 d_loss.backward() # 更新模型判别器D参数 self.d_optimizer.step() # loss['D/d_loss_t'] = d_loss_t.item() # loss['D/loss_cls'] = d_loss_cls.item() # loss['D/D_gp'] = d_loss_gp.item() # 获取判别器损失 loss['D/D_loss'] = d_loss.item() # =================================================================================== # # 3.训练生成器 # # =================================================================================== # # 进行模运算,判读更新时间 if (i + 1) % self.n_critic == 0: # 源至目标域 # 利用真实样本和目标标签生成生成样本 x_fake = self.G(x_real, label_trg) # 判别生成样本与目标标签 g_out_src = self.D(x_fake, label_trg) # 将生成与目标标签的损失与相同大小纯1张量计算交叉熵得到生成G损失 g_loss_fake = F.binary_cross_entropy_with_logits( input=g_out_src, target=torch.ones_like(g_out_src, dtype=torch.float)) # 得到真实样本通过域分类器得到的类别 out_cls = self.C(x_real) # 计算C计算类别与输入的类别的交叉熵损失即G的分类损失 g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org) # 目标至源域 # 通过G将生成样本转换为源标签 x_reconst = self.G(x_fake, label_org) # 得到循环一致性损失,即通过G转回来的损失,按道理这两个是同样的 # l1_loss为L1损失函数,即平均绝对误差 g_loss_rec = F.l1_loss(x_reconst, x_real) # 源到源域(身份一致性损失). # 通过真实样本与源标签生成,按道理也是生成x_real x_fake_iden = self.G(x_real, label_org) # 利用L1损失函数计算 id_loss = F.l1_loss(x_fake_iden, x_real) # 后退和优化 # 得到生成器的总体损失函数 g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\ self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss # 重置梯度变化缓冲区 self.reset_grad() # 对G损失求导 g_loss.backward() # 更新生成器参数 self.g_optimizer.step() # 记录对应的损失 loss['G/loss_fake'] = g_loss_fake.item() loss['G/loss_rec'] = g_loss_rec.item() loss['G/loss_cls'] = g_loss_cls.item() loss['G/loss_id'] = id_loss.item() loss['G/g_loss'] = g_loss.item() # =================================================================================== # # 4.其他 # # =================================================================================== # # 打印训练相关信息 if (i + 1) % self.log_step == 0: # 得到训练时间 et = datetime.now() - start_time # 截取后面的时间段 et = str(et)[:-7] # 耗时与迭代次数 log = "耗时:[{}], 迭代次数:[{}/{}]".format(et, i + 1, self.num_iters) # 打印对应损失值 for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) # 如果调用tensorboard来记录训练过程 if self.use_tensorboard: for tag, value in loss.items(): # 添加到log中 self.logger.scalar_summary(tag, value, i + 1) # 翻译固定数据进行调试 if (i + 1) % self.sample_step == 0: # torch.no_grad是一个上下文管理器,被该语句包括起来的部分将不会track 梯度 # 所有依赖他的tensor会全部变成True,反向传播时就不会自动求导了,反向传播就不会保存梯度,因此大大节约了显存或者说内存。 with torch.no_grad(): # 调用自定义方法,定义一个路由,并随机选取一个发音者作为测试数据 d, speaker = TestSet(self.test_dir).test_data() # random.choice返回参数的随机项 # 随机在speakers中选择一个不是目标的发音者 target = random.choice( [x for x in speakers if x != speaker]) # 将二值化的标签组取出第一个作为目标 # LabelBinary.transfrom方法将复杂类标签转换为二进制标签 label_t = self.spk_enc.transform([target])[0] # np.asarray将python原生列表或元组形式的现有数据来创建numpy数组 label_t = np.asarray([label_t]) # 取出字典中的文件名与内容 for filename, content in d.items(): f0 = content['f0'] ap = content['ap'] # 调用自定义方法处理对应的数据 sp_norm_pad = self.pad_coded_sp( content['coded_sp_norm']) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = torch.FloatTensor(one_seg).to( self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = torch.FloatTensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).data.cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content['coded_sp_norm']. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f'{speaker}-{target}_iter{i+1}_{filename}' path = os.path.join(self.sample_dir, name) print(f'[save]:{path}') librosa.output.write_wav(path, wav, SAMPLE_RATE) # 保存模型检查点 if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i + 1)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i + 1)) C_path = os.path.join(self.model_save_dir, '{}-C.ckpt'.format(i + 1)) torch.save(self.G.state_dict(), G_path) torch.save(self.D.state_dict(), D_path) torch.save(self.C.state_dict(), C_path) print('Saved model checkpoints into {}...'.format( self.model_save_dir)) # 衰减学习率 if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= (self.g_lr / float(self.num_iters_decay)) d_lr -= (self.d_lr / float(self.num_iters_decay)) c_lr -= (self.c_lr / float(self.num_iters_decay)) self.update_lr(g_lr, d_lr, c_lr) print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format( g_lr, d_lr))
def __init__(self, datadir: str): super(TestSet, self).__init__() self.datadir = datadir self.norm = Normalizer()
def __init__(self, datadir: str): super(TestSet, self).__init__() self.datadir = datadir # 定义对象norm方法为数据正则化方法 self.norm = Normalizer()
def train(self): # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr start_iters = 0 if self.resume_iters: pass norm = Normalizer() data_iter = iter(self.data_loader) print('Start training......') start_time = datetime.now() for i in range(start_iters, self.num_iters): # =================================================================================== # # 1. Preprocess input data # # =================================================================================== # # Fetch real images and labels. try: x_real, speaker_idx_org, label_org = next(data_iter) except: data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # Generate target domain labels randomly. rand_idx = torch.randperm(label_org.size(0)) label_trg = label_org[rand_idx] speaker_idx_trg = speaker_idx_org[rand_idx] x_real = x_real.to(self.device) # Input images. label_org = label_org.to( self.device) # Original domain one-hot labels. label_trg = label_trg.to( self.device) # Target domain one-hot labels. speaker_idx_org = speaker_idx_org.to( self.device) # Original domain labels speaker_idx_trg = speaker_idx_trg.to( self.device) #Target domain labels # =================================================================================== # # 2. Train the discriminator # # =================================================================================== # # Compute loss with real audio frame. CELoss = nn.CrossEntropyLoss() cls_real = self.C(x_real) cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) self.reset_grad() cls_loss_real.backward() self.c_optimizer.step() # Logging. loss = {} loss['C/C_loss'] = cls_loss_real.item() out_r = self.D(x_real, label_org) # Compute loss with fake audio frame. x_fake = self.G(x_real, label_trg) out_f = self.D(x_fake.detach(), label_trg) d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \ F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float)) out_cls = self.C(x_fake) d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Compute loss for gradient penalty. alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = (alpha * x_real.data + (1 - alpha) * x_fake.data).requires_grad_(True) out_src = self.D(x_hat, label_trg) d_loss_gp = self.gradient_penalty(out_src, x_hat) d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp self.reset_grad() d_loss.backward() self.d_optimizer.step() # loss['D/d_loss_t'] = d_loss_t.item() # loss['D/loss_cls'] = d_loss_cls.item() # loss['D/D_gp'] = d_loss_gp.item() loss['D/D_loss'] = d_loss.item() # =================================================================================== # # 3. Train the generator # # =================================================================================== # if (i + 1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, label_trg) g_out_src = self.D(x_fake, label_trg) g_loss_fake = F.binary_cross_entropy_with_logits( input=g_out_src, target=torch.ones_like(g_out_src, dtype=torch.float)) out_cls = self.C(x_fake) g_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Target-to-original domain. x_reconst = self.G(x_fake, label_org) g_loss_rec = F.l1_loss(x_reconst, x_real) # Original-to-Original domain(identity). x_fake_iden = self.G(x_real, label_org) id_loss = F.l1_loss(x_fake_iden, x_real) # Backward and optimize. g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\ self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss['G/loss_fake'] = g_loss_fake.item() loss['G/loss_rec'] = g_loss_rec.item() loss['G/loss_cls'] = g_loss_cls.item() loss['G/loss_id'] = id_loss.item() loss['G/g_loss'] = g_loss.item() # =================================================================================== # # 4. Miscellaneous # # =================================================================================== # # Print out training information. if (i + 1) % self.log_step == 0: et = datetime.now() - start_time et = str(et)[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format( et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) if self.use_tensorboard: for tag, value in loss.items(): self.logger.scalar_summary(tag, value, i + 1) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: with torch.no_grad(): d, speaker = TestSet(self.test_dir).test_data() target = random.choice( [x for x in speakers if x != speaker]) label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) for filename, content in d.items(): f0 = content['f0'] ap = content['ap'] sp_norm_pad = self.pad_coded_sp( content['coded_sp_norm']) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = torch.FloatTensor(one_seg).to( self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = torch.FloatTensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).data.cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content['coded_sp_norm']. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f'{speaker}-{target}_iter{i+1}_{filename}' path = os.path.join(self.sample_dir, name) print(f'[save]:{path}') librosa.output.write_wav(path, wav, SAMPLE_RATE) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i + 1)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i + 1)) C_path = os.path.join(self.model_save_dir, '{}-C.ckpt'.format(i + 1)) torch.save(self.G.state_dict(), G_path) torch.save(self.D.state_dict(), D_path) torch.save(self.C.state_dict(), C_path) print('Saved model checkpoints into {}...'.format( self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= (self.g_lr / float(self.num_iters_decay)) d_lr -= (self.d_lr / float(self.num_iters_decay)) c_lr -= (self.c_lr / float(self.num_iters_decay)) self.update_lr(g_lr, d_lr, c_lr) print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format( g_lr, d_lr))
def __init__(self, data_dir: str, sr: int): super(TestSet, self).__init__() self.data_dir = data_dir self.norm = Normalizer() self.sample_rate = sr