def test(self):
        """Translate speech using StarGAN ."""
        # Load the trained generator.
        self.restore_model(self.test_iters)
        norm = Normalizer()

        # Set data loader.
        d, speaker = TestSet(self.test_dir).test_data(self.src_speaker)
        targets = self.trg_speaker

        for target in targets:
            print(target)
            assert target in speakers
            label_t = self.spk_enc.transform([target])[0]
            if label_t == [0]: label_t = [1, 0]
            elif label_t == [1]: label_t = [0, 1]
            label_t = np.asarray([label_t])

            with torch.no_grad():

                for filename, content in d.items():
                    f0 = content['f0']
                    ap = content['ap']
                    sp_norm_pad = self.pad_coded_sp(content['coded_sp_norm'])

                    convert_result = []
                    for start_idx in range(0,
                                           sp_norm_pad.shape[1] - FRAMES + 1,
                                           FRAMES):
                        one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES]

                        one_seg = torch.FloatTensor(one_seg).to(self.device)
                        one_seg = one_seg.view(1, 1, one_seg.size(0),
                                               one_seg.size(1))
                        l = torch.FloatTensor(label_t)
                        one_seg = one_seg.to(self.device)
                        l = l.to(self.device)
                        one_set_return = self.G(one_seg, l).data.cpu().numpy()
                        one_set_return = np.squeeze(one_set_return)
                        one_set_return = norm.backward_process(
                            one_set_return, target)
                        convert_result.append(one_set_return)

                    convert_con = np.concatenate(convert_result, axis=1)
                    convert_con = convert_con[:, 0:content['coded_sp_norm'].
                                              shape[1]]
                    contigu = np.ascontiguousarray(convert_con.T,
                                                   dtype=np.float64)
                    decoded_sp = decode_spectral_envelope(contigu,
                                                          SAMPLE_RATE,
                                                          fft_size=FFTSIZE)
                    f0_converted = norm.pitch_conversion(f0, speaker, target)
                    wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE)

                    name = f'{speaker}-{target}_iter{self.test_iters}_{filename}'
                    path = os.path.join(self.result_dir, name)
                    print(f'[save]:{path}')
                    librosa.output.write_wav(path, wav, SAMPLE_RATE)
Beispiel #2
0
    def train(self):
        # 衰减的学习率缓存
        g_lr = self.g_lr
        d_lr = self.d_lr
        c_lr = self.c_lr
        # 开始训练步骤数为0
        start_iters = 0
        # 如果存在就跳过
        if self.resume_iters:
            pass
        # 调用定义的个性化标准化方法
        norm = Normalizer()
        # iter用来生成迭代器,这里用来迭代加载数据集
        data_iter = iter(self.data_loader)
        print('开始训练......')
        # 记录当前时间,now函数取当前时间
        start_time = datetime.now()
        # 利用总迭代次数来进行遍历
        for i in range(start_iters, self.num_iters):
            # =================================================================================== #
            #                                 1.预处理输入数据                                    #
            # =================================================================================== #
            # 获取真实的图像和对应标签标签
            try:
                # next方法为迭代下一个迭代器
                # 利用自定义的加载器获取真实x值,发音者标签在组中索引与源标签
                x_real, speaker_idx_org, label_org = next(data_iter)
            except:
                # 如果迭代器有问题就再转换为迭代器一次然后迭代
                data_iter = iter(self.data_loader)
                x_real, speaker_idx_org, label_org = next(data_iter)

            # 随机生成目标域标签
            # torch.randperm返回一个从0到参数-1范围的随机数组
            # 因为标签二值化了,所以这里的标签是10组成的,所以一共有label_org.size(0)个标签
            # 获得的是随机索引
            rand_idx = torch.randperm(label_org.size(0))
            # 根据随机数作为源标签的索引作为目标标签数
            label_trg = label_org[rand_idx]
            # 同理得到随机目标发音者
            speaker_idx_trg = speaker_idx_org[rand_idx]
            # to表示使用cpu或者gpu运行
            x_real = x_real.to(self.device)  # 输入数据
            label_org = label_org.to(self.device)  # 源域one-hot格式标签
            label_trg = label_trg.to(self.device)  # 目标域ont-hot格式标签
            speaker_idx_org = speaker_idx_org.to(self.device)  # 源域标签
            speaker_idx_trg = speaker_idx_trg.to(self.device)  # 目标域标签

            # =================================================================================== #
            #                                      2.训练判别器                                   #
            # =================================================================================== #
            # 用真实音频数据计算损失
            # nn.CrossEntropyLoss()为交叉熵损失函数,但是不是普通的形式,而是主要是将softmax-log-NLLLoss合并到一块得到的结果。
            CELoss = nn.CrossEntropyLoss()
            # 调用分类器计算真实数据
            cls_real = self.C(x_real)
            # 计算对应的域分类损失,即用交叉熵实现
            cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org)
            # 重置缓冲区,具体实现在下面
            self.reset_grad()
            # tensor.backward为自动求导函数
            cls_loss_real.backward()
            # optimizer.step这个方法会更新模型所有的参数以提升学习率,一般在backward函数后根据其计算的梯度来更新参数
            self.c_optimizer.step()
            # 记录中
            loss = {}
            # 从真实域分类损失张量中获取元素值
            # item()得到一个元素张量里面的元素值
            loss['C/C_loss'] = cls_loss_real.item()

            # 基于源数据的D判断结果
            out_r = self.D(x_real, label_org)
            # 用假音频帧计算损失
            # 根据真实样本与目标标签生成生成样本
            x_fake = self.G(x_real, label_trg)
            # detach截断反向传播的梯度流,从而让梯度不影响判别器D
            # 基于生成样本的D判断结果
            out_f = self.D(x_fake.detach(), label_trg)
            # torch.nn.Function.binary_cross_entropy_with_logits度量目标逻辑和输出逻辑之间的二进制交叉熵的函数
            # 接受任意形状的输入,target要求与输入形状一致。切记:target的值必须在[0,N-1]之间,其中N为类别数,否则会出现莫名其妙的错误,比如loss为负数。
            # 计算其实就是交叉熵,不过输入不要求在0,1之间,该函数会自动添加sigmoid运算
            # 返回一个填充了标量值1的张量,其大小与输入相同。torch.ones_like(input)
            # 相当于torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)

            # binary_cross_entropy_with_logits和binary_cross_entropy的区别
            # 有一个(类)损失函数名字中带了with_logits. 而这里的logits指的是,该损失函数已经内部自带了计算logit的操作,
            # 无需在传入给这个loss函数之前手动使用sigmoid/softmax将之前网络的输入映射到[0,1]之间
            d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \
                F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float))
            # 生成样本的分类结果
            out_cls = self.C(x_fake)
            # 交叉熵计算生成样本的域分类损失
            d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

            # 计算梯度惩罚的损失
            alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device)
            # 计算x_hat
            # requires_grad_设置积分方法,将requires_grad是否积分的属性设置为真
            # 取一个随机数混合真实样本和生成样本得到一个x尖
            x_hat = (alpha * x_real.data +
                     (1 - alpha) * x_fake.data).requires_grad_(True)
            # 计算混合样本和目标标签的判别结果
            out_src = self.D(x_hat, label_trg)
            # 调用自定义方法得到处理导数后的数据
            d_loss_gp = self.gradient_penalty(out_src, x_hat)
            # 计算判别器的总体损失
            d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp
            # 调用自定义方法重置梯度变化缓冲区
            self.reset_grad()
            # 对D的损失求导
            d_loss.backward()
            # 更新模型判别器D参数
            self.d_optimizer.step()

            # loss['D/d_loss_t'] = d_loss_t.item()
            # loss['D/loss_cls'] = d_loss_cls.item()
            # loss['D/D_gp'] = d_loss_gp.item()
            # 获取判别器损失
            loss['D/D_loss'] = d_loss.item()

            # =================================================================================== #
            #                                       3.训练生成器                                  #
            # =================================================================================== #
            # 进行模运算,判读更新时间
            if (i + 1) % self.n_critic == 0:
                # 源至目标域
                # 利用真实样本和目标标签生成生成样本
                x_fake = self.G(x_real, label_trg)
                #  判别生成样本与目标标签
                g_out_src = self.D(x_fake, label_trg)
                # 将生成与目标标签的损失与相同大小纯1张量计算交叉熵得到生成G损失
                g_loss_fake = F.binary_cross_entropy_with_logits(
                    input=g_out_src,
                    target=torch.ones_like(g_out_src, dtype=torch.float))
                # 得到真实样本通过域分类器得到的类别
                out_cls = self.C(x_real)
                # 计算C计算类别与输入的类别的交叉熵损失即G的分类损失
                g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org)

                # 目标至源域
                # 通过G将生成样本转换为源标签
                x_reconst = self.G(x_fake, label_org)
                # 得到循环一致性损失,即通过G转回来的损失,按道理这两个是同样的
                # l1_loss为L1损失函数,即平均绝对误差
                g_loss_rec = F.l1_loss(x_reconst, x_real)

                # 源到源域(身份一致性损失).
                # 通过真实样本与源标签生成,按道理也是生成x_real
                x_fake_iden = self.G(x_real, label_org)
                # 利用L1损失函数计算
                id_loss = F.l1_loss(x_fake_iden, x_real)

                # 后退和优化
                # 得到生成器的总体损失函数
                g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\
                 self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss
                # 重置梯度变化缓冲区
                self.reset_grad()
                # 对G损失求导
                g_loss.backward()
                # 更新生成器参数
                self.g_optimizer.step()

                # 记录对应的损失
                loss['G/loss_fake'] = g_loss_fake.item()
                loss['G/loss_rec'] = g_loss_rec.item()
                loss['G/loss_cls'] = g_loss_cls.item()
                loss['G/loss_id'] = id_loss.item()
                loss['G/g_loss'] = g_loss.item()
            # =================================================================================== #
            #                                           4.其他                                    #
            # =================================================================================== #
            # 打印训练相关信息
            if (i + 1) % self.log_step == 0:
                # 得到训练时间
                et = datetime.now() - start_time
                # 截取后面的时间段
                et = str(et)[:-7]
                # 耗时与迭代次数
                log = "耗时:[{}], 迭代次数:[{}/{}]".format(et, i + 1, self.num_iters)
                # 打印对应损失值
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)
                # 如果调用tensorboard来记录训练过程
                if self.use_tensorboard:
                    for tag, value in loss.items():
                        # 添加到log中
                        self.logger.scalar_summary(tag, value, i + 1)

            # 翻译固定数据进行调试
            if (i + 1) % self.sample_step == 0:
                # torch.no_grad是一个上下文管理器,被该语句包括起来的部分将不会track 梯度
                # 所有依赖他的tensor会全部变成True,反向传播时就不会自动求导了,反向传播就不会保存梯度,因此大大节约了显存或者说内存。
                with torch.no_grad():
                    # 调用自定义方法,定义一个路由,并随机选取一个发音者作为测试数据
                    d, speaker = TestSet(self.test_dir).test_data()
                    # random.choice返回参数的随机项
                    # 随机在speakers中选择一个不是目标的发音者
                    target = random.choice(
                        [x for x in speakers if x != speaker])
                    # 将二值化的标签组取出第一个作为目标
                    # LabelBinary.transfrom方法将复杂类标签转换为二进制标签
                    label_t = self.spk_enc.transform([target])[0]
                    # np.asarray将python原生列表或元组形式的现有数据来创建numpy数组
                    label_t = np.asarray([label_t])
                    # 取出字典中的文件名与内容
                    for filename, content in d.items():
                        f0 = content['f0']
                        ap = content['ap']
                        # 调用自定义方法处理对应的数据
                        sp_norm_pad = self.pad_coded_sp(
                            content['coded_sp_norm'])

                        convert_result = []
                        for start_idx in range(
                                0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
                            one_seg = sp_norm_pad[:,
                                                  start_idx:start_idx + FRAMES]

                            one_seg = torch.FloatTensor(one_seg).to(
                                self.device)
                            one_seg = one_seg.view(1, 1, one_seg.size(0),
                                                   one_seg.size(1))
                            l = torch.FloatTensor(label_t)
                            one_seg = one_seg.to(self.device)
                            l = l.to(self.device)
                            one_set_return = self.G(one_seg,
                                                    l).data.cpu().numpy()
                            one_set_return = np.squeeze(one_set_return)
                            one_set_return = norm.backward_process(
                                one_set_return, target)
                            convert_result.append(one_set_return)

                        convert_con = np.concatenate(convert_result, axis=1)
                        convert_con = convert_con[:,
                                                  0:content['coded_sp_norm'].
                                                  shape[1]]
                        contigu = np.ascontiguousarray(convert_con.T,
                                                       dtype=np.float64)
                        decoded_sp = decode_spectral_envelope(contigu,
                                                              SAMPLE_RATE,
                                                              fft_size=FFTSIZE)
                        f0_converted = norm.pitch_conversion(
                            f0, speaker, target)
                        wav = synthesize(f0_converted, decoded_sp, ap,
                                         SAMPLE_RATE)

                        name = f'{speaker}-{target}_iter{i+1}_{filename}'
                        path = os.path.join(self.sample_dir, name)
                        print(f'[save]:{path}')
                        librosa.output.write_wav(path, wav, SAMPLE_RATE)

            # 保存模型检查点
            if (i + 1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir,
                                      '{}-G.ckpt'.format(i + 1))
                D_path = os.path.join(self.model_save_dir,
                                      '{}-D.ckpt'.format(i + 1))
                C_path = os.path.join(self.model_save_dir,
                                      '{}-C.ckpt'.format(i + 1))
                torch.save(self.G.state_dict(), G_path)
                torch.save(self.D.state_dict(), D_path)
                torch.save(self.C.state_dict(), C_path)
                print('Saved model checkpoints into {}...'.format(
                    self.model_save_dir))

            # 衰减学习率
            if (i + 1) % self.lr_update_step == 0 and (i + 1) > (
                    self.num_iters - self.num_iters_decay):
                g_lr -= (self.g_lr / float(self.num_iters_decay))
                d_lr -= (self.d_lr / float(self.num_iters_decay))
                c_lr -= (self.c_lr / float(self.num_iters_decay))
                self.update_lr(g_lr, d_lr, c_lr)
                print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(
                    g_lr, d_lr))
    def train(self):
        # Learning rate cache for decaying.
        g_lr = self.g_lr
        d_lr = self.d_lr
        c_lr = self.c_lr

        start_iters = 0
        if self.resume_iters:
            pass

        norm = Normalizer()
        data_iter = iter(self.data_loader)

        print('Start training......')
        start_time = datetime.now()

        for i in range(start_iters, self.num_iters):
            # =================================================================================== #
            #                             1. Preprocess input data                                #
            # =================================================================================== #
            # Fetch real images and labels.
            try:
                x_real, speaker_idx_org, label_org = next(data_iter)
            except:
                data_iter = iter(self.data_loader)
                x_real, speaker_idx_org, label_org = next(data_iter)

            # Generate target domain labels randomly.
            rand_idx = torch.randperm(label_org.size(0))
            label_trg = label_org[rand_idx]
            speaker_idx_trg = speaker_idx_org[rand_idx]

            x_real = x_real.to(self.device)  # Input images.
            label_org = label_org.to(
                self.device)  # Original domain one-hot labels.
            label_trg = label_trg.to(
                self.device)  # Target domain one-hot labels.
            speaker_idx_org = speaker_idx_org.to(
                self.device)  # Original domain labels
            speaker_idx_trg = speaker_idx_trg.to(
                self.device)  #Target domain labels

            # =================================================================================== #
            #                             2. Train the discriminator                              #
            # =================================================================================== #
            # Compute loss with real audio frame.
            CELoss = nn.CrossEntropyLoss()
            cls_real = self.C(x_real)
            cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org)

            self.reset_grad()
            cls_loss_real.backward()
            self.c_optimizer.step()
            # Logging.
            loss = {}
            loss['C/C_loss'] = cls_loss_real.item()

            out_r = self.D(x_real, label_org)
            # Compute loss with fake audio frame.
            x_fake = self.G(x_real, label_trg)
            out_f = self.D(x_fake.detach(), label_trg)
            d_loss_t = F.binary_cross_entropy_with_logits(input=out_f,target=torch.zeros_like(out_f, dtype=torch.float)) + \
                F.binary_cross_entropy_with_logits(input=out_r, target=torch.ones_like(out_r, dtype=torch.float))

            out_cls = self.C(x_fake)
            d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

            # Compute loss for gradient penalty.
            alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device)
            x_hat = (alpha * x_real.data +
                     (1 - alpha) * x_fake.data).requires_grad_(True)
            out_src = self.D(x_hat, label_trg)
            d_loss_gp = self.gradient_penalty(out_src, x_hat)

            d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp

            self.reset_grad()
            d_loss.backward()
            self.d_optimizer.step()

            # loss['D/d_loss_t'] = d_loss_t.item()
            # loss['D/loss_cls'] = d_loss_cls.item()
            # loss['D/D_gp'] = d_loss_gp.item()
            loss['D/D_loss'] = d_loss.item()

            # =================================================================================== #
            #                               3. Train the generator                                #
            # =================================================================================== #
            if (i + 1) % self.n_critic == 0:
                # Original-to-target domain.
                x_fake = self.G(x_real, label_trg)
                g_out_src = self.D(x_fake, label_trg)
                g_loss_fake = F.binary_cross_entropy_with_logits(
                    input=g_out_src,
                    target=torch.ones_like(g_out_src, dtype=torch.float))

                out_cls = self.C(x_fake)
                g_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg)

                # Target-to-original domain.
                x_reconst = self.G(x_fake, label_org)
                g_loss_rec = F.l1_loss(x_reconst, x_real)

                # Original-to-Original domain(identity).
                x_fake_iden = self.G(x_real, label_org)
                id_loss = F.l1_loss(x_fake_iden, x_real)

                # Backward and optimize.
                g_loss = g_loss_fake + self.lambda_cycle * g_loss_rec +\
                 self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss

                self.reset_grad()
                g_loss.backward()
                self.g_optimizer.step()

                # Logging.
                loss['G/loss_fake'] = g_loss_fake.item()
                loss['G/loss_rec'] = g_loss_rec.item()
                loss['G/loss_cls'] = g_loss_cls.item()
                loss['G/loss_id'] = id_loss.item()
                loss['G/g_loss'] = g_loss.item()
            # =================================================================================== #
            #                                 4. Miscellaneous                                    #
            # =================================================================================== #
            # Print out training information.
            if (i + 1) % self.log_step == 0:
                et = datetime.now() - start_time
                et = str(et)[:-7]
                log = "Elapsed [{}], Iteration [{}/{}]".format(
                    et, i + 1, self.num_iters)
                for tag, value in loss.items():
                    log += ", {}: {:.4f}".format(tag, value)
                print(log)

                if self.use_tensorboard:
                    for tag, value in loss.items():
                        self.logger.scalar_summary(tag, value, i + 1)

            # Translate fixed images for debugging.
            if (i + 1) % self.sample_step == 0:
                with torch.no_grad():
                    d, speaker = TestSet(self.test_dir).test_data()
                    target = random.choice(
                        [x for x in speakers if x != speaker])
                    label_t = self.spk_enc.transform([target])[0]
                    label_t = np.asarray([label_t])

                    for filename, content in d.items():
                        f0 = content['f0']
                        ap = content['ap']
                        sp_norm_pad = self.pad_coded_sp(
                            content['coded_sp_norm'])

                        convert_result = []
                        for start_idx in range(
                                0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES):
                            one_seg = sp_norm_pad[:,
                                                  start_idx:start_idx + FRAMES]

                            one_seg = torch.FloatTensor(one_seg).to(
                                self.device)
                            one_seg = one_seg.view(1, 1, one_seg.size(0),
                                                   one_seg.size(1))
                            l = torch.FloatTensor(label_t)
                            one_seg = one_seg.to(self.device)
                            l = l.to(self.device)
                            one_set_return = self.G(one_seg,
                                                    l).data.cpu().numpy()
                            one_set_return = np.squeeze(one_set_return)
                            one_set_return = norm.backward_process(
                                one_set_return, target)
                            convert_result.append(one_set_return)

                        convert_con = np.concatenate(convert_result, axis=1)
                        convert_con = convert_con[:,
                                                  0:content['coded_sp_norm'].
                                                  shape[1]]
                        contigu = np.ascontiguousarray(convert_con.T,
                                                       dtype=np.float64)
                        decoded_sp = decode_spectral_envelope(contigu,
                                                              SAMPLE_RATE,
                                                              fft_size=FFTSIZE)
                        f0_converted = norm.pitch_conversion(
                            f0, speaker, target)
                        wav = synthesize(f0_converted, decoded_sp, ap,
                                         SAMPLE_RATE)

                        name = f'{speaker}-{target}_iter{i+1}_{filename}'
                        path = os.path.join(self.sample_dir, name)
                        print(f'[save]:{path}')
                        librosa.output.write_wav(path, wav, SAMPLE_RATE)

            # Save model checkpoints.
            if (i + 1) % self.model_save_step == 0:
                G_path = os.path.join(self.model_save_dir,
                                      '{}-G.ckpt'.format(i + 1))
                D_path = os.path.join(self.model_save_dir,
                                      '{}-D.ckpt'.format(i + 1))
                C_path = os.path.join(self.model_save_dir,
                                      '{}-C.ckpt'.format(i + 1))
                torch.save(self.G.state_dict(), G_path)
                torch.save(self.D.state_dict(), D_path)
                torch.save(self.C.state_dict(), C_path)
                print('Saved model checkpoints into {}...'.format(
                    self.model_save_dir))

            # Decay learning rates.
            if (i + 1) % self.lr_update_step == 0 and (i + 1) > (
                    self.num_iters - self.num_iters_decay):
                g_lr -= (self.g_lr / float(self.num_iters_decay))
                d_lr -= (self.d_lr / float(self.num_iters_decay))
                c_lr -= (self.c_lr / float(self.num_iters_decay))
                self.update_lr(g_lr, d_lr, c_lr)
                print('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(
                    g_lr, d_lr))