def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs,
           inf_iterator_test, logger, iteration):

    for epoch in range(8000):
        mse_sum = 0
        mse_n = 0

        for i, (audio, name) in enumerate(loader):

            cluster_size = audio.size(1)
            audio = audio.cuda()
            audio = (audio * 25 + 50) / 50

            time_step = audio.size(2)
            factor = 32
            audio_shuffle = [[] for i in range(time_step // factor)]
            nums = [x for x in range(time_step // factor)]
            random.shuffle(nums)

            for i_n, n in enumerate(nums):
                sf = random.uniform(0.5, 2)
                audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor *
                                                       (n + 1)],
                                                 scale_factor=sf,
                                                 mode='nearest')

            audio_shuffle = torch.cat(audio_shuffle, dim=2)

            audio = audio_shuffle  #F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step)
            audio = audio[..., :audio.size(2) // 16 * 16]

            audio_middile = F.interpolate(audio, scale_factor=1 / 2)
            audio_middile = audio_middile[:, :audio_middile.size(1) // 2, :]

            audio_low = F.interpolate(audio_middile, scale_factor=1 / 2)
            audio_low = audio_low[:, :audio_low.size(1) // 2, :]

            audio_list = [audio_low, audio_middile, audio]

            out, out_conversion, enc_content, latent_loss = model(audio, name)

            recon_loss = 0

            for num in range(3):
                recon_loss += criterion(out[num], audio_list[num])

            latent_loss = latent_loss.mean()
            #print ("recon_loss:", recon_loss)
            OptimStep([(model, opt,
                        recon_loss + latent_loss_weight * latent_loss, False)],
                      3)  # True),

            if i % 50 == 0:

                logger.log_training(iteration=iteration,
                                    loss_recon=recon_loss,
                                    latent_loss=latent_loss)

                model.eval()

                audio, name = next(inf_iterator_test)
                audio = audio.cuda()
                audio = (audio * 25 + 50) / 50

                out, out_conversion, enc_content, latent_loss = model(
                    audio, name)

                a = torch.stack([audio[0], out[-1][0], out_conversion[-1][0]],
                                dim=0)

                a = (a * 50 - 50) / 25
                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.log_validation(
                    iteration=iteration,
                    mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]),
                    mel_recon=("image", plot_spectrogram_to_numpy(),
                               out[-1][0]),
                    mel_conversion=("image", plot_spectrogram_to_numpy(),
                                    out_conversion[-1][0]),
                    audio_ori=("audio", 22050, a[0]),
                    audio_recon=("audio", 22050, a[1]),
                    audio_conversion=("audio", 22050, a[2]),
                )
                logger.close()
                save_checkpoint(
                    model, opt, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen'
                )

                model.train()
            iteration += 1
Ejemplo n.º 2
0
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs, inf_iterator_test, logger, iteration):
    vctk_mean = torch.tensor(np.load("/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/mean.npy")).unsqueeze(0).unsqueeze(2).cuda()
    vctk_std = torch.tensor(np.load("/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/std.npy")).unsqueeze(0).unsqueeze(2).cuda()    
    for epoch in range(epochs):
        mse_sum = 0
        mse_n = 0
        
        for i, audio in enumerate(loader):
            cluster_size = audio.size(1)
            audio = audio.cuda()
            audio = (audio - vctk_mean)/vctk_std
            factor = 32

            time_step = audio.size(2)
            
            audio_shuffle = [[] for i in range (time_step//factor)]
            nums = [x for x in range(time_step//factor)]
            random.shuffle(nums)
            
            for i_n, n in enumerate(nums):
                sf = random.uniform(0.5, 2)
                audio_shuffle[n] = F.interpolate(audio[...,factor*n : factor*(n+1)], scale_factor=sf, mode='nearest')
            
            audio_shuffle = torch.cat(audio_shuffle,dim=2)   

            audio = audio_shuffle#F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step)
            audio = audio[...,:audio.size(2)//32*32]
            
            audio_middile =  F.interpolate(audio, scale_factor= 1/2)
            audio_middile = audio_middile[:, :audio_middile.size(1)//2, :]

            audio_low = F.interpolate(audio_middile, scale_factor= 1/2)
            audio_low = audio_low[:, :audio_low.size(1)//2, :]
            
            audio_list = [audio_low, audio_middile, audio]
            
            out, out_conversion, enc_content, spk, latent_loss, idx = model(audio)
            
            recon_loss = 0

            for num in range(3):
                recon_loss += criterion(out[num], audio_list[num])
            
            latent_loss = latent_loss.mean()  
            #print ("recon_loss:", recon_loss)
            OptimStep([(model, opt,  recon_loss + latent_loss_weight*latent_loss , False)], 3)# True),
            


            if i % 200 == 0 :
                
                logger.log_training(iteration = iteration,  loss_recon = recon_loss, latent_loss = latent_loss)

                model.eval()

                audio = next(inf_iterator_test)
                audio = audio.cuda()
                audio = (audio - vctk_mean)/vctk_std
                
                out, out_conversion, enc_content, spk, latent_loss, idx = model(audio)
                
                
                audio = audio*vctk_std + vctk_mean
                out[-1] = out[-1]*vctk_std + vctk_mean
                out_conversion[-1] = out_conversion[-1]*vctk_std+vctk_mean
                a = torch.stack([audio[0], audio[idx[0]], out[-1][0], out_conversion[-1][0]], dim = 0)

                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.log_validation(iteration = iteration,
                    mel_ori = ("image", plot_spectrogram_to_numpy(), audio[0]),
                    mel_target = ("image", plot_spectrogram_to_numpy(), audio[idx[0]]),
                    mel_recon = ("image", plot_spectrogram_to_numpy(), out[-1][0]),
                    mel_conversion = ("image", plot_spectrogram_to_numpy(), out_conversion[-1][0]),
                    
                    mel_recon_middle = ("image", plot_spectrogram_to_numpy(), out[-2][0]),
                    mel_conversion_middle = ("image", plot_spectrogram_to_numpy(), out_conversion[-2][0]),
                    
                    mel_recon_low = ("image", plot_spectrogram_to_numpy(), out[-3][0]),
                    mel_conversion_low = ("image", plot_spectrogram_to_numpy(), out_conversion[-3][0]),

                    audio_ori = ("audio", 22050, a[0]),
                    audio_target = ("audio", 22050, a[1]),
                    audio_recon = ("audio", 22050, a[2]),
                    audio_conversion = ("audio", 22050, a[3]),

                )
                logger.close()
                save_checkpoint(model, opt, iteration, f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen')
                
                model.train()
            iteration += 1
Ejemplo n.º 3
0
    optimize_model(policy_net, batch_log_prob, batch_rewards, optimizer, GAMMA, device=device)

    # Clear trajectories batch
    batch_log_prob = []
    batch_rewards = []

    # Reset Flags
    if not(render_each_episode):
        finished_rendering_this_epoch = False

    # Record stats
    training_info["epoch mean durations"].append(sum(epoch_durations) / batch_size)
    training_info["epoch mean rewards"].append(sum(epoch_rewards) / batch_size)
    if (i_epoch + 1) % num_avg_epoch:
        training_info["past %d epochs mean reward" %  (num_avg_epoch)] = \
            (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \
                if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0


    # Plot stats
    plot_durations(training_info["epoch mean rewards"])

    # Update counter
    i_epoch += 1

    # Every save_ckpt_interval, save a checkpoint according to current i_episode.
    if (i_epoch) % save_ckpt_interval == 0:
        save_checkpoint(ckpt_dir, policy_net, optimizer, i_epoch, learning_rate=learning_rate,
                        **training_info)

Ejemplo n.º 4
0
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs,
           inf_iterator_test, logger, iteration):

    for epoch in range(epochs):
        factor = 32
        for i, audio in enumerate(loader):
            time_step = audio.size(2)
            audio = audio.cuda()
            audio_shuffle = [[] for i in range(time_step // factor)]
            nums = [x for x in range(time_step // factor)]
            random.shuffle(nums)

            for i_n, n in enumerate(nums):
                sf = random.uniform(0.5, 2)
                audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor *
                                                       (n + 1)],
                                                 scale_factor=sf,
                                                 mode='nearest')

            audio_shuffle = torch.cat(audio_shuffle, dim=2)
            audio = F.interpolate(audio,
                                  scale_factor=audio_shuffle.size(2) /
                                  time_step)
            audio = audio[..., :audio.size(2) // 16 * 16]
            audio_shuffle = audio_shuffle[..., :audio_shuffle.size(2) // 16 *
                                          16]
            out, out_conversion, enc_content, spk, latent_loss, idx = model(
                audio, audio_shuffle)

            recon_loss = criterion(
                out, audio)  #+ criterion(out_conversion, audio_shuffle)
            latent_loss = latent_loss.mean()

            OptimStep([(model, opt,
                        recon_loss + latent_loss_weight * latent_loss, False)],
                      3)  # True),

            if i % 50 == 0:

                logger.log_training(iteration=iteration,
                                    loss_recon=recon_loss,
                                    latent_loss=latent_loss)

            if i % 200 == 0:
                model.eval()

                audio = next(inf_iterator_test)
                audio = audio.cuda()
                audio_shuffle = [[] for i in range(time_step // factor)]

                for i_n, n in enumerate(nums):
                    sf = random.uniform(0.5, 1.5)
                    audio_shuffle[n] = F.interpolate(audio[..., factor *
                                                           n:factor * (n + 1)],
                                                     scale_factor=sf,
                                                     mode='nearest')

                audio_shuffle = torch.cat(audio_shuffle, dim=2)
                audio = F.interpolate(audio,
                                      scale_factor=audio_shuffle.size(2) /
                                      time_step)
                audio = audio[..., :audio.size(2) // 16 * 16]
                audio_shuffle = audio_shuffle[..., :audio_shuffle.size(2) //
                                              16 * 16]
                out, out_conversion, enc_content, spk, latent_loss, idx = model(
                    audio, audio_shuffle)
                a = torch.stack([
                    audio[0], audio_shuffle[idx[0]], out[0], out_conversion[0]
                ],
                                dim=0)

                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.close()
                logger.log_validation(
                    iteration=iteration,
                    mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]),
                    mel_target=("image", plot_spectrogram_to_numpy(),
                                audio_shuffle[idx[0]]),
                    mel_recon=("image", plot_spectrogram_to_numpy(), out[0]),
                    mel_conversion=("image", plot_spectrogram_to_numpy(),
                                    out_conversion[0]),
                    audio_ori=("audio", 22050, a[0]),
                    audio_target=("audio", 22050, a[1]),
                    audio_recon=("audio", 22050, a[2]),
                    audio_conversion=("audio", 22050, a[3]),
                )

                save_checkpoint(
                    model, opt, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen'
                )
                model.train()
            iteration += 1
Ejemplo n.º 5
0
                  training_info["max reward achieved"])
            print("Max TD loss recorded: %f" %
                  training_info["max TD loss recorded"])
            print("Max episode loss recorded: %f" %
                  training_info["max episode loss recorded"])
            print("Past 100 episodes avg reward: %f \n\n" %
                  training_info["past 100 episodes mean reward"])

            # Check if the problem is solved
            #  CartPole standard: average reward for the past 100 episode above 195
            if training_info["past 100 episodes mean reward"] > 195:
                print("\n\n\t Problem Solved !!!\n\n\n")

            break
    i_episode += 1

    # Update the target network, copying all weights and biases in DQN
    if i_episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    # Every save_ckpt_interval, save a checkpoint according to current i_episode.
    # Note that we use i_episode + 1
    if (i_episode + 1) % save_ckpt_interval == 0:
        save_checkpoint(ckpt_dir,
                        policy_net,
                        target_net,
                        optimizer,
                        i_episode + 1,
                        learning_rate=learning_rate,
                        **training_info)
Ejemplo n.º 6
0
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs,
           inf_iterator_test, logger, iteration, inf_iterator_enc):
    dis = NetD(80).cuda()
    opt_dis = optim.Adam(dis.parameters())
    '''
    gamma = 1.0
    lambda_k = 0.01
    init_k = 0.0
    recorder = BEGANRecorder(lambda_k, init_k, gamma)
    k = recorder.k.item()
    '''
    opt_dec = optim.Adam(model.dec.parameters())
    lj_mean = torch.tensor(
        np.load("/home/ericwudayi/nas189/homes/ericwudayi/LJSpeech/mean.npy")
    ).unsqueeze(0).unsqueeze(2).cuda()
    lj_std = torch.tensor(
        np.load("/home/ericwudayi/nas189/homes/ericwudayi/LJSpeech/std.npy")
    ).unsqueeze(0).unsqueeze(2).cuda()
    vctk_mean = torch.tensor(
        np.load(
            "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/mean.npy"
        )).unsqueeze(0).unsqueeze(2).cuda()
    vctk_std = torch.tensor(
        np.load(
            "/home/ericwudayi/nas189/homes/ericwudayi/VCTK-Corpus/mel3/std.npy"
        )).unsqueeze(0).unsqueeze(2).cuda()
    lj_mean = vctk_mean
    lj_std = vctk_std
    if args.load_checkpoint == True:
        dis, opt_dis, iteration = load_checkpoint(
            f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/dis',
            dis, opt_dis)

    for epoch in range(80000):

        for i, audio in enumerate(loader):

            audio = audio.cuda()
            audio = (audio - lj_mean) / lj_std
            #audio = (audio*25 + 50) / 50
            factor = 32

            time_step = audio.size(2)

            audio_shuffle = [[] for i in range(time_step // factor)]
            nums = [x for x in range(time_step // factor)]
            random.shuffle(nums)

            for i_n, n in enumerate(nums):
                sf = random.uniform(0.5, 2)
                audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor *
                                                       (n + 1)],
                                                 scale_factor=sf,
                                                 mode='nearest')

            audio_shuffle = torch.cat(audio_shuffle, dim=2)

            audio = audio_shuffle  #F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step)

            audio = audio[..., :audio.size(2) // 32 * 32]

            audio_middile = F.interpolate(audio, scale_factor=1 / 2)
            audio_middile = audio_middile[:, :audio_middile.size(1) // 2, :]

            audio_low = F.interpolate(audio_middile, scale_factor=1 / 2)
            audio_low = audio_low[:, :audio_low.size(1) // 2, :]

            audio_list = [audio_low, audio_middile, audio]

            out, latent_loss, index_list = model(audio)

            recon_loss = 0
            for num in range(3):
                recon_loss += criterion(out[num], audio_list[num])

            latent_loss = latent_loss.mean()

            if iteration % 1 == 0:
                model.zero_grad()

                audio_enc = next(inf_iterator_enc)
                audio_enc = audio_enc.cuda()
                audio_enc = (audio_enc - vctk_mean) / vctk_std
                if audio_enc.size(0) > audio.size(0):
                    audio_enc = audio_enc[:audio.size(0)]
                else:
                    audio = audio[:audio_enc.size(0)]
                audio_enc = F.interpolate(audio_enc,
                                          scale_factor=audio.size(2) /
                                          audio_enc.size(2))
                out_code, latent_loss_enc, index_list = model(audio_enc)
                #latent_loss += latent_loss_enc.mean()
                #latent_loss *= 0
                #loss_dis, loss_gan = GANLOSS(dis, audio, out_code[-1])
                #if iteration%4==0:
                #    OptimStep([(dis, opt_dis, loss_dis, False)],3)
                #else:
                #    OptimStep([(model, opt, recon_loss + latent_loss_weight*latent_loss + 0.1*loss_gan , False)],3)
                OptimStep([(model, opt, recon_loss +
                            latent_loss_weight * latent_loss, False)], 3)

            #else:
            #latent_loss *= 0
            #OptimStep([(model, opt,  recon_loss + latent_loss_weight*latent_loss , True)], 3)# True),

            #################################
            # BEGAN TRAINING PHASE          #
            #################################
            model.zero_grad()

            if iteration % 5 == 0:
                logger.log_training(iteration=iteration,
                                    loss_recon=recon_loss,
                                    latent_loss=latent_loss)

            if iteration % 200 == 0:
                model.eval()
                a = torch.stack(
                    [audio[0], out[-1][0], out_code[-1][0], audio_enc[0]],
                    dim=0)
                a = a * lj_std + lj_mean
                a[3] = (a[3] - lj_mean) / lj_std * vctk_std + vctk_mean
                image = a
                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.log_validation(
                    iteration=iteration,
                    mel_ori=("image", plot_spectrogram_to_numpy(), image[0]),
                    mel_recon=("image", plot_spectrogram_to_numpy(), image[1]),
                    mel_code=("image", plot_spectrogram_to_numpy(), image[2]),
                    mel_target=("image", plot_spectrogram_to_numpy(),
                                image[3]),
                    audio_ori=("audio", 22050, a[0]),
                    audio_recon=("audio", 22050, a[1]),
                    audio_code=("audio", 22050, a[2]),
                    audio_enc=("audio", 22050, a[3]),
                )

                save_checkpoint(
                    model, opt, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen'
                )
                save_checkpoint(
                    dis, opt_dis, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/dis'
                )

                model.train()
                logger.close()
            iteration += 1
Ejemplo n.º 7
0
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs,
           inf_iterator_test, logger, iteration):

    for epoch in range(epochs):
        mse_sum = 0
        mse_n = 0

        for i, (audio, pitch) in enumerate(loader):

            audio = audio.cuda().float()
            pitch = pitch.cuda().float()

            audio = (audio * 25 + 50) / 50

            #Normalize pitch
            #print (pitch.size())
            pitch_non_sil = (pitch > 20)
            pitch_sil = pitch < 20
            pitch_mean_non_sil = torch.sum(
                pitch * pitch_non_sil) / torch.sum(pitch_non_sil)
            pitch -= pitch_mean_non_sil  #torch.mean(pitch,dim = 1, keepdim = True)
            pitch = (pitch + 20) / 50
            pitch[pitch_sil] = 0.0
            #print (pitch[0,:50])

            pitch = pitch.unsqueeze(1)
            audio_middle = F.interpolate(audio, scale_factor=1 / 2)
            audio_middle = audio_middle[:, :audio_middle.size(1) // 2, :]
            pitch_middle = F.interpolate(pitch, scale_factor=1 / 2)

            audio_low = F.interpolate(audio_middle, scale_factor=1 / 2)
            audio_low = audio_low[:, :audio_low.size(1) // 2, :]
            pitch_low = F.interpolate(pitch_middle, scale_factor=1 / 2)

            audio_list = [audio_low, audio_middle, audio]
            pitch_list = [pitch, pitch_middle, pitch_low]
            out, out_conversion, enc_content, spk, latent_loss, idx = model(
                audio, pitch_list)

            recon_loss = 0
            #print (i)
            for num in range(3):
                recon_loss += criterion(out[num], audio_list[num])

            latent_loss = latent_loss.mean()
            #print ("recon_loss:", recon_loss)
            OptimStep([(model, opt,
                        recon_loss + latent_loss_weight * latent_loss, False)],
                      3)  # True),

            if i % 100 == 0:

                logger.log_training(iteration=iteration,
                                    loss_recon=recon_loss,
                                    latent_loss=latent_loss)

                model.eval()

                audio, pitch = next(inf_iterator_test)

                audio = audio.cuda().float()
                pitch = pitch.cuda().float()

                audio = (audio * 25 + 50) / 50
                pitch_non_sil = (pitch > 20)
                pitch_sil = pitch < 20
                pitch_mean_non_sil = torch.sum(
                    pitch * pitch_non_sil) / torch.sum(pitch_non_sil)
                pitch -= pitch_mean_non_sil  #torch.mean(pitch,dim = 1, keepdim = True)
                pitch = (pitch + 20) / 50
                pitch[pitch_sil] = 0.0
                pitch = pitch.unsqueeze(1)
                pitch_middle = F.interpolate(pitch, scale_factor=1 / 2)

                pitch_low = F.interpolate(pitch_middle, scale_factor=1 / 2)

                pitch_list = [pitch, pitch_middle, pitch_low]
                out, out_conversion, enc_content, spk, latent_loss, idx = model(
                    audio, pitch_list)

                a = torch.stack([
                    audio[0], audio[idx[0]], out[-1][0], out_conversion[-1][0]
                ],
                                dim=0)

                a = (a * 50 - 50) / 25
                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.log_validation(
                    iteration=iteration,
                    mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]),
                    mel_target=("image", plot_spectrogram_to_numpy(),
                                audio[idx[0]]),
                    mel_recon=("image", plot_spectrogram_to_numpy(),
                               out[-1][0]),
                    mel_conversion=("image", plot_spectrogram_to_numpy(),
                                    out_conversion[-1][0]),
                    mel_recon_middle=("image", plot_spectrogram_to_numpy(),
                                      out[-2][0]),
                    mel_conversion_middle=("image",
                                           plot_spectrogram_to_numpy(),
                                           out_conversion[-2][0]),
                    mel_recon_low=("image", plot_spectrogram_to_numpy(),
                                   out[-3][0]),
                    mel_conversion_low=("image", plot_spectrogram_to_numpy(),
                                        out_conversion[-3][0]),
                    audio_ori=("audio", 22050, a[0]),
                    audio_target=("audio", 22050, a[1]),
                    audio_recon=("audio", 22050, a[2]),
                    audio_conversion=("audio", 22050, a[3]),
                )
                logger.close()
                save_checkpoint(
                    model, opt, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen'
                )
                model.train()
            iteration += 1
Ejemplo n.º 8
0
    if hp.loss == 'BEGAN':
        loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(dis_high, singing, fake_singing, k)
        loss_cycle = criterion(speech_2x, fake_speech).mean()
        OptimStep([(m, opt, loss_gan + 0.2 * loss_cycle, True),
            (dis_high, opt_dis, loss_dis, False)], 3)
        
        k, convergence = recorder(real_dloss, fake_dloss, update_k=True)
    
    if iteration % 5 == 0:
        if hp.loss == "BEGAN":
            logger.log_training(iteration = iteration, loss_gan = loss_gan, 
            loss_dis = loss_dis, loss_cycle = loss_cycle, k = k, convergence = convergence)

    if (iteration % 50 == 0):

        save_checkpoint(m, opt, iteration, f'checkpoint/{args.checkpoint_path}/gen')
        save_checkpoint(dis_high, opt_dis, iteration, f'checkpoint/{args.checkpoint_path}/dis')

        
        idx = random.randint(0, fake_singing.size(0) - 1)

        #mel = (mel * std) +mean
        #z = (z * std) + mean
        real_audio = melblock.inverse(singing).detach().cpu().numpy()
        fake_audio = melblock.inverse(fake_singing).detach().cpu().numpy()
        real_speech_audio = vocoder_speech.inverse(speech).detach().cpu().numpy()
        #mel = (mel -mean)/ std
        #z = (z - mean ) / std
        """
        logger work like this:
            logger only accept image, audio ,scalars type.
def main():
    # Directory of Image Data
    data_dir = in_arg.data_dir
    train_dir = data_dir + '/train'
    valid_dir = data_dir + '/valid'
    test_dir = data_dir + '/test'

    # transforms for the training, validation, and testing sets
    data_transforms = transforms.Compose([
        transforms.Resize(255),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    ## transforms for the training set using Data Argumentation
    train_transforms = transforms.Compose([
        transforms.RandomRotation(30),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    ## transforms for validation set
    valid_transforms = data_transforms

    ## transforms for testing set
    test_transforms = data_transforms

    # Load the Datasets with ImageFolder
    image_datasets = datasets.ImageFolder(data_dir, transform=data_transforms)

    # Load training Dataset
    train_datasets = datasets.ImageFolder(train_dir,
                                          transform=train_transforms)

    # Load validation Dataset
    valid_datasets = datasets.ImageFolder(valid_dir,
                                          transform=valid_transforms)

    # Load test Dataset
    test_datasets = datasets.ImageFolder(test_dir, transform=test_transforms)

    # Dataloader and batch size
    dataloaders = torch.utils.data.DataLoader(image_datasets,
                                              batch_size=64,
                                              shuffle=True)

    #Trainloader
    trainloaders = torch.utils.data.DataLoader(train_datasets,
                                               batch_size=64,
                                               shuffle=True)

    #Validloader
    validloaders = torch.utils.data.DataLoader(valid_datasets,
                                               batch_size=64,
                                               shuffle=True)

    #Testloader
    testloaders = torch.utils.data.DataLoader(test_datasets,
                                              batch_size=64,
                                              shuffle=True)

    #Use two Pretrained model
    models_dict = {}
    vgg13 = models.vgg13(pretrained=True)
    vgg16 = models.vgg16(pretrained=True)
    print("OK1")
    models_dict = {'vgg13': vgg13, 'vgg16': vgg16}
    print("OK2")
    model = models_dict[in_arg.arch]
    print("OK3")
    #Frozen the parameters
    #They can't  get Updated during Training
    #Turning off gradient
    for param in model.parameters():
        param.requires_grad = False

    # Define our new Classifier using only 1 hidden Layer

    classifier = nn.Sequential(
        OrderedDict([('fc1', nn.Linear(25088, in_arg.hidden_units)),
                     ('relu', nn.ReLU()), ('dou', nn.Dropout(p=0.2)),
                     ('fc2', nn.Linear(in_arg.hidden_units, 102)),
                     ('output', nn.LogSoftmax(dim=1))]))

    ## Update Classifier and check model again
    model.classifier = classifier

    # define loss Since we use logsoftmax as output we use  negative log likelihood loss

    criterion = nn.NLLLoss()

    # define optimizer to update the weights with gradients

    optimizer = optim.Adam(model.classifier.parameters(),
                           lr=in_arg.learning_rate)

    epochs = in_arg.epochs

    # Use GPU if avaliable
    device = torch.device(
        "cuda" if torch.cuda.is_available() and in_arg.gpu else "cpu")

    for epoch in range(epochs):

        # Train model
        train_loss, train_accuracy = train_test(0, model, criterion, optimizer,
                                                trainloaders, device,
                                                in_arg.gpu)

        # Validate model
        with torch.no_grad():
            valid_loss, valid_accuracy = train_test(1, model, criterion,
                                                    optimizer, validloaders,
                                                    device, in_arg.gpu)

        # print description
        print("Epoch  :{}/{} \n ".format(epoch + 1, epochs))
        print("Traning Loss :{} \n ".format(train_loss))
        print("Validation Loss :{} \n ".format(valid_loss))
        print("Validation Accuracy :{} \n ".format(valid_accuracy))

    model.class_to_idx = train_datasets.class_to_idx

    save_checkpoint(model, optimizer, in_arg)
Ejemplo n.º 10
0
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs,
           inf_iterator_test, logger, iteration):

    for epoch in range(epochs):
        mse_sum = 0
        mse_n = 0

        for i, audio in enumerate(loader):
            cluster_size = audio.size(1)
            audio = audio.cuda()
            audio = (audio - mean) / std / 3
            out, out_conversion, enc_content, spk, latent_loss, idx = model(
                audio)
            recon_loss = criterion(out, audio)
            latent_loss = latent_loss.mean()

            OptimStep([(model, opt,
                        recon_loss + latent_loss_weight * latent_loss, False)],
                      3)  # True),

            mse_sum += recon_loss.item() * audio.shape[0]
            mse_n += audio.shape[0]
            if i % 5 == 0:
                logger.log_training(iteration=iteration,
                                    loss_recon=recon_loss,
                                    latent_loss=latent_loss)

            if i % 200 == 0:
                model.eval()

                audio = next(inf_iterator_test)
                audio = audio.cuda()

                audio = (audio - mean) / std / 3

                out, out_conversion, enc_content, spk, latent_loss, idx = model(
                    audio)
                a = torch.stack(
                    [audio[0], audio[idx[0]], out[0], out_conversion[0]],
                    dim=0)

                a = a * std * 3 + mean
                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.log_validation(
                    iteration=iteration,
                    mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]),
                    mel_target=("image", plot_spectrogram_to_numpy(),
                                audio[idx[0]]),
                    mel_recon=("image", plot_spectrogram_to_numpy(), out[0]),
                    mel_conversion=("image", plot_spectrogram_to_numpy(),
                                    out_conversion[0]),
                    audio_ori=("audio", 22050, a[0]),
                    audio_target=("audio", 22050, a[1]),
                    audio_recon=("audio", 22050, a[2]),
                    audio_conversion=("audio", 22050, a[3]),
                )
                logger.close()
                save_checkpoint(
                    model, opt, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen'
                )
                model.train()
            iteration += 1
Ejemplo n.º 11
0
        training_info["past %d epochs mean reward" %  (num_avg_epoch)] = \
            (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \
                if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0

    # Print stats
    print("\n\n=============  Epoch: %d  =============" % (i_epoch + 1))
    print("epoch mean durations: %f" % (epoch_durations[-1]))
    print("epoch mean rewards: %f" % (epoch_rewards[-1]))
    print("Max reward achieved: %f" % training_info["max reward achieved"])
    print("value net loss: %f" % value_net_mse)

    # Plot stats
    if plot:
        plot_durations(training_info["epoch mean rewards"],
                       training_info["value net loss"])

    # Update counter
    i_epoch += 1

    # Every save_ckpt_interval, save a checkpoint according to current i_episode.
    if i_epoch % save_ckpt_interval == 0:
        save_checkpoint(ckpt_dir,
                        policy_net,
                        value_net,
                        policynet_optimizer,
                        valuenet_optimizer,
                        i_epoch,
                        policy_lr=policy_lr,
                        valuenet_lr=valuenet_lr,
                        **training_info)
Ejemplo n.º 12
0
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs,
           inf_iterator_test, logger, iteration):
    dis = NetD(80).cuda()
    opt_dis = optim.Adam(dis.parameters())
    gamma = 1.0
    lambda_k = 0.01
    init_k = 0.0
    recorder = BEGANRecorder(lambda_k, init_k, gamma)
    k = recorder.k.item()
    opt_dec = optim.Adam(model.dec.parameters())
    for epoch in range(epochs):
        mse_sum = 0
        mse_n = 0

        for i, audio in enumerate(loader):

            audio = audio.cuda()
            audio = (audio * 25 + 50) / 50
            factor = 32

            time_step = audio.size(2)

            audio_shuffle = [[] for i in range(time_step // factor)]
            nums = [x for x in range(time_step // factor)]
            random.shuffle(nums)

            for i_n, n in enumerate(nums):
                sf = random.uniform(0.5, 2)
                audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor *
                                                       (n + 1)],
                                                 scale_factor=sf,
                                                 mode='nearest')

            audio_shuffle = torch.cat(audio_shuffle, dim=2)

            audio = audio_shuffle  #F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step)
            audio = audio[..., :audio.size(2) // 16 * 16]

            audio_middile = F.interpolate(audio, scale_factor=1 / 2)
            audio_middile = audio_middile[:, :audio_middile.size(1) // 2, :]

            audio_low = F.interpolate(audio_middile, scale_factor=1 / 2)
            audio_low = audio_low[:, :audio_low.size(1) // 2, :]

            audio_list = [audio_low, audio_middile, audio]

            out, latent_loss, index_list = model(audio)

            recon_loss = 0
            for num in range(3):
                recon_loss += criterion(out[num], audio_list[num])

            latent_loss = latent_loss.mean()

            #OptimStep([(model, opt,  recon_loss + latent_loss_weight*latent_loss , True)], 3)# True),

            #################################
            # BEGAN TRAINING PHASE          #
            #################################
            model.zero_grad()
            index_list_ = []
            for l in index_list:
                idx = torch.randperm(l.size(0))
                index_list_ += [l[idx]]
            out_code = model.index_to_decode(index_list_)
            loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(
                dis, audio, out_code[-1], k)
            OptimStep([(model, opt,
                        recon_loss + latent_loss_weight * latent_loss, True),
                       (model.dec, opt_dec, 0.2 * (loss_gan), True),
                       (dis, opt_dis, loss_dis, False)], 3)

            k, convergence = recorder(real_dloss, fake_dloss, update_k=True)
            iteration += 1
            print(iteration)
            model.zero_grad()

            if i % 5 == 0:
                logger.log_training(iteration=iteration,
                                    loss_gan=loss_gan,
                                    loss_dis=loss_dis,
                                    loss_recon=recon_loss,
                                    latent_loss=latent_loss,
                                    k=k,
                                    convergence=convergence)

            if i % 50 == 0:
                model.eval()
                a = torch.stack([audio[0], out[-1][0], out_code[-1][0]], dim=0)
                a = (a * 50 - 50) / 25
                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.log_validation(
                    iteration=iteration,
                    mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]),
                    mel_recon=("image", plot_spectrogram_to_numpy(),
                               out[-1][0]),
                    mel_code=("image", plot_spectrogram_to_numpy(),
                              out_code[-1][0]),
                    audio_ori=("audio", 22050, a[0]),
                    audio_recon=("audio", 22050, a[1]),
                    audio_code=("audio", 22050, a[2]),
                )

                save_checkpoint(
                    model, opt, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen'
                )
                save_checkpoint(
                    dis, opt_dis, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/dis'
                )

                model.train()
                logger.close()