Beispiel #1
0
 def loss(self, x, x_rec, z_mu, z_logvar, scaling):
     
     list_in = torch.cat([i for i in x.squeeze(1)]).long()
     norm_const,scaling,log_scaling = scaling[0], scaling[1], scaling[2]
     list_in = unscale_array(list_in,norm_const,scaling,log_scaling)
     list_in = list_in.int().long()
     list_out = torch.cat([i for i in x_rec], dim = 1).t()
     #print(list_in.size(), list_out.size())
     recon = F.nll_loss(list_out, list_in, size_average=True)
     
     kl_loss = 0.5*(-z_logvar+torch.exp(z_logvar)+z_mu.pow(2)-1.) # prior is unit gaussian here
     kl_loss = torch.mean(torch.sum(kl_loss,1))
     
     return recon, kl_loss
Beispiel #2
0
        #4. Backpropagate
        loss.backward()
        optimizer.step()

#4. EPOCH FINISHED :

    epoch_size = i + 1

    # Saving sounds/waveforms
    if np.mod(epoch, 50) == 0:
        #from training set
        fig = plt.figure(figsize=(12, 8))
        for idx in range(1, 5):
            plt.subplot(4, 2, 2 * idx - 1)
            inputs = mulaw.decode(
                unscale_array(raw_inputs[idx], norm_const, normalize,
                              log_scaling))
            plt.plot(inputs)
            plt.subplot(4, 2, 2 * idx)
            output = mulaw.to_int(x_rec[idx])
            output = mulaw.decode(output)
            plt.plot(output.clone().cpu().numpy())  #still a variable
        fig.savefig(results_folder + '/images/reconstructions/train_epoch' +
                    str(epoch) + '.png',
                    bbox_inches='tight')

    raw_inputs, pre_process, x, x_rec = None, None, None, None
    gc.collect()

    #Compute validation loss and scheduler.step()

    valid_loss, valid_in, valid_out = vae.valid_loss(testloader,
Beispiel #3
0
            #suppress dumb sizes and transpose to regenerate
            nsgt = nsgt[0].T

            #compute the resize needed
            nbFreq, nbFrames = regenerateAudio(np.zeros((1, 1)),
                                               testSize=True,
                                               targetLen=targetLen)

            # RE-UPSAMPLE the distribution
            factor = np.max(np.abs(nsgt))
            nsgt = resize(nsgt / factor, (nbFreq, nbFrames), mode='constant')
            nsgt *= factor

            #rescale
            nsgt = unscale_array(nsgt, norm_const, normalize, log_scaling)

            for it in [100, 200, 300]:
                for ph in [False, True]:
                    if ph == True:
                        phase = nnPhase
                    else:
                        phase = False

                    regenerateAudio(nsgt,
                                    sr=22050,
                                    targetLen=int(1.15583 * 22050),
                                    iterations=it,
                                    initPhase=phase,
                                    curName=soundPath + str(n) + '_' + str(i) +
                                    '_' + str(it) + '_ph' + str(ph))
Beispiel #4
0
def regenerate(VAE,
               dataset,
               nb,
               it,
               scale_param,
               scaling,
               log_scaling,
               downFactor,
               soundPath,
               crop=None,
               initPhase=True,
               nameExtension=''):

    targetLen = 25486

    for i, raw_input in enumerate(dataset.data):
        if i > nb:
            break

        pre_process = torch.from_numpy(raw_input).float()
        if torch.cuda.is_available():
            pre_process = pre_process.cuda()
        pre_process = pre_process.unsqueeze(0)
        pre_process = pre_process.unsqueeze(
            0)  #add 2 dimensions to forward into vae
        x = Variable(pre_process)

        #2. Forward data
        rec_mu, rec_logvar, z_mu, z_logvar = VAE.forward(x)

        #suppress dumb sizes and transpose to regenerate
        originalNSGT = pre_process.data.cpu()[0, 0, :, :].numpy().T
        recNSGT = rec_mu.data.cpu()[0, 0, :, :].numpy().T

        #compute the resize needed
        nbFreq, nbFrames = regenerateAudio(np.zeros((1, 1)),
                                           testSize=True,
                                           targetLen=targetLen)

        # RE-UPSAMPLE the distribution
        oriFactor = np.max(np.abs(originalNSGT))
        recFactor = np.max(np.abs(recNSGT))

        originalNSGT = resize(originalNSGT / oriFactor, (nbFreq, nbFrames),
                              mode='constant')
        recNSGT = resize(recNSGT / recFactor, (nbFreq, nbFrames),
                         mode='constant')

        originalNSGT *= oriFactor
        recNSGT *= recFactor

        #rescale
        originalNSGT = unscale_array(originalNSGT, scale_param, scaling,
                                     log_scaling)
        recNSGT = unscale_array(recNSGT, scale_param, scaling, log_scaling)

        # Now invert (with upsampled version)
        if initPhase:
            phase = get_phase(dataset.files[i], targetLen)
        else:
            phase = None

        filename = str(i) + nameExtension  #to test on various parameters sets
        regenerateAudio(originalNSGT,
                        targetLen=targetLen,
                        iterations=it,
                        curName=soundPath + filename,
                        initPhase=phase,
                        crop=crop)
        regenerateAudio(recNSGT,
                        targetLen=targetLen,
                        iterations=it,
                        curName=soundPath + filename + '_rec',
                        initPhase=phase,
                        crop=crop)