def loss(self, x, x_rec, z_mu, z_logvar, scaling): list_in = torch.cat([i for i in x.squeeze(1)]).long() norm_const,scaling,log_scaling = scaling[0], scaling[1], scaling[2] list_in = unscale_array(list_in,norm_const,scaling,log_scaling) list_in = list_in.int().long() list_out = torch.cat([i for i in x_rec], dim = 1).t() #print(list_in.size(), list_out.size()) recon = F.nll_loss(list_out, list_in, size_average=True) kl_loss = 0.5*(-z_logvar+torch.exp(z_logvar)+z_mu.pow(2)-1.) # prior is unit gaussian here kl_loss = torch.mean(torch.sum(kl_loss,1)) return recon, kl_loss
#4. Backpropagate loss.backward() optimizer.step() #4. EPOCH FINISHED : epoch_size = i + 1 # Saving sounds/waveforms if np.mod(epoch, 50) == 0: #from training set fig = plt.figure(figsize=(12, 8)) for idx in range(1, 5): plt.subplot(4, 2, 2 * idx - 1) inputs = mulaw.decode( unscale_array(raw_inputs[idx], norm_const, normalize, log_scaling)) plt.plot(inputs) plt.subplot(4, 2, 2 * idx) output = mulaw.to_int(x_rec[idx]) output = mulaw.decode(output) plt.plot(output.clone().cpu().numpy()) #still a variable fig.savefig(results_folder + '/images/reconstructions/train_epoch' + str(epoch) + '.png', bbox_inches='tight') raw_inputs, pre_process, x, x_rec = None, None, None, None gc.collect() #Compute validation loss and scheduler.step() valid_loss, valid_in, valid_out = vae.valid_loss(testloader,
#suppress dumb sizes and transpose to regenerate nsgt = nsgt[0].T #compute the resize needed nbFreq, nbFrames = regenerateAudio(np.zeros((1, 1)), testSize=True, targetLen=targetLen) # RE-UPSAMPLE the distribution factor = np.max(np.abs(nsgt)) nsgt = resize(nsgt / factor, (nbFreq, nbFrames), mode='constant') nsgt *= factor #rescale nsgt = unscale_array(nsgt, norm_const, normalize, log_scaling) for it in [100, 200, 300]: for ph in [False, True]: if ph == True: phase = nnPhase else: phase = False regenerateAudio(nsgt, sr=22050, targetLen=int(1.15583 * 22050), iterations=it, initPhase=phase, curName=soundPath + str(n) + '_' + str(i) + '_' + str(it) + '_ph' + str(ph))
def regenerate(VAE, dataset, nb, it, scale_param, scaling, log_scaling, downFactor, soundPath, crop=None, initPhase=True, nameExtension=''): targetLen = 25486 for i, raw_input in enumerate(dataset.data): if i > nb: break pre_process = torch.from_numpy(raw_input).float() if torch.cuda.is_available(): pre_process = pre_process.cuda() pre_process = pre_process.unsqueeze(0) pre_process = pre_process.unsqueeze( 0) #add 2 dimensions to forward into vae x = Variable(pre_process) #2. Forward data rec_mu, rec_logvar, z_mu, z_logvar = VAE.forward(x) #suppress dumb sizes and transpose to regenerate originalNSGT = pre_process.data.cpu()[0, 0, :, :].numpy().T recNSGT = rec_mu.data.cpu()[0, 0, :, :].numpy().T #compute the resize needed nbFreq, nbFrames = regenerateAudio(np.zeros((1, 1)), testSize=True, targetLen=targetLen) # RE-UPSAMPLE the distribution oriFactor = np.max(np.abs(originalNSGT)) recFactor = np.max(np.abs(recNSGT)) originalNSGT = resize(originalNSGT / oriFactor, (nbFreq, nbFrames), mode='constant') recNSGT = resize(recNSGT / recFactor, (nbFreq, nbFrames), mode='constant') originalNSGT *= oriFactor recNSGT *= recFactor #rescale originalNSGT = unscale_array(originalNSGT, scale_param, scaling, log_scaling) recNSGT = unscale_array(recNSGT, scale_param, scaling, log_scaling) # Now invert (with upsampled version) if initPhase: phase = get_phase(dataset.files[i], targetLen) else: phase = None filename = str(i) + nameExtension #to test on various parameters sets regenerateAudio(originalNSGT, targetLen=targetLen, iterations=it, curName=soundPath + filename, initPhase=phase, crop=crop) regenerateAudio(recNSGT, targetLen=targetLen, iterations=it, curName=soundPath + filename + '_rec', initPhase=phase, crop=crop)