def forward(self, X): G1 = Variable(torch.randn(1, 3, 10, 10)) G2 = Variable(torch.randn(1, 3, 10, 10)) G3 = Variable(torch.randn(1, 3, 10, 10)) X = X.cuda() G1 = G1.cuda() G2 = G2.cuda() G3 = G3.cuda() sum_abs = G1.abs() + G2.abs() + G3.abs() mask_need_norm = sum_abs.ge(1) mask_need_norm = mask_need_norm.float() G1_norm = torch.div(G1, sum_abs) G2_norm = torch.div(G2, sum_abs) G3_norm = torch.div(G3, sum_abs) G1 = torch.add(-mask_need_norm, 1) * G1 + mask_need_norm * G1_norm G2 = torch.add(-mask_need_norm, 1) * G2 + mask_need_norm * G2_norm G3 = torch.add(-mask_need_norm, 1) * G3 + mask_need_norm * G3_norm output = self.Propagator.forward(X, G1, G2, G3) return output
def test_basic_op_grad(self): """Grad output might need to be reshaped to match the second argument.""" x = Variable(torch.randn(4, 6), requires_grad=True) b = Variable(torch.rand(12, 1) + 1e-2, requires_grad=True) def y(): # .mm() depends on the grad_output being of correct size return b.mm(Variable(torch.rand(1, 2) + 1e-2)) (x + y()).sum().backward() (x - y()).sum().backward() (x * y()).sum().backward() (x / y()).sum().backward() (x.abs() ** y()).sum().backward()
def mse(nanobots): import numpy as np coordinates = np.asarray([[n.x, n.y, n.z] for n in nanobots], dtype=np.int) # coordinates.shape = (#nanobots, 3) radius = np.asarray([n.radius for n in nanobots], dtype=np.int) # radius.shape = (#nanobots, ) me = np.asarray([12, 12, 12], dtype=np.int) # me.shape = (3, ) me = np.expand_dims(me, axis=0) # me.shape = (1, 3) distances = np.sum(np.abs(coordinates - me), axis=1) # distances.shape = (#nanobots,) t = radius - distances.T in_range = np.sum((t > 0).astype(np.int)) # in_range.shape = () dist = np.sum(np.abs(me)) # dist.shape = () loss = -in_range - dist print(coordinates) print(radius) print(distances) print(t) print(in_range) print(loss) import torch from torch.autograd import Variable coordinates = Variable(torch.Tensor(coordinates)) radius = Variable(torch.Tensor(radius)) me = Variable(torch.Tensor(me), requires_grad=True) for t in range(10): distances = (coordinates - me).abs().sum(1) t = radius - distances in_range = (t > 0).sum() dist = me.abs().sum() loss = -in_range - dist - distances.mean() print('loss:', loss) loss.backward() print(distances) print(t) print(in_range) print(dist) print('me:', me) print(me.grad.data) me.data -= torch.round(me.grad.data) me.grad.data.zero_()
def test_local_var_unary_methods(self): ''' Unit tests for methods mentioned on issue 1385 https://github.com/OpenMined/PySyft/issues/1385''' x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.abs(), Var(torch.FloatTensor([1, 2, 3, 4, 5]))) assert torch.equal(x.abs_(), Var(torch.FloatTensor([1, 2, 3, 4, 5]))) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.cos().int(), Var(torch.IntTensor([0, 0, 0, 0, 0]))) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.cos_().int(), Var(torch.IntTensor([0, 0, 0, 0, 0]))) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.ceil(), x) assert torch.equal(x.ceil_(), x) assert torch.equal(x.cpu(), x)
def test_local_var_unary_methods(self): ''' Unit tests for methods mentioned on issue 1385 https://github.com/OpenMined/PySyft/issues/1385''' x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.abs(), Var(torch.FloatTensor([1, 2, 3, 4, 5]))) assert torch.equal(x.abs_(), Var(torch.FloatTensor([1, 2, 3, 4, 5]))) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.cos().int(), Var(torch.IntTensor( [0, 0, 0, 0, 0]))) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.cos_().int(), Var(torch.IntTensor( [0, 0, 0, 0, 0]))) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])) assert torch.equal(x.ceil(), x) assert torch.equal(x.ceil_(), x) assert torch.equal(x.cpu(), x)
def test_remote_var_unary_methods(self): ''' Unit tests for methods mentioned on issue 1385 https://github.com/OpenMined/PySyft/issues/1385''' hook = TorchHook(verbose=False) local = hook.local_worker remote = VirtualWorker(id=2,hook=hook) local.add_worker(remote) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])).send(remote) assert torch.equal(x.abs().get(), Var(torch.FloatTensor([1, 2, 3, 4, 5]))) assert torch.equal(x.abs_().get(), Var(torch.FloatTensor([1, 2, 3, 4, 5]))) assert torch.equal(x.cos().int().get(), Var(torch.IntTensor( [0, 0, 0, 0, 0]))) assert torch.equal(x.cos_().int().get(), Var(torch.IntTensor( [0, 0, 0, 0, 0]))) x = Var(torch.FloatTensor([1, 2, -3, 4, 5])).send(remote) assert torch.equal(x.ceil().get(), Var(torch.FloatTensor([1, 2, -3, 4, 5]))) assert torch.equal(x.ceil_().get(), Var(torch.FloatTensor([1, 2, -3, 4, 5]))) assert torch.equal(x.cpu().get(), Var(torch.FloatTensor([1, 2, -3, 4, 5])))
def doForwardPass(numericAndLineNumbers, surprisalTable=None, doDropout=True, batchSizeHere=1): global counter global crossEntropy global printHere global devLosses global hidden global beginning if hidden is not None: hidden = Variable(hidden.data).detach() forRestart = bernoulli.sample() #print(forRestart) sampled = startHidden(zeroHidden) hiddenNew = sampleToHidden(sampled).unsqueeze(0) # hidden = forRestart.unsqueeze(0).unsqueeze(2) * hiddenNew + (1-forRestart).unsqueeze(0).unsqueeze(2) * hidden # print(torch.where) hidden = torch.where( forRestart.unsqueeze(0).unsqueeze(2) == 1, hiddenNew, hidden) beginning = torch.where( forRestart.unsqueeze(0) == 1, zeroBeginning, beginning) # beginning = forRestart.unsqueeze(0).unsqueeze(2) * zeroBeginning + (1-forRestart).unsqueeze(0).unsqueeze(2) * beginning else: sampled = startHidden(zeroHidden) hiddenNew = sampleToHidden(sampled).unsqueeze(0) hidden = hiddenNew beginning = zeroBeginning numeric, lineNumbers = numericAndLineNumbers numeric = torch.cat([beginning, numeric], dim=0) beginning = numeric[numeric.size()[0] - 1].view(1, args.batchSize) loss = 0 wordNum = 0 lossWords = 0 policyGradientLoss = 0 baselineLoss = 0 optimizer.zero_grad() for c in components: c.zero_grad() # for q in parameters_made: # for p in q: # if p.grad is not None: # p.grad.fill_(0) totalQuality = 0.0 if True: inputTensor = numeric # so it will be sequence_length x args.batchSizeHere # print inputTensor # quit() inputTensorIn = inputTensor[:-1] inputTensorOut = inputTensor[1:] inputEmbeddings = word_pos_morph_embeddings( inputTensorIn.view(args.horizon, batchSizeHere)) if doDropout: inputEmbeddings = inputDropout(inputEmbeddings) if args.dropout_rate > 0: inputEmbeddings = dropout(inputEmbeddings) lossesWordTotal = [] sampled_vectors = [] logProbConditionals = [] for i in range(inputEmbeddings.size()[0]): # print(i, hidden.abs().max()) torch.clamp(hidden, min=-10, max=10) output1, hidden = rnn_both(inputEmbeddings[i].unsqueeze(0), hidden) assert args.rnn_layers == 1 meanHidden = cellToMean(hidden[0]) klLoss = [None for _ in inputEmbeddings] logStandardDeviationHidden = hiddenToLogSDHidden(hidden[0]) # print(torch.exp(logStandardDeviationHidden)) scaleForDist = torch.log(1 + torch.exp(logStandardDeviationHidden)) memoryDistribution = torch.distributions.Normal(loc=meanHidden, scale=scaleForDist) # sampled = memoryDistribution.rsample() encodedEpsilon = standardNormalPerStep.sample() sampled = meanHidden + scaleForDist * encodedEpsilon sampled_vectors.append(sampled) logProbConditional = memoryDistribution.log_prob(sampled).sum( dim=1) # TODO not clear whether back-prob through sampled? logProbConditionals.append(logProbConditional) hiddenNew = sampleToHidden(sampled).unsqueeze(0) # this also serves as the output for prediction hidden = hiddenNew # print(hidden.abs().max()) # output, _ = rnn_both(torch.cat([word_pos_morph_embeddings(torch.cuda.LongTensor([[2 for _ in range(args.batchSizeHere)]])), inputEmbeddings[halfSeqLen+1:]], dim=0), (hiddenNew, cellNew)) # output = torch.cat([output1[:halfSeqLen], output], dim=0) output = hiddenNew if doDropout: output = dropout(output) word_logits = decoder(output) word_logits = word_logits.view(batchSizeHere, outVocabSize) word_softmax = logsoftmax(word_logits) lossesWord = lossModuleTest(word_softmax, inputTensorOut[i].view(batchSizeHere)) lossesWordTotal.append(lossesWord) lossesWord = torch.stack(lossesWordTotal, dim=0) lossWords = lossesWord.sum(dim=0).sum(dim=0) loss = lossesWord.sum() klLoss = 0 batchSizeInflatedHere = args.batchSize * len(sampled_vectors) sampledTotal = torch.stack(sampled_vectors, dim=0).view(batchSizeInflatedHere, -1) logProbConditionalsTotal = torch.stack( logProbConditionals, dim=0).view(batchSizeInflatedHere) # print(sampledTotal.size(), logProbConditionalsTotal.size()) #for sampled, logProbConditional in zip(sampled_vectors, logProbConditionals): adjustment = [] epsilon = sampledTotal logdet = torch.autograd.Variable( torch.from_numpy( np.zeros(batchSizeInflatedHere).astype('float32')).cuda()) # n=1 context = torch.autograd.Variable( torch.from_numpy( np.zeros((batchSizeInflatedHere, context_dim)).astype('float32')).cuda()) for flowStep in range(args.flow_length): epsilon, logdet, context = flows[flowStep]( (epsilon, logdet, context)) if flowStep + 1 < args.flow_length: epsilon, logdet, context = torchkit.flows.FlipFlow(1)( (epsilon, logdet, context)) plainPriorLogProb = standardNormal.log_prob(epsilon).sum( dim=1) #- (0.5 * torch.sum(sampled * sampled, dim=1)) logProbMarginal = plainPriorLogProb + logdet klLoss = (logProbConditionalsTotal - logProbMarginal) # print(logProbConditional, logProbMarginal) # print(logStandardDeviationHidden) # klLoss = 0.5 * (-1 - 2 * (logStandardDeviationHidden) + torch.pow(meanHidden, 2) + torch.exp(2*logStandardDeviationHidden)) # klLoss = klLoss.sum(1) klLossSum = klLoss.sum() if counter % 10 == 0: klLossMean = klLoss.mean() print(args.beta, args.flow_length, klLossMean, lossesWord.mean(), args.beta * klLoss.mean() + lossesWord.mean()) if float(klLossMean) != float(klLossMean): print(hidden.abs().max()) assert False, "got NA, abort" loss = loss + args.beta * klLossSum # print lossesWord if surprisalTable is not None or True: lossesCPU = lossesWord.data.cpu().view((args.horizon), batchSizeHere).numpy() if True: for i in range(0, args.horizon ): #range(1,maxLength+1): # don't include i==0 j = 0 lineNum = int(lineNumbers[i][j]) print(i, itos_total[numeric[i + 1][j]], lossesCPU[i][j], lineNum) while lineNum >= len(completeData): completeData.append([[], 0]) completeData[lineNum][0].append(itos_total[numeric[i + 1][j]]) completeData[lineNum][1] += lossesCPU[i][j] if surprisalTable is not None: if printHere: print surprisalTable for j in range(batchSizeHere): for r in range(args.horizon): surprisalTable[r] += lossesCPU[ r, j] #.data.cpu().numpy()[0] wordNum = (args.horizon - 1) * batchSizeHere if wordNum == 0: print input_words print batchOrdered return 0, 0, 0, 0, 0 if printHere: print loss / wordNum print lossWords / wordNum print["CROSS ENTROPY", crossEntropy, exp(crossEntropy)] print("beta", args.beta) crossEntropy = 0.99 * crossEntropy + 0.01 * (lossWords / wordNum).data.cpu().numpy() totalQuality = loss.data.cpu().numpy( ) # consists of lossesWord + lossesPOS numberOfWords = wordNum # probabilities = torch.sigmoid(dhWeights) # neg_entropy = torch.sum( probabilities * torch.log(probabilities) + (1-probabilities) * torch.log(1-probabilities)) # policy_related_loss = lr_policy * (entropy_weight * neg_entropy + policyGradientLoss) # lives on CPU loss = loss / batchSizeHere return loss, None, None, totalQuality, numberOfWords, klLoss.mean()
Note: 1. G1~G3 constitute the affinity, they can be a bounch of output maps coming from any CNN, with the input of any useful known information (e.g., RGB images) 2. for any pixel i, |G1(i)| + |G2(i)| + |G3(i)| <= 1 is a sufficent condition for model stability (see paper) """ import torch from torch.autograd import Variable from pytorch_spn.modules.gaterecurrent2dnoind import GateRecurrent2dnoind Propagator = GateRecurrent2dnoind(True, False) X = Variable(torch.randn(1, 3, 10, 10)) G1 = Variable(torch.randn(1, 3, 10, 10)) G2 = Variable(torch.randn(1, 3, 10, 10)) G3 = Variable(torch.randn(1, 3, 10, 10)) sum_abs = G1.abs() + G2.abs() + G3.abs() mask_need_norm = sum_abs.ge(1) mask_need_norm = mask_need_norm.float() G1_norm = torch.div(G1, sum_abs) G2_norm = torch.div(G2, sum_abs) G3_norm = torch.div(G3, sum_abs) G1 = torch.add(-mask_need_norm, 1) * G1 + mask_need_norm * G1_norm G2 = torch.add(-mask_need_norm, 1) * G2 + mask_need_norm * G2_norm G3 = torch.add(-mask_need_norm, 1) * G3 + mask_need_norm * G3_norm X = X.cuda() G1 = G1.cuda() G2 = G2.cuda() G3 = G3.cuda()
def evaluate_fnet_multiround(inet, fnet, dataloader, nactors, nchannels=3, targets=None, concat_input=False): print('Evaluate fusion network') inet.eval() fnet.eval() total = 0 nbatches = 0 corrects = [0, 0, 0] runtimes = [0, 0, 0] sum_mae = 0 sum_mse = 0 criterionMSE = nn.MSELoss(reduction='mean') for batch_idx, (inputs, targets) in enumerate(dataloader): nbatches += 1 targets = Variable(targets).cuda() inputs = Variable(inputs).cuda() batch_size, C, H, W = inputs.shape total += batch_size x = inputs.unsqueeze(0).expand(nactors, batch_size, C, H, W).contiguous().view( nactors * batch_size, C, H, W) x = x[torch.randperm(x.shape[0]), ...] _, z_c, _ = inet(x) z_c = z_c.view(nactors, batch_size, z_c.shape[1], z_c.shape[2], z_c.shape[3]) sum_z = z_c.sum(dim=0) x = x.view(nactors, batch_size, C, H, W) x_g = x[torch.randperm(nactors), ...].permute(1, 0, 2, 3, 4) img_fused = fnet(x_g) _, z_fused, _ = inet(img_fused) for k in range(nactors): z = sum_z - z_c[k, ...] z_hat = nactors * z_fused - z out_hat = inet.module.classifier(z_hat) inputs = x[k, ...] out, _, _ = inet(inputs) _, y = torch.max(out.data, 1) _, y_hat = torch.max(out_hat.data, 1) corrects[2] += y_hat.eq(y.data).sum().cpu().item() / nactors input_hat = inet.module.inverse(z_hat) mae = torch.norm( (input_hat - inputs).view(batch_size, -1), dim=1) / torch.norm(inputs.view(batch_size, -1), dim=1) sum_mae += mae.mean().cpu().item() / nactors sum_mse += criterionMSE(input_hat, inputs).mean().cpu().item() / nactors max = inputs.abs().max().cpu().item() min = (input_hat - inputs).abs().max().cpu().item() del _, z_c, z_hat, z_fused, inputs, targets, img_fused, out, out_hat, y, y_hat # Evaluate time and classification performance corrects = 100 * np.asarray(corrects) / (total) print('\t == Correctly classified: X {:.4f} X_hat {:.4f} Match {:.4f}'. format(corrects[0], corrects[1], corrects[2])) print('\t == MAE: {:.4f}, MSE: {:.4f} in {:.4f} -> {:.4f}'.format( sum_mae / nbatches, sum_mse / nbatches, min, max)) runtimes = np.asarray(runtimes) / nbatches print( 'Average time G: {:.4f}, E: {:.4f}, C: {:.4f} over {} batches of size {}' .format(runtimes[0], runtimes[1], runtimes[2], nbatches, batch_size))
def main (): # Generator if generator_name == 'sine': generator = generate_sine elif generator_name == 'spikes': generator = generate_spikes else: raise "Generator {} not supported.".format(generator_name) # Reproducibility np.random.seed(seed) torch.manual_seed(seed) # Learnable, universal filter coefficients params = np.random.randn(num_params) params = transform_params(params) # Pytorch variables params = Variable(torch.FloatTensor(params), requires_grad=True) indices = Variable(torch.arange(num_params).type(dtype) / np.float(num_params - 1), requires_grad=False) x = Variable(torch.randn(input_shape).type(dtype), requires_grad=False) # Wavenet instance w = Wavenet(params, input_shape) # Optimiser optimiser = torch.optim.Adam([params], lr=1e-02) lambda_reg = 1.0E+02 num_steps = 5000 # Regularisation reg = Regularisation(params) # Training loop print "Initial parameters:", params.data.numpy() loss_dict = lambda : {'sparsity': 0, 'regularisation': 0, 'combined': 0, 'compactness': 0} losses = {'sparsity': [0], 'regularisation': [0], 'combined': [0], 'compactness': [0]} print "=" * 80 print "START RUNNING" print "-" * 80 start = time.time() for step, x_ in enumerate(generator(**gen_opts)): # Stop condition if step >= num_steps: break # Set input x.data = torch.from_numpy(x_).type(dtype) # Get wavelet coefficients c = w.forward(x) # Sparsity loss sparsity = 1. - gini(c) # Regularisation loss regularisation = reg.forward() # Compactness loss compactness = torch.sum(torch.dot(indices - 0.5, params.abs() / params.abs().sum())) # Combined loss combined = sparsity + lambda_reg * (regularisation) + compactness # Perform backpropagation combined.backward() # Parameter update if step % batch_size == 0: optimiser.step() optimiser.zero_grad() pass # Non-essential stuff below # ------------------------------------------------------------------------- # Log if step % 1000 == 0: print "Step {}/{}".format(step, num_steps) pass # Logging loss history losses['sparsity'][-1] += np.float(sparsity) losses['regularisation'][-1] += np.float(regularisation) losses['compactness'][-1] += np.float(compactness) losses['combined'][-1] += np.float(combined) if step % batch_size == 0: for key in losses: losses[key][-1] /= float(batch_size) losses[key].append(0.) pass pass # Draw model diagram if step == 0: from torchviz import make_dot dot = make_dot(sparsity, params={'params': params, 'input': x}) dot.format = 'pdf' dot.render('output/model') pass pass end = time.time() print "-" * 80 print "Took {:.1f} sec.".format(end - start) print "=" * 80 # Clean-up for key in losses: losses[key].pop(-1) pass print "Final parameters:", params.data.numpy() # Save to file tag = '{}__N{}__{}'.format('x'.join(map(str, input_shape)), num_params, generator_name) # -- Model torch.save(w, 'output/model__{}.pt'.format(tag)) # -- Loss with open('output/loss__{}.json'.format(tag), 'w') as f: json.dump(losses, f) pass return
def train(self): # Set up training. real_o = Variable(torch.FloatTensor(self.batch_size, 3, 64, 64).cuda(), requires_grad=False) real_o_next = Variable(torch.FloatTensor(self.batch_size, 3, 64, 64).cuda(), requires_grad=False) label = Variable(torch.FloatTensor(self.batch_size).cuda(), requires_grad=False) z = Variable(torch.FloatTensor(self.batch_size, self.rand_z_dim).cuda(), requires_grad=False) criterionD = nn.BCELoss().cuda() optimD = optim.Adam([{ 'params': self.D.parameters() }], lr=self.lr_d, betas=(0.5, 0.999)) optimG = optim.Adam([{ 'params': self.G.parameters() }, { 'params': self.Q.parameters() }, { 'params': self.T.parameters() }], lr=self.lr_g, betas=(0.5, 0.999)) ############################################ # Load rope dataset and apply transformations rope_path = os.path.realpath(self.data_dir) def filter_background(x): x[:, (x < 0.3).any(dim=0)] = 0.0 return x def dilate(x): x = x.squeeze(0).numpy() x = grey_dilation(x, size=3) x = x[None, :, :] return torch.from_numpy(x) trans = [ transforms.Resize(64), transforms.CenterCrop(64), transforms.ToTensor(), filter_background, # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ] if not self.fcn: # If fcn it will do the transformation to gray # and normalize in the loop. # trans.append(transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))) if self.gray: # Apply grayscale transformation. trans.append(lambda x: x.mean(dim=0)[None, :, :]) trans.append(dilate) trans.append(transforms.Normalize((0.5, ), (0.5, ))) trans_comp = transforms.Compose(trans) # Image 1 and image 2 are k steps apart. dataset = ImagePairs(root=rope_path, transform=trans_comp, n_frames_apart=self.k) dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, drop_last=True) from torchvision.utils import save_image imgs = next(iter(dataloader))[0][0] save_image(imgs * 0.5 + 0.5, 'train_img.png') ############################################ # Load eval plan dataset planning_data_dir = self.planning_data_dir dataset_start = dset.ImageFolder(root=os.path.join( planning_data_dir, 'start'), transform=trans_comp) dataset_goal = dset.ImageFolder(root=os.path.join( planning_data_dir, 'goal'), transform=trans_comp) data_start_loader = torch.utils.data.DataLoader(dataset_start, batch_size=1, shuffle=False, num_workers=1, drop_last=True) data_goal_loader = torch.utils.data.DataLoader(dataset_goal, batch_size=1, shuffle=False, num_workers=1, drop_last=True) ############################################ for epoch in range(self.n_epochs + 1): self.G.train() self.D.train() self.Q.train() self.T.train() for num_iters, batch_data in enumerate(dataloader, 0): # Real data o = batch_data[0] o_next = batch_data[1] bs = o.size(0) real_o.data.resize_(o.size()) real_o_next.data.resize_(o_next.size()) label.data.resize_(bs) real_o.data.copy_(o) real_o_next.data.copy_(o_next) if self.fcn: real_o = self.apply_fcn_mse(o) real_o_next = self.apply_fcn_mse(o_next) if real_o.abs().max() > 1: import ipdb ipdb.set_trace() assert real_o.abs().max() <= 1 if epoch == 0: break ############################################ # D Loss (Update D) optimD.zero_grad() # Real data probs_real = self.D(real_o, real_o_next) label.data.fill_(1) loss_real = criterionD(probs_real, label) loss_real.backward() # Fake data z, c, c_next = self._noise_sample(z, bs) fake_o, fake_o_next = self.G(z, c, c_next) probs_fake = self.D(fake_o.detach(), fake_o_next.detach()) label.data.fill_(0) loss_fake = criterionD(probs_fake, label) loss_fake.backward() D_loss = loss_real + loss_fake optimD.step() ############################################ # G loss (Update G) optimG.zero_grad() probs_fake_2 = self.D(fake_o, fake_o_next) label.data.fill_(1) G_loss = criterionD(probs_fake_2, label) # Q loss (Update G, T, Q) ent_loss = -self.P.log_prob(c).mean(0) crossent_loss = -self.Q.log_prob(fake_o, c).mean(0) crossent_loss_next = -self.Q.log_prob(fake_o_next, c_next).mean(0) # trans_prob = self.T.get_prob(Variable(torch.eye(self.dis_c_dim).cuda())) ent_loss_next = -self.T.log_prob(c, None, c_next).mean(0) mi_loss = crossent_loss - ent_loss mi_loss_next = crossent_loss_next - ent_loss_next Q_loss = mi_loss + mi_loss_next # T loss (Update T) Q_c_given_x, Q_c_given_x_var = ( i.detach() for i in self.Q.forward(real_o)) t_mu, t_variance = self.T.get_mu_and_var(c) t_diff = t_mu - c # Keep the variance small. # TODO: add loss on t_diff T_loss = (t_variance**2).sum(1).mean(0) (G_loss + self.infow * Q_loss + self.transw * T_loss).backward() optimG.step() ############################################# # Logging (iteration) if num_iters % 100 == 0: self.log_dict['Dloss'] = D_loss.item() self.log_dict['Gloss'] = G_loss.item() self.log_dict['Qloss'] = Q_loss.item() self.log_dict['Tloss'] = T_loss.item() self.log_dict['mi_loss'] = mi_loss.item() self.log_dict['mi_loss_next'] = mi_loss_next.item() self.log_dict['ent_loss'] = ent_loss.item() self.log_dict['ent_loss_next'] = ent_loss_next.item() self.log_dict['crossent_loss'] = crossent_loss.item() self.log_dict[ 'crossent_loss_next'] = crossent_loss_next.item() self.log_dict['D(real)'] = probs_real.data.mean() self.log_dict['D(fake)_before'] = probs_fake.data.mean() self.log_dict['D(fake)_after'] = probs_fake_2.data.mean() write_stats_from_var(self.log_dict, Q_c_given_x, 'Q_c_given_real_x_mu') write_stats_from_var(self.log_dict, Q_c_given_x, 'Q_c_given_real_x_mu', idx=0) write_stats_from_var(self.log_dict, Q_c_given_x_var, 'Q_c_given_real_x_variance') write_stats_from_var(self.log_dict, Q_c_given_x_var, 'Q_c_given_real_x_variance', idx=0) write_stats_from_var(self.log_dict, t_mu, 't_mu') write_stats_from_var(self.log_dict, t_mu, 't_mu', idx=0) write_stats_from_var(self.log_dict, t_diff, 't_diff') write_stats_from_var(self.log_dict, t_diff, 't_diff', idx=0) write_stats_from_var(self.log_dict, t_variance, 't_variance') write_stats_from_var(self.log_dict, t_variance, 't_variance', idx=0) print('\n#######################' '\nEpoch/Iter:%d/%d; ' '\nDloss: %.3f; ' '\nGloss: %.3f; ' '\nQloss: %.3f, %.3f; ' '\nT_loss: %.3f; ' '\nEnt: %.3f, %.3f; ' '\nCross Ent: %.3f, %.3f; ' '\nD(x): %.3f; ' '\nD(G(z)): b %.3f, a %.3f;' '\n0_Q_c_given_rand_x_mean: %.3f' '\n0_Q_c_given_rand_x_std: %.3f' '\n0_Q_c_given_fixed_x_std: %.3f' '\nt_diff_abs_mean: %.3f' '\nt_std_mean: %.3f' % ( epoch, num_iters, D_loss.item(), G_loss.item(), mi_loss.item(), mi_loss_next.item(), T_loss.item(), ent_loss.item(), ent_loss_next.item(), crossent_loss.item(), crossent_loss_next.item(), probs_real.data.mean(), probs_fake.data.mean(), probs_fake_2.data.mean(), Q_c_given_x[:, 0].cpu().numpy().mean(), Q_c_given_x[:, 0].cpu().numpy().std(), np.sqrt(Q_c_given_x_var[:, 0].cpu().numpy().mean()), t_diff.data.abs().mean(), t_variance.data.sqrt().mean(), )) ############################################# # Start evaluation from here. self.G.eval() self.D.eval() self.Q.eval() self.T.eval() ############################################# # Save images # Plot fake data x_save, x_next_save = self.G(*self.eval_input, self.get_c_next(epoch)) save_image(x_save.data, os.path.join(self.out_dir, 'gen', 'curr_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image(x_next_save.data, os.path.join(self.out_dir, 'gen', 'next_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image((x_save - x_next_save).data, os.path.join(self.out_dir, 'gen', 'diff_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) # Plot real data. if epoch % 10 == 0: save_image(real_o.data, os.path.join(self.out_dir, 'real', 'real_samples_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image(real_o_next.data, os.path.join(self.out_dir, 'real', 'real_samples_next_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) ############################################# # Save parameters if epoch % 5 == 0: if not os.path.exists('%s/var' % self.out_dir): os.makedirs('%s/var' % self.out_dir) for i in [self.G, self.D, self.Q, self.T]: torch.save( i.state_dict(), os.path.join(self.out_dir, 'var', '%s_%d' % ( i.__class__.__name__, epoch, ))) ############################################# # Logging (epoch) for k, v in self.log_dict.items(): log_value(k, v, epoch) if epoch > 0: # tf logger # log_value('avg|x_next - x|', (x_next_save.data - x_save.data).abs().mean(dim=0).sum(), epoch + 1) # self.logger.histo_summary("Q_c_given_x", Q_c_given_x.data.cpu().numpy().reshape(-1), step=epoch) # self.logger.histo_summary("Q_c0_given_x", Q_c_given_x[:, 0].data.cpu().numpy(), step=epoch) # self.logger.histo_summary("Q_c_given_x_var", Q_c_given_x_var.cpu().numpy().reshape(-1), step=epoch) # self.logger.histo_summary("Q_c0_given_x_var", Q_c_given_x_var[:, 0].data.cpu().numpy(), step=epoch) # csv log with open(os.path.join(self.out_dir, 'progress.csv'), 'a') as csv_file: writer = csv.writer(csv_file) if epoch == 1: writer.writerow(["epoch"] + list(self.log_dict.keys())) writer.writerow([ "%.3f" % _tmp for _tmp in [epoch] + list(self.log_dict.values()) ]) ############################################# # Do planning? if self.plan_length <= 0 or epoch not in self.planning_epoch: continue print("\n#######################" "\nPlanning") ############################################# # Showing plans on real images using best code. # Min l2 distance from start and goal real images. self.plan_hack(data_start_loader, data_goal_loader, epoch, 'L2') # Min classifier distance from start and goal real images. self.plan_hack(data_start_loader, data_goal_loader, epoch, 'classifier')
def train(self): # Set up training. real_o = Variable(torch.FloatTensor(self.batch_size, 3, 64, 64).cuda(), requires_grad=False) real_o_next = Variable(torch.FloatTensor(self.batch_size, 3, 64, 64).cuda(), requires_grad=False) label = Variable(torch.FloatTensor(self.batch_size).cuda(), requires_grad=False) z = Variable(torch.FloatTensor(self.batch_size, self.rand_z_dim).cuda(), requires_grad=False) criterionD = nn.BCELoss().cuda() optimD = optim.Adam([{'params': self.D.parameters()}], lr=self.lr_d, betas=(0.5, 0.999)) optimG = optim.Adam([{'params': self.G.parameters()}, {'params': self.Q.parameters()}, {'params': self.T.parameters()}], lr=self.lr_g, betas=(0.5, 0.999)) ############################################ # Load rope dataset and apply transformations rope_path = os.path.realpath(self.data_dir) trans = [ transforms.Resize(64), transforms.CenterCrop(64), transforms.ToTensor(), # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ] if not self.fcn: # If fcn it will do the transformation to gray # and normalize in the loop. trans.append(transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))) if self.gray: # Apply grayscale transformation. trans.append(lambda x: x.mean(dim=0)[None, :, :]) trans_comp = transforms.Compose(trans) # Image 1 and image 2 are k steps apart. dataset = ImagePairs(root=rope_path, transform=trans_comp, n_frames_apart=self.k) dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, drop_last=True) ############################################ # Load eval plan dataset planning_data_dir = self.planning_data_dir dataset_start = dset.ImageFolder(root=os.path.join(planning_data_dir, 'start'), transform=trans_comp) dataset_goal = dset.ImageFolder(root=os.path.join(planning_data_dir, 'goal'), transform=trans_comp) data_start_loader = torch.utils.data.DataLoader(dataset_start, batch_size=1, shuffle=False, num_workers=1, drop_last=True) data_goal_loader = torch.utils.data.DataLoader(dataset_goal, batch_size=1, shuffle=False, num_workers=1, drop_last=True) ############################################ for epoch in range(self.n_epochs + 1): self.G.train() self.D.train() self.Q.train() self.T.train() for num_iters, batch_data in enumerate(dataloader, 0): #print('going to sleep') #time.sleep(2) #print('waking up') # Real data o, _ = batch_data[0] o_next, _ = batch_data[1] bs = o.size(0) real_o.data.resize_(o.size()) real_o_next.data.resize_(o_next.size()) label.data.resize_(bs) real_o.data.copy_(o) real_o_next.data.copy_(o_next) # Plot real data: if epoch % 10 == 0: save_image(real_o.data, os.path.join(self.out_dir, 'real', 'real_preFcn_samples_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image(real_o_next.data, os.path.join(self.out_dir, 'real', 'real_preFcn_samples_next_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) if self.fcn: real_o = self.apply_fcn_mse(o) # a grey scale img [-1,1]. each pixel has the probability of being a part of the object real_o_next = self.apply_fcn_mse(o_next) if real_o.abs().max() > 1: import ipdb; ipdb.set_trace() assert real_o.abs().max() <= 1 if epoch == 0: break ############################################ # D Loss (Update D) optimD.zero_grad() # Real data probs_real = self.D(real_o, real_o_next) label.data.fill_(1) # label of real data is 1 loss_real = criterionD(probs_real, label) loss_real.backward() # weight gradCalc of descriminator only (realData -> Descriminator -> Loss) # Fake data z, c, c_next = self._noise_sample(z, bs) # z,c have normal distribution; c_next has a gaussian distribution with mean = c and some default variance value fake_o, fake_o_next = self.G(z, c, c_next) probs_fake = self.D(fake_o.detach(), fake_o_next.detach()) label.data.fill_(0) # label of fake data is 0 loss_fake = criterionD(probs_fake, label) loss_fake.backward() # weight gradCalc of descriminator only (because of detach (randNoise -> generator -> detach -> fakeData -> Descriminator -> Loss) D_loss = loss_real + loss_fake optimD.step() # weight update of D ############################################ # G loss (Update G) optimG.zero_grad() probs_fake_2 = self.D(fake_o, fake_o_next) label.data.fill_(1) # the generator should make the discriminator output an 1 (i.e real) G_loss = criterionD(probs_fake_2, label) # Q loss (Update G, T, Q) ent_loss = -self.P.log_prob(c).mean(0) # always equals log(2), only size(c) is used crossent_loss = -self.Q.log_prob(fake_o, c).mean(0) # fake_o is forward through an NN Q that outputs (mu,var). creates a probability function into which c is placed. # then we have the probability of each c given it's o_fake. we take a mean. crossent_loss_next = -self.Q.log_prob(fake_o_next, c_next).mean(0) # trans_prob = self.T.get_prob(Variable(torch.eye(self.dis_c_dim).cuda())) ent_loss_next = -self.T.log_prob(c, None, c_next).mean(0) mi_loss = crossent_loss - ent_loss mi_loss_next = crossent_loss_next - ent_loss_next Q_loss = mi_loss + mi_loss_next # T loss (Update T) Q_c_given_x, Q_c_given_x_var = (i.detach() for i in self.Q.forward(real_o)) t_mu, t_variance = self.T.get_mu_and_var(c) t_diff = t_mu - c # Keep the variance small. # TODO: add loss on t_diff T_loss = (t_variance ** 2).sum(1).mean(0) (G_loss + self.infow * Q_loss + self.transw * T_loss).backward() optimG.step() ############################################# # Logging (iteration) if num_iters % 100 == 0: os.system('nvidia-settings -q gpucoretemp') #print('going to sleep') #time.sleep(20) os.system('nvidia-settings -q gpucoretemp') #print('waking up') self.log_dict['Dloss'] = D_loss.data[0] self.log_dict['Gloss'] = G_loss.data[0] self.log_dict['Qloss'] = Q_loss.data[0] self.log_dict['Tloss'] = T_loss.data[0] self.log_dict['mi_loss'] = mi_loss.data[0] self.log_dict['mi_loss_next'] = mi_loss_next.data[0] self.log_dict['ent_loss'] = ent_loss.data[0] self.log_dict['ent_loss_next'] = ent_loss_next.data[0] self.log_dict['crossent_loss'] = crossent_loss.data[0] self.log_dict['crossent_loss_next'] = crossent_loss_next.data[0] self.log_dict['D(real)'] = probs_real.data.mean() self.log_dict['D(fake)_before'] = probs_fake.data.mean() self.log_dict['D(fake)_after'] = probs_fake_2.data.mean() write_stats_from_var(self.log_dict, Q_c_given_x, 'Q_c_given_real_x_mu') write_stats_from_var(self.log_dict, Q_c_given_x, 'Q_c_given_real_x_mu', idx=0) write_stats_from_var(self.log_dict, Q_c_given_x_var, 'Q_c_given_real_x_variance') write_stats_from_var(self.log_dict, Q_c_given_x_var, 'Q_c_given_real_x_variance', idx=0) write_stats_from_var(self.log_dict, t_mu, 't_mu') write_stats_from_var(self.log_dict, t_mu, 't_mu', idx=0) write_stats_from_var(self.log_dict, t_diff, 't_diff') write_stats_from_var(self.log_dict, t_diff, 't_diff', idx=0) write_stats_from_var(self.log_dict, t_variance, 't_variance') write_stats_from_var(self.log_dict, t_variance, 't_variance', idx=0) print('\n#######################' '\nEpoch/Iter:%d/%d; ' '\nDloss: %.3f; ' '\nGloss: %.3f; ' '\nQloss: %.3f, %.3f; ' '\nT_loss: %.3f; ' '\nEnt: %.3f, %.3f; ' '\nCross Ent: %.3f, %.3f; ' '\nD(x): %.3f; ' '\nD(G(z)): b %.3f, a %.3f;' '\n0_Q_c_given_rand_x_mean: %.3f' '\n0_Q_c_given_rand_x_std: %.3f' '\n0_Q_c_given_fixed_x_std: %.3f' '\nt_diff_abs_mean: %.3f' '\nt_std_mean: %.3f' % (epoch, num_iters, D_loss.data[0], G_loss.data[0], mi_loss.data[0], mi_loss_next.data[0], T_loss.data[0], ent_loss.data[0], ent_loss_next.data[0], crossent_loss.data[0], crossent_loss_next.data[0], probs_real.data.mean(), probs_fake.data.mean(), probs_fake_2.data.mean(), Q_c_given_x[:, 0].data.mean(), Q_c_given_x[:, 0].data.std(), np.sqrt(Q_c_given_x_var[:, 0].data.mean()), t_diff.data.abs().mean(), t_variance.data.sqrt().mean(), )) ############################################# # Start evaluation from here. self.G.eval() self.D.eval() self.Q.eval() self.T.eval() ############################################# # Save images # Plot fake data x_save, x_next_save = self.G(*self.eval_input, self.get_c_next(epoch)) save_image(x_save.data, os.path.join(self.out_dir, 'gen', 'curr_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image(x_next_save.data, os.path.join(self.out_dir, 'gen', 'next_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image((x_save - x_next_save).data, os.path.join(self.out_dir, 'gen', 'diff_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) # Plot real data. if epoch % 10 == 0: save_image(real_o.data, os.path.join(self.out_dir, 'real', 'real_samples_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image(real_o_next.data, os.path.join(self.out_dir, 'real', 'real_samples_next_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) ############################################# # Save parameters if epoch % 5 == 0: if not os.path.exists('%s/var' % self.out_dir): os.makedirs('%s/var' % self.out_dir) for i in [self.G, self.D, self.Q, self.T]: torch.save(i.state_dict(), os.path.join(self.out_dir, 'var', '%s_%d' % (i.__class__.__name__, epoch, ))) ############################################# # Logging (epoch) for k, v in self.log_dict.items(): log_value(k, v, epoch) if epoch > 0: # tf logger # log_value('avg|x_next - x|', (x_next_save.data - x_save.data).abs().mean(dim=0).sum(), epoch + 1) # self.logger.histo_summary("Q_c_given_x", Q_c_given_x.data.cpu().numpy().reshape(-1), step=epoch) # self.logger.histo_summary("Q_c0_given_x", Q_c_given_x[:, 0].data.cpu().numpy(), step=epoch) # self.logger.histo_summary("Q_c_given_x_var", Q_c_given_x_var.cpu().numpy().reshape(-1), step=epoch) # self.logger.histo_summary("Q_c0_given_x_var", Q_c_given_x_var[:, 0].data.cpu().numpy(), step=epoch) # csv log with open(os.path.join(self.out_dir, 'progress.csv'), 'a') as csv_file: writer = csv.writer(csv_file) if epoch == 1: writer.writerow(["epoch"] + list(self.log_dict.keys())) writer.writerow(["%.3f" % _tmp for _tmp in [epoch] + list(self.log_dict.values())]) ############################################# # Do planning? if self.plan_length <= 0 or epoch not in self.planning_epoch: continue print("\n#######################" "\nPlanning") ############################################# # Showing plans on real images using best code. # Min l2 distance from start and goal real images. self.plan_hack(data_start_loader, data_goal_loader, epoch, 'L2') # Min classifier distance from start and goal real images. self.plan_hack(data_start_loader, data_goal_loader, epoch, 'classifier')
dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) for i, key in enumerate(itos_deps): dhLogits[key] = 0.0 if key == ("VERB", "obj", "NOUN"): dhLogits[key] = (10.0 if random() > 0.5 else -10.0) dhWeights.data[i] = dhLogits[key] originalDistanceWeights[key] = 0.0 #random() distanceWeights.data[i] = originalDistanceWeights[key] assert float(dhWeights.abs().max()) > 5 words = list(vocab.iteritems()) words = sorted(words, key=lambda x: x[1], reverse=True) itos = map(lambda x: x[0], words) stoi = dict(zip(itos, range(len(itos)))) if len(itos) > 6: assert stoi[itos[5]] == 5 vocab_size = 50000 word_embeddings = torch.nn.Embedding(num_embeddings=vocab_size + 3, embedding_dim=1) #.cuda() pos_u_embeddings = torch.nn.Embedding(num_embeddings=len(posUni) + 3, embedding_dim=1) #.cuda()
def forward(numericAndLineNumbers, surprisalTable=None, doDropout=True, batchSizeHere=1): global counter global crossEntropy global printHere global devLosses global hidden global beginning global beginning global beginning_chars if hidden is not None: hidden = Variable(hidden.data).detach() forRestart = bernoulli.sample() #print(forRestart) hiddenNew = startHidden(zeroHidden).unsqueeze(0) # print(hiddenNew.size(), hidden.size()) hidden = torch.where( forRestart.unsqueeze(0).unsqueeze(2) == 1, hiddenNew, hidden) beginning = torch.where( forRestart.unsqueeze(0) == 1, zeroBeginning, beginning) # beginning = forRestart.unsqueeze(0).unsqueeze(2) * zeroBeginning + (1-forRestart).unsqueeze(0).unsqueeze(2) * beginning beginning_chars = torch.where( forRestart.unsqueeze(0).unsqueeze(2) == 1, zeroBeginning_chars, beginning_chars) else: hidden = startHidden(zeroHidden).unsqueeze(0) # print(hidden.size()) beginning = zeroBeginning beginning_chars = zeroBeginning_chars numeric, numeric_chars, lineNumbers = numericAndLineNumbers numeric = torch.cat([beginning, numeric], dim=0) numeric_chars = torch.cat([beginning_chars, numeric_chars], dim=0) beginning = numeric[numeric.size()[0] - 1].view(1, args.batchSize) beginning_chars = numeric_chars[numeric_chars.size()[0] - 1].view( 1, args.batchSize, 16) loss = 0 wordNum = 0 lossWords = 0 policyGradientLoss = 0 baselineLoss = 0 optimizer.zero_grad() for c in components: c.zero_grad() # for q in parameters_made: # for p in q: # if p.grad is not None: # p.grad.fill_(0) totalQuality = 0.0 if True: inputTensor = numeric # so it will be horizon x args.batchSizeHere # print inputTensor # quit() inputTensorIn = inputTensor[:-1] inputTensorOut = inputTensor[1:] input_tensor_chars = Variable(numeric_chars[:-1], requires_grad=False) embedded_chars = input_tensor_chars.transpose(0, 2).transpose(2, 1) embedded_chars = embedded_chars.contiguous().view(16, -1) _, embedded_chars = char_composition( character_embeddings(embedded_chars), None) embedded_chars = embedded_chars[0].view(2, args.horizon, args.batchSize, args.char_enc_hidden_dim) embedded_chars = char_composition_output( torch.cat([embedded_chars[0], embedded_chars[1]], dim=2)) # embedded = word_embeddings(input_tensor) inputEmbeddings = word_pos_morph_embeddings( inputTensorIn.view(args.horizon, batchSizeHere)) #print(embedded.size()) # print("=========") # print(numeric[:,5]) # print(embedded[:,5,:].mean(dim=1)[numeric[:-1,5] == 3]) # print(embedded_chars[:,5,:].mean(dim=1)[numeric[:-1,5] == 3]) inputEmbeddings = torch.cat([inputEmbeddings, embedded_chars], dim=2) if doDropout: if args.input_dropoutRate > 0: inputEmbeddings = inputDropout(inputEmbeddings) if args.dropout_rate > 0: inputEmbeddings = dropout(inputEmbeddings) lossesWordTotal = [] sampled_vectors = [] logProbConditionals = [] output_vectors = [] scales = [] means = [] encodedEpsilonForAllSteps = standardNormal.sample().view( args.horizon, args.batchSize, -1) for i in range(inputEmbeddings.size()[0]): #print(i, hidden.abs().max()) output1, hidden = rnn_both(inputEmbeddings[i].unsqueeze(0), hidden) assert args.rnn_layers == 1 hidden = torch.clamp(hidden, min=-5, max=5) output = hidden if doDropout: if args.dropout_rate > 0: output = dropout(output) output_vectors.append(output) output = torch.cat(output_vectors, dim=0) # print(output.size()) word_logits = decoder(output) # print(word_logits.size()) # word_logits = word_logits.view(args.horizon, batchSizeHere, outVocabSize) word_softmax = logsoftmax(word_logits) # print(word_softmax) # print(word_softmax.size()) # print(torch.exp(word_softmax).sum(dim=2)) # print(word_softmax.size()) # print(word_logits.abs().max(), word_softmax.abs().max()) # print(word_softmax.size(), inputTensorOut.size()) lossesWord = lossModuleTest(word_softmax.view(-1, 50003), inputTensorOut.view(-1)) # print(inputTensorOut) # print(lossesWord.mean()) # lossesWordTotal.append(lossesWord) # lossesWord = torch.stack(lossesWordTotal, dim=0) lossWords = lossesWord.sum() loss = lossWords klLoss = 0 # print(sampledTotal.size(), logProbConditionalsTotal.size()) #for sampled, logProbConditional in zip(sampled_vectors, logProbConditionals): # n=1 #print(loss, logProbConditionalsTotal.mean(), logProbMarginal.mean()) klLoss = 0 # print(logProbConditional, logProbMarginal) # print(logStandardDeviationHidden) # klLoss = 0.5 * (-1 - 2 * (logStandardDeviationHidden) + torch.pow(meanHidden, 2) + torch.exp(2*logStandardDeviationHidden)) # klLoss = klLoss.sum(1) klLossSum = 0 if counter % 10 == 0: klLossMean = 0 print(BETA, args.flow_length, klLossMean, lossesWord.mean(), BETA * klLossMean + lossesWord.mean()) if float(klLossMean) != float(klLossMean): print(hidden.abs().max()) assert False, "got NA, abort" loss = loss + BETA * klLossSum # print lossesWord if surprisalTable is not None or True: lossesCPU = lossesWord.data.cpu().view((args.horizon), batchSizeHere).numpy() if True: for i in range(0, args.horizon ): #range(1,maxLength+1): # don't include i==0 j = 0 lineNum = int(lineNumbers[i][j]) print(i, itos_total[numeric[i + 1][j]], lossesCPU[i][j], lineNum) while lineNum >= len(completeData): completeData.append([[], 0]) completeData[lineNum][0].append(itos_total[numeric[i + 1][j]]) completeData[lineNum][1] += lossesCPU[i][j] if surprisalTable is not None: if printHere: print surprisalTable for j in range(batchSizeHere): for r in range(args.horizon): surprisalTable[r] += lossesCPU[ r, j] #.data.cpu().numpy()[0] wordNum = (args.horizon - 1) * batchSizeHere if wordNum == 0: print input_words print batchOrdered return 0, 0, 0, 0, 0 if printHere: print loss / wordNum print lossWords / wordNum print["CROSS ENTROPY", crossEntropy, exp(crossEntropy)] print("beta", BETA) crossEntropy = 0.99 * crossEntropy + 0.01 * (lossWords / wordNum).data.cpu().numpy() totalQuality = loss.data.cpu().numpy( ) # consists of lossesWord + lossesPOS numberOfWords = wordNum # probabilities = torch.sigmoid(dhWeights) # neg_entropy = torch.sum( probabilities * torch.log(probabilities) + (1-probabilities) * torch.log(1-probabilities)) # policy_related_loss = lr_policy * (entropy_weight * neg_entropy + policyGradientLoss) # lives on CPU loss = loss / batchSizeHere return loss, None, None, totalQuality, numberOfWords, 0
def forward(numeric, surprisalTable=None, doDropout=True, batchSizeHere=1): global counter global crossEntropy global printHere global devLosses global hidden global beginning global beginning global beginning_chars if hidden is not None: hidden = Variable(hidden.data).detach() forRestart = bernoulli.sample() #print(forRestart) sampled = startHidden(zeroHidden) hiddenNew = sampleToHidden(sampled).unsqueeze(0) # hidden = forRestart.unsqueeze(0).unsqueeze(2) * hiddenNew + (1-forRestart).unsqueeze(0).unsqueeze(2) * hidden # print(torch.where) hidden = torch.where( forRestart.unsqueeze(0).unsqueeze(2) == 1, hiddenNew, hidden) beginning = torch.where( forRestart.unsqueeze(0) == 1, zeroBeginning, beginning) # beginning = forRestart.unsqueeze(0).unsqueeze(2) * zeroBeginning + (1-forRestart).unsqueeze(0).unsqueeze(2) * beginning beginning_chars = torch.where( forRestart.unsqueeze(0).unsqueeze(2) == 1, zeroBeginning_chars, beginning_chars) else: sampled = startHidden(zeroHidden) hiddenNew = sampleToHidden(sampled).unsqueeze(0) hidden = hiddenNew beginning = zeroBeginning beginning_chars = zeroBeginning_chars numeric, numeric_chars = numeric numeric = torch.cat([beginning, numeric], dim=0) numeric_chars = torch.cat([beginning_chars, numeric_chars], dim=0) beginning = numeric[numeric.size()[0] - 1].view(1, args.batchSize) beginning_chars = numeric_chars[numeric_chars.size()[0] - 1].view( 1, args.batchSize, 16) loss = 0 wordNum = 0 lossWords = 0 policyGradientLoss = 0 baselineLoss = 0 optimizer.zero_grad() for c in components: c.zero_grad() # for q in parameters_made: # for p in q: # if p.grad is not None: # p.grad.fill_(0) totalQuality = 0.0 if True: inputTensor = numeric # so it will be horizon x args.batchSizeHere # print inputTensor # quit() inputTensorIn = inputTensor[:-1] inputTensorOut = inputTensor[1:] input_tensor_chars = Variable(numeric_chars[:-1], requires_grad=False) embedded_chars = input_tensor_chars.transpose(0, 2).transpose(2, 1) embedded_chars = embedded_chars.contiguous().view(16, -1) _, embedded_chars = char_composition( character_embeddings(embedded_chars), None) embedded_chars = embedded_chars[0].view(2, args.horizon, args.batchSize, args.char_enc_hidden_dim) embedded_chars = char_composition_output( torch.cat([embedded_chars[0], embedded_chars[1]], dim=2)) # embedded = word_embeddings(input_tensor) inputEmbeddings = word_pos_morph_embeddings( inputTensorIn.view(args.horizon, batchSizeHere)) #print(embedded.size()) # print("=========") # print(numeric[:,5]) # print(embedded[:,5,:].mean(dim=1)[numeric[:-1,5] == 3]) # print(embedded_chars[:,5,:].mean(dim=1)[numeric[:-1,5] == 3]) inputEmbeddings = torch.cat([inputEmbeddings, embedded_chars], dim=2) if doDropout: if args.input_dropoutRate > 0: inputEmbeddings = inputDropout(inputEmbeddings) if args.dropout_rate > 0: inputEmbeddings = dropout(inputEmbeddings) lossesWordTotal = [] sampled_vectors = [] logProbConditionals = [] output_vectors = [] scales = [] means = [] encodedEpsilonForAllSteps = standardNormal.sample().view( args.horizon, args.batchSize, -1) for i in range(inputEmbeddings.size()[0]): #print(i, hidden.abs().max()) output1, hidden = rnn_both(inputEmbeddings[i].unsqueeze(0), hidden) assert args.rnn_layers == 1 meanHidden = cellToMean(hidden[0]) klLoss = [None for _ in inputEmbeddings] logStandardDeviationHidden = hiddenToLogSDHidden(hidden[0]) # print(torch.exp(logStandardDeviationHidden)) scaleForDist = 1e-8 + torch.log( 1 + torch.exp(logStandardDeviationHidden)) scales.append(scaleForDist) means.append(meanHidden) # sampled = memoryDistribution.rsample() encodedEpsilon = encodedEpsilonForAllSteps[ i] #standardNormalPerStep.sample() # encodedEpsilon = torch.clamp(encodedEpsilon, min=-10, max=10) sampled = meanHidden + scaleForDist * encodedEpsilon sampled_vectors.append(sampled) # print(encodedEpsilon.abs().max()) hiddenNew = sampleToHidden(sampled).unsqueeze(0) # this also serves as the output for prediction hidden = hiddenNew hidden = torch.clamp(hidden, min=-5, max=5) # print(hidden.abs().max()) # output, _ = rnn_both(torch.cat([word_pos_morph_embeddings(torch.cuda.LongTensor([[2 for _ in range(args.batchSizeHere)]])), inputEmbeddings[halfSeqLen+1:]], dim=0), (hiddenNew, cellNew)) # output = torch.cat([output1[:halfSeqLen], output], dim=0) output = hiddenNew if doDropout: if args.dropout_rate > 0: output = dropout(output) output_vectors.append(output) meanHidden = torch.stack(means, dim=0) scaleForDist = torch.stack(scales, dim=0) sampled = torch.stack(sampled_vectors, dim=0) # memoryDistribution = torch.distributions.Normal(loc=meanHidden, scale=scaleForDist) # logProbConditional = memoryDistribution.log_prob(sampled).sum(dim=2) # # print("============") # print(logProbConditional) # print(sampled.size(), meanHidden.size(), scaleForDist.size()) logProbConditional = -(( (sampled - meanHidden)**2) / (2 * (scaleForDist**2))) - math.log( math.sqrt(2 * math.pi)) - torch.log(scaleForDist) logProbConditional = logProbConditional.sum(dim=2) # print(logProbConditional) batchSizeInflatedHere = args.batchSize * len(sampled_vectors) sampledTotal = sampled.view(batchSizeInflatedHere, -1) #print(logProbConditional.size()) # print(output_vectors[0].size()) output = torch.cat(output_vectors, dim=0) # print(output.size()) word_logits = decoder(output) # print(word_logits.size()) # word_logits = word_logits.view(args.horizon, batchSizeHere, outVocabSize) word_softmax = logsoftmax(word_logits) # print(word_softmax) # print(word_softmax.size()) # print(torch.exp(word_softmax).sum(dim=2)) # print(word_softmax.size()) # print(word_logits.abs().max(), word_softmax.abs().max()) # print(word_softmax.size(), inputTensorOut.size()) lossesWord = lossModuleTest(word_softmax.view(-1, 50003), inputTensorOut.view(-1)) # print(inputTensorOut) # print(lossesWord.mean()) # lossesWordTotal.append(lossesWord) # lossesWord = torch.stack(lossesWordTotal, dim=0) lossWords = lossesWord.sum() loss = lossWords klLoss = 0 logProbConditionalsTotal = logProbConditional.view( batchSizeInflatedHere) # print(sampledTotal.size(), logProbConditionalsTotal.size()) #for sampled, logProbConditional in zip(sampled_vectors, logProbConditionals): adjustment = [] epsilon = sampledTotal # n=1 plainPriorLogProb = standardNormal.log_prob(epsilon).sum( dim=1) #- (0.5 * torch.sum(sampled * sampled, dim=1)) logProbMarginal = plainPriorLogProb #print(loss, logProbConditionalsTotal.mean(), logProbMarginal.mean()) klLoss = (logProbConditionalsTotal - logProbMarginal) # print(logProbConditional, logProbMarginal) # print(logStandardDeviationHidden) # klLoss = 0.5 * (-1 - 2 * (logStandardDeviationHidden) + torch.pow(meanHidden, 2) + torch.exp(2*logStandardDeviationHidden)) # klLoss = klLoss.sum(1) klLossSum = klLoss.sum() if counter % 10 == 0: klLossMean = klLoss.mean() print(BETA, args.flow_length, klLossMean, lossesWord.mean(), BETA * klLossMean + lossesWord.mean()) if float(klLossMean) != float(klLossMean): print(hidden.abs().max()) assert False, "got NA, abort" loss = loss + BETA * klLossSum # print lossesWord if surprisalTable is not None or printHere: lossesCPU = lossesWord.data.cpu().view((args.horizon), batchSizeHere).numpy() if printHere: for i in range(0, args.horizon ): #range(1,maxLength+1): # don't include i==0 j = 0 print(i, itos_total[numeric[i + 1][j]], lossesCPU[i][j]) if surprisalTable is not None: if printHere: print surprisalTable for j in range(batchSizeHere): for r in range(args.horizon): surprisalTable[r] += lossesCPU[ r, j] #.data.cpu().numpy()[0] wordNum = (args.horizon - 1) * batchSizeHere if wordNum == 0: print input_words print batchOrdered return 0, 0, 0, 0, 0 if printHere: print loss / wordNum print lossWords / wordNum print["CROSS ENTROPY", crossEntropy, exp(crossEntropy)] print("beta", BETA) crossEntropy = 0.99 * crossEntropy + 0.01 * (lossWords / wordNum).data.cpu().numpy() totalQuality = loss.data.cpu().numpy( ) # consists of lossesWord + lossesPOS numberOfWords = wordNum # probabilities = torch.sigmoid(dhWeights) # neg_entropy = torch.sum( probabilities * torch.log(probabilities) + (1-probabilities) * torch.log(1-probabilities)) # policy_related_loss = lr_policy * (entropy_weight * neg_entropy + policyGradientLoss) # lives on CPU loss = loss / batchSizeHere return loss, None, None, totalQuality, numberOfWords, klLoss.mean( ) if not doDropout else None
def train(self): # Set up training. criterionD = nn.BCELoss().cuda() optimD = optim.Adam([{'params': self.D.parameters()}], lr=self.lr_d, betas=(0.5, 0.999)) optimG = optim.Adam([{'params': self.G.parameters()}, {'params': self.Q.parameters()}, {'params': self.T.parameters()}], lr=self.lr_g, betas=(0.5, 0.999)) ############################################ # Load rope dataset and apply transformations rope_path = os.path.realpath(self.data_dir) trans = [ transforms.Resize(64), transforms.CenterCrop(64), transforms.ToTensor(), # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ] if not self.fcn: # If fcn it will do the transformation to gray # and normalize in the loop. trans.append(transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))) if self.gray: # Apply grayscale transformation. trans.append(lambda x: x.mean(dim=0)[None, :, :]) trans_comp = transforms.Compose(trans) # Image 1 and image 2 are k steps apart. dataset = self.dataset(phase='train', mode='train') self.plan_length = dataset.spec.max_seq_len - 3 dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, drop_last=True) ############################################ # Load eval plan dataset planning_data_dir = self.planning_data_dir dataset_plan = self.dataset(phase='val', mode='plan') data_plan_loader = torch.utils.data.DataLoader(dataset_plan, batch_size=1, shuffle=False, num_workers=4, drop_last=True) # dataset_start = self.dataset(phase='val', mode='start') # dataset_goal = self.dataset(phase='val', mode='goal') # data_start_loader = torch.utils.data.DataLoader(dataset_start, # batch_size=1, # shuffle=False, # num_workers=1, # drop_last=True) # data_goal_loader = torch.utils.data.DataLoader(dataset_goal, # batch_size=1, # shuffle=False, # num_workers=1, # drop_last=True) ############################################ real_o = Variable(torch.FloatTensor(self.batch_size, 3, dataset.img_sz, dataset.img_sz).cuda(), requires_grad=False) real_o_next = Variable(torch.FloatTensor(self.batch_size, 3, dataset.img_sz, dataset.img_sz).cuda(), requires_grad=False) label = Variable(torch.FloatTensor(self.batch_size).cuda(), requires_grad=False) z = Variable(torch.FloatTensor(self.batch_size, self.rand_z_dim).cuda(), requires_grad=False) for epoch in range(self.n_epochs + 1): self.G.train() self.D.train() self.Q.train() self.T.train() for num_iters, batch_data in tqdm(enumerate(dataloader, 0)): # break # Real data o, _ = batch_data[0] o_next, _ = batch_data[1] bs = o.size(0) real_o.data.resize_(o.size()) real_o_next.data.resize_(o_next.size()) label.data.resize_(bs) real_o.data.copy_(o) real_o_next.data.copy_(o_next) if self.fcn: real_o = self.apply_fcn_mse(o) real_o_next = self.apply_fcn_mse(o_next) if real_o.abs().max() > 1: import ipdb; ipdb.set_trace() assert real_o.abs().max() <= 1 if epoch == 0: break ############################################ # D Loss (Update D) optimD.zero_grad() # Real data probs_real = self.D(real_o, real_o_next) label.data.fill_(1) loss_real = criterionD(probs_real, label) loss_real.backward() # Fake data z, c, c_next = self._noise_sample(z, bs) fake_o, fake_o_next = self.G(z, c, c_next) probs_fake = self.D(fake_o.detach(), fake_o_next.detach()) label.data.fill_(0) loss_fake = criterionD(probs_fake, label) loss_fake.backward() D_loss = loss_real + loss_fake optimD.step() ############################################ # G loss (Update G) optimG.zero_grad() probs_fake_2 = self.D(fake_o, fake_o_next) label.data.fill_(1) G_loss = criterionD(probs_fake_2, label) # Q loss (Update G, T, Q) ent_loss = -self.P.log_prob(c).mean(0) crossent_loss = -self.Q.log_prob(fake_o, c).mean(0) crossent_loss_next = -self.Q.log_prob(fake_o_next, c_next).mean(0) # trans_prob = self.T.get_prob(Variable(torch.eye(self.dis_c_dim).cuda())) ent_loss_next = -self.T.log_prob(c, None, c_next).mean(0) mi_loss = crossent_loss - ent_loss mi_loss_next = crossent_loss_next - ent_loss_next Q_loss = mi_loss + mi_loss_next # T loss (Update T) Q_c_given_x, Q_c_given_x_var = (i.detach() for i in self.Q.forward(real_o)) t_mu, t_variance = self.T.get_mu_and_var(c) t_diff = t_mu - c # Keep the variance small. # TODO: add loss on t_diff T_loss = (t_variance ** 2).sum(1).mean(0) (G_loss + self.infow * Q_loss + self.transw * T_loss).backward() optimG.step() ############################################# # Logging (iteration) if num_iters % 100 == 0: self.log_dict['Dloss'] = D_loss.item() self.log_dict['Gloss'] = G_loss.item() self.log_dict['Qloss'] = Q_loss.item() self.log_dict['Tloss'] = T_loss.item() self.log_dict['mi_loss'] = mi_loss.item() self.log_dict['mi_loss_next'] = mi_loss_next.item() self.log_dict['ent_loss'] = ent_loss.item() self.log_dict['ent_loss_next'] = ent_loss_next.item() self.log_dict['crossent_loss'] = crossent_loss.item() self.log_dict['crossent_loss_next'] = crossent_loss_next.item() self.log_dict['D(real)'] = probs_real.data.mean() self.log_dict['D(fake)_before'] = probs_fake.data.mean() self.log_dict['D(fake)_after'] = probs_fake_2.data.mean() write_stats_from_var(self.log_dict, Q_c_given_x, 'Q_c_given_real_x_mu') write_stats_from_var(self.log_dict, Q_c_given_x, 'Q_c_given_real_x_mu', idx=0) write_stats_from_var(self.log_dict, Q_c_given_x_var, 'Q_c_given_real_x_variance') write_stats_from_var(self.log_dict, Q_c_given_x_var, 'Q_c_given_real_x_variance', idx=0) write_stats_from_var(self.log_dict, t_mu, 't_mu') write_stats_from_var(self.log_dict, t_mu, 't_mu', idx=0) write_stats_from_var(self.log_dict, t_diff, 't_diff') write_stats_from_var(self.log_dict, t_diff, 't_diff', idx=0) write_stats_from_var(self.log_dict, t_variance, 't_variance') write_stats_from_var(self.log_dict, t_variance, 't_variance', idx=0) print('\n#######################' '\nEpoch/Iter:%d/%d; ' '\nDloss: %.3f; ' '\nGloss: %.3f; ' '\nQloss: %.3f, %.3f; ' '\nT_loss: %.3f; ' '\nEnt: %.3f, %.3f; ' '\nCross Ent: %.3f, %.3f; ' '\nD(x): %.3f; ' '\nD(G(z)): b %.3f, a %.3f;' '\n0_Q_c_given_rand_x_mean: %.3f' '\n0_Q_c_given_rand_x_std: %.3f' '\n0_Q_c_given_fixed_x_std: %.3f' '\nt_diff_abs_mean: %.3f' '\nt_std_mean: %.3f' % (epoch, num_iters, D_loss.item(), G_loss.item(), mi_loss.item(), mi_loss_next.item(), T_loss.item(), ent_loss.item(), ent_loss_next.item(), crossent_loss.item(), crossent_loss_next.item(), probs_real.data.mean(), probs_fake.data.mean(), probs_fake_2.data.mean(), Q_c_given_x[:, 0].cpu().numpy().mean(), Q_c_given_x[:, 0].cpu().numpy().std(), np.sqrt(Q_c_given_x_var[:, 0].cpu().numpy().mean()), t_diff.data.abs().mean(), t_variance.data.sqrt().mean(), )) ############################################# # Start evaluation from here. self.G.eval() self.D.eval() self.Q.eval() self.T.eval() ############################################# # Save images # Plot fake data x_save, x_next_save = self.G(*self.eval_input, self.get_c_next(epoch)) save_image(x_save.data, os.path.join(self.out_dir, 'gen', 'curr_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image(x_next_save.data, os.path.join(self.out_dir, 'gen', 'next_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image((x_save - x_next_save).data, os.path.join(self.out_dir, 'gen', 'diff_samples_%03d.png' % epoch), nrow=self.test_num_codes, normalize=True) # Plot real data. if epoch % 10 == 0: save_image(real_o.data, os.path.join(self.out_dir, 'real', 'real_samples_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) save_image(real_o_next.data, os.path.join(self.out_dir, 'real', 'real_samples_next_%d.png' % epoch), nrow=self.test_num_codes, normalize=True) ############################################# # Save parameters if epoch % 5 == 0: if not os.path.exists('%s/var' % self.out_dir): os.makedirs('%s/var' % self.out_dir) for i in [self.G, self.D, self.Q, self.T]: torch.save(i.state_dict(), os.path.join(self.out_dir, 'var', '%s_%d' % (i.__class__.__name__, epoch, ))) ############################################# # Logging (epoch) for k, v in self.log_dict.items(): log_value(k, v, epoch) if epoch > 0: # tf logger # log_value('avg|x_next - x|', (x_next_save.data - x_save.data).abs().mean(dim=0).sum(), epoch + 1) # self.logger.histo_summary("Q_c_given_x", Q_c_given_x.data.cpu().numpy().reshape(-1), step=epoch) # self.logger.histo_summary("Q_c0_given_x", Q_c_given_x[:, 0].data.cpu().numpy(), step=epoch) # self.logger.histo_summary("Q_c_given_x_var", Q_c_given_x_var.cpu().numpy().reshape(-1), step=epoch) # self.logger.histo_summary("Q_c0_given_x_var", Q_c_given_x_var[:, 0].data.cpu().numpy(), step=epoch) # csv log with open(os.path.join(self.out_dir, 'progress.csv'), 'a') as csv_file: writer = csv.writer(csv_file) if epoch == 1: writer.writerow(["epoch"] + list(self.log_dict.keys())) writer.writerow(["%.3f" % _tmp for _tmp in [epoch] + list(self.log_dict.values())]) ############################################# # Do planning? if self.plan_length <= 0 or epoch not in self.planning_epoch: continue print("\n#######################" "\nPlanning") ############################################# # Showing plans on real images using best code. # Min l2 distance from start and goal real images. evaluator = EvalPSNR(2) plans = [] datas = [] for i, data in enumerate(data_plan_loader): plan = self.plan_hack(i, data[:, 0], data[:, -1], epoch, 'L2', data.shape[1] - 3, save=False) evaluator(plan[None].cpu().numpy(), data.cpu().numpy()) print(evaluator.PSNR(), evaluator.SSIM()) # if i < 4: # self.make_gif(torch.cat([data[0], plan.cpu()], dim=2), i, epoch) plans.append(plan.cpu()) datas.append(data[0]) if i == 3: for i in range(4): datas[i] = np.concatenate( [datas[i], np.zeros([100 - datas[i].shape[0]] + list(datas[i].shape[1:]))], 0) for i in range(4): plans[i] = np.concatenate([plans[i], torch.zeros([100 - plans[i].shape[0]] + list(plans[i].shape[1:]))], 0) data = np.concatenate(datas, 3) plan = np.concatenate(plans, 3) self.make_gif(torch.from_numpy(np.concatenate([data, plan], 2)), i, epoch, fps=4) import pdb; pdb.set_trace() print(('Test: [{0}/{1}]\t' 'PSNR {PSNR:.3f}' 'SSIM {SSIM:.3f}'.format( i, len(data_plan_loader), PSNR=evaluator.PSNR(), SSIM=evaluator.SSIM())))