def forward(self, frame, policy): # x: [B,2,84,84] self.B = frame.size()[0] # policy = policies[0] # print (frame.size()) # fds # print (frame.size()) #Predict mask mask = self.predict_mask(frame) #[B,2,210,160] # print (mask.size()) mask = mask.repeat(1,3,1,1) # print (mask.size()) # fsad #frame: [B,6,210,160] masked_frame = frame * mask log_dist_mask = policy.action_logdist(masked_frame) log_dist_true = policy.action_logdist(frame) action_dist_kl = torch.sum((log_dist_true - log_dist_mask)*torch.exp(log_dist_true), dim=1) #[B] action_dist_kl = torch.mean(action_dist_kl) # * 1000 mask = mask.view(self.B, -1) mask_sum = torch.mean(torch.sum(mask, dim=1)) * .000001 loss = action_dist_kl + mask_sum return loss, action_dist_kl, mask_sum
def forward(self, frame, DQNs): # x: [B,2,84,84] self.B = frame.size()[0] blurred_frame = self.blur_frame(frame) #Predict mask blur_weighting = self.predict_precision(frame) #[B,1,480,640] blur_weighting = blur_weighting.repeat(1,3,1,1) mixed_frame = frame * blur_weighting + (1.-blur_weighting)*blurred_frame difs= [] for i in range(len(DQNs)): q_mask = DQNs[i](mixed_frame) q_real = DQNs[i](frame) dif = torch.mean((q_mask-q_real)**2) #[B,A] difs.append(dif) difs = torch.stack(difs) dif = torch.mean(difs) blur_weighting = blur_weighting.view(self.B, -1) mask_sum = torch.mean(torch.sum(blur_weighting, dim=1)) * .0000001 loss = dif + mask_sum return loss, dif, mask_sum
def calculate_loss(self, x, beta=1., average=False): ''' :param x: input image(s) :param beta: a hyperparam for warmup :param average: whether to average loss or not :return: value of a loss function ''' # pass through VAE x_mean, x_logvar, z_q, z_q_mean, z_q_logvar = self.forward(x) # RE if self.args.input_type == 'binary': RE = log_Bernoulli(x, x_mean, dim=1) elif self.args.input_type == 'gray' or self.args.input_type == 'continuous': RE = -log_Logistic_256(x, x_mean, x_logvar, dim=1) else: raise Exception('Wrong input type!') # KL log_p_z = self.log_p_z(z_q) log_q_z = log_Normal_diag(z_q, z_q_mean, z_q_logvar, dim=1) KL = -(log_p_z - log_q_z) loss = - RE + beta * KL if average: loss = torch.mean(loss) RE = torch.mean(RE) KL = torch.mean(KL) return loss, RE, KL
def prepare_model(): since = time.time() num_epochs = 1 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train']: mean = torch.zeros(3) std = torch.zeros(3) # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels = data now_batch_size,c,h,w = inputs.shape mean += torch.sum(torch.mean(torch.mean(inputs,dim=3),dim=2),dim=0) std += torch.sum(torch.std(inputs.view(now_batch_size,c,h*w),dim=2),dim=0) print(mean/dataset_sizes['train']) print(std/dataset_sizes['train']) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) return
def evaluate(model): model.eval() running_loss = [0., 0., 0.] epoch_loss = 0. display_step = 100 for batch_idx, (x, c) in enumerate(test_loader): x, c = x.to(device), c.to(device) log_p, logdet = model(x, c) log_p, logdet = torch.mean(log_p), torch.mean(logdet) loss = -(log_p + logdet) running_loss[0] += loss.item() / display_step running_loss[1] += log_p.item() / display_step running_loss[2] += logdet.item() / display_step epoch_loss += loss.item() if (batch_idx + 1) % 100 == 0: print('Global Step : {}, [{}, {}] [Log pdf, Log p(z), Log Det] : {}' .format(global_step, epoch, batch_idx + 1, np.array(running_loss))) running_loss = [0., 0., 0.] del x, c, log_p, logdet, loss del running_loss epoch_loss /= len(test_loader) print('Evaluation Loss : {:.4f}'.format(epoch_loss)) return epoch_loss
def forward(self, frame, DQNs): # x: [B,2,84,84] self.B = frame.size()[0] #Predict mask mask = self.predict_mask(frame) #[B,2,210,160] # print (mask.size()) mask = mask.repeat(1,3,1,1) #frame: [B,6,210,160] masked_frame = frame * mask difs= [] for i in range(len(DQNs)): q_mask = DQNs[i](masked_frame) q_real = DQNs[i](frame) dif = torch.mean((q_mask-q_real)**2) #[B,A] difs.append(dif) difs = torch.stack(difs) dif = torch.mean(difs) mask = mask.view(self.B, -1) mask_sum = torch.mean(torch.sum(mask, dim=1)) * .0000001 loss = dif + mask_sum return loss, dif, mask_sum
def predictive_elbo(self, x, k, s): # No pW or qW self.B = x.size()[0] #batch size # self.k = k #number of z samples aka particles P # self.s = s #number of W samples elbo1s = [] for i in range(s): Ws, logpW, logqW = self.sample_W() #_ , [1], [1] mu, logvar = self.encode(x) #[B,Z] z, logpz, logqz = self.sample_z(mu, logvar, k=k) #[P,B,Z], [P,B] x_hat = self.decode(Ws, z) #[P,B,X] logpx = log_bernoulli(x_hat, x) #[P,B] elbo = logpx + logpz - logqz #[P,B] if k>1: max_ = torch.max(elbo, 0)[0] #[B] elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B] # elbo1 = elbo1 #+ (logpW - logqW)*.00000001 #[B], logp(x|W)p(w)/q(w) elbo1s.append(elbo) elbo1s = torch.stack(elbo1s) #[S,B] if s>1: max_ = torch.max(elbo1s, 0)[0] #[B] elbo1 = torch.log(torch.mean(torch.exp(elbo1s - max_), 0)) + max_ #[B] elbo = torch.mean(elbo1s) #[1] return elbo#, logprobs2[0], logprobs2[1], logprobs2[2], logprobs2[3], logprobs2[4]
def forward(self, z_seq, a_seq, term_seq): # x: [B,2,84,84] # T = x.size()[0] h = torch.zeros(1,self.h_size).cuda() z_losses = [] term_losses = [] for t in range(len(term_seq)-1): inter = self.encode_az(a_seq[t], z_seq[t]) h = self.update_h(h, inter) z_pred, term_pred = self.predict_output(h, inter) z_loss = torch.mean((z_seq[t+1] - z_pred)**2) term_loss = F.binary_cross_entropy_with_logits(input=term_pred, target=term_seq[t+1]) z_losses.append(z_loss) term_losses.append(term_loss) z_loss = torch.mean(torch.stack(z_losses)) term_loss = torch.mean(torch.stack(term_losses)) loss = z_loss + term_loss return loss, z_loss, term_loss
def forward(self, x, k=1): self.B = x.size()[0] mu, logvar = self.encode(x) z, logpz, logqz = self.sample(mu, logvar, k=k) #[P,B,Z] x_hat = self.decode(z) #[PB,X] x_hat = x_hat.view(k, self.B, -1) # print x_hat.size() # print x_hat.size() # print x.size() logpx = log_bernoulli(x_hat, x) #[P,B] elbo = logpx + logpz - logqz #[P,B] if k>1: max_ = torch.max(elbo, 0)[0] #[B] elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B] elbo = torch.mean(elbo) #[1] #for printing logpx = torch.mean(logpx) logpz = torch.mean(logpz) logqz = torch.mean(logqz) self.x_hat_sigmoid = F.sigmoid(x_hat) return elbo, logpx, logpz, logqz
def get_paf_and_heatmap(model, img_raw, scale_search, param_stride=8, box_size=368): multiplier = [scale * box_size / img_raw.shape[0] for scale in scale_search] heatmap_avg = torch.zeros((len(multiplier), 19, img_raw.shape[0], img_raw.shape[1])).cuda() paf_avg = torch.zeros((len(multiplier), 38, img_raw.shape[0], img_raw.shape[1])).cuda() for i, scale in enumerate(multiplier): img_test = cv2.resize(img_raw, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) img_test_pad, pad = pad_right_down_corner(img_test, param_stride, param_stride) img_test_pad = np.transpose(np.float32(img_test_pad[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5 feed = Variable(torch.from_numpy(img_test_pad)).cuda() output1, output2 = model(feed) print(output1.size()) print(output2.size()) heatmap = nn.UpsamplingBilinear2d((img_raw.shape[0], img_raw.shape[1])).cuda()(output2) paf = nn.UpsamplingBilinear2d((img_raw.shape[0], img_raw.shape[1])).cuda()(output1) heatmap_avg[i] = heatmap[0].data paf_avg[i] = paf[0].data heatmap_avg = torch.transpose(torch.transpose(torch.squeeze(torch.mean(heatmap_avg, 0)), 0, 1), 1, 2).cuda() heatmap_avg = heatmap_avg.cpu().numpy() paf_avg = torch.transpose(torch.transpose(torch.squeeze(torch.mean(paf_avg, 0)), 0, 1), 1, 2).cuda() paf_avg = paf_avg.cpu().numpy() return paf_avg, heatmap_avg
def forward(self, x, k): self.B = x.size()[0] #batch size #Encode mu, logvar = self.encode(x) #[B,Z] z, logpz, logqz = self.sample(mu, logvar, k=k) #[P,B,Z], [P,B] #Decode x_hat = self.decode(z) #[P,B,X] logpx = log_bernoulli(x_hat, x) #[P,B] #Compute elbo elbo = logpx + logpz - logqz #[P,B] if k>1: max_ = torch.max(elbo, 0)[0] #[B] elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B] elbo = torch.mean(elbo) #[1] logpx = torch.mean(logpx) logpz = torch.mean(logpz) logqz = torch.mean(logqz) return elbo, logpx, logpz, logqz
def train_step(self, state_batch, mcts_probs, winner_batch, lr): """perform a training step""" # wrap in Variable if self.use_gpu: state_batch = Variable(torch.FloatTensor(state_batch).cuda()) mcts_probs = Variable(torch.FloatTensor(mcts_probs).cuda()) winner_batch = Variable(torch.FloatTensor(winner_batch).cuda()) else: state_batch = Variable(torch.FloatTensor(state_batch)) mcts_probs = Variable(torch.FloatTensor(mcts_probs)) winner_batch = Variable(torch.FloatTensor(winner_batch)) # zero the parameter gradients self.optimizer.zero_grad() # set learning rate set_learning_rate(self.optimizer, lr) # forward log_act_probs, value = self.policy_value_net(state_batch) # define the loss = (z - v)^2 - pi^T * log(p) + c||theta||^2 # Note: the L2 penalty is incorporated in optimizer value_loss = F.mse_loss(value.view(-1), winner_batch) policy_loss = -torch.mean(torch.sum(mcts_probs*log_act_probs, 1)) loss = value_loss + policy_loss # backward and optimize loss.backward() self.optimizer.step() # calc policy entropy, for monitoring only entropy = -torch.mean( torch.sum(torch.exp(log_act_probs) * log_act_probs, 1) ) return loss.data[0], entropy.data[0]
def forward(self, x, k=1): self.k = k self.B = x.size()[0] mu, logvar = self.encode(x) z, logpz, logqz = self.sample(mu, logvar, k=k) x_hat, logpW, logqW = self.decode(z) logpx = log_bernoulli(x_hat, x) #[P,B] elbo = logpx + logpz - logqz + (logpW - logqW)*.00000001 #[P,B] if k>1: max_ = torch.max(elbo, 0)[0] #[B] elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B] elbo = torch.mean(elbo) #[1] #for printing logpx = torch.mean(logpx) logpz = torch.mean(logpz) logqz = torch.mean(logqz) self.x_hat_sigmoid = F.sigmoid(x_hat) return elbo, logpx, logpz, logqz, logpW, logqW
def forward(self, x, k=1): self.B = x.size()[0] mu, logvar = self.encode(x) z, logpz, logqz = self.sample(mu, logvar, k=k) # x_hat = self.decode(z) x_mean, x_logvar = self.decode(z) #[P,B,1] # logpx = log_bernoulli(x_hat, x) #[P,B] logpx = lognormal_decoder(x, x_mean, x_logvar) #[P,B] # elbo = logpx + .00000001*logpz - logqz #[P,B] elbo = logpx + logpz - logqz #[P,B] if k>1: max_ = torch.max(elbo, 0)[0] #[B] elbo = torch.log(torch.mean(torch.exp(elbo - max_), 0)) + max_ #[B] elbo = torch.mean(elbo) #[1] #for printing logpx = torch.mean(logpx) logpz = torch.mean(logpz) logqz = torch.mean(logqz) # self.x_hat_sigmoid = F.sigmoid(x_hat) return elbo, logpx, logpz, logqz
def singleTagLoss(pred_tag, keypoints): """ associative embedding loss for one image """ eps = 1e-6 tags = [] pull = 0 for i in keypoints: tmp = [] for j in i: if j[1]>0: tmp.append(pred_tag[j[0]]) if len(tmp) == 0: continue tmp = torch.stack(tmp) tags.append(torch.mean(tmp, dim=0)) pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2) if len(tags) == 0: return make_input(torch.zeros([1]).float()), make_input(torch.zeros([1]).float()) tags = torch.stack(tags)[:,0] num = tags.size()[0] size = (num, num, tags.size()[1]) A = tags.unsqueeze(dim=1).expand(*size) B = A.permute(1, 0, 2) diff = A - B diff = torch.pow(diff, 2).sum(dim=2)[:,:,0] push = torch.exp(-diff) push = (torch.sum(push) - num) return push/((num - 1) * num + eps) * 0.5, pull/(num + eps)
def global_pooling(x): # input x [n, c, h, w] # output l [n, c] s = torch.mean(x, dim=-1) s = torch.mean(s, dim=-1) return s
def angle_length_loss(y_pred, y_true, weights): y_true = y_true.permute(0, 2, 3, 1) y_pred = y_pred.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) # Single threshold # score_per_bundle = {} # bundles = ExpUtils.get_bundle_names(HP.CLASSES)[1:] nr_of_classes = int(y_true.shape[-1] / 3.) scores = torch.zeros(nr_of_classes) for idx in range(nr_of_classes): y_pred_bund = y_pred[:, :, :, (idx * 3):(idx * 3) + 3].contiguous() y_true_bund = y_true[:, :, :, (idx * 3):(idx * 3) + 3].contiguous() # [x,y,z,3] weights_bund = weights[:, :, :, (idx * 3)].contiguous() # [x,y,z] angles = PytorchUtils.angle_last_dim(y_pred_bund, y_true_bund) angles_weighted = angles / weights_bund #norm lengths to 0-1 to be more equal to angles?? -> peaks are already around 1 -> ok lengths = (torch.norm(y_pred_bund, 2., -1) - torch.norm(y_true_bund, 2, -1)) ** 2 lenghts_weighted = lengths * weights_bund # Divide by weights.max otherwise lens would be way bigger # Flip angles to make it a minimization problem combined = -angles_weighted + lenghts_weighted / weights_bund.max() scores[idx] = torch.mean(combined) return torch.mean(scores)
def forward(self, frame, policies): # x: [B,2,84,84] self.B = frame.size()[0] #Predict mask pre_mask = self.predict_mask_nosigmoid(frame) mask = F.sigmoid(pre_mask) masked_frame = frame * mask kls = [] for i in range(len(policies)): policy = policies[i] log_dist_mask = policy.action_logdist(masked_frame) log_dist_true = policy.action_logdist(frame) action_dist_kl = torch.sum((log_dist_true - log_dist_mask)*torch.exp(log_dist_true), dim=1) #[B] action_dist_kl = torch.mean(action_dist_kl) # * 1000 kls.append(action_dist_kl) kls = torch.stack(kls) #[policies, B] action_dist_kl = torch.mean(action_dist_kl) #[1] #over batch and over policies pre_mask = pre_mask.view(self.B, -1) mask_cost = torch.abs(pre_mask + 20) # mask_sum = torch.mean(torch.sum(mask_cost, dim=1)) * .00001 # mask_cost = torch.mean(mask_cost) * .00001 mask_cost = torch.mean(mask_cost) * .01 loss = action_dist_kl + mask_cost return loss, action_dist_kl, mask_cost
def encode_and_logprob(self, x): for i in range(len(self.first_half_weights)-1): x = self.act_func(self.first_half_weights[i](x)) # pre_act = self.first_half_weights[i](x) #[B,D] # # pre_act_with_noise = Variable(torch.randn(1, self.arch_2[i][1]).type(self.dtype)) * pre_act # probs = torch.ones(1, self.arch_2[i][1]) * .5 # pre_act_with_noise = Variable(torch.bernoulli(probs).type(self.dtype)) * pre_act # x = self.act_func(pre_act_with_noise) mean = self.first_half_weights[-1](x) logvar = self.q_logvar(x) # print (logvar) #Sample eps = Variable(torch.randn(1, self.z_size)) #.type(self.dtype)) # x = (torch.sqrt(torch.exp(W_logvars)) * eps) + W_means x = (torch.exp(.5*logvar) * eps) + mean logq = -torch.mean( logvar.sum(1) + ((x - mean).pow(2)/torch.exp(logvar)).sum(1)) logp = torch.mean( x.pow(2).sum(1)) return x, logq+logp
def calculate_loss(self, x, beta=1., average=False): # pass through VAE x_mean, x_logvar, z1_q, z1_q_mean, z1_q_logvar, z2_q, z2_q_mean, z2_q_logvar, z1_p_mean, z1_p_logvar = self.forward(x) # RE if self.args.input_type == 'binary': RE = log_Bernoulli(x, x_mean, dim=1) elif self.args.input_type == 'gray' or self.args.input_type == 'continuous': RE = -log_Logistic_256(x, x_mean, x_logvar, dim=1) else: raise Exception('Wrong input type!') # KL log_p_z1 = log_Normal_diag(z1_q, z1_p_mean, z1_p_logvar, dim=1) log_q_z1 = log_Normal_diag(z1_q, z1_q_mean, z1_q_logvar, dim=1) log_p_z2 = self.log_p_z2(z2_q) log_q_z2 = log_Normal_diag(z2_q, z2_q_mean, z2_q_logvar, dim=1) KL = -(log_p_z1 + log_p_z2 - log_q_z1 - log_q_z2) # full loss loss = -RE + beta * KL if average: loss = torch.mean(loss) RE = torch.mean(RE) KL = torch.mean(KL) return loss, RE, KL
def setUp(self, length=3, factor=10, count=1000000, seed=None, dtype=torch.float64, device=None): '''Set up the test values. Args: length: Size of the vector. factor: To multiply the mean and standard deviation. count: Number of samples for Monte-Carlo estimation. seed: Seed for the random number generator. dtype: The data type. device: In which device. ''' if seed is not None: torch.manual_seed(seed) # input mean and covariance self.mu = torch.randn(length, dtype=dtype, device=device) * factor self.cov = rand.definite(length, dtype=dtype, device=device, positive=True, semi=False, norm=factor**2) self.var = self.cov.diag() # Monte-Carlo estimation of the output mean and variance normal = torch.distributions.MultivariateNormal(self.mu, self.cov) out_samples = normal.sample((count,)).clamp_(min=0.0) self.mc_mu = torch.mean(out_samples, dim=0) self.mc_var = torch.var(out_samples, dim=0) normal = torch.distributions.MultivariateNormal(self.mu * 0, self.cov) out_samples = normal.sample((count,)).clamp_(min=0.0) mean = torch.mean(out_samples, dim=0) self.mc_zm_cov = cov(out_samples) self.mc_zm_corr = self.mc_zm_cov + outer(mean)
def cos_sim(in0,in1): in0_norm = normalize_tensor(in0) in1_norm = normalize_tensor(in1) N = in0.size()[0] X = in0.size()[2] Y = in0.size()[3] return torch.mean(torch.mean(torch.sum(in0_norm*in1_norm,dim=1).view(N,1,X,Y),dim=2).view(N,1,1,Y),dim=3).view(N)
def torch_pearsonr(x, y): # https://github.com/pytorch/pytorch/issues/1254 mean_x = torch.mean(x) mean_y = torch.mean(y) xm = x.sub(mean_x) ym = y.sub(mean_y) r_num = xm.dot(ym) r_den = torch.norm(xm, 2) * torch.norm(ym, 2) r_val = r_num / r_den return r_val
def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0): # x: [B,3,112,112] # q: [B,L] # inf type: 0 is both, 1 is only x, 2 is only y # dec type: 0 is both, 1 is only x, 2 is only y outputs = {} if inf_net is None: mu, logvar = self.inference_net(x) else: mu, logvar = inf_net.inference_net(x) z, logpz, logqz = self.sample(mu, logvar) z_dec = self.z_to_dec(z) B = z_dec.shape[0] # Decode Image x_hat = self.image_decoder(z_dec) alpha = torch.sigmoid(x_hat) beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale) x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5) # logpx = beta.log_prob(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda()) #[120,3,112,112] # add uniform noise here logpx = beta.log_prob(x_noise) #[120,3,112,112] # add uniform noise here logpx = torch.sum(logpx.view(B, -1),1) # [PB] * self.w_logpx # logpx = logpx * self.w_logpx log_ws = logpx + logpz - logqz outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz)) outputs['elbo'] = torch.mean(log_ws) outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.mean(logpz) outputs['logqz'] = torch.mean(logqz) outputs['logvar'] = logvar # print (outputs['elbo'], outputs['welbo'], outputs['logpz'], outputs['logqz']) # fafs # if generate: # # word_preds, sampled_words = self.text_generator.teacher_force(z_dec, generate=generate, embeder=self.encoder_embed) # # if dec_type == 2: # alpha = torch.sigmoid(self.image_decoder(z_dec)) # return outputs, alpha #, word_preds, sampled_words return outputs
def ganLossG(self, d_fake): if config.GAN_SETTING == "WGAN": loss = -torch.mean(d_fake) elif config.GAN_SETTING == "LSGAN": loss = 0.5 * torch.mean((d_fake - 1)**2) else: real_labels = Variable(torch.ones(d_fake.size(0)).cuda()) fake_labels = Variable(torch.zeros(d_fake.size(0)).cuda()) loss = self.bce_loss(d_fake, real_labels) return loss
def MVNError(output, gt): outMean = torch.mean(output) outStd = torch.std(output) output = (output - outMean)/outStd gtMean = torch.mean(gt) gtStd = torch.std(gt) gt = (gt - gtMean)/gtStd d = output - gt diff = torch.sqrt(torch.mean(d * d)) return diff
def forward(self, x=None, warmup=1., inf_net=None): #, k=1): #, marginf_type=0): outputs = {} B = x.shape[0] if inf_net is None: # mu, logvar = self.inference_net(x) z, logits = self.q.sample(x) else: # mu, logvar = inf_net.inference_net(x) z, logqz = inf_net.sample(x) # print (z[0]) # b = harden(z) # print (b[0]) # logpz = torch.sum( self.prior.log_prob(b), dim=1) # print (logpz[0]) # print (logpz.shape) # fdasf probs_q = torch.sigmoid(logits) probs_q = torch.clamp(probs_q, min=.00000001, max=.9999999) probs_p = torch.ones(B, self.z_size).cuda() *.5 KL = probs_q*torch.log(probs_q/probs_p) + (1-probs_q)*torch.log((1-probs_q)/(1-probs_p)) KL = torch.sum(KL, dim=1) # print (z.shape) # Decode Image x_hat = self.generator.forward(z) alpha = torch.sigmoid(x_hat) beta = Beta(alpha*self.beta_scale, (1.-alpha)*self.beta_scale) x_noise = torch.clamp(x + torch.FloatTensor(x.shape).uniform_(0., 1./256.).cuda(), min=1e-5, max=1-1e-5) logpx = beta.log_prob(x_noise) #[120,3,112,112] # add uniform noise here logpx = torch.sum(logpx.view(B, -1),1) # [PB] * self.w_logpx # print (logpx.shape,logpz.shape,logqz.shape) # fsdfda log_ws = logpx - KL #+ logpz - logqz outputs['logpx'] = torch.mean(logpx) outputs['x_recon'] = alpha # outputs['welbo'] = torch.mean(logpx + warmup*( logpz - logqz)) outputs['welbo'] = torch.mean(logpx + warmup*(KL)) outputs['elbo'] = torch.mean(log_ws) outputs['logws'] = log_ws outputs['z'] = z outputs['logpz'] = torch.zeros(1) #torch.mean(logpz) outputs['logqz'] = torch.mean(KL) # outputs['logvar'] = logvar return outputs
def compute_accuracy(self, x, y, dataset): if dataset == 'CelebA': x = F.sigmoid(x) predicted = self.threshold(x) correct = (predicted == y).float() accuracy = torch.mean(correct, dim=0) * 100.0 else: _, predicted = torch.max(x, dim=1) correct = (predicted == y).float() accuracy = torch.mean(correct) * 100.0 return accuracy
def n_mpjpe(predicted, target): """ Normalized MPJPE (scale only), adapted from: https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py """ assert predicted.shape == target.shape norm_predicted = torch.mean(torch.sum(predicted**2, dim=3, keepdim=True), dim=2, keepdim=True) norm_target = torch.mean(torch.sum(target*predicted, dim=3, keepdim=True), dim=2, keepdim=True) scale = norm_target / norm_predicted return mpjpe(scale * predicted, target)
def get_image_mean(img_tensor): """ :param img_tensor: torch.FloatTensor with shape [3, 480, 640] :type img_tensor: :return: torch.FloatTensor with shape [3] :rtype: """ img_mean = torch.mean(img_tensor, 1) img_mean = torch.mean(img_mean, 1) return img_mean
def mse(self, pred, label, weight): loss = weight * (pred - label) ** 2 return torch.mean(loss)
def train_NAVAR(data, maxlags=5, hidden_nodes=256, dropout=0, epochs=200, learning_rate=1e-4, batch_size=300, lambda1=0, val_proportion=0.0, weight_decay=0, check_every=1000, hidden_layers=1, normalize=True, split_timeseries=False, lstm=False): """ Trains a Neural Additive Vector Autoregression (NAVAR) model on time series data and scores the potential causal links between variables. Args: data: ndarray T (time points) x N (variables) input data maxlags: int Maximum number of time lags hidden_nodes: int Number of hidden nodes in each layers dropout: float Dropout probability in the hidden layers epochs: int Number of training epochs learning_rate: float Learning rate for Adam optimizer batch_size: int The size of the training batches lambda1: float Parameter for penalty to the contributions val_proportion: float Proportion of the dataset used for validation weight_decay: float Weight decay used in neural networks check_every: int Every 'check_every'th epoch we print training progress hidden_layers: int Number of hidden layers in the neural networks normalize: bool Indicates whether we should should normalize every variable split_timeseries: int If the original time series consists of multiple shorter time series, this argument should indicate the original time series length. Otherwise should be zero. lstm: bool Indicates whether we should use the LSTM model (instead of MLP). Returns: causal_matrix: ndarray N (variables) x N (variables) array containing the scores for every causal link. causal_matrix[i, j] indicates the score for potential link i -> j contributions: ndarray N^2 x training_examples array containing the contributions from and to every variable for every sample in the training_set loss_val: float Validation loss of the model after training """ # T is the number of time steps, N the number of variables T, N = data.shape # initialize the NAVAR model if lstm: model = NAVARLSTM(N, hidden_nodes, maxlags, dropout=dropout, hidden_layers=hidden_layers) else: model = NAVAR(N, hidden_nodes, maxlags, dropout=dropout, hidden_layers=hidden_layers) # use Mean Squared Error and the Adam optimzer criterion = torch.nn.MSELoss(reduction='mean') optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # obtain the training and validation data dataset = DataLoader(data, maxlags, normalize=normalize, val_proportion=val_proportion, split_timeseries=split_timeseries, lstm=lstm) X_train, Y_train = dataset.train_Xs, dataset.train_Ys X_val, Y_val = dataset.val_Xs, dataset.val_Ys # push model and data to GPU if available if torch.cuda.is_available(): model = model.cuda() X_train = X_train.cuda() Y_train = Y_train.cuda() if X_val is not None: X_val = X_val.cuda() Y_val = Y_val.cuda() num_training_samples = X_train.shape[0] total_loss = 0 loss_val = 0 # start of training loop batch_counter = 0 for t in range(1, epochs + 1): #obtain batches batch_indeces_list = [] if batch_size < num_training_samples: batch_perm = np.random.choice(num_training_samples, size=num_training_samples, replace=False) for i in range(int(num_training_samples / batch_size) + 1): start = i * batch_size batch_i = batch_perm[start:start + batch_size] if len(batch_i) > 0: batch_indeces_list.append(batch_perm[start:start + batch_size]) else: batch_indeces_list = [np.arange(num_training_samples)] for batch_indeces in batch_indeces_list: batch_counter += 1 X_batch = X_train[batch_indeces] Y_batch = Y_train[batch_indeces] # forward pass to calculate predictions and contributions predictions, contributions = model(X_batch) # calculate the loss loss_pred = criterion(predictions, Y_batch) loss_l1 = (lambda1 / N) * torch.mean( torch.sum(torch.abs(contributions), dim=1)) loss = loss_pred + loss_l1 total_loss += loss # Zero gradients, perform a backward pass, and update the weights. optimizer.zero_grad() loss.backward() optimizer.step() # every 'check_every' epochs we calculate and print the validation loss if t % check_every == 0: model.eval() if val_proportion > 0.0: val_pred, val_contributions = model(X_val) loss_val = criterion(val_pred, Y_val) model.train() print( f'iteration {t}. Loss: {total_loss/batch_counter} Val loss: {loss_val}' ) total_loss = 0 batch_counter = 0 # use the trained model to calculate the causal scores model.eval() # X_train can be too big to fit in the GPU, then this call raises "RuntimeError: CUDA out of memory." if split_timeseries: y_pred, contributions = model(X_train[:batch_size]) else: y_pred, contributions = model(X_train) causal_matrix = torch.std(contributions, dim=0).view(N, N).detach().cpu().numpy() return causal_matrix, contributions, loss_val
generator=generator1, discriminator=discriminator1, EP=EP, arguments=arguments, criterion=criterion, conditional_gen=False, source_num=1) elif tr_method == 'ML': if loss == 'Euclidean': criterion = nn.MSELoss() elif loss == 'Poisson': eps = 1e-20 criterion = lambda lam, tar: torch.mean(-tar*torch.log(lam+eps) + lam) generative_trainer(loader_mix=loader_mix, train_loader=loader1, generator=generator1, EP=EP, arguments=arguments, criterion=criterion, conditional_gen=False) # save models savepath = os.path.join(os.getcwd(), 'model_parameters') if not os.path.exists(savepath): os.mkdir(savepath) ut.save_models([generator1], [discriminator1], exp_info, savepath, arguments)
def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top2Gating on logits.""" # everything is in fp32 in this function gates = F.softmax(logits, dim=1) capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity)) # Create a mask for 1st's expert per token indices1_s = torch.argmax(gates, dim=1) num_experts = int(gates.shape[1]) mask1 = F.one_hot(indices1_s, num_classes=num_experts) # Create a mask for 2nd's expert per token using Gumbel-max trick # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/ logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device) # Replace top-expert with min value logits_except1 = logits_w_noise.masked_fill(mask1.bool(), float("-inf")) indices2_s = torch.argmax(logits_except1, dim=1) mask2 = F.one_hot(indices2_s, num_classes=num_experts) # Compute locations in capacity buffer locations1 = torch.cumsum(mask1, dim=0) - 1 locations2 = torch.cumsum(mask2, dim=0) - 1 # Update 2nd's location by accounting for locations of 1st locations2 += torch.sum(mask1, dim=0, keepdim=True) # gating decisions exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # Compute l_aux me = torch.mean(gates, dim=0) ce = torch.mean(mask1.float(), dim=0) l_aux = torch.mean(me * ce) * num_experts * num_experts # Remove locations outside capacity from mask mask1 *= torch.lt(locations1, capacity) mask2 *= torch.lt(locations2, capacity) # Store the capacity location for each token locations1_s = torch.sum(locations1 * mask1, dim=1) locations2_s = torch.sum(locations2 * mask2, dim=1) # Normalize gate probabilities mask1_float = mask1.float() mask2_float = mask2.float() gates1_s = einsum("se,se->s", gates, mask1_float) gates2_s = einsum("se,se->s", gates, mask2_float) denom_s = gates1_s + gates2_s # Avoid divide-by-zero denom_s = torch.clamp(denom_s, min=torch.finfo(denom_s.dtype).eps) gates1_s /= denom_s gates2_s /= denom_s # Calculate combine_weights and dispatch_mask gates1 = einsum("s,se->se", gates1_s, mask1_float) gates2 = einsum("s,se->se", gates2_s, mask2_float) locations1_sc = _one_hot_to_float(locations1_s, capacity) locations2_sc = _one_hot_to_float(locations2_s, capacity) combine1_sec = einsum("se,sc->sec", gates1, locations1_sc) combine2_sec = einsum("se,sc->sec", gates2, locations2_sc) combine_weights = combine1_sec + combine2_sec dispatch_mask = combine_weights.bool() return l_aux, combine_weights, dispatch_mask, exp_counts
def _run(data_file_path, dataset, data_generator, num_batches, vocabulary_size, context_size, num_noise_words, vec_dim, num_epochs, batch_size, lr, model_ver, vec_combine_method, save_all, generate_plot, model_ver_is_dbow, gpu): if torch.cuda.is_available(): torch.cuda.set_device(gpu) print(torch.cuda.current_device()) if model_ver_is_dbow: model = DBOW(vec_dim, num_docs=len(dataset), num_words=vocabulary_size) else: model = DM(vec_dim, num_docs=len(dataset), num_words=vocabulary_size) cost_func = NegativeSampling() optimizer = Adam(params=model.parameters(), lr=lr) if torch.cuda.is_available(): model.cuda() print("Dataset comprised of {:d} documents.".format(len(dataset))) print("Vocabulary size is {:d}.\n".format(vocabulary_size)) print("Training started.") best_loss = float("inf") prev_model_file_path = None for epoch_i in range(num_epochs): epoch_start_time = time.time() loss = [] for batch_i in range(num_batches): batch = next(data_generator) if torch.cuda.is_available(): batch.cuda_() if model_ver_is_dbow: x = model.forward(batch.doc_ids, batch.target_noise_ids) else: x = model.forward( batch.context_ids, batch.doc_ids, batch.target_noise_ids) x = cost_func.forward(x) loss.append(x.item()) model.zero_grad() x.backward() optimizer.step() _print_progress(epoch_i, batch_i, num_batches) # end of epoch loss = torch.mean(torch.FloatTensor(loss)) is_best_loss = loss < best_loss best_loss = min(loss, best_loss) state = { 'epoch': epoch_i + 1, 'model_state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer_state_dict': optimizer.state_dict() } prev_model_file_path = save_training_state( data_file_path, model_ver, vec_combine_method, context_size, num_noise_words, vec_dim, batch_size, lr, epoch_i, loss, state, save_all, generate_plot, is_best_loss, prev_model_file_path, model_ver_is_dbow) epoch_total_time = round(time.time() - epoch_start_time) print(" ({:d}s) - loss: {:.4f}".format(epoch_total_time, loss))
def get_loss(y_, y): return torch.mean((y_ - y_train)**2) # torch.mean() 是一个均值函数
def log_test_results(self, n_epoch): ''' Calculates errors on test dataset and logs them using TBVisualiser ''' test_start_time = time.time() # Set maximum number of batches to test max_test_iters = min(self.cfg['General'][0]['max_test_iters'], len(self.dataloader_test)) if max_test_iters == 0: max_test_iters = 1 # Set model to evaluation mode self.model.is_train = False self.model.eval() # Evaluate max_test_iters batches test_errors = OrderedDict() test_scalars = OrderedDict() for i_test_batch, test_batch in enumerate(self.dataloader_test): if i_test_batch == max_test_iters: break reconstr_image, _ = self.model.forward(test_batch) errors = self.model.get_current_errors() # Save errors from current batch for label, error in errors.items(): if label in test_errors: test_errors[label] += error else: test_errors[label] = error scalars = self.model.get_current_scalars() for label, scalar in scalars.items(): if label in test_scalars: if not label in ['au_predictions', 'au_ground_truths']: test_scalars[label] += scalar else: test_scalars[label] = torch.cat( (test_scalars[label], scalar), dim=0) else: test_scalars[label] = scalar # Log first image and reconstructed image if i_test_batch == 0: original_image = visualisation_utils.tensor2im( test_batch[0][0]) reconstr_image = visualisation_utils.tensor2im( reconstr_image[0]) self.tb_visualiser.log_images( { 'Original Image': original_image, 'Reconstructed Image': reconstr_image }, self.total_steps, is_train=False) # Normalise errors for label in test_errors.keys(): test_errors[label] /= max_test_iters # Log errors to tensorboard test_duration = time.time() - test_start_time self.tb_visualiser.plot_scalars(test_errors, self.total_steps, is_train=False) visualisation_utils.print_current_test_errors(n_epoch, test_duration, test_errors) # Normalise scalars for label in test_scalars.keys(): if not label in ['au_predictions', 'au_ground_truths']: test_scalars[label] /= max_test_iters # Calculate F1 scores f1_denominator = 2 * test_scalars['true_pos'] + test_scalars['false_neg'] \ + test_scalars['false_pos'] f1_denominator += torch.finfo(torch.float).tiny individual_f1_scores = 2 * test_scalars['true_pos'] / f1_denominator all_tp = torch.sum(test_scalars['true_pos']) all_fn = torch.sum(test_scalars['false_neg']) all_fp = torch.sum(test_scalars['false_pos']) + torch.finfo( torch.float).tiny average_f1_score = 2 * all_tp / (2 * all_tp + all_fn + all_fp) f1_scores = { 'F1 Score for AU1': individual_f1_scores[0].item(), 'F1 Score for AU2': individual_f1_scores[1].item(), 'F1 Score for AU4': individual_f1_scores[2].item(), 'F1 Score for AU5': individual_f1_scores[3].item(), 'F1 Score for AU6': individual_f1_scores[4].item(), 'F1 Score for AU9': individual_f1_scores[5].item(), 'F1 Score for AU12': individual_f1_scores[6].item(), 'F1 Score for AU15': individual_f1_scores[7].item(), 'F1 Score for AU17': individual_f1_scores[8].item(), 'F1 Score for AU20': individual_f1_scores[9].item(), 'F1 Score for AU25': individual_f1_scores[10].item(), 'F1 Score for AU26': individual_f1_scores[11].item(), 'Average of F1 Scores': torch.mean(individual_f1_scores).item(), 'Overall F1 Score': average_f1_score.item() } # Calculate Accuracies num_predictions = test_scalars['true_pos'] + test_scalars['true_neg'] \ + test_scalars['false_pos'] + test_scalars['false_neg'] accuracy_tensor = (test_scalars['true_pos'] + test_scalars['true_neg']) / num_predictions accuracies = { 'Accuracy for AU1': accuracy_tensor[0].item(), 'Accuracy for AU2': accuracy_tensor[1].item(), 'Accuracy for AU4': accuracy_tensor[2].item(), 'Accuracy for AU5': accuracy_tensor[3].item(), 'Accuracy for AU6': accuracy_tensor[4].item(), 'Accuracy for AU9': accuracy_tensor[5].item(), 'Accuracy for AU12': accuracy_tensor[6].item(), 'Accuracy for AU15': accuracy_tensor[7].item(), 'Accuracy for AU17': accuracy_tensor[8].item(), 'Accuracy for AU20': accuracy_tensor[9].item(), 'Accuracy for AU25': accuracy_tensor[10].item(), 'Accuracy for AU26': accuracy_tensor[11].item(), 'Average Accuracy': torch.mean(accuracy_tensor).item() } # Calculate 2AFC Scores component_2afc_tensor, average_2afc = metric_utils.compute_2AFC( test_scalars['au_ground_truths'], test_scalars['au_predictions']) values_2afc = { '2AFC Score for AU1': component_2afc_tensor[0].item(), '2AFC Score for AU2': component_2afc_tensor[1].item(), '2AFC Score for AU4': component_2afc_tensor[2].item(), '2AFC Score for AU5': component_2afc_tensor[3].item(), '2AFC Score for AU6': component_2afc_tensor[4].item(), '2AFC Score for AU9': component_2afc_tensor[5].item(), '2AFC Score for AU12': component_2afc_tensor[6].item(), '2AFC Score for AU15': component_2afc_tensor[7].item(), '2AFC Score for AU17': component_2afc_tensor[8].item(), '2AFC Score for AU20': component_2afc_tensor[9].item(), '2AFC Score for AU25': component_2afc_tensor[10].item(), '2AFC Score for AU26': component_2afc_tensor[11].item(), 'Overall 2AFC Score': average_2afc.item() } del test_scalars['true_pos'] del test_scalars['true_neg'] del test_scalars['false_pos'] del test_scalars['false_neg'] del test_scalars['au_ground_truths'] del test_scalars['au_predictions'] test_scalars['F1 Scores'] = f1_scores test_scalars['Accuracies'] = accuracies test_scalars['2AFC Scores'] = values_2afc # Log Metrics self.tb_visualiser.plot_scalars(test_scalars, self.total_steps, is_train=False) # Set model back to training mode self.model.is_train = True self.model.train()
def train(fv, model_name, criterion, balance=False, batchsize=64, size=0): if fv == "matlab": dloader = matloader else: dloader = fvloader train_data = dloader.load_train_data(size=size, balance=balance, fv=fv) val_data = dloader.load_val_data(size=size, fv=fv) test_data = dloader.load_test_data(size=size, fv=fv) # model_name = "transformer_%s_size%d_bce" % (fv, size) model_dir = os.path.join("./modeldir/%s" % model_name) model_pth = os.path.join(model_dir, "model.pth") writer = tensorboardX.SummaryWriter(model_dir) if os.path.exists(model_pth): print("------load model--------") model = torch.load(model_pth) else: # model = Transformer(fv, NUM_HEADS=4, NUM_LAYERS=3).cuda() model = Transformer(fv).cuda() model = nn.DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( # optimizer, factor=0.5, # patience=30, min_lr=1e-4) epochs = 2000 step = 1 val_step = 1 max_f1 = 0.0 for e in range(epochs): model.train() print("------epoch--------", e) st = time.time() train_shuffle = fvloader.shuffle(train_data) for item in fvloader.batch_fv(train_shuffle, batch=batchsize): # for name, param in model.named_parameters(): # writer.add_histogram( # name, param.clone().cpu().data.numpy(), step) # writer.add_histogram( # "grad/"+name, param.grad.clone().cpu().data.numpy(), step) model.zero_grad() genes, nimgs, labels, timesteps = item inputs = torch.from_numpy(nimgs).type(torch.cuda.FloatTensor) gt = torch.from_numpy(labels).type(torch.cuda.FloatTensor) pd = model(inputs) # loss = criterion(pd, gt) all_loss = criterion(pd, gt) label_loss = torch.mean(all_loss, dim=0) loss = torch.mean(label_loss) # for i in range(6): # writer.add_scalar("train sl_%d_loss" % i, # label_loss[i].item(), step) train_pd = torch_util.threshold_tensor_batch(pd) np_pd = train_pd.data.cpu().numpy() torch_util.torch_metrics( labels, np_pd, writer, step, mode="train") writer.add_scalar("train loss", loss, step) loss.backward() optimizer.step() step += 1 et = time.time() writer.add_scalar("train time", et - st, e) for param_group in optimizer.param_groups: writer.add_scalar("lr", param_group['lr'], e) # run_origin_train(model, imbtrain_data, writer, e, criterion) if e % 1 == 0: val_loss, val_f1 = run_val( model, dloader, val_data, writer, val_step, criterion) # scheduler.step(val_loss) val_step += 1 if e == 0: start_loss = val_loss min_loss = start_loss # if val_loss > 2 * min_loss: # print("early stopping at %d" % e) # break # if e % 50 == 0: # pt = os.path.join(model_dir, "%d.pt" % e) # torch.save(model.state_dict(), pt) # result = os.path.join(model_dir, "result_epoch%d.txt" % e) # run_test(model, test_data, result) if min_loss > val_loss or max_f1 < val_f1: if min_loss > val_loss: print("---------save best----------", "loss", val_loss) min_loss = val_loss if max_f1 < val_f1: print("---------save best----------", "f1", val_f1) max_f1 = val_f1 torch.save(model, model_pth) result = os.path.join(model_dir, "result_epoch%d.txt" % e) run_test(model, dloader, test_data, result)
def __init__(self, tensor): """tensor is taken as a sample to calculate the mean and std""" self.mean = torch.mean(tensor) self.std = torch.std(tensor)
def __call__(self, input, target): # Average across channels in order to get the final score class_idx = torch.arange(input.shape[1]).to(input.device) input = torch.argmax(input, axis=1)==class_idx[:,None,None,None,None] input = input.transpose(1,0) return torch.mean(compute_per_channel_dice(input, target, epsilon=self.epsilon))
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) datasets = OrderedDict() for split in splits: datasets[split] = PTB(data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) model = SentenceVAE(vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, experiment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) total_steps = (len(datasets["train"]) // args.batch_size) * args.epochs print("Train dataset size", total_steps) def kl_anneal_function(anneal_function, step): if anneal_function == 'identity': return 1 if anneal_function == 'linear': if args.warmup is None: return 1 - (total_steps - step) / total_steps else: warmup_steps = (total_steps / args.epochs) * args.warmup return 1 - (warmup_steps - step ) / warmup_steps if step < warmup_steps else 1.0 ReconLoss = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data[0]].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood recon_loss = ReconLoss(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step) return recon_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 for epoch in range(args.epochs): for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['length']) # loss calculation recon_loss, KL_loss, KL_weight = loss_fn( logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step) if split == 'train': loss = (recon_loss + KL_weight * KL_loss) / batch_size else: # report complete elbo when validation loss = (recon_loss + KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['negELBO'] = torch.cat( (tracker['negELBO'], loss.data.unsqueeze(0))) if args.tensorboard_logging: neg_elbo = (recon_loss + KL_loss) / batch_size writer.add_scalar("%s/Negative_ELBO" % split.upper(), neg_elbo.data[0], epoch * len(data_loader) + iteration) writer.add_scalar("%s/Recon_Loss" % split.upper(), recon_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL_Loss" % split.upper(), KL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL_Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): logger.info( "%s Batch %04d/%i, Loss %9.4f, Recon-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.data[0], recon_loss.data[0] / batch_size, KL_loss.data[0] / batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) logger.info("%s Epoch %02d/%i, Mean Negative ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['negELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/NegELBO" % split.upper(), torch.mean(tracker['negELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist() } if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) logger.info("Model saved at %s" % checkpoint_path) if args.num_samples: torch.cuda.empty_cache() model.eval() with torch.no_grad(): print(f"Generating {args.num_samples} samples") generations, _ = model.inference(n=args.num_samples) vocab = datasets["train"].i2w print( "Sampled latent codes from z ~ N(0, I), generated sentences:") for i, generation in enumerate(generations, start=1): sentence = [vocab[str(word.item())] for word in generation] print(f"{i}:", " ".join(sentence))
def message(self, x_j, pos_i, pos_j): if x_j is None: x_j = pos_j # Get offsets from points offsets = self.offset_mlp(x_j) # Reshape offsets to shape [SUM n_neighbors(n_points), n_kpoints, kp_dim] offsets = offsets.view((-1, self.num_points, self.kernel_dim)) # Rescale offset for this layer offsets *= self.kp_extent # Center every neighborhood [SUM n_neighbors(n_points), dim] neighbors = pos_j - pos_i # Number of points n_points = neighbors.shape[0] # Get points kernels and add offsets K_points = self.kernel K_points = K_points.float().view((-1, 3)).unsqueeze(0) K_points_deformed = K_points + offsets self.internal_losses["permissive_loss"] = permissive_loss( K_points_deformed, self.radius) # Get all difference matrices [SUM n_neighbors(n_points), n_kpoints, dim] neighbors = neighbors.unsqueeze(1) differences = neighbors - K_points_deformed sq_distances = (differences**2).sum(-1) # Get Kernel point influences [n_points, n_kpoints, n_neighbors] if self.KP_influence == "constant": # Every point get an influence of 1. all_weights = torch.ones_like(sq_distances) elif self.KP_influence == "linear": # Influence decrease linearly with the distance, and get to zero when d = kp_extent. all_weights = 1.0 - (torch.sqrt(sq_distances) / (self.kp_extent)) all_weights[all_weights < 0] = 0.0 elif self.KP_influence == "square": # Influence decrease linearly with the distance, and get to zero when d = kp_extent. all_weights = 1.0 - (sq_distances / (self.kp_extent**2)) all_weights[all_weights < 0] = 0.0 else: raise ValueError( "Unknown influence function type (config.KP_influence)") neighbors_1nn = torch.argmin(sq_distances, dim=-1) # Fitting Loss sq_distances_min = sq_distances.gather(1, neighbors_1nn.unsqueeze(-1)) sq_distances_min /= self.radius**2 # To be independant of the layer self.internal_losses["fitting_loss"] = torch.mean(sq_distances_min) weights = all_weights.gather(1, neighbors_1nn.unsqueeze(-1)) K_weights = self.kernel_weight K_weights = torch.index_select(K_weights, 0, neighbors_1nn.view(-1)).view( (n_points, self.in_features, self.out_features)) # Get the features of each neighborhood [n_points, n_neighbors, in_fdim] features = x_j # Apply distance weights [n_points, n_kpoints, in_fdim] weighted_features = torch.einsum("nb, nc -> nc", weights, features) # Apply network weights [n_kpoints, n_points, out_fdim] out_features = torch.einsum("na, nac -> nc", weighted_features, K_weights) return out_features
def permissive_loss(deformed_kpoints, radius): """This loss is responsible to penalize deformed_kpoints to move outside from the radius defined for the convolution """ norm_deformed_normalized = F.normalize(deformed_kpoints) / float(radius) return torch.mean(norm_deformed_normalized[norm_deformed_normalized > 1.0])
trainLoss = 0. train_estimation_relative_error = 0 for batch, [trainX, trainY] in enumerate(tqdm(trainLoader, ncols=10)): # break nbatch += 1 trainX = trainX.to(device) trainY = trainY.to(device) trainX = torch.sin(trainX) trainY = OneHotLabel(trainY, n_output) batch_train_repeatX, batch_train_repeatY = KMeansRepeatX(trainX, repeat_n), KMeansRepeatY(trainY, repeat_n) pre = net.forward(batch_train_repeatX, train=True, BP=BP_train) loss = CELoss(pre, batch_train_repeatY) trainLoss += torch.mean(loss).detach().cpu().numpy() if BP_train: net.backward(batch_train_repeatY, BP_train) net.update_params(learning_rate, BP_train) else: net.backward(loss, BP_train) net.update_params(learning_rate, BP_train, method) trainLoss /= nbatch train_loss.append(trainLoss) print('train epoch:{} loss:{}'.format(epoch, trainLoss)) if ((epoch + 1) % 100 == 0): learning_rate *= 0.8 print('学习率衰减至{}'.format(learning_rate)) loss = 0. N = 0. n = 0.
def __call__(self, rnn_output): if self.beta == .0: return .0 l2 = torch.sqrt(torch.sum(torch.pow(rnn_output, 2), dim=-1)) l2 = self.beta * torch.mean(torch.pow(l2[:, 1:] - l2[:, :-1], 2)) return l2
# adjust learning rate if i % DECAY_EPOCH == 0: LR_D = LR_D / 2 LR_G = LR_G / 2 adjust_learning_rate(optim_D, LR_D) adjust_learning_rate(optim_G, LR_G) artist_paintings = artist_works() G_ideas = torch.randn(BATCH_SIZE, N_IDEAS) G_paintings = G(G_ideas) prob_artist0 = D(artist_paintings) # D try to increase this prob_artist1 = D(G_paintings) # D try to reduce this D_loss = -torch.mean(torch.log(prob_artist0) + torch.log(1. - prob_artist1)) G_loss = -torch.mean(torch.log(prob_artist1)) if i % 3 == 0: optim_D.zero_grad() D_loss.backward(retain_graph = True) # reuse computational graph optim_D.step() optim_G.zero_grad() G_loss.backward() optim_G.step() if i % 100 == 0: # plotting plt.cla() plt.plot(PAINT_POINTS[0], G_paintings.data.numpy()[0], c='#4AD631', lw=3, label='Generated painting',) plt.plot(PAINT_POINTS[0], 2 * np.power(PAINT_POINTS[0], 2) + 1, c='#74BCFF', lw=3, label='upper bound')
def train(self): svhn_iter = iter(self.svhn_loader) mnist_iter = iter(self.mnist_loader) iter_per_epoch = min(len(svhn_iter), len(mnist_iter)) fixed_svhn = self.np_to_var(svhn_iter.next()[0]) fixed_mnist = self.np_to_var(mnist_iter.next()[0]) for step in range(self.train_iters+1): if (step+1) % iter_per_epoch == 0: mnist_iter = iter(self.mnist_loader) svhn_iter = iter(self.svhn_loader) svhn, _ = svhn_iter.next() svhn = self.np_to_var(svhn) mnist, _ = mnist_iter.next() mnist = self.np_to_var(mnist) #============ train D ============# # real images self.reset_grad() out = self.Dx(mnist) Dx_loss = torch.mean((out-1)**2) out = self.Dy(svhn) Dy_loss = torch.mean((out-1)**2) D_real_loss = Dx_loss + Dy_loss D_real_loss.backward() self.D_optim.step() # fake images self.reset_grad() out = self.Dy(self.Gxy(mnist)) Dy_loss = torch.mean(out**2) out = self.Dx(self.Gyx(svhn)) Dx_loss = torch.mean(out**2) D_fake_loss = Dx_loss + Dy_loss D_fake_loss.backward() self.D_optim.step() #============ train G ============# # mnist-svhn-mnist cycle self.reset_grad() mnist_to_svhn = self.Gxy(mnist) out = self.Dy(mnist_to_svhn) mnist_reconst = self.Gyx(mnist_to_svhn) # adversarial loss G_loss = torch.mean((out-1)**2) # cycle-consistency loss G_loss += torch.mean((mnist - mnist_reconst)**2) G_loss.backward() self.G_optim.step() # svhn-mnist-svhn cycle self.reset_grad() svhn_to_mnist = self.Gyx(svhn) out = self.Dx(svhn_to_mnist) svhn_reconst = self.Gxy(svhn_to_mnist) # adversarial loss G_loss = torch.mean((out-1)**2) # cycle-consistency loss G_loss += torch.mean((svhn - svhn_reconst)**2) G_loss.backward() self.G_optim.step() # print logs if (step+1) % self.log_step == 0: print('Step [%d/%d], d_real_loss: %.4f, d_fake_loss: %.4f, g_loss: %.4f' % (step+1, self.train_iters, D_real_loss.data[0], D_fake_loss.data[0], G_loss.data[0])) if (step+1) % self.sample_step == 0: fake_mnist = self.Gyx(fixed_svhn) fake_svhn = self.Gxy(fixed_mnist) mnist, fake_mnist = self.var_to_np(fixed_mnist), self.var_to_np(fake_mnist) svhn , fake_svhn = self.var_to_np(fixed_svhn), self.var_to_np(fake_svhn) merged = merge_images(mnist, fake_svhn) path = os.path.join(self.sample_path, 'sample-%d-m-s.png' % (step+1)) scipy.misc.imsave(path, merged) print('Saved %s' % path) merged = merge_images(svhn, fake_mnist) path = os.path.join(self.sample_path, 'sample-%d-s-m.png' % (step+1)) scipy.misc.imsave(path, merged) print('Saved %s' % path) if (step+1) % 5000 == 0: Gxy_path = os.path.join(self.model_path, 'Gxy-%d.pkl' % (step+1)) Gyx_path = os.path.join(self.model_path, 'Gyx-%d.pkl' % (step+1)) Dx_path = os.path.join(self.model_path, 'Dx-%d.pkl' % (step+1)) Dy_path = os.path.join(self.model_path, 'Dy-%d.pkl' % (step+1)) torch.save(self.Gxy.state_dict(), Gxy_path) torch.save(self.Gyx.state_dict(), Gyx_path) torch.save(self.Dx.state_dict(), Dx_path) torch.save(self.Dy.state_dict(), Dy_path)
def mean(input, dim): return th.mean(input, dim=dim)
def train(self): #torch.autograd.set_detect_anomaly(True) """ Main training loop Helpful URL: https://github.com/balakg/posewarp-cvpr2018/blob/master/code/posewarp_gan_train.py """ for epoch in range(self.num_epochs): num_batches = len(self.train_dataset_loader) # Initialize running averages disc_losses = AverageMeter() train_disc_accuracies = AverageMeter() tot_losses = AverageMeter() train_accuracies = AverageMeter() for batch_id, batch_data in enumerate(self.train_dataset_loader): self.gan.train() # Set the model to train mode self.vgg_loss_network.eval() current_step = epoch * num_batches + batch_id # Get data from dataset src_img = batch_data['im'].cuda(async=True) target_img = batch_data['target_im'].cuda(async=True) src_iuv = batch_data['im_iuv'].cuda(async=True) target_iuv = batch_data['target_iuv'].cuda(async=True) #pdb.set_trace() # ============ # Run predictive GAN on source image _, classification_src = self.gan(src_img, src_iuv, target_iuv, use_gt=False) # Run predictive GAN on target image _ , classification_tgt = self.gan(target_img, src_iuv, target_iuv, use_gt=True) # Create discriminator groundtruth # For src, we create zeros # For tgt, we create ones disc_gt_src = torch.zeros(classification_src.shape[0], 1, dtype=torch.float32).cuda() disc_gt_tgt = torch.ones(classification_src.shape[0], 1, dtype=torch.float32).cuda() disc_gt = torch.cat((disc_gt_src, disc_gt_tgt), dim=0).cuda(async=True) classification_all = torch.cat((classification_src, classification_tgt) , dim=0) # Train Discriminator network disc_loss = self._optimizeDiscriminator(classification_all, disc_gt) disc_losses.update(disc_loss.item(), disc_gt.shape[0]) disc_acc = 100.0 * torch.mean( ( torch.round(F.softmax(classification_all, dim=1)) == disc_gt ).float() ) train_disc_accuracies.update(disc_acc.item(), disc_gt.shape[0]) print("Epoch: {}, Batch {}/{} has Discriminator loss {}, and acc {}".format(epoch, batch_id, num_batches, disc_losses.avg, train_disc_accuracies.avg)) # Start training GAN first for several iterations if current_step < self.start_disc_iters: print("Discriminator training only: {}/{}\n".format(current_step,self.start_disc_iters)) continue # ============ # Optimize the GAN # Note that now we use disc_gt_tgt which are 1's generated_img, classification_src = self.gan(src_img, src_iuv, target_iuv, use_gt=False) tot_loss = self._optimizeGAN(generated_img, target_img, classification_src, disc_gt_tgt) tot_losses.update(tot_loss.item(), disc_gt_tgt.shape[0]) acc = 100.0 * torch.mean( ( torch.round(F.softmax(classification_src, dim=1)) == disc_gt_tgt ).float() ) tot_losses.update(tot_loss.item(), disc_gt_tgt.shape[0]) train_accuracies.update(acc.item(), disc_gt_tgt.shape[0]) # Not adjusting learning rate currently # if epoch % 100 == 99: # self._adjust_learning_rate(epoch) # # Not Clipping Weights # self._clip_weights() if current_step % self.log_freq == 0: print("Epoch: {}, Batch {}/{} has loss {}, and acc {}".format(epoch, batch_id, num_batches, tot_losses.avg, train_accuracies.avg)) # TODO: you probably want to plot something here self.txwriter.add_scalar('train/discriminator_loss', disc_losses.avg, current_step) self.txwriter.add_scalar('train/total_loss', tot_losses.avg, current_step) self.txwriter.add_scalar('train/discriminator_acc', train_accuracies.avg, current_step) """ Visualize some images """ if current_step % self.display_freq == 0: name1 = '{0}_{1}_{2}'.format(epoch, current_step, "image1") name2 = '{0}_{1}_{2}'.format(epoch, current_step, "image2") name3 = '{0}_{1}_{2}'.format(epoch, current_step, "gan_image") im1 = denormalizeImage(src_img[0,:,:,:].cpu().numpy()) im2 = denormalizeImage(target_img[0,:,:,:].cpu().numpy()) im3 = denormalizeImage(generated_img[0,:,:,:].detach().cpu().numpy()) self.txwriter.add_image("Image1/"+name1,im1) self.txwriter.add_image("Image2/"+name2,im2) self.txwriter.add_image("GAN/"+name3,im3) """ TODO : Test accuracies if current_step % self.test_freq == 0:#self._test_freq-1: self._model.eval() val_accuracy = self.validate() print("Epoch: {} has val accuracy {}".format(epoch, val_accuracy)) self.txwriter.add_scalar('test/acc', val_accuracy, current_step) """ """ Save Model periodically """ if (current_step % self.save_freq == 0) and current_step > 0: save_name = 'model_checkpoint.pth' torch.save(self.gan.state_dict(), save_name) print('Saved model to {}'.format(save_name))
def learn(self): self.learn_iter += 1 # for x in self.Actor_target.state_dict().keys(): # eval('self.Actor_target.' + x + '.data.mul_((1-TAU))') # eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)') # for x in self.Critic_target.state_dict().keys(): # eval('self.Critic_target.' + x + '.data.mul_((1-TAU))') # eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)') # for target_param, param in zip(net_target.parameters(), net.parameters()): # target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) # for k, v in self.eval_critic_net.state_dict().items(): # self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k]) # for k, v in self.eval_actor_net.state_dict().items(): # self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k]) batch_data = self.memory.sample(self.batch_size) s0, a0, r1, s1 = zip(*batch_data) s0 = torch.tensor(s0, dtype=torch.float) a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size, len(self.action_bounds)) r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1) s1 = torch.tensor(s1, dtype=torch.float) # Select action according to policy and add clipped noise # Input (s, a), output q q_s0_a0_1 = self.eval_critic_net1(s0, a0) q_s0_a0_2 = self.eval_critic_net2(s0, a0) # Input (s_, a_), output q_ for q_target # 得到a_ noise = (torch.randn_like(a0) * self.policy_noise_clip * 2).clamp( -self.policy_noise_clip, self.policy_noise_clip) a1 = self.target_actor_net(s1).detach() + noise action_bound = self.action_bounds.expand_as(a1) a1[a1 < -action_bound] = -action_bound[a1 < -action_bound] a1[a1 > action_bound] = action_bound[a1 > action_bound] q_s1_a1_1 = self.target_critic_net1(s1, a1).detach() q_s1_a1_2 = self.target_critic_net2(s1, a1).detach() q_s1_a1 = torch.min(q_s1_a1_1, q_s1_a1_2) q_target = r1 + self.gamma * q_s1_a1 loss_critic = nn.MSELoss()(q_s0_a0_1, q_target) + nn.MSELoss()( q_s0_a0_2, q_target) # critic 学习过程 # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce , # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确 # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2 self.optimizer_critic1.zero_grad() self.optimizer_critic2.zero_grad() loss_critic.backward() self.optimizer_critic1.step() self.optimizer_critic2.step() loss_actor = 0 # actor 学习过程 # https://zhuanlan.zhihu.com/p/84321382 # Delayed policy updates if self.learn_iter % self.policy_delay == 0: actor_a = self.eval_actor_net(s0) critic_q = self.eval_critic_net1(s0, actor_a) # loss=-q=-ce(s,ae(s))更新ae ae(s)=a ae(s_)=a_ # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0 loss_actor = -torch.mean(critic_q) self.optimizer_actor.zero_grad() loss_actor.backward() self.optimizer_actor.step() # Update the frozen target models for param, target_param in zip( self.eval_critic_net1.parameters(), self.target_critic_net1.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip( self.eval_critic_net2.parameters(), self.target_critic_net2.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.eval_actor_net.parameters(), self.target_actor_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) return loss_critic, loss_actor
def kl_loss(code): return torch.mean(torch.pow(code, 2))
w_comp_p = int(w_p / 16) + 1 h_mask_p.append(h_comp_p) w_mask_p.append(w_comp_p) x = x.cuda() y = y.cuda() xp = xp.cuda() # out is CNN featuremaps output_highfeature = encoder(x) # print("out: ", output_highfeature.size()) # print("output_highfeature: ", output_highfeature.size()) output_highfeature_p = encoder(xp) x_mean = [] xp_mean = [] for i, j in zip(output_highfeature, output_highfeature_p): x_mean.append(float(torch.mean(i))) xp_mean.append(float(torch.mean(j))) # x_mean = torch.mean(output_highfeature) # x_mean = float(x_mean) for i in range(batch_size): decoder_hidden_init[i] = decoder_hidden_init[i] * x_mean[i] decoder_hidden_init[i] = torch.tanh(decoder_hidden_init[i]) decoder_hidden_init_p[i] = decoder_hidden_init_p[i] * xp_mean[i] decoder_hidden_init_p[i] = torch.tanh(decoder_hidden_init_p[i]) # decoder_hidden_init[] # dense_input is height and output_area is width which is bb output_area1 = output_highfeature.size() output_area1_p = output_highfeature_p.size() output_area = output_area1[3]
def top1gating( logits: Tensor, capacity_factor: float, min_capacity: int, used_token: Tensor = None, noisy_gate_policy: Optional[str] = None, drop_tokens: bool = True, use_rts: bool = True, use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top1Gating on logits.""" if noisy_gate_policy == 'RSample': logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device) # everything is in fp32 in this function gates = F.softmax(logits, dim=1) capacity = _capacity(gates, torch.tensor(capacity_factor), torch.tensor(min_capacity)) # Create a mask for 1st's expert per token # noisy gating indices1_s = torch.argmax( logits_w_noise if noisy_gate_policy == 'RSample' else gates, dim=1) num_experts = int(gates.shape[1]) mask1 = F.one_hot(indices1_s, num_classes=num_experts) # mask only used tokens if used_token is not None: mask1 = einsum("s,se->se", used_token, mask1) # gating decisions exp_counts = torch.sum(mask1, dim=0).detach().to('cpu') # if we don't want to drop any tokens if not drop_tokens: new_capacity = torch.max(exp_counts).to(logits.device) dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.group.WORLD) capacity = new_capacity # Compute l_aux me = torch.mean(gates, dim=0) ce = torch.mean(mask1.float(), dim=0) l_aux = torch.sum(me * ce) * num_experts # Random Token Selection if use_rts: uniform = exp_selection_uniform_map.get(logits.device) if uniform is None: uniform = torch.distributions.uniform.Uniform( low=torch.tensor(0.0, device=logits.device), high=torch.tensor(1.0, device=logits.device)).rsample exp_selection_uniform_map[logits.device] = uniform mask1_rand = mask1 * uniform(mask1.shape) else: mask1_rand = mask1 assert logits.shape[ 0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size." top_idx = _top_idx(mask1_rand, capacity) new_mask1 = mask1 * torch.zeros_like(mask1).scatter_(0, top_idx, 1) mask1 = new_mask1 if use_tutel: # Tutel doesn't support index values masked with zero # so we need to replace masked indices with -1 indices_mask = mask1.sum(dim=1) * num_experts - 1 indices1_s = torch.min(indices1_s, indices_mask) # Compute locations in capacity buffer if use_tutel: locations1 = tutel_moe.fast_cumsum_sub_one(mask1) else: locations1 = torch.cumsum(mask1, dim=0) - 1 if use_tutel: gates1_s = (gates * mask1).sum(dim=1) locations1_s = torch.sum(locations1 * mask1, dim=1) return l_aux, capacity, num_experts, [ indices1_s, ], [ locations1_s, ], [ gates1_s, ], exp_counts # Store the capacity location for each token locations1_s = torch.sum(locations1 * mask1, dim=1) # Normalize gate probabilities mask1_float = mask1.float() gates = gates * mask1_float locations1_sc = _one_hot_to_float(locations1_s, capacity) combine_weights = einsum("se,sc->sec", gates, locations1_sc) dispatch_mask = combine_weights.bool() return l_aux, combine_weights, dispatch_mask, exp_counts
def __train_pass__(self, epoch, is_training=True): st_epoch = time.time() if (is_training): self.G.train() status = 'TRAIN' else: self.G.eval() status = 'EVAL' g_time = 0.0 for i, batch in enumerate(self.dataloader): if (i >= len(self.dataloader) - 2): break st_batch = time.time() if (self.opt_parser.comb_fan_awing): image_in, image_out, fan_pred_landmarks = batch fan_pred_landmarks = fan_pred_landmarks.reshape( -1, 68, 3).detach().cpu().numpy() elif (self.opt_parser.add_audio_in): image_in, image_out, audio_in = batch audio_in = audio_in.reshape(-1, 1, 256, 256).to(device) else: image_in, image_out = batch with torch.no_grad(): # # online landmark (AwingNet) image_in, image_out = \ image_in.reshape(-1, 3, 256, 256).to(device), image_out.reshape(-1, 3, 256, 256).to(device) inputs = image_out outputs, boundary_channels = self.fa_model(inputs) pred_heatmap = outputs[-1][:, :-1, :, :].detach().cpu() pred_landmarks, _ = get_preds_fromhm(pred_heatmap) pred_landmarks = pred_landmarks.numpy() * 4 # online landmark (FAN) -> replace jaw + eye brow in AwingNet if (self.opt_parser.comb_fan_awing): fl_jaw_eyebrow = fan_pred_landmarks[:, 0:27, 0:2] fl_rest = pred_landmarks[:, 51:, :] pred_landmarks = np.concatenate([fl_jaw_eyebrow, fl_rest], axis=1).astype(np.int) # draw landmark on while bg img_fls = [] for pred_fl in pred_landmarks: img_fl = np.ones(shape=(256, 256, 3)) * 255.0 if (self.opt_parser.comb_fan_awing): img_fl = vis_landmark_on_img74(img_fl, pred_fl) # 74x2 else: img_fl = vis_landmark_on_img98(img_fl, pred_fl) # 98x2 img_fls.append(img_fl.transpose((2, 0, 1))) img_fls = np.stack(img_fls, axis=0).astype(np.float32) / 255.0 image_fls_in = torch.tensor(img_fls, requires_grad=False).to(device) if (self.opt_parser.add_audio_in): # print(image_fls_in.shape, image_in.shape, audio_in.shape) image_in = torch.cat([image_fls_in, image_in, audio_in], dim=1) else: image_in = torch.cat([image_fls_in, image_in], dim=1) # image_in, image_out = \ # image_in.reshape(-1, 6, 256, 256).to(device), image_out.reshape(-1, 3, 256, 256).to(device) # image2image net fp g_out = self.G(image_in) g_out = torch.tanh(g_out) loss_l1 = self.criterionL1(g_out, image_out) loss_vgg, loss_style = self.criterionVGG(g_out, image_out, style=True) loss_vgg, loss_style = torch.mean(loss_vgg), torch.mean(loss_style) loss = loss_l1 + loss_vgg + loss_style if (is_training): self.optimizer.zero_grad() loss.backward() self.optimizer.step() # log if (self.opt_parser.write): self.writer.add_scalar('loss', loss.cpu().detach().numpy(), self.count) self.writer.add_scalar('loss_l1', loss_l1.cpu().detach().numpy(), self.count) self.writer.add_scalar('loss_vgg', loss_vgg.cpu().detach().numpy(), self.count) self.count += 1 # save image to track training process if (i % self.opt_parser.jpg_freq == 0): vis_in = np.concatenate([ image_in[0, 3:6].cpu().detach().numpy().transpose( (1, 2, 0)), image_in[0, 0:3].cpu().detach().numpy().transpose( (1, 2, 0)) ], axis=1) vis_out = np.concatenate([ image_out[0].cpu().detach().numpy().transpose( (1, 2, 0)), g_out[0].cpu().detach().numpy().transpose( (1, 2, 0)) ], axis=1) vis = np.concatenate([vis_in, vis_out], axis=0) try: os.makedirs( os.path.join(self.opt_parser.jpg_dir, self.opt_parser.name)) except: pass cv2.imwrite( os.path.join(self.opt_parser.jpg_dir, self.opt_parser.name, 'e{:03d}_b{:04d}.jpg'.format(epoch, i)), vis * 255.0) # save ckpt if (i % self.opt_parser.ckpt_last_freq == 0): self.__save_model__('last', epoch) print( "Epoch {}, Batch {}/{}, loss {:.4f}, l1 {:.4f}, vggloss {:.4f}, styleloss {:.4f} time {:.4f}" .format(epoch, i, len(self.dataset) // self.opt_parser.batch_size, loss.cpu().detach().numpy(), loss_l1.cpu().detach().numpy(), loss_vgg.cpu().detach().numpy(), loss_style.cpu().detach().numpy(), time.time() - st_batch)) g_time += time.time() - st_batch if (self.opt_parser.test_speed): if (i >= 100): break print('Epoch time usage:', time.time() - st_epoch, 'I/O time usage:', time.time() - st_epoch - g_time, '\n=========================') if (self.opt_parser.test_speed): exit(0) if (epoch % self.opt_parser.ckpt_epoch_freq == 0): self.__save_model__('{:02d}'.format(epoch), epoch)
def optimize_model(): if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.uint8) # Divide memory into different tensors non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action).view(BATCH_SIZE, 1) reward_batch = torch.cat(batch.reward) # Create state-action (s,a) tensor for input into the critic network with taken actions state_action = torch.cat([state_batch, action_batch], -1) # Compute Q(s,a) using critic network state_action_values = critic_nn(state_action) # Compute deterministic next state action using actor target network next_action = target_actor_nn(non_final_next_states).detach() # Compute next timestep state-action (s,a) tensor for non-final next states next_state_action = torch.zeros(BATCH_SIZE, 4, device=device) next_state_action[non_final_mask, :] = torch.cat( [non_final_next_states, next_action], -1) # Compute next state values at t+1 using target critic network next_state_values = target_critic_nn(next_state_action).detach() # Compute expected state action values y[i]= r[i] + Q'(s[i+1], a[i+1]) expected_state_action_values = reward_batch.view( BATCH_SIZE, 1) + GAMMA * next_state_values # Critic loss by mean squared error loss_critic = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the critic network optimizer_critic.zero_grad() loss_critic.backward() for param in critic_nn.parameters(): param.grad.data.clamp_(-1, 1) optimizer_critic.step() #optimize actor # Actor actions state_actor_action = actor_nn(state_batch) # State-actor-actions tensor state_actor_action_values = torch.cat([state_batch, state_actor_action], -1) # Loss loss_actor = -1 * torch.mean(critic_nn(state_actor_action_values)) optimizer_actor.zero_grad() loss_actor.backward() for param in actor_nn.parameters(): param.grad.data.clamp_(-1, 1) optimizer_actor.step() # Soft parameter update update_targets(target_critic_nn, critic_nn) update_targets(target_actor_nn, actor_nn)
def forward(self, true, pred): loss = true - pred return torch.mean(torch.log(torch.cosh(loss + 1e-12)))
def forward(self, x): mean = torch.mean(x, 1, keepdim=True) variance = torch.mean((x - mean)**2, 1, keepdim=True) x = (x - mean) * torch.rsqrt(variance + self.eps) x = x * self.gamma + self.beta return x
def forward(self, true, pred): return torch.mean(torch.abs(true - pred))
def forward(self, map_pred, map_gtd): # map_pred : input prediction saliency map, map_gtd : input ground truth density map map_pred = map_pred.float() map_gtd = map_gtd.float() map_pred = map_pred.view(1, -1) # change the map_pred into a tensor with n rows and 1 cols map_gtd = map_gtd.view(1, -1) # change the map_pred into a tensor with n rows and 1 cols min1 = torch.min(map_pred) max1 = torch.max(map_pred) # print("min1 and max1 are (saliecny map):", min1, max1) map_pred = (map_pred - min1) / (max1 - min1 + self.epsilon) # min-max normalization for keeping KL loss non-NAN min2 = torch.min(map_gtd) max2 = torch.max(map_gtd) # print("min2 and max2 are (fixation points) :", min2, max2) map_gtd = (map_gtd - min2) / (max2 - min2 + self.epsilon) # min-max normalization for keeping KL loss non-NAN map_gtd_id_1 = torch.gt(map_gtd, 0.5) map_gtd_id_0 = torch.lt(map_gtd, 0.5) map_gtd_id_00 = torch.eq(map_gtd, 0.5) map_gtd[map_gtd_id_1] = 1.0 map_gtd[map_gtd_id_0] = 0.0 map_gtd[map_gtd_id_00] = 0.0 map_pred_mean = torch.mean(map_pred) # calculating the mean value of tensor map_pred_mean = map_pred_mean.item() # change the tensor into a number map_pred_std = torch.std(map_pred) # calculate the standard deviation map_pred_std = map_pred_std.item() # change the tensor into a number map_pred = (map_pred - map_pred_mean) / (map_pred_std + self.epsilon) # normalization NSS = map_pred * map_gtd # print("early NSS is :", NSS) ''' dim_NSS = NSS.size() print("dim_NSS is :", dim_NSS) dim_NSS = dim_NSS[1] sum_nss = 0 dim_sum = 0 for idxnss in range(0, dim_NSS): if (NSS[0, idxnss] > 0.05): # # should not be 0, because there are a lot of 0.00XXX in map1_NSS due to float format sum_nss += NSS[0, idxnss] dim_sum += 1 NSS = sum_nss / dim_sum ''' # NSS = NSS # should not add anythin, because there are a lot of 0.00XXX in map1_NSS due to float format # id = torch.nonzero(NSS) id = torch.gt(NSS, 0.1) # find out the id of NSS > 0.1 bignss = NSS[id] # print(bignss) if(len(bignss) == 0): # NSS[id] is empty id = torch.gt(NSS, -0.00000001) # decrease the threshold, because must set it as tensor not inter bignss = NSS[id] # NSS = torch.sum(NSS[id]) # NSS = torch.mean(NSS) NSS = torch.mean(bignss) NSS = -NSS # the bigger NSS the better return NSS
def forward(self, input): return input / torch.sqrt( torch.mean(input**2, dim=1, keepdim=True) + 1e-8)