def _gaussian_kl_divergence(self, p, q): p_mean = p[0][:Z_DIM] p_logstd = p[0][Z_DIM:] p_var = T.sqrt(T.exp(p_logstd)) q_mean = q[0][:Z_DIM] q_logstd = q[0][Z_DIM:] q_var = T.sqrt(T.exp(q_logstd)) kl = (T.log(q_var/p_var) + (p_var + (p_mean-q_mean)*(p_mean-q_mean))/q_var - 1) * 0.5 return T.sum(kl)
def rmsprop(opfunc, x, config, state=None): """ An implementation of RMSprop ARGS: - 'opfunc' : a function that takes a single input (X), the point of a evaluation, and returns f(X) and df/dX - 'x' : the initial point - 'config` : a table with configuration parameters for the optimizer - 'config['learningRate']' : learning rate - 'config['alpha']' : smoothing constant - 'config['epsilon']' : value with which to initialise m - 'config['weightDecay']' : weight decay - 'state' : a table describing the state of the optimizer; after each call the state is modified - 'state['m']' : leaky sum of squares of parameter gradients, - 'state['tmp']' : and the square root (with epsilon smoothing) RETURN: - `x` : the new x vector - `f(x)` : the function, evaluated before the update """ # (0) get/update state if config is None and state is None: raise ValueError("rmsprop requires a dictionary to retain state between iterations") state = state if state is not None else config lr = config.get('learningRate', 1e-2) alpha = config.get('alpha', 0.99) epsilon = config.get('epsilon', 1e-8) wd = config.get('weightDecay', 0) # (1) evaluate f(x) and df/dx fx, dfdx = opfunc(x) # (2) weight decay if wd != 0: dfdx.add_(wd, x) # (3) initialize mean square values and square gradient storage if 'm' not in state: state['m'] = x.new().resize_as_(dfdx).zero_() state['tmp'] = x.new().resize_as_(dfdx) # (4) calculate new (leaky) mean squared values state['m'].mul_(alpha) state['m'].addcmul_(1.0 - alpha, dfdx, dfdx) # (5) perform update torch.sqrt(state['m'], out=state['tmp']).add_(epsilon) x.addcdiv_(-lr, dfdx, state['tmp']) # return x*, f(x) before optimization return x, fx
def triplet_loss(self, z_p, z_n, z_d, margin=0.1, l2=0): l_n = torch.sqrt(((z_p - z_n) ** 2).sum(dim=1)) l_d = - torch.sqrt(((z_p - z_d) ** 2).sum(dim=1)) l_nd = l_n + l_d loss = F.relu(l_n + l_d + margin) l_n = torch.mean(l_n) l_d = torch.mean(l_d) l_nd = torch.mean(l_n + l_d) loss = torch.mean(loss) if l2 != 0: loss += l2 * (torch.norm(z_p) + torch.norm(z_n) + torch.norm(z_d)) return loss, l_n, l_d, l_nd
def ldmk_loss(input, target, weight=None, size_average=True): n, c = input.size() loss_ = (input - target) ** 2 iod = torch.sqrt(torch.sum( (target[:, 36*2:37*2] - target[:, 45*2:46*2])**2, 1)) loss = torch.autograd.Variable(torch.zeros((n, c//2))).float().cuda() for i in range(c//2): loss[:, i] = torch.sqrt((loss_[:, i*2] + loss_[:, i*2+1])) / (iod+1e-6) if size_average: loss = torch.mean(loss) return loss
def save_conv_shrink_bn(fp, conv_model, bn_model, eps=1e-5): if bn_model.bias.is_cuda: bias = bn_model.bias.data - bn_model.running_mean * bn_model.weight.data / torch.sqrt(bn_model.running_var + eps) convert2cpu(bias).numpy().tofile(fp) s = conv_model.weight.data.size() weight = conv_model.weight.data * (bn_model.weight.data / torch.sqrt(bn_model.running_var + eps)).view(-1,1,1,1).repeat(1, s[1], s[2], s[3]) convert2cpu(weight).numpy().tofile(fp) else: bias = bn_model.bias.data - bn_model.running_mean * bn_model.weight.data / torch.sqrt(bn_model.running_var + eps) bias.numpy().tofile(fp) s = conv_model.weight.data.size() weight = conv_model.weight.data * (bn_model.weight.data / torch.sqrt(bn_model.running_var + eps)).view(-1,1,1,1).repeat(1, s[1], s[2], s[3]) weight.numpy().tofile(fp)
def get_negative_expectation(q_samples, measure, average=True): log_2 = math.log(2.) if measure == 'GAN': Eq = F.softplus(-q_samples) + q_samples elif measure == 'JSD': Eq = F.softplus(-q_samples) + q_samples - log_2 elif measure == 'X2': Eq = -0.5 * ((torch.sqrt(q_samples ** 2) + 1.) ** 2) elif measure == 'KL': Eq = torch.exp(q_samples) elif measure == 'RKL': Eq = q_samples - 1. elif measure == 'DV': Eq = log_sum_exp(q_samples, 0) - math.log(q_samples.size(0)) elif measure == 'H2': Eq = torch.exp(q_samples) - 1. elif measure == 'W1': Eq = q_samples else: raise_measure_error(measure) if average: return Eq.mean() else: return Eq
def forward(self, input1): self.batchgrid3d = torch.zeros(torch.Size([input1.size(0)]) + self.grid3d.size()) for i in range(input1.size(0)): self.batchgrid3d[i] = self.grid3d self.batchgrid3d = Variable(self.batchgrid3d) #print(self.batchgrid3d) x = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,0:4]), 3) y = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,4:8]), 3) z = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,8:]), 3) #print(x) r = torch.sqrt(x**2 + y**2 + z**2) + 1e-5 #print(r) theta = torch.acos(z/r)/(np.pi/2) - 1 #phi = torch.atan(y/x) phi = torch.atan(y/(x + 1e-5)) + np.pi * x.lt(0).type(torch.FloatTensor) * (y.ge(0).type(torch.FloatTensor) - y.lt(0).type(torch.FloatTensor)) phi = phi/np.pi output = torch.cat([theta,phi], 3) return output
def test_hmc_conjugate_gaussian(fixture, num_samples, warmup_steps, hmc_params, expected_means, expected_precs, mean_tol, std_tol): pyro.get_param_store().clear() hmc_kernel = HMC(fixture.model, **hmc_params) mcmc_run = MCMC(hmc_kernel, num_samples, warmup_steps).run(fixture.data) for i in range(1, fixture.chain_len + 1): param_name = 'loc_' + str(i) marginal = EmpiricalMarginal(mcmc_run, sites=param_name) latent_loc = marginal.mean latent_std = marginal.variance.sqrt() expected_mean = torch.ones(fixture.dim) * expected_means[i - 1] expected_std = 1 / torch.sqrt(torch.ones(fixture.dim) * expected_precs[i - 1]) # Actual vs expected posterior means for the latents logger.info('Posterior mean (actual) - {}'.format(param_name)) logger.info(latent_loc) logger.info('Posterior mean (expected) - {}'.format(param_name)) logger.info(expected_mean) assert_equal(rmse(latent_loc, expected_mean).item(), 0.0, prec=mean_tol) # Actual vs expected posterior precisions for the latents logger.info('Posterior std (actual) - {}'.format(param_name)) logger.info(latent_std) logger.info('Posterior std (expected) - {}'.format(param_name)) logger.info(expected_std) assert_equal(rmse(latent_std, expected_std).item(), 0.0, prec=std_tol)
def pullaway_loss(embeddings): norm = torch.sqrt(torch.sum(embeddings ** 2.0, 1, keepdim=True)) normalized_embeddings = embeddings / norm similarity = torch.matmul(normalized_embeddings, normalized_embeddings.transpose(1, 0)) batch_size = embeddings.size()[0] pt_loss = (torch.sum(similarity) - batch_size) / (batch_size * (batch_size - 1)) return pt_loss
def skewness_score(x, dim=0): '''Test whether the skew is different from the normal distribution. This function tests the null hypothesis that the skewness of the population that the sample was drawn from is the same as that of a corresponding normal distribution. ripoff from: `scipy.stats.skewtest`. Args: a: Array of the sample data axis: Axis along which to compute test. Default is 0. If None, compute over the whole array `a`. Returns: statistic: The computed z-score for this test. p-value: A 2-sided chi squared probability for the hypothesis test. ''' x, n, dim = _x_n_dim(x, dim) b2 = (x**3).mean(dim) / (x**2).mean(dim)**1.5 y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) beta2 = 3.0 * (n**2 + 27 * n - 70) * (n + 1) * (n + 3) /\ ((n - 2.0) * (n + 5) * (n + 7) * (n + 9)) W2 = -1.0 + math.sqrt(2 * (beta2 - 1)) delta = 1.0 / math.sqrt(0.5 * math.log(W2)) alpha = math.sqrt(2.0 / (W2 - 1)) y[y == 0] = 1 yalpha = y / alpha Z = delta * torch.log(yalpha + torch.sqrt(yalpha**2 + 1)) return Z, 1 + torch.erf(-math.sqrt(0.5) * torch.abs(Z))
def forward(self, input, label): # --------------------------- cos(theta) & phi(theta) --------------------------- if self.device_id == None: cosine = F.linear(F.normalize(input), F.normalize(self.weight)) else: x = input sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0) temp_x = x.cuda(self.device_id[0]) weight = sub_weights[0].cuda(self.device_id[0]) cosine = F.linear(F.normalize(temp_x), F.normalize(weight)) for i in range(1, len(self.device_id)): temp_x = x.cuda(self.device_id[i]) weight = sub_weights[i].cuda(self.device_id[i]) cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])), dim=1) sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = torch.where(cosine > 0, phi, cosine) else: phi = torch.where(cosine > self.th, phi, cosine - self.mm) # --------------------------- convert label to one-hot --------------------------- one_hot = torch.zeros(cosine.size()) if self.device_id != None: one_hot = one_hot.cuda(self.device_id[0]) one_hot.scatter_(1, label.view(-1, 1).long(), 1) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4 output *= self.s return output
def test(): network.eval() test_loss = 0 correct = 0 for data, target in test_loader: target_indices = target target_one_hot = to_one_hot(target_indices, length=network.digits.num_units) data, target = Variable(data, volatile=True).cuda(), Variable(target_one_hot).cuda() output = network(data) test_loss += network.loss(data, output, target, size_average=False).data[0] # sum up batch loss v_mag = torch.sqrt((output**2).sum(dim=2, keepdim=True)) pred = v_mag.data.max(1, keepdim=True)[1].cpu() correct += pred.eq(target_indices.view_as(pred)).sum() test_loss /= len(test_loader.dataset) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def create_input(points, sigma2): bs, N, _ = points.size() #points has size bs,N,2 OP = torch.zeros(bs,N,N,4).type(dtype) E = torch.eye(N).type(dtype).unsqueeze(0).expand(bs,N,N) OP[:,:,:,0] = E W = points.unsqueeze(1).expand(bs,N,N,dim) - points.unsqueeze(2).expand(bs,N,N,dim) dists2 = (W * W).sum(3) dists = torch.sqrt(dists2) W = torch.exp(-dists2 / sigma2) OP[:,:,:,1] = W D = E * W.sum(2,True).expand(bs,N,N) OP[:,:,:,2] = D U = (torch.ones(N,N).type(dtype)/N).unsqueeze(0).expand(bs,N,N) OP[:,:,:,3] = U OP = Variable(OP) x = Variable(points) Y = Variable(W.clone()) # Normalize inputs if normalize: mu = x.sum(1)/N mu_ext = mu.unsqueeze(1).expand_as(x) var = ((x - mu_ext)*(x - mu_ext)).sum(1)/N var_ext = var.unsqueeze(1).expand_as(x) x = x - mu_ext x = x/(10 * var_ext) return (OP, x, Y), dists
def _get_norm(self, gaus): norm_tensor = torch.ones([1, 1, self.npixels[0], self.npixels[1]]) normalization_feats = torch.autograd.Variable(norm_tensor) if self.use_gpu: normalization_feats = normalization_feats.cuda() norm_out = self._compute_gaussian(normalization_feats, gaussian=gaus) return 1 / torch.sqrt(norm_out + 1e-20)
def forward(self, x, y, xidx=None, yidx=None): K = torch.sqrt(l2_distance(x, y)) u, v = self._get_uv(x, y, xidx, yidx) if self.regularization == 'entropy': return torch.exp((u[:, None] + v[None, :] - K) / self.alpha) else: return torch.clamp((u[:, None] + v[None, :] - K), min=0) / (2 * self.alpha)
def forward(self, input): # Hack: Force noise vectors to be function of input so they are put into # predict_net and not init_net when tracing with ONNX epsilon_input = torch.randn(1, input.size()[1], device=input.device) epsilon_output = torch.randn( self.out_dimension - input.size()[1] + input.size()[1], 1, device=input.device, ) epsilon_in = torch.sign(epsilon_input) * torch.sqrt(torch.abs(epsilon_input)) epsilon_out = torch.sign(epsilon_output) * torch.sqrt(torch.abs(epsilon_output)) # Add noise to bias and weights noise = torch.mul(epsilon_in, epsilon_out) bias = self.bias + self.sigma_bias * epsilon_out.t() weight = self.weight + self.sigma_weight * noise return input.matmul(weight.t()) + bias
def _PyramidRoI_Feat(self, feat_maps, rois, im_info): ''' roi pool on pyramid feature maps''' # do roi pooling based on predicted rois img_area = im_info[0][0] * im_info[0][1] h = rois.data[:, 4] - rois.data[:, 2] + 1 w = rois.data[:, 3] - rois.data[:, 1] + 1 roi_level = torch.log(torch.sqrt(h * w) / 224.0) / np.log(2) roi_level = torch.floor(roi_level + 4) # -------- # roi_level = torch.log(torch.sqrt(h * w) / 224.0) # roi_level = torch.round(roi_level + 4) # ------ roi_level[roi_level < 2] = 2 roi_level[roi_level > 5] = 5 # roi_level.fill_(5) if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) # NOTE: need to add pyrmaid grid_xy = _affine_grid_gen(rois, feat_maps.size()[2:], self.grid_size) ## grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() roi_pool_feat = self.RCNN_roi_crop(feat_maps, Variable(grid_yx).detach()) ## if cfg.CROP_RESIZE_WITH_MAX_POOL: roi_pool_feat = F.max_pool2d(roi_pool_feat, 2, 2) elif cfg.POOLING_MODE == 'align': roi_pool_feats = [] box_to_levels = [] for i, l in enumerate(range(2, 6)): if (roi_level == l).sum() == 0: continue idx_l = (roi_level == l).nonzero().squeeze() box_to_levels.append(idx_l) scale = feat_maps[i].size(2) / im_info[0][0] feat = self.RCNN_roi_align(feat_maps[i], rois[idx_l], scale) roi_pool_feats.append(feat) roi_pool_feat = torch.cat(roi_pool_feats, 0) box_to_level = torch.cat(box_to_levels, 0) idx_sorted, order = torch.sort(box_to_level) roi_pool_feat = roi_pool_feat[order] elif cfg.POOLING_MODE == 'pool': roi_pool_feats = [] box_to_levels = [] for i, l in enumerate(range(2, 6)): if (roi_level == l).sum() == 0: continue idx_l = (roi_level == l).nonzero().squeeze() box_to_levels.append(idx_l) scale = feat_maps[i].size(2) / im_info[0][0] feat = self.RCNN_roi_pool(feat_maps[i], rois[idx_l], scale) roi_pool_feats.append(feat) roi_pool_feat = torch.cat(roi_pool_feats, 0) box_to_level = torch.cat(box_to_levels, 0) idx_sorted, order = torch.sort(box_to_level) roi_pool_feat = roi_pool_feat[order] return roi_pool_feat
def testModulus(self): for jit in [True, False]: modulus = sl.Modulus(jit=jit) x = torch.cuda.FloatTensor(100,10,4,2).copy_(torch.rand(100,10,4,2)) y = modulus(x) u = torch.squeeze(torch.sqrt(torch.sum(x * x, 3))) v = y.narrow(3, 0, 1) self.assertAlmostEqual(linfnorm(u, v), 0, places=6)
def __call__(self, states, agent_states): states_v = ptan.agent.float32_preprocessor(states).to(self.device) mu_v, var_v, _ = self.net(states_v) mu = mu_v.data.cpu().numpy() sigma = torch.sqrt(var_v).data.cpu().numpy() actions = np.random.normal(mu, sigma) actions = np.clip(actions, -1, 1) return actions, agent_states
def forward(self, tensor: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ broadcast_mask = mask.unsqueeze(-1).float() num_elements = broadcast_mask.sum() * self.size mean = (tensor * broadcast_mask).sum() / num_elements masked_centered = (tensor - mean) * broadcast_mask std = torch.sqrt( (masked_centered * masked_centered).sum() / num_elements + self.eps ) return self.gamma * (tensor - mean) / (std + self.eps) + self.beta
def loss(self, x, y, xidx=None, yidx=None): K = torch.sqrt(l2_distance(x, y)) u, v = self._get_uv(x, y, xidx, yidx) if regularization == 'entropy': reg = - alpha * torch.exp((u[:, None] + v[None, :] - K) / alpha) else: reg = - torch.clamp((u[:, None] + v[None, :] - K), min=0) ** 2 / 4 / alpha return - torch.mean(u[:, None] + v[None, :] + reg)
def MVNError(output, gt): outMean = torch.mean(output) outStd = torch.std(output) output = (output - outMean)/outStd gtMean = torch.mean(gt) gtStd = torch.std(gt) gt = (gt - gtMean)/gtStd d = output - gt diff = torch.sqrt(torch.mean(d * d)) return diff
def zero_mean_covariance(covariance, stability=0.0): '''Output covariance of ReLU for zero-mean Gaussian input. f(x) = max(x, 0). Args: covariance: Input covariance matrix (Size, Size). stability: For accurate results this should be zero if used in training, use a value like 1e-4 for stability. Returns: Output covariance of ReLU for zero-mean Gaussian input (Size, Size). ''' S = outer(torch.sqrt(torch.diagonal(covariance, 0, -2, -1))) V = (covariance / S).clamp_(stability - 1.0, 1.0 - stability) Q = torch.acos(-V) * V + torch.sqrt(1.0 - (V**2.0)) - 1.0 cov = S * Q * (1.0 / (2.0 * math.pi)) # handle degenerate case when we have zero variance cov[cov != cov] = 0 # replace nans with zeros return cov
def reconstruction_loss(self, images, input, size_average=True): # Get the lengths of capsule outputs. v_mag = torch.sqrt((input**2).sum(dim=2)) # Get index of longest capsule output. _, v_max_index = v_mag.max(dim=1) v_max_index = v_max_index.data # Use just the winning capsule's representation (and zeros for other capsules) to reconstruct input image. batch_size = input.size(0) all_masked = [None] * batch_size for batch_idx in range(batch_size): # Get one sample from the batch. input_batch = input[batch_idx] # Copy only the maximum capsule index from this batch sample. # This masks out (leaves as zero) the other capsules in this sample. batch_masked = Variable(torch.zeros(input_batch.size())).cuda() batch_masked[v_max_index[batch_idx]] = input_batch[v_max_index[batch_idx]] all_masked[batch_idx] = batch_masked # Stack masked capsules over the batch dimension. masked = torch.stack(all_masked, dim=0) # Reconstruct input image. masked = masked.view(input.size(0), -1) output = self.relu(self.reconstruct0(masked)) output = self.relu(self.reconstruct1(output)) output = self.sigmoid(self.reconstruct2(output)) output = output.view(-1, self.image_channels, self.image_height, self.image_width) # Save reconstructed images occasionally. if self.reconstructed_image_count % 10 == 0: if output.size(1) == 2: # handle two-channel images zeros = torch.zeros(output.size(0), 1, output.size(2), output.size(3)) output_image = torch.cat([zeros, output.data.cpu()], dim=1) else: # assume RGB or grayscale output_image = output.data.cpu() vutils.save_image(output_image, "reconstruction.png") self.reconstructed_image_count += 1 # The reconstruction loss is the sum squared difference between the input image and reconstructed image. # Multiplied by a small number so it doesn't dominate the margin (class) loss. error = (output - images).view(output.size(0), -1) error = error**2 error = torch.sum(error, dim=1) * 0.0005 # Average over batch if size_average: error = error.mean() return error
def __init__(self, concentration): if concentration.data.min() < 1: raise NotImplementedError('concentration < 1 is not supported') self.concentration = concentration self._standard_gamma = Gamma(concentration, concentration.new_tensor([1.]).squeeze().expand_as(concentration)) # The following are Marsaglia & Tsang's variable names. self._d = self.concentration - 1.0 / 3.0 self._c = 1.0 / torch.sqrt(9.0 * self._d) # Compute log scale using Gamma.log_prob(). x = self._d.detach() # just an arbitrary x. log_scale = self.propose_log_prob(x) + self.log_prob_accept(x) - self.log_prob(x) super(RejectionStandardGamma, self).__init__(self.propose, self.log_prob_accept, log_scale)
def __call__(self, boxlists): """ Arguments: boxlists (list[BoxList]) """ # Compute level ids s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists])) # Eqn.(1) in FPN paper target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps)) target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max) return target_lvls.to(torch.int64) - self.k_min
def forward(self, x): n = x.size(2) * x.size(3) t = x.view(x.size(0), x.size(1), n) mean = torch.mean(t, 2).unsqueeze(2).expand_as(x) # Calculate the biased var. torch.var returns unbiased var var = torch.var(t, 2).unsqueeze(2).expand_as(x) * ((n - 1) / float(n)) scale_broadcast = self.weight.unsqueeze(1).unsqueeze(1).unsqueeze(0) scale_broadcast = scale_broadcast.expand_as(x) shift_broadcast = self.bias.unsqueeze(1).unsqueeze(1).unsqueeze(0) shift_broadcast = shift_broadcast.expand_as(x) out = (x - mean) / torch.sqrt(var + self.eps) out = out * scale_broadcast + shift_broadcast return out
def forward(self, input): self.epsison_input.normal_() self.epsilon_output.normal_() func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x)) eps_in = func(self.epsilon_input.data) eps_out = func(self.epsilon_output.data) bias = self.bias if bias is not None: bias = bias + self.sigma_bias * eps_out.t() noise_v = torch.mul(eps_in, eps_out) return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)
def clip_gradient(model, clip_norm): """Computes a gradient clipping coefficient based on gradient norm.""" totalnorm = 0 for p in model.parameters(): if p.requires_grad and p.grad is not None: modulenorm = p.grad.data.norm() totalnorm += modulenorm ** 2 totalnorm = torch.sqrt(totalnorm).item() norm = (clip_norm / max(totalnorm, clip_norm)) # print totalnorm for p in model.parameters(): if p.requires_grad and p.grad is not None: p.grad.mul_(norm)
def gradient_penalty(self, y, x): """Compute gradient penalty: (L2_norm(dy/dx) - 1)**2.""" weight = torch.ones(y.size()).to(self.device) dydx = torch.autograd.grad(outputs=y, inputs=x, grad_outputs=weight, retain_graph=True, create_graph=True, only_inputs=True)[0] dydx = dydx.view(dydx.size(0), -1) dydx_l2norm = torch.sqrt(torch.sum(dydx**2, dim=1)) return torch.mean((dydx_l2norm-1)**2)
def train(self,data,inform = None,use_cuda=True,TASK = 2,num_epochs = 200,batch_size = 50,k_d=1, k_g = 1,lr = 0.0001): g_optimizer = optim.Adam(self.generator.parameters(),lr=lr) d_optimizer = optim.Adam(self.discriminator.parameters(), lr=lr) try: for epoch in range(num_epochs): ls_g=[] ls_d=[] for input_data,info in iterate_minibatches(data, batch_size,inform): # Optimize D for _ in range(k_d): # Sample noise if not (info is None): noise = Variable(torch.cat((torch.Tensor(sample_noise(len(input_data))),torch.Tensor(info)),1).cuda()) else: noise = Variable(torch.Tensor(sample_noise(len(input_data))).cuda()) # Do an update inp_data = Variable(torch.Tensor(input_data).cuda()) data_gen = self.generator(noise) if(TASK==4): #COde from here https://github.com/EmilienDupont/wgan-gp alpha = torch.rand(inp_data.size()[0], 1) alpha = alpha.expand_as(inp_data) if use_cuda: alpha = alpha.cuda() interpolated = alpha * inp_data.data + (1 - alpha) * data_gen.data interpolated = Variable(interpolated, requires_grad=True) if use_cuda: interpolated = interpolated.cuda() prob_interpolated = self.discriminator(interpolated,TASK=TASK) gradients = torch.autograd.grad(outputs=prob_interpolated, inputs=interpolated, grad_outputs=torch.ones(prob_interpolated.size()).cuda() if use_cuda else torch.ones( prob_interpolated.size()), create_graph=True, retain_graph=True)[0] gradients = gradients.view(inp_data.size()[0], -1) gradients_norm = torch.sqrt(torch.sum(gradients ** 2, dim=1) + 1e-12) penalty = 10 * ((gradients_norm - 1) ** 2).mean() if not (info is None): loss = d_loss(self.discriminator(data_gen + Variable(torch.Tensor(info).cuda(), requires_grad=False),TASK = TASK), self.discriminator(inp_data,TASK = TASK),TASK,penalty) else: loss = d_loss(self.discriminator(data_gen,TASK = TASK), self.discriminator(inp_data,TASK = TASK),TASK,penalty) else: if not (info is None): loss = d_loss(self.discriminator(data_gen + Variable(torch.Tensor(info).cuda(), requires_grad=False),TASK = TASK),self.discriminator(inp_data,TASK = TASK),TASK) else: loss = d_loss(self.discriminator(data_gen,TASK = TASK), self.discriminator(inp_data,TASK = TASK),TASK) ls_d.append(loss.data.cpu().numpy()) d_optimizer.zero_grad() loss.backward() d_optimizer.step() if TASK == 3: self.discriminator.apply(self.clipper,TASK = TASK) # Optimize G for _ in range(k_g): # Sample noise if not (info is None): noise = Variable(torch.cat((torch.Tensor(sample_noise(len(input_data))),torch.Tensor(info)),1).cuda()) else: noise = Variable(torch.Tensor(sample_noise(len(input_data))).cuda()) # Do an update data_gen = self.generator(noise) if not (info is None): loss = g_loss(self.discriminator(data_gen + Variable(torch.Tensor(info).cuda(), requires_grad=False),TASK = TASK),TASK) else: loss = g_loss(self.discriminator(data_gen,TASK = TASK),TASK) ls_g.append(loss.data.cpu().numpy()) g_optimizer.zero_grad() loss.backward() g_optimizer.step() if(epoch%10==0): print('generator_loss:',np.mean(ls_g),'discriminator_loss',np.mean(ls_d)) except KeyboardInterrupt: pass
elbo_list.append( elbo_evaluate(images, labels, para, dim, scale, revise, num_St).item()) #算法起始位置 z_samples = sampleZ(para, dim, num_S) log_qs = ng_log_Qs(para, z_samples, dim) log_priors = ng_log_Priors(z_samples, dim) log_likelihoods = ng_log_Likelihoods(images, labels, z_samples, dim) for s in range(len(z_samples)): gradients[s] = grad_log_Q(para, z_samples[s], dim)[0] elbo_temp = log_likelihoods * revise + log_priors / scale - log_qs / scale grad_temp = torch.matmul(torch.diag(elbo_temp), gradients) grad_avg = torch.mean(grad_temp, 0) G += torch.matmul(grad_avg.view(dim * 2, -1), grad_avg.view(-1, dim * 2)) rho = eta / torch.sqrt(torch.diag(G)) para.data += rho * grad_avg #print information if 1: print('Epoch[{}/{}], step[{}/{}]'.format(\ epoch+1, num_epochs, i+1,len(train_loader))) print('ELBO: {:.3f}\n'.format(\ elbo_list[len(elbo_list)-1])) if not os.path.exists('./result_elbo'): os.makedirs('./result_elbo') result = np.array(elbo_list) np.save('./result_elbo/bbvi_basic.npy', result)
def arccosh(x): c0 = torch.log(x) c1 = torch.log1p(torch.sqrt(x * x - 1) / x) return c0 + c1
def arcsinh(x): return torch.log(x + torch.sqrt(x * x + 1))
def arcosh(x): return torch.log(x + torch.sqrt(x * x - 1))
def sqrt(val): return torch.sqrt(torch.tensor(val).float())
def squash(self, x, axis=-1): s_squared_norm = (x**2).sum(axis, keepdim=True) scale = t.sqrt(s_squared_norm + T_epsilon) return x / scale
def sqrt(input): return th.sqrt(input)
def step(self, # futures=(train_episodes_futures, valid_episodes_futures) train_futures, valid_futures, max_kl=1e-3, cg_iters=10, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): num_tasks = len(train_futures[0]) logs = {} # Compute the surrogate loss # old_losses, old_kls, old_pis = self._async_gather([ self.surrogate_loss(train, valid, old_pi=None) for (train, valid) in zip(zip(*train_futures), valid_futures)]) logs['loss_before'] = to_numpy(old_losses) logs['kl_before'] = to_numpy(old_kls) # 计算平均误差,输出为标量 old_loss = sum(old_losses) / num_tasks grads = torch.autograd.grad(old_loss, self.policy.parameters(), retain_graph=True) grads = parameters_to_vector(grads) # Compute the step direction with Conjugate Gradient # 计算平均误差,输出为标量 old_kl = sum(old_kls) / num_tasks hessian_vector_product = self.hessian_vector_product(old_kl, damping=cg_damping) stepdir = conjugate_gradient(hessian_vector_product, grads, cg_iters=cg_iters) # Compute the Lagrange multiplier shs = 0.5 * torch.dot(stepdir, hessian_vector_product(stepdir, retain_graph=False)) lagrange_multiplier = torch.sqrt(shs / max_kl) step = stepdir / lagrange_multiplier # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) """ vector_to_parameter( * , self.policy.parameters()) 就是对网络参数的更新 """ # Line search step_size = 1.0 for _ in range(ls_max_steps): vector_to_parameters(old_params - step_size * step, self.policy.parameters()) losses, kls, _ = self._async_gather([ self.surrogate_loss(train, valid, old_pi=old_pi) for (train, valid, old_pi) in zip(zip(*train_futures), valid_futures, old_pis)]) improve = (sum(losses) / num_tasks) - old_loss kl = sum(kls) / num_tasks if (improve.item() < 0.0) and (kl.item() < max_kl): logs['loss_after'] = to_numpy(losses) logs['kl_after'] = to_numpy(kls) break step_size *= ls_backtrack_ratio else: vector_to_parameters(old_params, self.policy.parameters()) # 查看最终神经网络参数 params_final = self.policy.parameters() # logs['loss_before', 'kl_before', 'loss_after', 'kl_after'] return logs