def pdist(self, fX): """Compute pdist à-la scipy.spatial.distance.pdist Parameters ---------- fX : (n, d) torch.Tensor Embeddings. Returns ------- distances : (n * (n-1) / 2,) torch.Tensor Condensed pairwise distance matrix """ if self.metric == 'euclidean': return F.pdist(fX) elif self.metric in ('cosine', 'angular'): distance = 0.5 * torch.pow(F.pdist(F.normalize(fX)), 2) if self.metric == 'cosine': return distance return torch.acos(torch.clamp(1. - distance, -1 + 1e-12, 1 - 1e-12))
def tla_loss(feats, labels, margin): batch_size = feats.size(0) // 2 x = F.normalize(feats, p=2, dim=1) ori, adv = x.chunk(2, dim=0) distmat = torch.pow(adv, 2).sum(dim=1, keepdim=True).expand(batch_size, batch_size) + \ torch.pow(ori, 2).sum(dim=1, keepdim=True).expand(batch_size, batch_size).t() distmat.addmm_(1, -2, adv, ori.t()) # print("ori labels: {}".format(labels)) labels = labels.unsqueeze(1).expand(batch_size, batch_size) # print("now labels: {}".format(labels)) mask = labels.eq(labels.t()) # print("mask: {}".format(mask)) zero = torch.tensor([0.]).cuda() dist = [] for i in range(batch_size): congener_dist = distmat[i][i] congener_marks = mask[i].clone() inhomogen_marks = (-1 * congener_marks + 1).bool() nearst_inhomogen_dist = torch.min(distmat[i][inhomogen_marks]) congener_dist = congener_dist.clamp(min=1e-12, max=1e+12) nearst_inhomogen_dist = nearst_inhomogen_dist.clamp(min=1e-12, max=1e+12) dist.append(zero+max(congener_dist - nearst_inhomogen_dist + margin, zero)) # print(dist) dist = torch.cat(dist) loss = dist.mean() if args.homo_constrain: ori_distmat = F.pdist(ori, p=2) loss += ori_distmat.mean() if args.inhomo_constrain: adv_distmat = F.pdist(adv, p=2) loss += adv_distmat.mean() return loss
def compute_specialization_metric(encoder_norms, device): """ Args: encoder_norms - Attention norms. Tensor of size (batch_size, num_layers, num_heads, seq_len, seq_len) """ batch_size, num_layers, num_heads, seq_len, seq_len2 = encoder_norms.size() assert seq_len == seq_len2 encoder_norms = encoder_norms.permute(0, 2, 1, 3, 4) # flip layer dimension with head dimension encoder_norms = F.normalize(encoder_norms.flatten(3).contiguous(), p=1.0, dim=-1) # divide each attention norm pattern by its mean # encoder_norms now has a size of (num_heads, num_layers, seq_len * seq_len) metric_list = [] for encoder_norm in encoder_norms: head_means = [] for single_head_norms in encoder_norm: single_head_distances = F.pdist(single_head_norms, p=1) # pairwise L1 distances head_means.append(torch.mean(single_head_distances)) single_head_mean = torch.stack(head_means).mean() all_head_mean = torch.mean( F.pdist(encoder_norm.flatten(start_dim=0, end_dim=1).contiguous(), p=1 )).mean() # pairwise distances between all attention norm patterns metric_list.append(single_head_mean.item() / all_head_mean.item()) return torch.tensor(sum(metric_list), device=device, requires_grad=False), \ torch.tensor(len(encoder_norms), device=device, requires_grad=False)
def reinforce(input, policy_estimator, model, num_episodes=10000, batch_size=10): opt = optim.Adam(policy_estimator.network.parameters(), lr=0.0005) for i in range(num_episodes): x = input y = get_abm_actual_outputs() x, y = Variable(x), Variable(y) means, sigs = policy_estimator.predict(x) dists = torch.distributions.Normal(means[0], sigs[0]) samples = dists.sample() # Get outputs from model here output = get_abm_exp_outputs(samples.abs().numpy(), batch_size, model) if i % 10 == 0: print('episode ', i) print(' ', 'output', output[0]) print(' ', 'means', means[0]) print(' ', 'sigs', sigs[0]) print(' ', 'samples', samples) # mloss = F.mse_loss(output, y).sum()#, reduce=False) reward = -F.pdist(torch.stack([output[0], y[0]]), p=2) loss = -dists.log_prob(output[0]).sum() * reward opt.zero_grad() loss.backward() opt.step()
def compute_diversity(self, pred_embeddings): len_recommended_list = len(pred_embeddings) distances = pdist(pred_embeddings) # calculate diversity according to formula diversity = 2 * torch.sum(distances) / (len_recommended_list * (len_recommended_list - 1)) return diversity
def forward(self, targets: torch.Tensor, embedding_vector: torch.Tensor, *args, **kwargs): b_dim = targets.shape[0] point_count = 0 dist_loss_list = [] var_loss_list = [] for bid in range(b_dim): mean_vec_list = [] for idx in range(1, self.max_lane_count + 1): vs, us = torch.nonzero(targets[bid] == idx, as_tuple=True) if vs.numel() > 1: mean_vec = embedding_vector[bid, :, vs, us].mean(dim=1) diff = embedding_vector[bid, :, vs, us] - \ mean_vec.reshape((-1, 1)) dist_loss_list.append( torch.sum( F.relu( torch.linalg.norm(diff, dim=0) - self.delta_v)**2)) point_count = point_count + vs.numel() mean_vec_list.append(mean_vec) if len(mean_vec_list) > 1: mean_vec_tensor = torch.stack(mean_vec_list) var_loss_list.append( torch.mean( F.relu(self.delta_d - F.pdist(mean_vec_tensor))**2)) dist_loss = torch.sum(torch.stack(dist_loss_list)) / \ point_count if point_count > 0 else self.zero var_loss = torch.mean(torch.stack( var_loss_list)) if len(var_loss_list) > 0 else self.zero return dist_loss + var_loss
def forward(self): a = torch.randn(8, 4) b = torch.randn(8, 4) return len( F.pairwise_distance(a, b), F.cosine_similarity(a, b), F.pdist(a), )
def _pairwise_dist(cx, cy, p=2, _pow_flag=False): """Compute pairwise distances between two Tensors of size m x `shape` and n x `shape`. This should be done as efficiently as possible. Discussions: https://github.com/pytorch/pytorch/issues/9406 """ def _are_equal(cx, cy): if cx is cy: return True return torch.equal(cx, cy) res = None m = cx.size(0) n = cy.size(0) imsize = cx.view(m, -1).size(-1) cx_eq_cy = _are_equal(cx, cy) if cx_eq_cy: # logger.debug("Calc pairwise distance with pytorch.pdist.") # Calculate only triangular, fast, cheaper, stable. Looks like this: # torch.cat([torch.full((n - i - 1,), i, dtype=torch.int64) for i in range(n)]) res = F.pdist(cx.view(m, -1), p=p) elif p == 2 and m * n * imsize * (torch.finfo(cx.dtype).bits // 8) > 4 * 1024**2: # logger.debug("Calc pairwise distance with quadratic expansion.") # If more than 4MB needed to repr a full matrix # Faster and cheaper, but less stable (quadratic expansion) # Still slower than the first choice cx_ = cx.view(m, -1) cy_ = cy.view(n, -1) cx_norm = cx_.pow(2).sum(dim=-1, keepdim=True) cy_norm = cy_.pow(2).sum(dim=-1, keepdim=True).transpose(-2, -1) res = cx_norm + cy_norm - 2 * cx_.matmul(cy_.transpose(-2, -1)) if cx_eq_cy: # Ensure zero diagonal diag_inds = torch.arange(m) res[diag_inds, diag_inds] = 0 # Zero out negative values res.clamp_min_(0) if _pow_flag: _pow_flag[0] = True else: res = res.sqrt() else: # logger.debug("Calc pairwise distance with naive broadcasting.") # More expensive - Θ(n^2 d), but numerically more stable cx_ = cx.view(m, 1, -1) cy_ = cy.view(1, n, -1) # XXX does not support broadcasting yet #15918 and #15901 # res = F.pairwise_distance(cx_, cy_, p=p, eps=1e-8) res = torch.norm(cx_ - cy_, p=p, dim=-1) return res
def pairwise_distance(ps, qs=None, p=2): # last dim is summed if qs is None: return F.pdist(ps, p=p) return torch.cdist(ps, qs, p=p) ps, qs = ps.unsqueeze(-2), qs.unsqueeze(-3) return (ps - qs).pow(p).sum(-1).pow(1 / p)
def forward(self, p): all_y = self.embedding.weight dist = F.pdist(all_y).pow(2) dist = (1. + dist).pow(-1.0 * self.degrees_of_freedom) q = torch.clamp(dist / (2 * dist.sum()), min=eps) # log_loss = pij * (torch.log(dist) - torch.log(qij)) log_loss = p.dot(torch.log(torch.clamp(p, min=eps)) - torch.log(q)) return log_loss.sum()
def dist_mat(embeddings): # Reconstruct distance matrix because torch.pdist gives back a condensed flattened vector n_samples = embeddings.squeeze_().shape[0] mat = torch.zeros(n_samples,n_samples) if torch.cuda.is_available(): mat = mat.cuda() dists = F.pdist(embeddings) s_ = 0 for i , n in enumerate(reversed(range(1,n_samples))): mat[i,i+1:] = dists[s_:s_+n] s_ += n return mat
def MMD_torch(Z, A, bandwidth=1.): """Calculates the MMD between the two groups (as defined by A) in Z, using a Gaussian kernel. MMD stands for Maximum Mean Discrepancy. As calculated in https://arxiv.org/pdf/1511.00830.pdf Args: Z (Torch tensor): an (n x d)-shaped tensor, representing some data about each individual. A (Torch tensor): an (n x 1) or (n,)-shaped tensor, representing a binary sensitive attribute for each individual. bandwidth (float, optional): bandwidth paramater for the kernel. Returns: mmd: an estimate of the MMD between the distributions P(Z | A = 0) and P(Z | A = 1). """ A = A.flatten().byte() # Separate the A = 0 and A = 1 groups in Z. Z0 = Z[1 - A, :] Z1 = Z[A, :] Z01 = torch.cat([Z0, Z1], dim=0) n0 = torch.sum(1 - A) n1 = torch.sum(A) # Calculate the pairwise distances between rows of Z. dists = F.pdist(Z01) # Determine which elements of dists represent distances between rows from # which groups. v0, v1, v01 = get_mmd_inds(n0, n1) Z0_dist = dists[v0] Z1_dist = dists[v1] Z_dist = dists[v01] # Calculate MMD using these distances. kernel_sum_0 = 2 * torch.sum(torch.exp(-bandwidth * Z0_dist)) + n0 kernel_sum_1 = 2 * torch.sum(torch.exp(-bandwidth * Z1_dist)) + n1 kernel_sum_01 = torch.sum(torch.exp(-bandwidth * Z_dist)) mmd = (kernel_sum_0 / (n0**2) + kernel_sum_1 / (n1**2) - 2 * kernel_sum_01 / (n0 * n1)) return mmd
def hsic_single(x1, x2, sigma): """ param::x1, x2: of size (c, n) """ c, n = x1.size() Ks = [] for x in [x1, x2]: d = F.pdist(x) idx = torch.triu_indices(c, c, 1) dist = torch.zeros((c, c)).to(x.device) dist[idx[0], idx[1]] = d dist = dist + dist.t() K = torch.exp(-dist**2 / (sigma * n)) Ks.append(K) K12 = torch.matmul(Ks[0], Ks[1]) hsic = torch.trace(K12) / (c - 1) ** 2 + \ torch.mean(Ks[0]) * torch.mean(Ks[1]) * c ** 2 / (c - 1) ** 2 - \ 2 * torch.mean(K12) * c / (c - 1) ** 2 return hsic
def calculate_affinity_gpu(data): data = torch.tensor(data).double().cuda() # type: torch.Tensor n, d = data.shape distances = pdist(data) distances_mat = torch.zeros((n, n)).double().cuda() cur_idx = 0 for i in range(n): distances_mat[i, i + 1:] = distances[cur_idx:cur_idx + (n - i - 1)] cur_idx += (n - i - 1) distances_mat = distances_mat + distances_mat.T rank = torch.zeros((n, n)).double().cuda() for i in range(n): rank[i] = torch.argsort(torch.argsort(distances_mat[i])) affinity = rank * rank.T return affinity.cpu().numpy()
v = x.unsqueeze(1).expand((args.batch_size, args.sample_size, 3, 32, 32)) v = v.reshape((-1, 3, 32, 32)) noised = noise.sample(v) if args.model == "ResNet": rep_noisy = get_final_layer(model, noised) elif args.model == "MLP": rep_noisy = get_final_layer_mlp(model, noised) else: raise ValueError rep_noisy = rep_noisy.reshape(args.batch_size, -1, rep_noisy.shape[-1]) top_cats = model(noised).reshape(args.batch_size, -1, 10).argmax(dim=2).mode(dim=1) top_cats = top_cats.values l2 = torch.stack([F.pdist(rep_i, p=2) for rep_i in rep_noisy]).mean(dim=1).data l1 = torch.stack([F.pdist(rep_i, p=1) for rep_i in rep_noisy]).mean(dim=1).data linf = torch.stack([F.pdist(rep_i, p=float("inf")) for rep_i in rep_noisy]).mean(dim=1).data results["acc"] += (y == top_cats).float().cpu().numpy().tolist() results["l1"] += l1.cpu().numpy().tolist() results["l2"] += l2.cpu().numpy().tolist() results["linf"] += linf.cpu().numpy().tolist() results["noise"] += args.batch_size * [noise_str] results["epoch"] += args.batch_size * [epoch] if args.load: results = pd.read_csv(f"{args.dir}/snapshots.csv") else: results = pd.DataFrame(results) results.to_csv(f"{args.dir}/snapshots.csv")
def train(self, data, n_epochs): loader = torch.utils.data.DataLoader(data, batch_size=self.batch_size, shuffle=True) tdqm_dict_keys = ['homology', 'compactness', 'reconstruction'] tdqm_dict = dict(zip(tdqm_dict_keys, [0.0, 0.0, 0, 0])) for epoch in range(n_epochs): # initialize cumulative losses to zero at the start of epoch total_homology_loss = 0.0 total_reconstruction_loss = 0.0 total_compactness_loss = 0.0 with tqdm(total=len(loader), unit_scale=True, postfix={ 'homology': 0.0, 'compactness': 0.0, 'reconstruction': 0.0 }, desc="Epoch : %i/%i" % (epoch + 1, n_epochs), ncols=100) as pbar: for batch_idx, batch in enumerate(loader): batch = batch.type(torch.float32).to(self.device) latent = self.autoencoder.encoder(batch) if self.normalize_for_homology == 'std': latent_hom = (latent - torch.mean( latent, axis=0)) / torch.std(latent, axis=0) elif self.normalize_for_homology == '01': latent_hom = (latent - torch.min(latent, axis=0).values ) / (torch.max(latent, axis=0).values - torch.min(latent, axis=0).values) else: latent_hom = latent # in pure reconstruction mode, # skip the Gudhi part to speed training up if self.homology_penalty == 0.0: homology_loss = torch.FloatTensor([0.0 ]).to(self.device) else: # calculate pairwise distance matrix # pdist is a flat tensor representing # the upper triangle of the pairwise # distance tensor. self.pdist = F.pdist(latent_hom) # compute the persistence interval lengths self.persistence_births, self.persistence_deaths = persistence_pairs( latent_hom, dim=self.homology_dim, device=self.device) # Compute the indicator of indices that correspond # to pairs of points such that the intersection of their # balls in the Vietoris-Rips scheme is a birth or death event # for the homology of interest. indicators_death = torch.FloatTensor([ self.indicator_death( triangular_from_linear_index( self.batch_size, k)) for k in range(self.pdist.shape[0]) ]).to(self.device) indicators_birth = torch.FloatTensor([ self.indicator_birth( triangular_from_linear_index( self.batch_size, k)) for k in range(self.pdist.shape[0]) ]).to(self.device) death_pdist = self.pdist[torch.where( indicators_death == 1)[0]].to(self.device) if self.homology_dim == 0: birth_pdist = torch.zeros(death_pdist.shape).to( self.device) else: birth_pdist = self.pdist[torch.where( indicators_birth == 1)[0]].to(self.device) # Due to rounding, it may sometimes happen that more pairs are located # in the pdist matrix than there are : this hack bypasses that. n_pairs = min([ len(self.persistence_births), len(self.persistence_deaths) ]) # Compute homology loss homology_loss = torch.norm(death_pdist[:n_pairs] - birth_pdist[:n_pairs] + self.homology_eps, p=self.norm) # MSE loss between true input and decoder output reconstruction_loss = F.mse_loss( batch, self.autoencoder.decoder(latent)) # A trivial solution to the homology optimization is to # increase or decrease the scale of the latent point cloud. # To avoid that, add a penalization on the radius of # the point cloud for a given norm. compactness_loss = torch.norm(latent - torch.mean(latent, axis=0), p=self.norm) loss = self.target_penalty * reconstruction_loss \ + self.homology_penalty * homology_loss \ + self.compactness_penalty * compactness_loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() total_homology_loss += homology_loss.item() total_reconstruction_loss += reconstruction_loss.item() total_compactness_loss += compactness_loss.item() # Logging tdqm_dict['homology'] = total_homology_loss / (batch_idx + 1) tdqm_dict['reconstruction'] = total_reconstruction_loss / ( batch_idx + 1) tdqm_dict['compactness'] = total_compactness_loss / ( batch_idx + 1) if batch_idx % self.throttle == 0: pbar.set_postfix(tdqm_dict) pbar.update(self.throttle)
def test_pdist(self): # pdist is not implemented for fp16 inp = torch.randn(128, 128, device='cuda', dtype=torch.float32) output = F.pdist(inp, p=2)
def pdist(self, x): return F.pdist(x)