def test_emd2_multi(): from ot.datasets import get_1D_gauss as gauss n = 1000 # nb bins # bin positions x = np.arange(n, dtype=np.float64) # Gaussian distributions a = gauss(n, m=20, s=5) # m= mean, s= std ls = np.arange(20, 1000, 20) nb = len(ls) b = np.zeros((n, nb)) for i in range(nb): b[:, i] = gauss(n, m=ls[i], s=10) # loss matrix M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1))) # M/=M.max() print('Computing {} EMD '.format(nb)) # emd loss 1 proc ot.tic() emd1 = ot.emd2(a, b, M, 1) ot.toc('1 proc : {} s') # emd loss multipro proc ot.tic() emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') np.testing.assert_allclose(emd1, emdn)
def test_emd_emd2_devices_tf(): if not tf: return nx = ot.backend.TensorflowBackend() n_samples = 100 n_features = 2 rng = np.random.RandomState(0) x = rng.randn(n_samples, n_features) y = rng.randn(n_samples, n_features) a = ot.utils.unif(n_samples) M = ot.dist(x, y) # Check that everything stays on the CPU with tf.device("/CPU:0"): ab, Mb = nx.from_numpy(a, M) Gb = ot.emd(ab, ab, Mb) w = ot.emd2(ab, ab, Mb) nx.assert_same_dtype_device(Mb, Gb) nx.assert_same_dtype_device(Mb, w) if len(tf.config.list_physical_devices('GPU')) > 0: # Check that everything happens on the GPU ab, Mb = nx.from_numpy(a, M) Gb = ot.emd(ab, ab, Mb) w = ot.emd2(ab, ab, Mb) nx.assert_same_dtype_device(Mb, Gb) nx.assert_same_dtype_device(Mb, w) assert nx.dtype_device(Gb)[1].startswith("GPU")
def double_wasserstein1(X_train_smote): n, m, r, _ = X_train_smote.shape # uniform measures at points clouds of card m a2 = np.ones(m) / m b2 = np.ones(m) / m # uniform measures at points of card r a1 = np.ones(r) / r b1 = np.ones(r) / r # 1st level distance matrix of size m x m M1 = np.zeros((m, m)) # M1 loop for i in range(m): for j in range(i + 1, m): # pairwise squared Euclidean distances as the ground metric M0_ij = ot.dist(X_train_smote[0, i], X_train_smote[1, j], metric="sqeuclidean") # 2-Wasserstein distance btw point clouds, take square root M1[i, j] = ot.emd2(a1, b1, M0_ij)**0.5 # 1st level symmetrize M1 = M1 + M1.T np.fill_diagonal(M1, 1e9) # 1-Wasserstein distance btw collections of point clouds W1 = ot.emd2(a2, b2, M1) return W1
def wmd(p, q, C, truncate=None): """ Word mover's distance between distributions p and q with cost M.""" if truncate is None: return ot.emd2(p, q, C) id_p = np.argsort(p)[-truncate:] id_q = np.argsort(q)[-truncate:] C_reduced = C[id_p][:, id_q] return ot.emd2(p[id_p], q[id_q], C_reduced)
def w2(k): print(k) w2z = np.zeros((2, 10, 10, 500)) #w2b = np.zeros((10,10,500)) for j in range(1, 11): print(' j: ' + str(j)) for i in range(1): try: #Y = np.load('/home/archithpc/data/batch-variable-genes-3/sim-cluster-' + str(j) + '-' + str(i) + '.npy')[k,j-1,i-1] #Y = np.loadtxt('/home/archithpc/data/batch-effect-variable-genes-3/' + str(k) + '/sim-cluster-' + str(j) + '-' + str(i) + '.csv') dist_obs = squareform(pdist(Y)) dist_obs_norm = normalize(dist_obs, norm='l1') #path = './'+ folder + '/matern/' + init + '/' + str(k) + '/' + str(j) + '/' + str(i) + '/model-output-' + iters + '.hdf5' #fit = h5py.File(path, 'r') #zfit = fit['x_mean'] #dist_z = squareform(pdist(zfit)) #dist_z_norm = normalize(dist_z, norm = 'l1') ols = LinearRegression(fit_intercept=False) ols.fit(batch_var, Y) Yres = Y - ols.predict(batch_var) pca = PCA(n_components=3) zpca = pca.fit_transform(Yres) dist_pca = squareform(pdist(zpca)) dist_pca_norm = normalize(dist_pca, norm='l1') #seurat = pd.read_csv('./'+folder+'/results/seurat-results/sim-' + str(i) + '.csv') #dist_seurat = squareform(pdist(seurat.values)) #dist_s_norm = normalize(dist_seurat, norm = 'l1') #mnn = pd.read_csv('./'+folder+'/mnn-results/' + str(k) + '/sim-' + str(j) + '-' + str(i) + '.csv') #dist_m = squareform(pdist(mnn.values)) #dist_m_norm = normalize(dist_m, norm = 'l1') for n in range(0, 500): #w2z[0,j-1,i-1,n] = ot.emd2(dist_z_norm[j,:],dist_norm[j,:],M) #w2z[1,j-1,i-1,n] = ot.emd2(dist_z_norm[j,:],dist_obs_norm[j,:],M) w2p[0, j - 1, i - 1, n] = ot.emd2(dist_pca_norm[j, :], dist_norm[j, :], M) w2p[1, j - 1, i - 1, n] = ot.emd2(dist_pca_norm[j, :], dist_obs_norm[j, :], M) #w2s[0,k,j-1,i-1,n] = ot.emd2(dist_s_norm[j,:],dist_norm[j,:],M) #w2s[1,k,j-1,i-1,n] = ot.emd2(dist_s_norm[j,:],dist_obs_norm[j,:],M) #w2m[0,j-1,i-1,n] = ot.emd2(dist_m_norm[j,:],dist_norm[j,:],M) #w2m[1,j-1,i-1,n] = ot.emd2(dist_m_norm[j,:],dist_obs_norm[j,:],M) #w2b[j-1,i-1,n] = ot.emd2(dist_norm[j,:],dist_obs_norm[j,:],M) except: w2z[0, j - 1, i - 1, :] = -1 * np.ones(500) w2z[1, j - 1, i - 1, :] = -1 * np.ones(500) return w2p
def test_periodic_phi(gdim, M): events = np.random.rand(nev, M, 1 + gdim) for phi_col in range(1, gdim + 1): emds1 = emd.emds_pot(events, R=1.0, gdim=gdim, n_jobs=1, verbose=0) events_c = np.copy(events) events_c[:, :, phi_col] += 2 * np.pi * np.random.randint( -10, 10, size=(nev, M)) emds2 = emd.emds_pot(events_c, R=1.0, gdim=gdim, periodic_phi=True, phi_col=phi_col, n_jobs=1, verbose=0) assert epsilon_diff(emds1, emds2, 10**-12) ev1 = np.random.rand(10, 1 + gdim) * 4 * np.pi ev2 = np.random.rand(20, 1 + gdim) * 4 * np.pi thetaw = np.zeros((len(ev1), len(ev2))) thetar = np.zeros((len(ev1), len(ev2))) for i, p1 in enumerate(ev1): for j, p2 in enumerate(ev2): dw, dr = 0., 0. for m, (k1, k2) in enumerate(zip(p1, p2)): if m == 0: continue elif m == phi_col: dw += (k1 - k2)**2 dr += np.min([ abs(k1 - (k2 + 2 * np.pi * n)) for n in range(-3, 3) ])**2 else: dw += (k1 - k2)**2 dr += (k1 - k2)**2 thetaw[i, j] = np.sqrt(dw) thetar[i, j] = np.sqrt(dr) zs1 = np.ascontiguousarray(ev1[:, 0] / np.sum(ev1[:, 0])) zs2 = np.ascontiguousarray(ev2[:, 0] / np.sum(ev2[:, 0])) ot_w, ot_r = ot.emd2(zs1, zs2, thetaw), ot.emd2(zs1, zs2, thetar) ef_w = emd.emd_pot(ev1, ev2, norm=True, gdim=gdim, periodic_phi=False, phi_col=phi_col) ef_r = emd.emd_pot(ev1, ev2, norm=True, gdim=gdim, periodic_phi=True, phi_col=phi_col) assert epsilon_diff(ot_w, ef_w, 10**-14) assert epsilon_diff(ot_r, ef_r, 10**-14)
def test_emd2_multi(): n = 500 # nb bins # bin positions x = np.arange(n, dtype=np.float64) # Gaussian distributions a = gauss(n, m=20, s=5) # m= mean, s= std ls = np.arange(20, 500, 20) nb = len(ls) b = np.zeros((n, nb)) for i in range(nb): b[:, i] = gauss(n, m=ls[i], s=10) # loss matrix M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1))) # M/=M.max() print('Computing {} EMD '.format(nb)) # emd loss 1 proc ot.tic() emd1 = ot.emd2(a, b, M, 1) ot.toc('1 proc : {} s') # emd loss multipro proc ot.tic() emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') ot.tic() emdn2 = ot.emd2(a, b, M, dense=False) ot.toc('multi proc : {} s') np.testing.assert_allclose(emd1, emdn) np.testing.assert_allclose(emd1, emdn2, rtol=1e-6) # emd loss multipro proc with log ot.tic() emdn = ot.emd2(a, b, M, log=True, return_matrix=True) ot.toc('multi proc : {} s') for i in range(len(emdn)): emd = emdn[i] log = emd[1] cost = emd[0] check_duality_gap(a, b[:, i], M, log['G'], log['u'], log['v'], cost) emdn[i] = cost emdn = np.array(emdn) np.testing.assert_allclose(emd1, emdn)
def test_emd2_gradients(): n_samples = 100 n_features = 2 rng = np.random.RandomState(0) x = rng.randn(n_samples, n_features) y = rng.randn(n_samples, n_features) a = ot.utils.unif(n_samples) M = ot.dist(x, y) if torch: a1 = torch.tensor(a, requires_grad=True) b1 = torch.tensor(a, requires_grad=True) M1 = torch.tensor(M, requires_grad=True) val, log = ot.emd2(a1, b1, M1, log=True) val.backward() assert a1.shape == a1.grad.shape assert b1.shape == b1.grad.shape assert M1.shape == M1.grad.shape assert np.allclose( a1.grad.cpu().detach().numpy(), log['u'].cpu().detach().numpy() - log['u'].cpu().detach().numpy().mean()) assert np.allclose( b1.grad.cpu().detach().numpy(), log['v'].cpu().detach().numpy() - log['v'].cpu().detach().numpy().mean()) # Testing for bug #309, checking for scaling of gradient a2 = torch.tensor(a, requires_grad=True) b2 = torch.tensor(a, requires_grad=True) M2 = torch.tensor(M, requires_grad=True) val = 10.0 * ot.emd2(a2, b2, M2) val.backward() assert np.allclose(10.0 * a1.grad.cpu().detach().numpy(), a2.grad.cpu().detach().numpy()) assert np.allclose(10.0 * b1.grad.cpu().detach().numpy(), b2.grad.cpu().detach().numpy()) assert np.allclose(10.0 * M1.grad.cpu().detach().numpy(), M2.grad.cpu().detach().numpy())
def wasserstein_l1(base_dist, new_dist, normalise=False): """ Calculate the wasserstein distance between 2 distributions via samples using l1 cost""" # Calculate the Wasserstein distance between base_dist and new_dist using JSD as the cost N = base_dist.shape[0] base_dist = np.copy(base_dist) new_dist = np.copy(new_dist) # Sanitise the distributions base_dist.clip(0, 1) new_dist.clip(0, 1) base_vector = np.ones(N) / N new_vector = np.ones(N) / N cost_matrix = np.zeros(shape=(N, N)) for i in range(N): row_cost = np.abs(base_dist[i] - new_dist).sum(axis=1) cost_matrix[i] = row_cost if normalise: mean_phi = np.mean(new_dist, axis=0) cost_to_base = np.abs(mean_phi - base_dist).sum(axis=1) + 0.0000001 cost_matrix = cost_matrix / cost_to_base[:, np.newaxis] w_results = emd2(base_vector, new_vector, cost_matrix) return w_results
def pot_wasserstein_mapper(net1, net2, metric_space, p=None, q=None): """ Computes vanilla EMD (over Hausdorff dist) for mapper graphs Parameters ---------- net1 : lightweight_mapper.Network Mapper graph net2 : lightweight_mapper.Network Mapper graph metric_space : np.array Pairwise distance matrix p : np.array - nx1 Distribution over nodes corresponding to net1 q : np.array - nx1 Distribution over nodes corresponding to net2 Returns ------- EMD (Cost = Hausdorff dist) """ C3 = network_merge_distance(net1, net2, metric_space) if p is None or q is None: p = np.diag(net1.adjacency_matrix.toarray()) p = p / p.sum() q = np.diag(net2.adjacency_matrix.toarray()) q = q / q.sum() gw_dist = ot.emd2(p, q, C3) params = ot.emd(p, q, C3) return gw_dist, params
def main(args): # Input the sentences which you want to get the similarity s1 = '大坂なおみ 逆転で2年ぶり2度目の全米OP優勝。3度目のグランドスラム制覇' s2 = '大坂なおみが2年ぶり2回目のV 4大大会3勝目 全米テニス' # s2 = '大相撲秋場所 八角理事長「横綱不在 申し訳ない」' mt = MeCab.Tagger( '-d {} -Owakati'.format(args.mecab_dict_path) ) if args.mecab_dict_path is not None else MeCab.Tagger('-Owakati') wv = KeyedVectors.load_word2vec_format( os.path.dirname(os.path.abspath(__file__)) + '/vecs/jawiki.word_vectors.200d.txt') w1 = get_w(s1, mt, wv) w2 = get_w(s2, mt, wv) z1 = get_z(w1) z2 = get_z(w2) m1 = [np.linalg.norm(w1_i) / z1 for w1_i in w1] m2 = [np.linalg.norm(w2_i) / z2 for w2_i in w2] # Compute cost matrix C c = [] for w1_i in w1: c.append([1 - cos_sim(np.array(w1_i), np.array(w2_j)) for w2_j in w2]) # Show the result print(s1) print(s2) print("{:.2f}".format(ot.emd2(m1, m2, c)))
def calculate_path_length(device, args, model, data, end_time, n_pts=10000): """ Calculates the total length of the path from time 0 to timepoint """ # z_samples = torch.tensor(data.get_data()).type(torch.float32).to(device) z_samples = data.base_sample()(n_pts, *data.get_shape()).to(device) model.eval() n = 1001 with torch.no_grad(): integration_times = (torch.tensor(np.linspace(0, end_time, n)).type( torch.float32).to(device)) # z, _ = model(z_samples, torch.zeros_like(z_samples), integration_times=integration_times, reverse=False) z, _ = model( z_samples, torch.zeros_like(z_samples), integration_times=integration_times, reverse=True, ) z = z.cpu().numpy() z_diff = np.diff(z, axis=0) z_lengths = np.sum(np.linalg.norm(z_diff, axis=-1), axis=0) total_length = np.mean(z_lengths) import ot as pot from scipy.spatial.distance import cdist emd = pot.emd2( np.ones(n_pts) / n_pts, np.ones(n_pts) / n_pts, cdist(z[-1, :, :], data.get_data()), ) print(total_length, emd) plt.scatter(z[-1, :, 0], z[-1, :, 1]) plt.savefig("test.png") plt.close()
def basket_dist_EMD(self, baskets): basket1 = baskets[0] basket2 = baskets[1] dictionary = np.unique(list(basket1) + list(basket2)) vocab_len_ = len(dictionary) product2ind = dict(zip(dictionary, np.arange(vocab_len_))) # Compute distance matrix. dictionary_vecs = self.model.wv.vectors[[x for x in dictionary]] distance_matrix = squareform(pdist(dictionary_vecs)) if np.sum(distance_matrix) == 0.0: # `emd` gets stuck if the distance matrix contains only zeros. return float('inf') def nbow(document): bow = np.zeros(vocab_len_, dtype=np.float) for d in document: bow[product2ind[d]] += 1. return bow / len(document) # Compute nBOW representation of documents. d1 = nbow(basket1) d2 = nbow(basket2) # Compute WMD. return ot.emd2(d1, d2, distance_matrix)
def __call__( self, x: dict, x_0: dict, t: int = None, par: dict = None, ) -> float: # compute summary statistics, shape (n, dim), (n0, dim) s, s0 = self.sumstat(x), self.sumstat(x_0) n, n0 = s.shape[0], s0.shape[0] # pairwise cost matrix, shape (n, n0) m = self.dist(XA=s, XB=s0) # weights (could also be passed/learned?) w, w0 = np.ones((n, )) / n, np.ones((n0, )) / n0 # optimal transport ("earth mover's") cost value cost = ot.emd2(a=w, b=w0, M=m, **self.emd_args, log=False) # take root to match Wasserstein distance definition if self.p < np.inf: cost = cost**(1 / self.p) return cost
def evaluation(trainer): num_sample = 10000 xp = gen.xp #xs = [] """ for i in range(0, num_sample, batchsize): z = Variable(xp.asarray(gen.make_hidden(batchsize))) with chainer.using_config('train', False), chainer.using_config('enable_backprop', False): x = gen(z) x = chainer.cuda.to_cpu(x.data) xs.append(x) xs = np.asarray(xs) """ z = Variable(xp.asarray(gen.make_hidden(num_sample))) with chainer.using_config('train', False), chainer.using_config('enable_backprop', False): x = gen(z) xs = chainer.cuda.to_cpu(x.data) real_data = GmmDataset(num_sample, 123, num_cluster=8, std=0.02, scale=2)._data a, b = np.ones((num_sample,)) / num_sample, np.ones((num_sample,)) / num_sample #print(xs) #print(real_data) M = ot.dist(xs, real_data) M /= M.max() distance = ot.emd2(a, b, M) del xs gc.collect() del real_data gc.collect() #print(distance) chainer.reporter.report({ 'EMD': distance, })
def compute_batch_images_emd(b_img1, b_img2, eps=0): """ :param b_img1: :param b_img2: :return: """ import ot assert b_img1.shape[-1] == 1 and b_img2.shape[-1] == 1 b_img1 = b_img1.astype(np.float64) b_img2 = b_img2.astype(np.float64) # eps: avoid emd be zero b_img1 = np.clip(b_img1, a_min=eps, a_max=255) b_img1 /= b_img1.sum(axis=(1, 2, 3), keepdims=True) b_img2 = np.clip(b_img2, a_min=eps, a_max=255) b_img2 /= b_img2.sum(axis=(1, 2, 3), keepdims=True) b, h, w, c = b_img1.shape b_img1 = b_img1.reshape((b, -1)) b_img2 = b_img2.reshape((b, -1)) xx, yy = np.meshgrid(np.arange(h), np.arange(w)) xy = np.hstack((xx.reshape(-1, 1), yy.reshape(-1, 1))) M = ot.dist(xy, xy) emd = np.zeros((b, 1)) for idx in range(b): xapp1 = b_img1[idx] xapp2 = b_img2[idx] dist = ot.emd2(xapp1, xapp2, M) assert dist > 0 emd[idx] = dist return emd
def distFun_raw(self, x1, x2): if self.distType == "Euclidean": #normalize data CHECK IF IT WORKS BETTER x1 = np.divide(x1, np.linalg.norm(x1, 1, 0)) x2 = np.divide(x2, np.linalg.norm(x2, 1, 0)) tmp = np.zeros((x1.shape[1], x2.shape[1])) for k in range(x1.shape[1]): tmp[k, :] = np.linalg.norm(x1[:, k] - x2, None, 0) if x1.shape[1] == 1 and x2.shape[1] == 1: tmp = tmp.item() return tmp elif self.distType == "Wasserstein": if x1.shape[1] == 1 and x2.shape[1] == 1: x1 = np.array(np.divide(x1, np.sum(x1))).flatten() x2 = np.array(np.divide(x2, np.sum(x2))).flatten() # print((x1.shape,x2.shape,self.TransportCostImg.shape)) return ot.emd2(x1, x2, self.TransportCostImg) else: tmp = np.zeros((x1.shape[1], x2.shape[1])) for k in range(x1.shape[1]): for kk in range(x2.shape[1]): if kk >= k: tmp[k, kk] = self.distFun_raw(x1[:, k], x2[:, kk]) else: tmp[k, kk] = tmp[kk, k] return tmp
def wasserstein_distance_1d(histograms, centers, normalize=True, **kwargs): # handle args if not isinstance(centers[0], np.ndarray): centers = (centers, centers) metric = kwargs.pop("metric", "euclidean") # checks assert len(histograms) == 2 assert histograms[0].ndim == 1 assert histograms[1].ndim == 1 assert len(centers) == 2 assert centers[0].ndim == 1 assert centers[1].ndim == 1 # point distance centers0 = centers[0][..., np.newaxis] centers1 = centers[1][..., np.newaxis] M = cdist(centers0, centers1, metric=metric) # normalize histograms if normalize: hist0 = histograms[0] / histograms[0].sum() hist1 = histograms[1] / histograms[1].sum() else: hist0 = histograms[0] hist1 = histograms[1] # wasserstein distance distance = ot.emd2(hist0, hist1, M) return distance
def loader(data_path, embeddings_path, p=1, K_lda=70, glove_embeddings=True, stemming=True): """ Load dataset and embeddings from data path.""" # Load dataset from data_path vocab, embed_vocab, bow_data, y = load_wmd_data(data_path) y = y - 1 # Use GLOVE word embeddings if glove_embeddings: vocab, embed_vocab, bow_data = change_embeddings( vocab, bow_data, embeddings_path) # Reduce vocabulary by removing short words, stop words, and stemming if stemming: vocab, embed_vocab, bow_data = reduce_vocab(bow_data, vocab, embed_vocab, embed_aggregate='mean') # Get embedded documents embed_data = get_embedded_data( bow_data, embed_vocab, vocab ) ## List of length n° of doc and L[doc] : np.array shape (size embedding, n° of words in doc) # Matrix of word embeddings embeddings = np.array([embed_vocab[w] for w in vocab ]) ## shape (Vocab, features embedding) topics, lda_centers, topic_proportions = fit_topics( bow_data, embeddings, vocab, K_lda) cost_embeddings = euclidean_distances( embeddings, embeddings )**p ## Pairwise distance between embeddings (vocal, base cost) cost_topics = np.zeros((topics.shape[0], topics.shape[0])) for i in range(cost_topics.shape[0]): print("i :", i) for j in range(i + 1, cost_topics.shape[1]): cost_topics[i, j] = ot.emd2(topics[i] / (topics[i].sum()), topics[j] / (topics[j].sum()), cost_embeddings) cost_topics = cost_topics + cost_topics.T out = { 'X': bow_data, 'y': y, 'embeddings': embeddings, 'topics': topics, 'proportions': topic_proportions, 'cost_E': cost_embeddings, 'cost_T': cost_topics } print("Processing save for" + data_path) save_preprocessing(out, data_path, K_lda) print("Save done for" + data_path) return out
def dist(self, G1, G2, topk='automatic', batch=100, tol=1e-5): """NBD between two graphs. Params ------ G1, G2 (nx.Graph): The graphs to compare. topk (int or 'automatic'): The number of eigenvalues to compute. If 'automatic' (default), use only the eigenvalues that are larger than the square root of the largest eigenvalue. Note this may yield different number of eigenvalues for each graph. batch (int): If topk is 'automatic', this is the number of eigenvalues to compute each time until the condition is met. Default 100. tol (float): Numerical tolerance when computing eigenvalues. """ vals1 = nbvals(G1, topk, batch, tol) vals2 = nbvals(G2, topk, batch, tol) mass = lambda num: np.ones(num) / num vals_dist = distance_matrix(vals1, vals2) dist = emd2(mass(vals1.shape[0]), mass(vals2.shape[0]), vals_dist) self.results['vals'] = (vals1, vals2) return dist
def compute_divergence_from_cluster_labels(embeds1, embeds2, labels1, labels2, threshold): labels_all = list(np.concatenate((labels1, labels2))) counts1 = Counter(labels1) counts2 = Counter(labels2) n_senses = list(set(labels_all)) #print("Clusters:", len(n_senses)) t1 = [] t2 = [] label_list = [] for i in n_senses: if counts1[i] + counts2[i] > threshold: t1.append(counts1[i]) t2.append(counts2[i]) label_list.append(i) t1 = np.array(t1) t2 = np.array(t2) emb1_means = np.array( [np.mean(embeds1[labels1 == clust], 0) for clust in label_list]) emb2_means = np.array( [np.mean(embeds2[labels2 == clust], 0) for clust in label_list]) M = np.nan_to_num(np.array( [cdist(emb1_means, emb2_means, metric='cosine')])[0], nan=1) t1_dist = t1 / t1.sum() t2_dist = t2 / t2.sum() wass = ot.emd2(t1_dist, t2_dist, M) jsd = compute_jsd(t1_dist, t2_dist) return jsd, wass
def _compute_wasserstein_distance(label_sequences, sinkhorn=False, categorical=False, sinkhorn_lambda=1e-2): ''' Generate the Wasserstein distance matrix for the graphs embedded in label_sequences ''' # Get the iteration number from the embedding file n = len(label_sequences) M = np.zeros((n,n)) # Iterate over pairs of graphs for graph_index_1, graph_1 in enumerate(label_sequences): # Only keep the embeddings for the first h iterations labels_1 = label_sequences[graph_index_1] for graph_index_2, graph_2 in enumerate(label_sequences[graph_index_1:]): labels_2 = label_sequences[graph_index_2 + graph_index_1] # Get cost matrix ground_distance = 'hamming' if categorical else 'euclidean' costs = ot.dist(labels_1, labels_2, metric=ground_distance) if sinkhorn: mat = ot.sinkhorn(np.ones(len(labels_1))/len(labels_1), np.ones(len(labels_2))/len(labels_2), costs, sinkhorn_lambda, numItermax=50) M[graph_index_1, graph_index_2 + graph_index_1] = np.sum(np.multiply(mat, costs)) else: M[graph_index_1, graph_index_2 + graph_index_1] = \ ot.emd2([], [], costs) M = (M + M.T) return M
def Wdist(X, Y, reg=0., p=2.): ''' param X, Y: (n x 2) and (m x 2) numpy array (points of persistence diagrams) param reg: regularization parameters for entropic smoothing. If 0., exact computation. param p: exponent for Wasserstein; return: float, estimation of the Wasserstein distance between two diagrams (exact if reg = 0.). ''' M = build_dist_matrix(X, Y, p=p) n = len(X) m = len(Y) a = 1.0 / (n + m) * np.ones( n) # weight vector of the input diagram. Uniform here. hat_a = np.append( a, m / (n + m)) # so that we have a probability measure, required by POT b = 1.0 / (n + m) * np.ones( m) # weight vector of the input diagram. Uniform here. hat_b = np.append( b, n / (m + n)) # so that we have a probability measure, required by POT if reg > 0: ot_cost = (n + m) * ot.bregman.sinkhorn2(hat_a, hat_b, M, reg=reg) else: ot_cost = (n + m) * ot.emd2(hat_a, hat_b, M) return np.power(ot_cost, 1. / p)
def get_dwasserstein(model, market_baskets): # make distance matrix d between purchase histories of customer c and d basket_X = market_baskets[0] basket_Y = market_baskets[1] list_basketX = list(basket_X) list_basketY = list(basket_Y) dictionary = np.unique(list_basketX + list_basketY) dictionary_len = len(dictionary) product2index = dict(zip(dictionary, np.arange(dictionary_len))) dictionary_vectors = model.wv.vectors[[word for word in dictionary]] distance_matrix = squareform(pdist(dictionary_vectors)) if np.sum(distance_matrix) == 0.0: return float('inf') def bag_of_words(document): bow = np.zeros(dictionary_len, dtype=np.float) for d in document: bow[product2index[d]] += 1. return bow / len(document) bow_X = bag_of_words(basket_X) bow_Y = bag_of_words(basket_Y) # Finally we compute the Wasserstein metric using both baskets and the distance metrics. dw = ot.emd2(bow_X, bow_Y, distance_matrix) return dw
def _edge_curvature( edge, measures, geodesic_distances, measure_cutoff=1e-6, sinkhorn_regularisation=0, weighted_curvature=False, ): """Compute curvature for an edge.""" node_x, node_y = edge m_x, m_y = measures[node_x], measures[node_y] Nx = np.where(m_x >= measure_cutoff * np.max(m_x))[0] Ny = np.where(m_y >= measure_cutoff * np.max(m_y))[0] m_x, m_y = m_x[Nx], m_y[Ny] m_x /= m_x.sum() m_y /= m_y.sum() distances_xy = geodesic_distances[np.ix_(Nx, Ny)] if sinkhorn_regularisation > 0: wasserstein_distance = ot.sinkhorn2(m_x, m_y, distances_xy, sinkhorn_regularisation)[0] else: wasserstein_distance = ot.emd2(m_x, m_y, distances_xy) if weighted_curvature: return geodesic_distances[node_x, node_y] - wasserstein_distance return 1.0 - wasserstein_distance / geodesic_distances[node_x, node_y]
def wasserstein(M, sqrt): """Calculate earth mover's distance.""" if sqrt: M = M.abs().sqrt() emd = ot.emd2([], [], M.numpy()) return emd
def __call__(self, y_true_proba, y_proba): scores = [] mask = ~np.any(np.isnan(y_proba), axis=1) y_proba = y_proba[mask] y_true_proba = y_true_proba[mask] for this_y_true, this_y_proba in zip(y_true_proba, y_proba): this_y_true_max = this_y_true.max() this_y_proba_max = this_y_proba.max() # special treatment for the all zero cases if (this_y_true_max * this_y_proba_max) == 0: if this_y_true_max or this_y_proba_max: scores.append(1.) # as ground_metric max is 1 else: scores.append(0.) continue this_y_true = this_y_true.astype(np.float64) / this_y_true.sum() this_y_proba = this_y_proba.astype(np.float64) / this_y_proba.sum() score = emd2(this_y_true, this_y_proba, self.ground_metric) scores.append(score) assert len(scores) == len(y_true_proba) assert len(y_proba) == len(y_true_proba) return np.mean(scores)
def earth_mover_distance(cloud1, cloud2, eigenvals): """ Returns the earth mover's distance between two point clouds Parameters ---------- cloud1 : 2-D array First point cloud cloud2 : 2-D array Second point cloud Returns ------- distance : float The distance between the two point clouds """ cloud1 = cloud1.toarray() if scipy.sparse.isspmatrix(cloud1) else cloud1 cloud2 = cloud2.toarray() if scipy.sparse.isspmatrix(cloud2) else cloud2 if eigenvals is not None: cloud1 = cloud1.dot(eigenvals) cloud2 = cloud2.dot(eigenvals) p = np.ones(len(cloud1)) / len(cloud1) q = np.ones(len(cloud2)) / len(cloud2) pairwise_dist = sklearn.metrics.pairwise.pairwise_distances( cloud1, Y=cloud2, metric='sqeuclidean') return np.sqrt(pot.emd2(p, q, pairwise_dist, numItermax=1e7))
def basket_dist_Decomp(self, bskts): bskt1 = bskts[0] bskt2 = bskts[1] dct = nmpy.unique(list(bskt1) + list(bskt2)) vocab_len_ = len(dct) product2ind = dict(zip(dct, nmpy.arange(vocab_len_))) # Here distance matrix is calculated. dict_vectors = self.model.wv.vectors[[x for x in dct]] dist_matrix = squareform(pdist(dict_vectors)) if nmpy.sum(dist_matrix) == 0.0: # There will be issues if 'EMD' has 0s in it. return float('inf') def no_bow(doc): bow = nmpy.zeros(vocab_len_, dtype=nmpy.float) for e in doc: bow[product2ind[e]] += 1. return bow / len(doc) # 'no_bow' is calculated to represent data as documets. dist_1 = no_bow(bskt1) dist_2 = no_bow(bskt2) # Here we obtain Wasserstein Minimim Distance. return ot.emd2(dist_1, dist_2, dist_matrix)
def test_emd2_multi(): n = 1000 # nb bins # bin positions x = np.arange(n, dtype=np.float64) # Gaussian distributions a = gauss(n, m=20, s=5) # m= mean, s= std ls = np.arange(20, 1000, 20) nb = len(ls) b = np.zeros((n, nb)) for i in range(nb): b[:, i] = gauss(n, m=ls[i], s=10) # loss matrix M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1))) # M/=M.max() print('Computing {} EMD '.format(nb)) # emd loss 1 proc ot.tic() emd1 = ot.emd2(a, b, M, 1) ot.toc('1 proc : {} s') # emd loss multipro proc ot.tic() emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') np.testing.assert_allclose(emd1, emdn) # emd loss multipro proc with log ot.tic() emdn = ot.emd2(a, b, M, log=True, return_matrix=True) ot.toc('multi proc : {} s') for i in range(len(emdn)): emd = emdn[i] log = emd[1] cost = emd[0] check_duality_gap(a, b[:, i], M, log['G'], log['u'], log['v'], cost) emdn[i] = cost emdn = np.array(emdn) np.testing.assert_allclose(emd1, emdn)
def create_prototypes(dataset_name): all_files = glob.glob(dataset_name + "/json_format/*") all_files.sort(key=lambda x: int((x.strip().split('/')[-1]).split('.')[0])) all_clases = {} for file in all_files: name = (file.strip().split('/')[-1]).split('.')[0] with open(file, "r") as f1: graph = json.load(f1) if graph["target"] not in all_clases.keys(): all_clases[graph["target"]] = {} if len(graph["labels"]) == 1: raise ValueError("Only one node") G = nx.Graph() G.add_edges_from(graph["edges"]) N = normalized_laplacian_matrix(G).todense() eigvals = scipy.linalg.eigvals(N) eigvals = eigvals.real.round(decimals=5) if type(eigvals) == int: raise TypeError("Type is int rather than list") all_clases[graph["target"]][int(name)] = eigvals class_prototype_dict = {} for class_, class_graphs in all_clases.items(): current_class_eigvals = [] for num in sorted(class_graphs): # print(num) current_class_eigvals.append(class_graphs[num]) all_dist = [] for i in range(len(current_class_eigvals)): current_dist = [] for j in range(len(current_class_eigvals)): a = current_class_eigvals[i] b = current_class_eigvals[j] cost = ot.utils.dist(np.reshape(a, (a.shape[0], 1)), np.reshape(b, (b.shape[0], 1))) # Uniform distribution has been assumed over the spectra for faster implementation. One can first use density estimation # to approximate the distribution which can provide better results. loss = ot.emd2([], [], cost) current_dist.append(loss) all_dist.append(current_dist) all_dist = np.array(all_dist) current_prot_index = np.argmin(np.sum(all_dist, axis=1)) # print(list(class_graphs.keys())) sorted_keys = list(class_graphs.keys()) sorted_keys.sort() class_prototype_dict[str(class_)] = sorted_keys[current_prot_index] print(class_prototype_dict) with open(dataset_name + "/class_prototype_numbers.json", 'w') as f: json.dump(class_prototype_dict, f)
def test_emd_empty(): # test emd and emd2 for simple identity n = 100 rng = np.random.RandomState(0) x = rng.randn(n, 2) u = ot.utils.unif(n) M = ot.dist(x, x) G = ot.emd([], [], M) # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) # check constratints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn w = ot.emd2([], [], M) # check loss=0 np.testing.assert_allclose(w, 0)
def wasserstein(M, sqrt): if sqrt: M = M.abs().sqrt() emd = ot.emd2([],[],M.numpy()) return emd