def incremental_bary_map_emd(xs, xt, a, b, m1, m2, k): ''' Compute the incomplete minibatch barycenter mapping between a source and a target distributions. (faster for small batch size) Parameters ---------- - xs : ndarray(ns, d) source data - xt : ndarray(nt, d) target data - a : ndarray(ns) source distribution weights - b : ndarray(nt) target distribution weights - m1 : int source batch size - m2 : int target batch size - k : int number of batch couples Returns ------- - new_xs : ndarray(ns, d) Transported source measure - new_xt : ndarray(nt, d) Transported target measure ''' new_xs = np.zeros(xs.shape) new_xt = np.zeros(xt.shape) Ns = np.shape(xs)[0] Nt = np.shape(xt)[0] if m1 < 101: for i in range(k): #Test mini batch sub_xs, sub_weights_a, id_a = small_mini_batch(xs, a, m1, Ns) sub_xt, sub_weights_b, id_b = small_mini_batch(xt, b, m2, Nt) sub_M = ot.dist(sub_xs, sub_xt, "sqeuclidean").copy() G0 = ot.emd(sub_weights_a, sub_weights_b, sub_M) new_xs[id_a] += G0.dot(xt[id_b]) new_xt[id_b] += G0.T.dot(xs[id_a]) else: for i in range(k): #Test mini batch sub_xs, sub_weights_a, id_a = mini_batch(xs, a, m1, Ns) sub_xt, sub_weights_b, id_b = mini_batch(xt, b, m2, Nt) sub_M = ot.dist(sub_xs, sub_xt, "sqeuclidean").copy() G0 = ot.emd(sub_weights_a, sub_weights_b, sub_M) new_xs[id_a] += G0.dot(xt[id_b]) new_xt[id_b] += G0.T.dot(xs[id_a]) return 1. / k * Ns * new_xs, 1. / k * Nt * new_xt
def _compute_copula_ot_dependence(empirical: np.array, target: np.array, forget: np.array, n_obs: int) -> float: """ Calculates optimal copula transport dependence measure. :param empirical: (np.array) Empirical copula. :param target: (np.array) Target copula. :param forget: (np.array) Forget copula. :param nb_obs: (int) Number of observations. :return: (float) Optimal copula transport dependence. """ # Uniform distribution on samples t_measure, f_measure, e_measure = (np.ones( (n_obs, )) / n_obs, np.ones((n_obs, )) / n_obs, np.ones( (n_obs, )) / n_obs) # Compute the ground distance matrix between locations gdist_e2t = ot.dist(empirical, target) gdist_e2f = ot.dist(empirical, forget) # Compute the optimal transport matrix e2t_ot = ot.emd(t_measure, e_measure, gdist_e2t) e2f_ot = ot.emd(f_measure, e_measure, gdist_e2f) # Compute the optimal transport distance: # <optimal transport matrix, ground distance matrix>_F e2t_dist = np.trace(np.dot(np.transpose(e2t_ot), gdist_e2t)) e2f_dist = np.trace(np.dot(np.transpose(e2f_ot), gdist_e2f)) # Compute the copula ot dependence measure ot_measure = 1 - e2t_dist / (e2f_dist + e2t_dist) return ot_measure
def test_emd_emd2_devices_tf(): if not tf: return nx = ot.backend.TensorflowBackend() n_samples = 100 n_features = 2 rng = np.random.RandomState(0) x = rng.randn(n_samples, n_features) y = rng.randn(n_samples, n_features) a = ot.utils.unif(n_samples) M = ot.dist(x, y) # Check that everything stays on the CPU with tf.device("/CPU:0"): ab, Mb = nx.from_numpy(a, M) Gb = ot.emd(ab, ab, Mb) w = ot.emd2(ab, ab, Mb) nx.assert_same_dtype_device(Mb, Gb) nx.assert_same_dtype_device(Mb, w) if len(tf.config.list_physical_devices('GPU')) > 0: # Check that everything happens on the GPU ab, Mb = nx.from_numpy(a, M) Gb = ot.emd(ab, ab, Mb) w = ot.emd2(ab, ab, Mb) nx.assert_same_dtype_device(Mb, Gb) nx.assert_same_dtype_device(Mb, w) assert nx.dtype_device(Gb)[1].startswith("GPU")
def get_lang_mapping(self, lang1, lang2, metric, entreg): path = self._get_shortest_path_from_lang1_to_lang2(lang1, lang2) print(path) mapping = None for i in range(len(path)): a = self.lang_dict[path[i][0]] a = self.project_into_lang_space(a, self.args.lang_space) b = self.lang_dict[path[i][0]].child[path[i][1]] b = self.project_into_lang_space(b, self.args.lang_space) if path[i][2]: plan = ot.emd(b.freq, a.freq, build_MXY(b.projected_matrix, a.projected_matrix)) else: plan = ot.emd(a.freq, b.freq, build_MXY(a.projected_matrix, b.projected_matrix)) if mapping is None: mapping = plan else: mapping = np.matmul(mapping, plan) a.projected_matrix = None b.projected_matrix = None return mapping
def test_warnings(): n = 100 # nb bins m = 100 # nb bins mean1 = 30 mean2 = 50 # bin positions x = np.arange(n, dtype=np.float64) y = np.arange(m, dtype=np.float64) # Gaussian distributions a = gauss(n, m=mean1, s=5) # m= mean, s= std b = gauss(m, m=mean2, s=10) # loss matrix M = ot.dist(x.reshape((-1, 1)), y.reshape((-1, 1)))**(1. / 2) print('Computing {} EMD '.format(1)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") print('Computing {} EMD '.format(1)) ot.emd(a, b, M, numItermax=1) assert "numItermax" in str(w[-1].message)
def forward_seq(self, x_train, x_test): N = self.Nmasses (Pl_train, P_train) = particleApproximation_v0(x_train, N) (Pl_test, P_test) = particleApproximation_v0(x_test, N) Pl_tem = 0 for a in range(2): #x_train.shape[0]): t = Pl_train[a] Pl_tem = Pl_tem + t Pl_tem = Pl_tem / 2 #x_train.shape[0] P_tem = np.ones((N, )) / float(N) #Pl_tem_vec=np.reshape(Pl_tem,(Pl_tem.shape[0]*Pl_tem.shape[1],),order='F') V = list() M = x_train.shape[0] for ind in range(M): Ni = Pl_train[ind].shape[0] C = ot.dist(Pl_train[ind], Pl_tem) b = P_tem # b=np.ones((N,))/float(N) a = P_train[ind] # a=np.ones((Ni,))/float(Ni) p = ot.emd(a, b, C) # exact linear program #V.append(np.matmul((N*p).T,Pl_train[ind])-Pl_tem) V.append(np.matmul((N * p).T, Pl_train[ind]) + Pl_tem) # already giving transport displacement? V = np.asarray(V) x_train_hat = np.zeros((len(V), V[0].shape[0] * V[0].shape[1])) for a in range(len(V)): x_train_hat[a, :] = np.reshape(V[a], (V[0].shape[0] * V[0].shape[1], ), order='F') V = list() M = x_test.shape[0] for ind in range(M): Ni = Pl_test[ind].shape[0] C = ot.dist(Pl_test[ind], Pl_tem) b = P_tem # b=np.ones((N,))/float(N) a = P_test[ind] # a=np.ones((Ni,))/float(Ni) p = ot.emd(a, b, C) # exact linear program #V.append(np.matmul((N*p).T,Pl_test[ind])-Pl_tem) V.append(np.matmul((N * p).T, Pl_test[ind]) + Pl_tem) V = np.asarray(V) x_test_hat = np.zeros((len(V), V[0].shape[0] * V[0].shape[1])) for a in range(len(V)): x_test_hat[a, :] = np.reshape(V[a], (V[0].shape[0] * V[0].shape[1], ), order='F') return x_train_hat, x_test_hat, Pl_tem, P_tem
def computeTransportLaplacianSymmetric_fw(distances, Ss, St, xs, xt, reg=1e-9, regls=0, reglt=0, solver=None, nbitermax=400, thr_stop=1e-8, step='opt', **kwargs): distribS = np.ones((xs.shape[0],)) / xs.shape[0] distribT = np.ones((xt.shape[0],)) / xt.shape[0] Ls = get_laplacian(Ss) Lt = get_laplacian(St) loop = True transp = ot.emd(distribS, distribT, distances) niter = 0 while loop: old_transp = transp.copy() G = np.asarray(regls * get_gradient1(Ls, xt, old_transp) + reglt * get_gradient2(Lt, xs, old_transp)) transp0 = ot.emd(distribS, distribT, distances + G) E = transp0 - old_transp # Ge=get_gradient(E,K) if step == 'opt': # optimal step size !!! tau = max(0, min(1, (-np.sum(E * distances) - np.sum(E * G)) / ( 2 * regls * quadloss1(E, Ls, xt) + 2 * reglt * quadloss2(E, Lt, xs)))) else: # other step size just in case tau = 2. / (niter + 2) # print "tau:",tau transp = old_transp + tau * E # print "loss:",np.sum(transp*distances)+quadloss(transp,K)/2 if niter >= nbitermax: loop = False err = np.sum(np.abs(transp - old_transp)) if err < thr_stop: loop = False # print niter niter += 1 if niter % 1000 == 0: print('{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(niter, err)) # print "loss:",np.sum(transp*distances)+quadloss(transp,K)/2 return transp
def sinkhorn_mapping(set_1, set_2): """http://pot.readthedocs.io/en/stable/auto_examples/plot_OT_2D_samples.html""" a, b = np.ones((len(set_1), )) / len(set_1), np.ones( (len(set_2), )) / len(set_2) arr_1 = _generate_arr(set_1) arr_2 = _generate_arr(set_2) switch_to_cartesian(arr_1, 80 * 400) switch_to_cartesian(arr_2, 80 * 400) M = ot.dist(arr_1, arr_2) G0 = ot.emd(a, b, M) counter = 0 for i in G0: inner_counter = 0 for j in i: if j > 0.003: print(arr_1[counter], arr_2[inner_counter], j, sep=", ") inner_counter += 1 counter += 1 pl.figure(4) for i in range(arr_1.shape[0]): for j in range(arr_2.shape[0]): if G0[i, j] > 0.003: pl.plot([arr_1[i, 0], arr_2[j, 0]], [arr_1[i, 1], arr_2[j, 1]]) pl.plot(arr_1[:, 0], arr_1[:, 1], '+b', label='Source samples') pl.plot(arr_2[:, 0], arr_2[:, 1], 'xr', label='Target samples') pl.show()
def test(source_samples, target_samples, weight_function): """ :param source_samples: array (n_source, feature) :param target_samples: array (n_target, feature) :param weight_function: function determine distance between two samples :return: """ assert source_samples.shape[1] == target_samples.shape[1] # Employ uniform distribution over all data as empirical distribution (not a histogram) source_dist = np.ones((len(source_samples), )) / len(source_samples) target_dist = np.ones((len(target_samples), )) / len(target_samples) # print('source:', source_dist.shape, np.sum(source_dist)) # print('target:', target_dist.shape, np.sum(target_dist)) # build cost matrix (n_source, n_target) cost_matrix = np.array([[float(weight_function(__i, __o)) for __i in target_samples] for __o in source_samples]) print('cost :\n', cost_matrix, cost_matrix.shape) # derive optimal transport based on network simplex algorithm # Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W. (2011, December). # Displacement interpolation using Lagrangian mass transport. # In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p. 158). ACM. optimal_transport = ot.emd(a=source_dist, b=target_dist, M=cost_matrix) return optimal_transport
def emd(xs, xt, metric='euclidean', numItermax=2**17, **kwargs): if len(xs.shape) == 1: xs = xs.reshape(-1, 1) if len(xt.shape) == 1: xt = xt.reshape(-1, 1) M = cdist(xs, xt, metric) if (M == 0).all(): return 0. M2 = M**2 a = np.ones(len(xs)) / len(xs) b = np.ones(len(xt)) / len(xt) for i in range(3): try: F = ot.emd(a, b, M2, numItermax, **kwargs) # np.isclose(np.sum(F), 1) return np.sum(M * F) except UserWarning: numItermax = 2 * numItermax assert False, "No conversion reached. Try to increase numItermax!"
def compute_ot_loss_matrix(y: np.ndarray, y_hat: np.ndarray, D: np.ndarray, ot_niters=10**5): """ Solve the optimal transport problem for the image pixels, and return the OT permutation matrix Pi. :param y: the ground-truth image. :param y_hat: the predicted image. :param D: the distance matrix; generate via make_distance_matrix(y.shape[0]) :param y_hat_as_logits: if True, y_hat is provided as logits. :return: Pi, the optimal transport matrix, of shape [d**2, d**2]. The (i,j) entry in Pi represents the cost of moving pixel i in y_hat to pixel j in y. """ assert_array_finite(y) assert_array_finite(y_hat) assert_array_nonnegative(y) assert_array_nonnegative(y_hat) np.testing.assert_array_equal(y.shape[0], y.shape[1]) # check images are square np.testing.assert_array_equal(y.shape, y_hat.shape) # check images same size y_hist = normalize_to_histogram(y) y_hat_hist = normalize_to_histogram(y_hat) PI = ot.emd(y_hat_hist, y_hist, D, numItermax=ot_niters) return PI
def diagonality(ir_dft, ir_dftb): #normalize spectra ir_dft = [i / np.sum(i) for i in ir_dft] ir_dftb = [i / np.sum(i) for i in ir_dftb] #diagonality of P https://math.stackexchange.com/questions/1392491/measure-of-how-much-diagonal-a-matrix-is Y, X = np.meshgrid(np.linspace(0, 1, ir_dft[0].size), np.linspace(0, 1, ir_dft[0].size)) C = abs(Y - X)**2 def dist(P): j = np.ones(P.shape[0]) r = np.arange(P.shape[0]) r2 = r**2 n = j @ P @ j.T sum_x = r @ P @ j.T sum_y = j @ P @ r.T sum_x2 = r2 @ P @ j.T sum_y2 = j @ P @ r2.T sum_xy = r @ P @ r.T return (n * sum_xy - sum_x * sum_y) / (np.sqrt(n * sum_x2 - sum_x**2) * np.sqrt(n * sum_y2 - sum_y**2)) # print('Case (Diagonality)') d = np.zeros((len(ir_dft), len(ir_dftb))) for i, a in enumerate(ir_dft): for j, b in enumerate(ir_dftb): # P = sink.sinkhorn(a,b, 0.003).P P = ot.emd(a, b, C) d[i, j] = dist(P) return d
def match_spots_using_spatial_heuristic(X, Y, use_ot: bool = True) -> np.ndarray: """ Calculates and returns a mapping of spots using a spatial heuristic. Args: X (array-like, optional): Coordinates for spots X. Y (array-like, optional): Coordinates for spots Y. use_ot: If ``True``, use optimal transport ``ot.emd()`` to calculate mapping. Otherwise, use Scipy's ``min_weight_full_bipartite_matching()`` algorithm. Returns: Mapping of spots using a spatial heuristic. """ n1, n2 = len(X), len(Y) X, Y = norm_and_center_coordinates(X), norm_and_center_coordinates(Y) dist = scipy.spatial.distance_matrix(X, Y) if use_ot: pi = ot.emd(np.ones(n1) / n1, np.ones(n2) / n2, dist) else: row_ind, col_ind = scipy.sparse.csgraph.min_weight_full_bipartite_matching( scipy.sparse.csr_matrix(dist)) pi = np.zeros((n1, n2)) pi[row_ind, col_ind] = 1 / max(n1, n2) if n1 < n2: pi[:, [(j not in col_ind) for j in range(n2)]] = 1 / (n1 * n2) elif n2 < n1: pi[[(i not in row_ind) for i in range(n1)], :] = 1 / (n1 * n2) return pi
def OT_emd(Xl, Xr): # loss matrix C = ot.dist(Xl, Xr) M = C / C.max() n = len(Xl) m = len(Xr) print(n, m) a, b = np.ones((n, )) / n, np.ones((m, )) / m G0 = ot.emd(a, b, M) plt.figure(3) plt.imshow(G0, interpolation='nearest') plt.title('OT matrix G0') plt.figure(4) ot.plot.plot2D_samples_mat(Xl[:, :2], Xr[:, :2], G0, c=[.5, .5, 1]) plt.plot(Xl[:, 0], Xl[:, 1], '+b', label='Source samples') plt.plot(Xr[:, 0], Xr[:, 1], 'xr', label='Target samples') plt.legend(loc=0) plt.title('OT matrix with samples') return (G0)
def OT_scores_emd(Xl, Xr, scoresl, scoresr, mu=0.5): # loss matrix print(scoresl) C = ot.dist(Xl, Xr) + mu * ot.dist(np.expand_dims(scoresl, axis=1), np.expand_dims(scoresr, axis=1)) M = C / C.max() n = len(Xl) m = len(Xr) print(n, m) a, b = scoresl, scoresr G0 = ot.emd(a, b, M) plt.figure(3) plt.imshow(G0, interpolation='nearest') plt.title('OT matrix G0') plt.figure(4) ot.plot.plot2D_samples_mat(Xl[:, :2], Xr[:, :2], G0, c=[.5, .5, 1]) plt.plot(Xl[:, 0], Xl[:, 1], '+b', label='Source samples') plt.plot(Xr[:, 0], Xr[:, 1], 'xr', label='Target samples') plt.legend(loc=0) plt.title('OT matrix with samples') return (G0)
def get_node_cell_type_intersection(tree): # find the amount of intersection each node has with each cell type labels, suppl_image_coords = get_image_coords(supplementary_data) layers = list(get_layers(tree)) cost_matrices, costs = [], [] for layer_ind, layer in enumerate(layers): if len(layer) == 0: continue print('Layer', layer_ind, 'number of nodes', len(layer)) node_prop = np.array([node.coords.shape[0] for node in layer]) node_prop = node_prop / node_prop.sum() suppl_prop = np.array( [image_coords.shape[0] for image_coords in suppl_image_coords]) suppl_prop = suppl_prop / suppl_prop.sum() cost_matrix = get_intersection(layer, suppl_image_coords) transport_matrix, log = ot.emd(node_prop, suppl_prop, 1 - cost_matrix, log=True) costs.append(log['cost']) cost_matrices.append(cost_matrix) cost_min_ind = np.argmin(costs) cost_matrix = cost_matrices[cost_min_ind] layer = layers[cost_min_ind] print('Optimal layer is {} with {} clusters.'.format( cost_min_ind, len(layer))) annotate_optimal_layer(labels, cost_matrix, layer)
def weighted_barycenter_algorithm(ls, test_func, X_orig, Yi_orig, bi, lambdas, tol=1e-8, metric='euclidean', reg=1e-2, maxiter=20, bregmanmaxiter=30): """ k : number of supports in X X_orig : init of barycenter (k * d) Yi_orig : list of distributions size (k_i * d) bi : list of weights size (k_i) tol: tolerance """ assert(len(Yi_orig) == len(bi)) assert(len(X_orig[0]) == len(Yi_orig[0][0])) X = X_orig Yi = Yi_orig displacement = 1 niter = 0 while (displacement > tol and niter < maxiter): X_prev = X a = compute_barycenter_weight(X, Yi, bi, lambdas, tol=tol, maxiter=bregmanmaxiter, reg=reg) Tsum = np.zeros(X.shape) for i in range(0, len(bi)): M = build_MXY(X, Yi[i], metric=metric) #T = ot.sinkhorn(a, bi[i], M, reg) T = ot.emd(a, bi[i], M) Tsum = Tsum + lambdas[i] * np.reshape(1. / a, (-1, 1)) * np.matmul(T, Yi[i]) displacement = np.sum(np.square(Tsum - X)) print("~~~~epoch "+str(niter)+"~~~~") #i = ls.index('en') #for j in range(len(ls)): # if i!=j and (not ls[i].isdigit()) and (not ls[j].isdigit()): # mapping = ot.emd(bi[i], a, build_MXY(Yi[i], X)) # mapping2 = ot.emd(a, bi[j], build_MXY(X, Yi[j])) # print("="*20+"begin testing mapping for "+ls[i]+" and "+ls[j]+"="*21) # test_func(ls[i], ls[j], np.dot(mapping, mapping2)) # mapping = None # mapping2 = None #for i in range(len(ls)): # for j in range(len(ls)): # if i!=j and (not ls[i].isdigit()) and (not ls[j].isdigit()): # mapping = ot.emd(bi[i], a, build_MXY(Yi[i], X)) # mapping2 = ot.emd(a, bi[j], build_MXY(X, Yi[j])) # print("="*20+"begin testing mapping for "+ls[i]+" and "+ls[j]+"="*21) # try: # test_func(ls[i], ls[j], np.dot(mapping, mapping2)) # except: # print("failed to eval on "+ls[i]+" and "+ls[j]) # mapping = None # mapping2 = None X = Tsum niter += 1 return X, a
def rbd_wasserstein_approx2(f1,f2): im1 = rbd_read(f1).flatten().reshape(-1,1) im2 = rbd_read(f2).flatten().reshape(-1,1) gauss1 = mix.GaussianMixture(2).set_params(tol=1e-1).fit(im1) gauss2 = mix.GaussianMixture(2).set_params(tol=1e-1).fit(im2) m11,m12 = gauss1.means_.flatten() v11,v12 = gauss1.covariances_.flatten() p11,p12 = gauss1.weights_.flatten() m21,m22 = gauss2.means_.flatten() v21,v22 = gauss2.covariances_.flatten() p21,p22 = gauss2.weights_.flatten() d1 = np.array([p11,p12,0,0]) d2 = np.array([0,0,p21,p22]) m = np.array([m11,m12,m21,m22]) v = np.array([v11,v12,v21,v22]) weight_matrix = np.zeros((len(d1),len(d2))) for i in range(len(d1)): for j in range(len(d1)): weight_matrix[i,j] = gauss_wasserstein(m[i],v[i],m[j],v[j]) return(ot.emd(d1,d2,weight_matrix))
def solve_optimal_transport_problem(self, ): self.n = len(self.weights_model1) self.a, self.b = np.ones((self.n,)) / self.n, np.ones((self.n,)) / self.n # uniform distribution on samples # loss matrix self.M = ot.dist(self.weights_model1, self.weights_model2) self.M /= self.M.max() self.G0 = ot.emd(self.a, self.b, self.M)
def compute_gamma(self, pred): ''' Function to compute the OT between the target and source samples. :return:Gamma the OT matrix ''' # Reshaping the samples into vectors of dimensions number of modalities * patch_dimension. # train_vecs are of shape (batch_size, d) train_vec_source = np.reshape(self.image_representation_source, (self.batch_size, self.image_representation_source.shape[1]* self.image_representation_source.shape[2]* self.image_representation_source.shape[3]* self.image_representation_source.shape[4])) train_vec_target = np.reshape(self.image_representation_target, (self.batch_size, self.image_representation_target.shape[1]* self.image_representation_target.shape[2]* self.image_representation_target.shape[3]* self.image_representation_target.shape[4])) # Same for the ground truth but the GT is the same for both modalities truth_vec_source = np.reshape(self.train_batch[1][:self.batch_size], (self.batch_size, self.config.patch_shape[0]*self.config.patch_shape[1]*self.config.patch_shape[2])) pred_vec_source = np.reshape(pred[:self.batch_size], (self.batch_size, self.config.patch_shape[0]*self.config.patch_shape[1]*self.config.patch_shape[2])) # We don't have information on target labels pred_vec_target = np.reshape(pred[self.batch_size:], (self.batch_size, self.config.patch_shape[0]*self.config.patch_shape[1]*self.config.patch_shape[2])) # Compute the distance between samples and between the source_truth and the target prediction. C0 = cdist(train_vec_source, train_vec_target, metric="sqeuclidean") C1 = cdist(truth_vec_source, pred_vec_target, metric=self.config.jdot_distance) C = K.get_value(self.jdot_alpha)*C0+K.get_value(self.jdot_beta)*C1 # Computing gamma using the OT library gamma = ot.emd(ot.unif(self.batch_size), ot.unif(self.batch_size), C) return gamma
def gromov_wasserstein_distance_latent_space_rand_emd(data_path, num_labels, num_clusters, result_path, args): import scipy as sp import matplotlib.pylab as pl import ot # z = np.load(data_path+ "/L-1/z.npy") # -1 means no discrimation for labelsa, the same vae transform , orthogonal concept to whether cluster on this z space or use other mehtod to split into clusters z = np.load( data_path + "/L-1" + config.z_name, allow_pickle=True ) # -1 means no discrimation for labelsa, the same vae transform , orthogonal concept to whether cluster on this z space or use other mehtod to split into clusters np.random.shuffle(z) results = {} mat = np.zeros((num_clusters, num_clusters)) from sklearn.model_selection import KFold kf = KFold(n_splits=num_clusters) i = 0 cluster_idx = {} for train_eval_idx, test_idx in kf.split(z): cluster_idx[str(i)] = test_idx i = i + 1 i = 0 print(z.shape) for i in range(num_clusters): xs = z[cluster_idx[str(i)]] print(xs.shape) for j in range(num_clusters): xt = z[cluster_idx[str(j)]] print(xt.shape) # Compute distance kernels, normalize them and then display n_samples = min(xs.shape[0], xt.shape[0]) if args.debug == True: n_samples = 100 xs = xs[:n_samples] xt = xt[:n_samples] M = sp.spatial.distance.cdist(xt, xs) M /= M.max() ds, dt = np.ones((len(xs), )) / len(xs), np.ones( (len(xt), )) / len(xt) g0, loss = ot.emd(ds, dt, M, log=True) print( 'Gromov-Wasserstein distances between {}_{} clusters: {}--{} '. format(i, j, str(loss), str(np.sum(g0)))) #results[str(i)+str(j)]={"GW":log0['gw_dist'],"EGW":log['gw_dist']} results[str(i) + str(j)] = loss["cost"] mat[i, j] = loss["cost"] #pl.figure(1, (10, 5)) #pl.subplot(1, 2, 1) #pl.imshow(gw0, cmap='jet') #pl.title('Gromov Wasserstein') #pl.subplot(1, 2, 2) #pl.imshow(gw, cmap='jet') #pl.title('Entropic Gromov Wasserstein') #pl.savefig(result_path + "/WD_TSNE{}_{}.jpg".format(i,j)) # print(results) print(mat) with open("wd_rand.txt", 'a') as lf: lf.write(str(results)) return results
def test_emd_1d_emd2_1d_with_weights(): # test emd1d gives similar results as emd n = 20 m = 30 rng = np.random.RandomState(0) u = rng.randn(n, 1) v = rng.randn(m, 1) w_u = rng.uniform(0., 1., n) w_u = w_u / w_u.sum() w_v = rng.uniform(0., 1., m) w_v = w_v / w_v.sum() M = ot.dist(u, v, metric='sqeuclidean') G, log = ot.emd(w_u, w_v, M, log=True) wass = log["cost"] G_1d, log = ot.emd_1d(u, v, w_u, w_v, metric='sqeuclidean', log=True) wass1d = log["cost"] wass1d_emd2 = ot.emd2_1d(u, v, w_u, w_v, metric='sqeuclidean', log=False) wass1d_euc = ot.emd2_1d(u, v, w_u, w_v, metric='euclidean', log=False) # check loss is similar np.testing.assert_allclose(wass, wass1d) np.testing.assert_allclose(wass, wass1d_emd2) # check loss is similar to scipy's implementation for Euclidean metric wass_sp = wasserstein_distance(u.reshape((-1, )), v.reshape((-1, )), w_u, w_v) np.testing.assert_allclose(wass_sp, wass1d_euc) # check constraints np.testing.assert_allclose(w_u, G.sum(1)) np.testing.assert_allclose(w_v, G.sum(0))
def wass1dim(data1, data2, numBins=200): ''' Compare two one-dimensional arrays by the Wasserstein metric (https://en.wikipedia.org/wiki/Wasserstein_metric). The input data should have outliers removed. Parameters ---------- data1, data2: two one-dimensional arrays to compare. numBins: the number of bins. Outputs ------- result: the computed Wasserstein metric. ''' numBins = 200 ## number of bins upper = np.max((data1.max(), data2.max())) lower = np.min((data1.min(), data2.min())) xbins = np.linspace(lower, upper, numBins + 1) density1, _ = np.histogram(data1, density=False, bins=xbins) density2, _ = np.histogram(data2, density=False, bins=xbins) density1 = density1 / np.sum(density1) density2 = density2 / np.sum(density2) # pairwise distance matrix between bins distMat = distance_matrix(xbins[1:].reshape(numBins, 1), xbins[1:].reshape(numBins, 1)) M = distMat T = ot.emd(density1, density2, M) # optimal transport matrix result = np.sum(T * M) # the objective data return result
def compute_deepjdot_loss(features_source, ys_pred, ys, features_target, yt_pred, gamma_criterion, g_criterion): # Compute the euclidian distance in the feature space #C0 = cdist(features_source.detach().cpu().numpy(), # features_target.detach().cpu().numpy(), p=0.2) C0 = torch.square(torch.cdist(features_source, features_target, p=2.0)) # Compute the loss function of labels #C1 = F.cross_entropy(yt_pred, ys) classes = torch.arange(yt_pred.shape[1]).reshape(1, yt_pred.shape[1]) one_hot_ys = (ys.unsqueeze(1) == classes.to(device=c.device)).float() C1 = torch.square( torch.cdist(one_hot_ys, F.softmax(yt_pred, dim=1), p=2.0)) C = c.alpha * C0 + c.tloss * C1 # Compute the gamma function #gamma = ot.emd(ot.unif(features_source.shape[0]), # ot.unif(features_target.shape[0]), C) gamma = ot.emd( torch.from_numpy(ot.unif( features_source.shape[0])).to(device=c.device), torch.from_numpy(ot.unif( features_target.shape[0])).to(device=c.device), C) # ot.emd: solve the OT problem for the pdfs of both source and target features # ot.unif: return an histogram of the arguments # Align Loss gamma_loss = gamma_criterion(features_source, features_target, gamma) # gamma loss get the fetures of the source, the features of the target # and gamma. It first performs the L2 distance between the features and # then return self.jdot_alpha * dnn.K.sum(self.gamma * (gdist)) # Classifier Loss clf_loss = g_criterion(ys, ys_pred, yt_pred, gamma) return clf_loss, gamma_loss, clf_loss + gamma_loss
def _match_shorter(self, shorter): """compute mapping if new points are less than known points""" M, known, _ = _dist_closest(self.points, shorter) G = ot.emd([], [], M[known]) result = np.empty((len(shorter, )), dtype=int) result[np.argmax(G, axis=1)] = known return result
def persistence_wasserstein_distance(x: np.ndarray, y: np.ndarray, ground_distance: np.ndarray) -> float: """Compute an approximation of Persistence Wasserstein_1 distance between persistenced iagrams with vector representations ``x`` and ``y`` using the ground distance provided. Parameters ---------- x: array of shape (n_gaussians,) The vectorization of the first persistence diagram y: array of shape (n_gaussians,) The vectorization of the first persistence diagram ground_distance: array of shape (n_gaussians + 1, n_gaussians + 1) The amended ground-distance as output by ``add_birth_death_line`` Returns ------- dist: float Ann approximation of Persistence Wasserstein_1 distance between persistenced iagrams with vector representations ``x`` and ``y`` """ x_a = np.append(x, y.sum()) x_a /= x_a.sum() y_a = np.append(y, x.sum()) y_a /= y_a.sum() plan = ot.emd(x_a, y_a, ground_distance) return (x.sum() + y.sum()) * (plan * ground_distance).sum()
def graph_d(self, graph1, graph2): """ Compute the Wasserstein distance between two graphs. Uniform weights are used. Parameters ---------- graph1 : a Graph object graph2 : a Graph object Returns ------- The Wasserstein distance between the features of graph1 and graph2 """ nodes1 = graph1.nodes() nodes2 = graph2.nodes() t1masses = np.ones(len(nodes1)) / len(nodes1) t2masses = np.ones(len(nodes2)) / len(nodes2) x1 = self.reshaper(graph1.all_matrix_attr()) x2 = self.reshaper(graph2.all_matrix_attr()) if self.features_metric == 'dirac': f = lambda x, y: x != y M = ot.dist(x1, x2, metric=f) else: M = ot.dist(x1, x2, metric=self.features_metric) if np.max(M) != 0: M = M / np.max(M) self.M = M transp = ot.emd(t1masses, t2masses, M) self.transp = transp return np.sum(transp * M)
def pot_wasserstein_mapper(net1, net2, metric_space, p=None, q=None): """ Computes vanilla EMD (over Hausdorff dist) for mapper graphs Parameters ---------- net1 : lightweight_mapper.Network Mapper graph net2 : lightweight_mapper.Network Mapper graph metric_space : np.array Pairwise distance matrix p : np.array - nx1 Distribution over nodes corresponding to net1 q : np.array - nx1 Distribution over nodes corresponding to net2 Returns ------- EMD (Cost = Hausdorff dist) """ C3 = network_merge_distance(net1, net2, metric_space) if p is None or q is None: p = np.diag(net1.adjacency_matrix.toarray()) p = p / p.sum() q = np.diag(net2.adjacency_matrix.toarray()) q = q / q.sum() gw_dist = ot.emd2(p, q, C3) params = ot.emd(p, q, C3) return gw_dist, params
def makeTransportPlan(self): if self.source_data and self.target_data: if self.source_data_size == self.target_data_size: loss_matrix = ot.dist(self.source_data, self.target_data) loss_matrix = loss_matrix / loss_matrix.max() if not self.source_weight: self.source_weight = np.ones( (self.source_data_size, )) / self.source_data_size print( "The Source weights are intiialized to one. If custome weight please load weight." ) if not self.target_weight: self.target_weight = np.ones( (self.target_data_size, )) / self.target_data_size print( "The Target weights are intiialized to one. If custome weight please load weight." ) transport = ot.emd(self.source_weight, self.target_weight, loss_matrix, log=True) print("Transport Plan Complete") print("The cost is: {}".format(transport[1]['cost'])) self.transport_plan = transport[0] return transport else: print( "Optimal Transport Plan not complete due to mismatch in Source and Target Size." ) return else: print( "Optimal Transport Plan not complete. Please add Source & Target data and rerun." ) return
def gnpr_distance(x: np.array, y: np.array, theta: float, n_bins: int = 50) -> float: """ Calculates the empirical distance between two random variables under the Generic Non-Parametric Representation (GNPR) approach. Formula for the distance is taken from https://www.researchgate.net/publication/322714557 (p.72). Parameter theta defines what type of information dependency is being tested: - for theta = 0 the distribution information is tested - for theta = 1 the dependence information is tested - for theta = 0.5 a mix of both information types is tested With theta in [0, 1] the distance lies in the range [0, 1] and is a metric. (See original work for proof, p.71) This method is modified as it uses 1D Optimal Transport Distance to measure distribution distance. This solves the issue of defining support and choosing a number of bins. The number of bins can be given as an input to speed up calculations. Big numbers of bins can take a long time to calculate. :param x: (np.array/pd.Series) X vector. :param y: (np.array/pd.Series) Y vector (same number of observations as X). :param theta: (float) Type of information being tested. Falls in range [0, 1]. :param n_bins: (int) Number of bins to use to split the X and Y vector observations. (100 by default) :return: (float) Distance under GNPR approach. """ # Number of observations num_obs = x.shape[0] # Calculating the d_1 distance dist_1 = 3 / (num_obs * (num_obs**2 - 1)) * (np.power(x - y, 2).sum()) # Binning observations x_binned = pd.Series(np.histogram(x, bins=n_bins)[0]) / num_obs y_binned = pd.Series(np.histogram(y, bins=n_bins)[0]) / num_obs # Bin positions bins = np.linspace(0, 1, n_bins) # Loss matrix loss_matrix = ot.dist(bins.reshape((n_bins, 1)), bins.reshape((n_bins, 1))) loss_matrix /= loss_matrix.max() # Optimal transportation matrix ot_matrix = ot.emd(x_binned.sort_values(), y_binned.sort_values(), loss_matrix) # Optimal transport distance dist_0 = np.trace(np.dot(np.transpose(ot_matrix), loss_matrix)) # Calculating the GNPR distance distance = theta * dist_1 + (1 - theta) * dist_0 return distance**(1 / 2)
def test_dual_variables(): n = 5000 # nb bins m = 6000 # nb bins mean1 = 1000 mean2 = 1100 # bin positions x = np.arange(n, dtype=np.float64) y = np.arange(m, dtype=np.float64) # Gaussian distributions a = gauss(n, m=mean1, s=5) # m= mean, s= std b = gauss(m, m=mean2, s=10) # loss matrix M = ot.dist(x.reshape((-1, 1)), y.reshape((-1, 1))) ** (1. / 2) print('Computing {} EMD '.format(1)) # emd loss 1 proc ot.tic() G, log = ot.emd(a, b, M, log=True) ot.toc('1 proc : {} s') ot.tic() G2 = ot.emd(b, a, np.ascontiguousarray(M.T)) ot.toc('1 proc : {} s') cost1 = (G * M).sum() # Check symmetry np.testing.assert_array_almost_equal(cost1, (M * G2.T).sum()) # Check with closed-form solution for gaussians np.testing.assert_almost_equal(cost1, np.abs(mean1 - mean2)) # Check that both cost computations are equivalent np.testing.assert_almost_equal(cost1, log['cost']) check_duality_gap(a, b, M, G, log['u'], log['v'], log['cost'])
def solve(fake_feature,true_feature): # get the optimal matching between fake and true. assume #fake < # true M=distance(fake_feature,true_feature,True) emd = ot.emd([], [], M.numpy()) map= np.zeros(fake_feature.size(0)) for i in range(0,fake_feature.size(0)): for j in range(0,true_feature.size(0)): if emd[i][j]>0: map[i]=j return map
def jdot_krr(X,y,Xtest,gamma_g=1, numIterBCD = 10, alpha=1,lambd=1e1, method='emd',reg=1,ktype='linear'): # Initializations n = X.shape[0] ntest = Xtest.shape[0] wa=np.ones((n,))/n wb=np.ones((ntest,))/ntest # original loss C0=cdist(X,Xtest,metric='sqeuclidean') #print np.max(C0) C0=C0/np.median(C0) # classifier g = classif.KRRClassifier(lambd) # compute kernels if ktype=='rbf': Kt=sklearn.metrics.pairwise.rbf_kernel(Xtest,Xtest,gamma=gamma_g) else: Kt=sklearn.metrics.pairwise.linear_kernel(Xtest,Xtest) C = alpha*C0#+ cdist(y,ypred,metric='sqeuclidean') k=0 while (k<numIterBCD):# and not changeLabels: k=k+1 if method=='sinkhorn': G = ot.sinkhorn(wa,wb,C,reg) if method=='emd': G= ot.emd(wa,wb,C) Yst=ntest*G.T.dot(y) g.fit(Kt,Yst) ypred=g.predict(Kt) # function cost fcost = cdist(y,ypred,metric='sqeuclidean') C=alpha*C0+fcost return g,np.sum(G*(fcost))
def test_emd_empty(): # test emd and emd2 for simple identity n = 100 rng = np.random.RandomState(0) x = rng.randn(n, 2) u = ot.utils.unif(n) M = ot.dist(x, x) G = ot.emd([], [], M) # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) # check constratints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn w = ot.emd2([], [], M) # check loss=0 np.testing.assert_allclose(w, 0)
def test_warnings(): n = 100 # nb bins m = 100 # nb bins mean1 = 30 mean2 = 50 # bin positions x = np.arange(n, dtype=np.float64) y = np.arange(m, dtype=np.float64) # Gaussian distributions a = gauss(n, m=mean1, s=5) # m= mean, s= std b = gauss(m, m=mean2, s=10) # loss matrix M = ot.dist(x.reshape((-1, 1)), y.reshape((-1, 1))) ** (1. / 2) print('Computing {} EMD '.format(1)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") print('Computing {} EMD '.format(1)) ot.emd(a, b, M, numItermax=1) assert "numItermax" in str(w[-1].message) assert len(w) == 1 a[0] = 100 print('Computing {} EMD '.format(2)) ot.emd(a, b, M) assert "infeasible" in str(w[-1].message) assert len(w) == 2 a[0] = -1 print('Computing {} EMD '.format(2)) ot.emd(a, b, M) assert "infeasible" in str(w[-1].message) assert len(w) == 3
pl.xlabel('x') pl.ylabel('y') pl.legend() pl.title('Toy regression example') #%% TLOT itermax=5 alpha=1 C0=cdist(xs,xt,metric='sqeuclidean') #print np.max(C0) C0=C0/np.median(C0) fcost = cdist(ys,yt,metric='sqeuclidean') C=alpha*C0+fcost G0=ot.emd(ot.unif(n),ot.unif(n),C) fit_params={'epochs':100} model,loss = jdot.jdot_nn_l2(get_model,xs,ys,xt,ytest=yt,fit_params=fit_params,numIterBCD = itermax, alpha=alpha) ypred=model.predict(xvisu.reshape((-1,1))) pl.figure(2) pl.clf() pl.scatter(xs,ys,label='Source samples',edgecolors='k') pl.scatter(xt,yt,label='Target samples',edgecolors='k') pl.plot(xvisu,fs_s(xvisu),'b',label='Source model') pl.plot(xvisu,fs_t(xvisu),'g',label='Target model') pl.plot(xvisu,ypred,'r',label='JDOT model')
def jdot_nn_l2(get_model,X,Y,Xtest,ytest=[],fit_params={},reset_model=True, numIterBCD = 10, alpha=1,method='emd',reg=1,nb_epoch=100,batch_size=10): # get model should return a new model compiled with l2 loss # Initializations n = X.shape[0] ntest = Xtest.shape[0] wa=np.ones((n,))/n wb=np.ones((ntest,))/ntest # original loss C0=cdist(X,Xtest,metric='sqeuclidean') C0=C0/np.max(C0) # classifier g = get_model() TBR = [] sav_fcost = [] sav_totalcost = [] results = {} #Init initial g(.) g.fit(X,Y,**fit_params) ypred=g.predict(Xtest) C = alpha*C0+ cdist(Y,ypred,metric='sqeuclidean') # do it only if the final labels were given if len(ytest): ydec=np.argmax(ypred,1)+1 TBR1=np.mean(ytest==ydec) TBR.append(TBR1) k=0 changeLabels=False while (k<numIterBCD):# and not changeLabels: k=k+1 if method=='sinkhorn': G = ot.sinkhorn(wa,wb,C,reg) if method=='emd': G= ot.emd(wa,wb,C) Yst=ntest*G.T.dot(Y) if reset_model: g=get_model() g.fit(Xtest,Yst,**fit_params) ypred=g.predict(Xtest) # function cost fcost = cdist(Y,ypred,metric='sqeuclidean') #pl.figure() #pl.imshow(fcost) #pl.show() C=alpha*C0+fcost ydec_tmp=np.argmax(ypred,1)+1 if k>1: changeLabels=np.all(ydec_tmp==ydec) sav_fcost.append(np.sum(G*fcost)) sav_totalcost.append(np.sum(G*(alpha*C0+fcost))) ydec=ydec_tmp if len(ytest): TBR1=np.mean((ytest-ypred)**2) TBR.append(TBR1) results['ypred0']=ypred results['ypred']=np.argmax(ypred,1)+1 if len(ytest): results['mse']=TBR results['clf']=g results['fcost']=sav_fcost results['totalcost']=sav_totalcost return g,results
pl.subplot(1, 3, 2) pl.imshow(M2, interpolation='nearest') pl.title('Squared Euclidean cost') pl.subplot(1, 3, 3) pl.imshow(Mp, interpolation='nearest') pl.title('Sqrt Euclidean cost') pl.tight_layout() ############################################################################## # Dataset 1 : Plot OT Matrices # ---------------------------- #%% EMD G1 = ot.emd(a, b, M1) G2 = ot.emd(a, b, M2) Gp = ot.emd(a, b, Mp) # OT matrices pl.figure(3, figsize=(7, 3)) pl.subplot(1, 3, 1) ot.plot.plot2D_samples_mat(xs, xt, G1, c=[.5, .5, 1]) pl.plot(xs[:, 0], xs[:, 1], '+b', label='Source samples') pl.plot(xt[:, 0], xt[:, 1], 'xr', label='Target samples') pl.axis('equal') # pl.legend(loc=0) pl.title('OT Euclidean') pl.subplot(1, 3, 2)
def jdot_svm(X,y,Xtest, ytest=[],gamma_g=1, numIterBCD = 10, alpha=1, lambd=1e1, method='emd',reg_sink=1,ktype='linear'): # Initializations n = X.shape[0] ntest = Xtest.shape[0] wa=np.ones((n,))/n wb=np.ones((ntest,))/ntest # original loss C0=cdist(X,Xtest,metric='sqeuclidean') # classifier g = classif.SVMClassifier(lambd) # compute kernels if ktype=='rbf': Kt=sklearn.metrics.pairwise.rbf_kernel(Xtest,gamma=gamma_g) #Ks=sklearn.metrics.pairwise.rbf_kernel(X,gamma=gamma_g) else: Kt=sklearn.metrics.pairwise.linear_kernel(Xtest) #Ks=sklearn.metrics.pairwise.linear_kernel(X) TBR = [] sav_fcost = [] sav_totalcost = [] results = {} ypred=np.zeros(y.shape) Chinge=np.zeros(C0.shape) C=alpha*C0+Chinge # do it only if the final labels were given if len(ytest): TBR.append(np.mean(ytest==np.argmax(ypred,1)+1)) k=0 while (k<numIterBCD): k=k+1 if method=='sinkhorn': G = ot.sinkhorn(wa,wb,C,reg_sink) if method=='emd': G= ot.emd(wa,wb,C) if k>1: sav_fcost.append(np.sum(G*Chinge)) sav_totalcost.append(np.sum(G*(alpha*C0+Chinge))) Yst=ntest*G.T.dot((y+1)/2.) #Yst=ntest*G.T.dot(y_f) g.fit(Kt,Yst) ypred=g.predict(Kt) Chinge=classif.loss_hinge(y,ypred) #Chinge=SVMclassifier.loss_hinge(y_f*2-1,ypred*2-1) C=alpha*C0+Chinge if len(ytest): TBR1=np.mean(ytest==np.argmax(ypred,1)+1) TBR.append(TBR1) results['ypred']=np.argmax(ypred,1)+1 if len(ytest): results['TBR']=TBR results['clf']=g results['G']=G results['fcost']=sav_fcost results['totalcost']=sav_totalcost return g,results
pl.plot(xs[:, 0], xs[:, 1], '+b', label='Source samples') pl.plot(xt[:, 0], xt[:, 1], 'xr', label='Target samples') pl.legend(loc=0) pl.title('Source and target distributions') pl.figure(2) pl.imshow(M, interpolation='nearest') pl.title('Cost matrix M') ############################################################################## # Compute EMD # ----------- #%% EMD G0 = ot.emd(a, b, M) pl.figure(3) pl.imshow(G0, interpolation='nearest') pl.title('OT matrix G0') pl.figure(4) ot.plot.plot2D_samples_mat(xs, xt, G0, c=[.5, .5, 1]) pl.plot(xs[:, 0], xs[:, 1], '+b', label='Source samples') pl.plot(xt[:, 0], xt[:, 1], 'xr', label='Target samples') pl.legend(loc=0) pl.title('OT matrix with samples') ############################################################################## # Compute Sinkhorn