def test_kmp_precomputed_dictionary(): n_samples = mult_dense.shape[0] cv = ShuffleSplit(n_samples, n_iterations=1, test_fraction=0.2, random_state=0) train, test = list(cv)[0] X_train, y_train = mult_dense[train], mult_target[train] X_test, y_test = mult_dense[test], mult_target[test] components = select_components(X_train, y_train, n_components=0.3, random_state=0) K_train = pairwise_kernels(X_train, components) kmp = KMPClassifier(metric="precomputed") kmp.fit(K_train, y_train) y_pred = kmp.predict(K_train) acc = np.mean(y_pred == y_train) assert_true(acc >= 0.75) K_test = pairwise_kernels(X_test, components) y_pred = kmp.predict(K_test) acc = np.mean(y_pred == y_test) assert_true(acc >= 0.63)
def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", **kwargs): """ This function computes the kernel matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2). Parameters: X (list of n numpy arrays of shape (numx2)): first list of persistence diagrams. Y (list of m numpy arrays of shape (numx2)): second list of persistence diagrams (optional). If None, pairwise kernel values are computed from the first list only. kernel: kernel to use. It can be either a string ("sliced_wasserstein", "persistence_scale_space", "persistence_weighted_gaussian", "persistence_fisher") or a function taking two numpy arrays of shape (nx2) and (mx2) as inputs. If it is a function, make sure that it is symmetric. **kwargs: optional keyword parameters. Any further parameters are passed directly to the kernel function. See the docs of the various kernel classes in this module. Returns: numpy array of shape (nxm): kernel matrix. """ XX = np.reshape(np.arange(len(X)), [-1,1]) YY = None if Y is None else np.reshape(np.arange(len(Y)), [-1,1]) if kernel == "sliced_wasserstein": return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"]) / kwargs["bandwidth"]) elif kernel == "persistence_fisher": return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="persistence_fisher", kernel_approx=kwargs["kernel_approx"], bandwidth=kwargs["bandwidth"]) / kwargs["bandwidth_fisher"]) elif kernel == "persistence_scale_space": return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_scale_space_kernel, X, Y, **kwargs)) elif kernel == "persistence_weighted_gaussian": return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_weighted_gaussian_kernel, X, Y, **kwargs)) else: return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(metric, **kwargs))
def eval(self, X): """Evaluate the kernel density estimation Parameters ---------- X : array_like array of points at which to evaluate the KDE. Shape is (n_points, n_dim), where n_dim matches the dimension of the training points. Returns ------- dens : ndarray array of shape (n_points,) giving the density at each point. The density will be normalized for metric='gaussian' or metric='tophat', and will be unnormalized otherwise. """ X = np.atleast_2d(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X_.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.metric == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel gamma = 0.5 / self.h / self.h D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma) D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1])) dens = D.sum(1) elif self.metric == 'tophat': # use Ball Tree to efficiently count neighbors bt = BallTree(self.X_) counts = bt.query_radius(X, self.h, count_only=True) dens = counts / n_volume(self.h, X.shape[1]) elif self.metric == 'exponential': D = pairwise_distances(X, self.X_) dens = np.exp(-abs(D) / self.h) dens = dens.sum(1) dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1]) elif self.metric == 'quadratic': D = pairwise_distances(X, self.X_) dens = (1 - (D / self.h) ** 2) dens[D > self.h] = 0 dens = dens.sum(1) dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2) else: D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs) dens = D.sum(1) return dens
def kernel_two_sample_test(X, Y, kernel_function='rbf', iterations=10000, verbose=False, random_state=None, **kwargs): """Compute MMD^2_u, its null distribution and the p-value of the kernel two-sample test. Note that extra parameters captured by **kwargs will be passed to pairwise_kernels() as kernel parameters. E.g. if kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1), then this will result in getting the kernel through kernel_function(metric='rbf', gamma=0.1). """ m = len(X) n = len(Y) XY = np.vstack([X, Y]) K = pairwise_kernels(XY, metric=kernel_function, **kwargs) mmd2u = MMD2u(K, m, n) if verbose: print("MMD^2_u = %s" % mmd2u) print("Computing the null distribution.") mmd2u_null = compute_null_distribution(K, m, n, iterations, verbose=verbose, random_state=random_state) p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() / float(iterations)) if verbose: print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations)) return mmd2u, mmd2u_null, p_value
def kernelTwoSampleTest(X, Y, kernel_function='rbf', iterations=10000, verbose=False, **kwargs): """Compute MMD^2_u, its null distribution and the p-value of the kernel two-sample test. Note that extra parameters captured by **kwargs will be passed to pairwise_kernels() as kernel parameters. E.g. if kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1), then this will result in getting the kernel through kernel_function(metric='rbf', gamma=0.1). """ m = len(X) n = len(Y) X = X.numpy() X = X.reshape(X.shape[0], -1) Y = Y.numpy() Y = Y.reshape(Y.shape[0], -1) XY = np.vstack([X, Y]) # calculate the kernel matrix given elements of both domains K = pairwise_kernels(XY, metric=kernel_function, **kwargs) mmd2u = MMD2u(K, m, n) if verbose: print("MMD^2_u = %s" % mmd2u) print("Computing the null distribution.") return mmd2u
def _get_kernel(self, view, X, Y=None): params = { "gamma": self.gamma[view], } return pairwise_kernels( X, Y, metric=self.kernel[view], filter_params=True, **params )
def compute_rbf_kernel_matrix(X): """Compute the RBF kernel matrix with sigma2 as the median pairwise distance. """ sigma2 = np.median(pairwise_distances(X, metric='euclidean'))**2 K = pairwise_kernels(X, X, metric='rbf', gamma=1.0 / sigma2, n_jobs=-1) return K
def extract(self, A, k, W=None, H=None): """ Run a NMF algorithm Parameters ---------- A : numpy.array or scipy.sparse matrix, shape (m,n) m : Number of features n : Number of samples k : int - target lower rank Returns ------- (W, H, rec) W : Obtained factor matrix, shape (m,k) H : Obtained coefficient matrix, shape (n,k) """ if W is None and H is None: W = np.random.rand(A.shape[0], k) H = np.random.rand(A.shape[1], k) elif W is None: Sol, info = nnls.nnlsm_blockpivot(H, A.T) W = Sol.T elif H is None: Sol, info = nnls.nnlsm_blockpivot(W.T, A) H = Sol.T H_hat = np.random.rand(A.shape[1], k) S = metrics.pairwise_kernels(A.T) norm_A = mu.norm_fro(A) for i in range(1, self.max_iter + 1): (W, H, H_hat) = self.iter_solver(A, S, W, H, H_hat, self.alpha, self.beta) rel_error = mu.norm_fro_err(A, W, H, norm_A) / norm_A return W, H, H_hat
def extract(self, A, k, max_iter=100, lambda_reg=0.1, alpha_reg=0.1): """ Run a NMF algorithm Parameters ---------- A : numpy.array or scipy.sparse matrix, shape (m,n) m : Number of features n : Number of samples k : int - target lower rank lambda_reg : Regularization constant for GRNMF alpha : L1 regularization constant for H matrix Returns ------- (W, H, rec) W : Obtained factor matrix, shape (m,k) H : Obtained coefficient matrix, shape (n,k) """ W = np.random.rand(A.shape[0], k) H = np.random.rand(A.shape[1], k) S = metrics.pairwise_kernels(A.T) # normalize the distance matrix between 0 to 1 S = S - np.min(S) / (np.max(S) - np.min(S)) D = np.sum(S, axis=1) norm_A = mu.norm_fro(A) for i in range(1, max_iter + 1): (W, H) = self.iter_solver(A, S, D, W, H, lambda_reg, alpha_reg) rel_error = mu.norm_fro_err(A, W, H, norm_A) / norm_A return W, H
def compute_rbf_kernel_matrix(X): """Compute the RBF kernel matrix with sigma2 as the median pairwise distance. """ sigma2 = np.median(pairwise_distances(X, metric='euclidean'))**2 K = pairwise_kernels(X, X, metric='rbf', gamma=1.0/sigma2, n_jobs=-1) return K
def __generate_kernel(self): t0 = timer() if self._n_neighbors is not None: self.__calc_epsilon() k_init = np.exp(-self._dist**2 / self._epsilon) else: k_init = pairwise_kernels(self._data, metric='rbf', gamma=1 / self._epsilon, n_jobs=self._n_jobs) # k_init = k_init - np.eye(k_init.shape[0]) # prohibits self-transitions d_init = np.sum(k_init, 1) d_init_alpha = d_init**(-self._alpha) d_init_alpha_mat = d_init_alpha.reshape(-1, 1) * d_init_alpha self._kernel_alpha = k_init / d_init_alpha_mat if np.allclose(self._kernel_alpha, self._kernel_alpha.T): pass else: self._kernel_alpha = (self._kernel_alpha + self._kernel_alpha.T) / 2 self._d_alpha = np.sum(self._kernel_alpha, 1) t1 = timer() t10 = round(t1 - t0, 3) print('time elapsed for the computation of the kernel: {}'.format(t10))
def compute_mmd2u(X, Y): m = len(X) n = len(Y) XY = np.vstack([X, Y]) sigma2 = np.median(pairwise_distances(X, Y, metric='euclidean'))**2 K = pairwise_kernels(XY, metric='rbf', gamma=1./sigma2) return MMD2u(K, m, n)
def compute_metric_mmd2(X, Y): m = len(X) n = len(Y) sigma2 = np.median(pairwise_distances(X, Y, metric='euclidean'))**2 XY = np.vstack([X, Y]) K = pairwise_kernels(XY, metric='rbf', gamma=1.0 / sigma2) mmd2u = MMD2u(K, m, n, False) return mmd2u
def score(self,x): n=len(x) Phi=pairwise_kernels(x,self.xce,metric="rbf",gamma=1./(2*self.sigma**2)) Phi1=tile(Phi.sum(0),(n,1)) tmp1=Phi1.T.dot(Phi)/(n**2) tmp2=Phi.sum(0)/(n) score=self.alpha.dot(tmp1).dot(self.alpha)-self.alpha.dot(tmp2) return -score
def test_degenerate(self): # simple cosine similarity (we always return normalized vectors) sims1 = pairwise_kernels(self.query, self.index, metric='linear') # degenerate soft cosine should be equal to cosine sims2 = soft_cosine_similarities( self.query, self.index, np.identity(len(self.vocab))) self.assertTrue(np.allclose(sims1, sims2))
def predict(self, X): ''' Returns +1 if the sample is predicted to be novel, -1 otherwise. ''' ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric) scores = score(self.projection, self.target_points, ks) prediction = np.array([1 if sc > self.threshold else -1 for sc in scores]) return prediction
def fit(self, X, y, sample_weight=None): kernel_mat = metrics.pairwise_kernels(X, metric=self.metric) proj, target_points = learn(kernel_mat, y) self.projection = proj self.target_points = target_points self.X_train = X return self
def witness_function(X, Y, grid, kernel_function='rbf', **kwargs): """ This function computes the witness function. For the definition of the witness function see page 729 in the "A Kernel Two-Sample Test" by Gretton et al. (2012) :param X: numpy-array Data, of size MxD [M is the number of data points, D is the features dimension] :param Y: numpy-array Data, of size NxD [N is the number of data points, D is the features dimension] :param gird: numpy-array Defines a grid for which the witness function is computed. It has the size PxD where P is the number of grid points, D is the features dimension :param kernel_function: string defines the kernel function, only used for the MMD. For the list of implemented kernel please consult with https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics :param kwargs: extra parameters, these are passed to `pairwise_kernels()` as kernel parameters. E.g., if `kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1)` :return: numpy-array witness function """ if X.shape[1] != Y.shape[1]: raise ValueError( "Incompatible dimension for X and Y matrices. X and Y should have the same feature dimension," ": X.shape[1] == %i while Y.shape[1] == %i." % (X.shape[1], Y.shape[1])) if X.shape[1] != grid.shape[1]: raise ValueError( "Incompatible dimension for data and grid matrices. data and grid should have the same feature dimension," ": data.shape[1] == %i while grid.shape[1] == %i." % (X.shape[1], grid.shape[1])) # data and grid size m = len(X) n = len(Y) # compute pairwise kernels K_xg = pairwise_kernels(X, grid, metric=kernel_function, **kwargs) K_yg = pairwise_kernels(Y, grid, metric=kernel_function, **kwargs) return (np.sum(K_xg, axis=0) / m) - (np.sum(K_yg, axis=0) / n)
def predict(self, X, prob=False): X = copy.deepcopy(X) K_ts_matrix = pairwise_kernels(X, self.X, metric=self.kernel, gamma=self.gamma) output = np.dot(K_ts_matrix, self.alpha) if prob is True: return output return output.argmax(axis=1)
def predict(self, X): ''' Returns +1 if the sample is predicted to be novel, -1 otherwise. ''' ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric) scores = score(self.projection, self.target_points, ks) prediction = np.array( [1 if sc > self.threshold else -1 for sc in scores]) return prediction
def example(): import numpy as np from sklearn.metrics import pairwise_distances from sklearn.metrics import pairwise_kernels X = np.array([[2, 3], [3, 5], [5, 8]]) Y = np.array([[1, 0], [2, 1]]) print(pairwise_distances(X, Y, metric='manhattan')) print(pairwise_distances(X, metric='manhattan')) print(pairwise_kernels(X, Y, metric='linear'))
def transform(self, X): """Apply the feature map to X.""" X = check_array(X) embedded = pairwise_kernels(X, self.components_, metric=self.kernel, gamma=self.gamma) return np.dot(embedded, self.normalization_.T)
def get_cosine_sim(self, s, p, o, body): scores = [] claim = s + p + o lsen = tokenize.sent_tokenize(body) vec = CountVectorizer(analyzer='word') vec.fit(lsen) scores = pairwise_kernels(vec.transform([claim]), vec.transform(lsen), metric='cosine') scores = scores[0].tolist() return max(scores)
def test_statistics(X, Y, model='MMD', kernel_function='rbf', **kwargs): """ This function performs a test statistics and return a test value. This implementation can perform the Kolmogorov-Smirnov test (for one-dimensional data only), Kullback-Leibler divergence and MMD. :param X: numpy-array Data, of size MxD [M is the number of data points, D is the features dimension] :param Y: numpy-array Data, of size NxD [N is the number of data points, D is the features dimension] :param model: string defines the basis model to perform two sample test ['KS', 'KL', 'MMD'] :param kernel_function: string defines the kernel function, only used for the MMD. For the list of implemented kernel please consult with https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics :param kwargs: extra parameters, these are passed to `pairwise_kernels()` as kernel parameters or `KL_divergence_estimator()` as the number of k. E.g., if `kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1)` :return: float the test value """ if model not in ['KS', 'KL', 'MMD']: raise ValueError( "The Model '%s' is not implemented, try 'KS', 'KL', or 'MMD'." % model) if X.shape[1] != Y.shape[1]: raise ValueError( "Incompatible dimension for X and Y matrices. X and Y should have the same feature dimension," ": X.shape[1] == %i while Y.shape[1] == %i." % (X.shape[1], Y.shape[1])) if model == 'KS' and X.shape[1] > 1: raise ValueError("The KS test can handle only one dimensional data," ": X.shape[1] == %i and Y.shape[1] == %i." % (X.shape[1], Y.shape[1])) m = len(X) n = len(Y) # compute the test statistics according to the input model if model == 'MMD': XY = np.vstack([X, Y]) K = pairwise_kernels(XY, metric=kernel_function, **kwargs) test_value = MMD2u_estimator(K, m, n) elif model == 'KS': test_value, _ = stats.ks_2samp(X.T[0], Y.T[0]) elif model == 'KL': test_value = KL_divergence_estimator(X, Y, **kwargs) return test_value
def predict(self, X): X = np.asarray(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.kernel == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel h = np.asarray(self.h) gamma = 0.5 / h / h K = pairwise_kernels(X, self.X, metric='rbf', gamma=gamma) else: K = pairwise_kernels(X, self.X, metric=self.kernel, **self.kwargs) K /= self.dy**2 return (K * self.y).sum(1) / K.sum(1)
def predict(self, X): X = np.asarray(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.kernel == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel h = np.asarray(self.h) gamma = 0.5 / h / h K = pairwise_kernels(X, self.X, metric='rbf', gamma=gamma) else: K = pairwise_kernels(X, self.X, metric=self.kernel, **self.kwargs) K /= self.dy ** 2 return (K * self.y).sum(1) / K.sum(1)
def transform(self, X): """Project the points in X onto the fisher directions. Parameters ---------- X : {array-like} of shape (n_samples, n_features) to be projected onto the fisher directions. """ check_is_fitted(self) return pairwise_kernels( X, self.X_, metric=self.kernel, **self.kwds ) @ self.weights_
def _get_kernel(self, view, X, Y=None): if callable(self.kernel[view]): params = self.kernel_params[view] or {} else: params = { "gamma": self.gamma[view], "degree": self.degree[view], "coef0": self.coef0[view], } return pairwise_kernels( X, Y, metric=self.kernel[view], filter_params=True, **params )
def fit(self, X): A = self.__adjacent_mat(X, self.n_neighbors) if self.kernel == 'linear': K = pairwise_kernels(X, metric='linear') elif self.kernel == 'polynomial': K = pairwise_kernels(X, metric='polynomial', gamma=0.05, degree=3) elif self.kernel == 'sigmoid': K = pairwise_kernels(X, metric='sigmoid', gamma=0.5) elif self.kernel == 'rbf': K = pairwise_kernels(X, metric='rbf', gamma=self.gamma) else: raise Exception('Invalid kernel') I = np.eye(X.shape[0]) T = np.dot(np.transpose(A), K) inv = np.linalg.inv(np.dot(T, K) + self.regu_coef * I) C = np.dot(inv, T) Coef = self.thrC(C, self.ro) y_pre, C_final = self.post_proC(Coef, self.n_clusters, 8, 18) if self.save_affinity: np.savez('./gcsc-kernel-affinity.npz', C=C_final, C1=0.5 * (np.abs(C) + np.abs(C.T))) return y_pre
def fit(self, X, y): self.X = X self.y = y if y.shape.__len__() != 2: self.classes_ = np.unique(y) self.n_classes_ = self.classes_.__len__() self.y = self.one2array(y, self.n_classes_) else: self.classes_ = np.arange(y.shape[1]) self.n_classes_ = self.classes_.__len__() K_tr_matrix = pairwise_kernels(X, X, metric=self.kernel) self.alpha = np.dot( np.linalg.inv(np.eye(X.shape[0]) / self.C + K_tr_matrix), self.y)
def __init__(self, X, y, theta_0, theta_1): self.X = X self.size, self.dim = self.X.shape y = y ε = 1e-2 self.theta_0 = theta_0 self.theta_1 = theta_1 self.c = np.sqrt(theta_1**2 + ε) length_scale = 1.0 / (np.sqrt(2) * self.c) self.kernel = RBF(length_scale=length_scale) K = theta_0**2 * pairwise_kernels(X, metric=self.kernel) self.z = np.linalg.solve(K, y)
def predict(self, X): ''' Returns +1 if the sample is predicted to be novel, -1 otherwise. The threshold is selected between 0 and the minimum distance between two target points. ''' ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric) scores = score(self.projection, self.target_points, ks) # min_dist = self._get_pairwise_min_dist() prediction = np.array([1 if sc > self.threshold else -1 for sc in scores]) return scores
def predict_kernel(self, X): n_test = X.shape[0] distance_matrix = 2 - 2 * pairwise_kernels( X, self.X, metric='rbf', gamma=10) mean_vector = np.zeros( (n_test, self.n_classes_, X.shape[1])) # [n_X, n_class, n_feature] for c in range(self.n_classes_): c_index = np.nonzero(self.y == self.classes_[c]) dis_c = distance_matrix[:, c_index[0]] X_c = self.X[c_index] sorted_index = dis_c.argsort() nearest_neighbor_c = X_c[sorted_index][:, :self.n_neighbor, :] mean_vector[:, c, :] = nearest_neighbor_c.mean(axis=1) results = np.zeros(n_test) for i in range(n_test): dis = 2 - 2 * pairwise_kernels(X[i].reshape(1, X.shape[1]), mean_vector[i, :, :], metric='rbf', gamma=1).flatten() results[i] = self.classes_[np.argmin(dis)] return results
def predict(self, X): ''' Returns +1 if the sample is predicted to be novel, -1 otherwise. The threshold is selected between 0 and the minimum distance between two target points. ''' ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric) scores = score(self.projection, self.target_points, ks) # min_dist = self._get_pairwise_min_dist() prediction = np.array( [1 if sc > self.threshold else -1 for sc in scores]) return scores
def fit(self, X, y): """ Fit the NearestCentroid model according to the given training data. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array, shape = [n_samples] Target values (integers) """ X, y = check_X_y(X, y) self.classes_ = unique_labels(y) if self.n_components > self.classes_.size - 1: warnings.warn( "n_components > classes_.size - 1." "Only the first classes_.size - 1 components will be valid." ) self.X_ = X self.y_ = y y_onehot = OneHotEncoder().fit_transform( self.y_[:, np.newaxis]) K = pairwise_kernels( X, X, metric=self.kernel, **self.kwds) m_classes = y_onehot.T @ K / y_onehot.T.sum(1) indices = (y_onehot @ np.arange(self.classes_.size)).astype('i') N = K @ (K - m_classes[indices]) # Add value to diagonal for rank robustness N += eye(self.y_.size) * self.robustness_offset m_classes_centered = m_classes - K.mean(1) M = m_classes_centered.T @ m_classes_centered # Find weights w, self.weights_ = eigsh(M, self.n_components, N, which='LM') # Compute centers centroids_ = m_classes @ self.weights_ # Train nearest centroid classifier self.clf_ = NearestCentroid().fit(centroids_, self.classes_) return self
def fit(self, X, y): # Validate input. X, y = check_X_y(X, y, accept_sparse=None, dtype='numeric') # Normalize input. self.n_, self.d_ = X.shape X, y = self._normalize_X_y(X, y) self.gamma_ = kernel_radius_to_gamma(self.kernel_radius, self.n_, self.d_, self.kernel_value_at_radius) # Train model. self.K_ = pairwise_kernels(X, metric='rbf', gamma=self.gamma_, n_jobs=-1) return self
def SVM_single_modality(data_b6, data_btbr, modality='Structural'): """ """ print 'Analyzing %s data' %(modality) vectors = np.vstack((data_b6, data_btbr)) y = np.hstack((np.zeros(len(data_b6)), np.ones(len(data_btbr)))) sigma2 = np.median(pairwise_distances(vectors, metric='euclidean'))**2 k_matrix = pairwise_kernels(vectors, metric='rbf', gamma=1.0/sigma2) clf = SVC(kernel='precomputed') cv_scores = cross_val_score(clf, k_matrix, y, cv=StratifiedKFold(y, n_folds=len(y)/2)) print 'Mean accuracy: %s, std: %s' %(np.mean(cv_scores), np.std(cv_scores)) print 'All folds scores: %s' %(cv_scores) print ''
def fit(self,x): n=len(x) if self.xce is None: self.b=min(100,n) self.xce=x[permutation(n)][:self.b] else: self.b=len(self.xce) Phi=pairwise_kernels(x,self.xce,metric="rbf",gamma=1./(2*self.sigma**2)) Phi1=tile(Phi.sum(0),(n,1)) tmp1=Phi1.T.dot(Phi)/(n**2) tmp2=Phi.sum(0)/(n) self.alpha=pinv(tmp1 + self.lam*identity(self.b)).dot(tmp2) ppred1=maximum(Phi.dot(self.alpha),0.) ypred=ppred1>=0.5 self.label=ypred return self
def main(): r"""Plot figure: Different outcomes of a Gaussian kernel approximation.""" T = 25 # Number of curves cm_subsection = np.linspace(0, 1, T + 1) colors = [matplotlib.cm.rainbow(x) for x in cm_subsection] d = 1 # Dimension of the input N = 250 # Number of points per curves # Generate N data in (-1, 1) and exact Gram matrix np.random.seed(0) X = np.linspace(-1, 1, N).reshape((N, d)) K = pairwise_kernels(X, metric='rbf', gamma=1. / (2. * .1 ** 2)) # A Matrix for the decomposable kernel. Link the outputs to some mean value c = np.random.randn(N, 2) A = .5 * np.eye(2) + .5 * np.ones((2, 2)) plt.close() plt.rc('text', usetex=True) plt.rc('font', family='serif') f, axes = plt.subplots(2, 2, figsize=(12, 8), sharex=True, sharey=True) # For each curve with different D for k, D in enumerate(np.logspace(0, 4, T)): D = int(D) np.random.seed(0) w = np.random.randn(d, D) / .1 phiX = phi(X, w, D) Kt = np.dot(phiX, phiX.T) # Generate outputs with the exact Gram matrix pred = np.dot(np.dot(Kt, c), A) axes[0, 0].plot(X, pred[:, 0], c=colors[k], lw=.5, linestyle='-') axes[0, 0].set_ylabel(r'$y_1$') axes[0, 1].plot(X, pred[:, 1], c=colors[k], lw=.5, linestyle='-') axes[0, 1].set_ylabel(r'$y_2$') # Generate outputs with the a realization of the random Gram matrix w = np.random.randn(d, D) / .1 phiX = phi(X, w, D) Kt = np.dot(phiX, phiX.T) pred = np.dot(np.dot(Kt, c), A) axes[1, 0].plot(X, pred[:, 0], c=colors[k], lw=.5, linestyle='-') axes[1, 0].set_xlabel(r'$x$') axes[1, 0].set_ylabel(r'$y_1$') axes[1, 1].plot(X, pred[:, 1], c=colors[k], lw=.5, linestyle='-') axes[1, 1].set_xlabel(r'$x$') axes[1, 1].set_ylabel(r'$y_2$') axes[0, 0].plot(X, np.dot(np.dot(K, c), A)[:, 0], c='k', lw=.5, label='K') axes[0, 1].plot(X, np.dot(np.dot(K, c), A)[:, 1], c='k', lw=.5, label='K') axes[1, 0].plot(X, np.dot(np.dot(K, c), A)[:, 0], c='k', lw=.5, label='K') axes[1, 1].plot(X, np.dot(np.dot(K, c), A)[:, 1], c='k', lw=.5, label='K') axes[0, 0].set_title(r'$\widetilde{K}u \approx Ku$, realization 1', x=1.1) axes[1, 0].set_title(r'$\widetilde{K}u \approx Ku$, realization 2', x=1.1) for xx in axes.ravel(): xx.legend(loc=4) createColorbar(1, D, f, axes) plt.savefig('not_Mercer.pgf', bbox_inches='tight')
print "X_train", X_train.shape print "X_test", X_test.shape # PCA view print "Computing PCA..." pca = RandomizedPCA(n_components=300) X_train_pca = pca.fit_transform(X_train) X_test_pca = pca.transform(X_test) components_pca = select_components(X_train_pca, y_train, n_components=opts.n_components, class_distrib="balanced") print "Computing kernels (PCA view)..." K_pca_train = pairwise_kernels(X_train_pca, components_pca, metric="rbf", gamma=0.1) K_pca_test = pairwise_kernels(X_test_pca, components_pca, metric="rbf", gamma=0.1) # Regular view components = select_components(X_train, y_train, n_components=opts.n_components, class_distrib="balanced") print "Computing kernels (regular view)..." K_train = pairwise_kernels(X_train, components, metric="rbf", gamma=0.1) K_test = pairwise_kernels(X_test, components, metric="rbf", gamma=0.1) # Combined views n_components = components.shape[0] n = n_components / 2
def predict(self,x): Phi=pairwise_kernels(x,self.xce,metric="rbf",gamma=1./(2*self.sigma**2)) ppred1=maximum(Phi.dot(self.alpha),0.) #ppred1=exp(-Phi.dot(self.alpha)) ypred=ppred1>=0.5 return ypred
def _summarize(data, vocabulary, labels_column, num_cluster): # Basic stats print("Number of songs per cluster") counter = Counter(labels_column) print(counter) print() prob_Ct, prob_Tc, prob_T = compute_probs(data, num_cluster, labels_column, counter) all_tags = range(len(prob_T)) print("Top tags per cluster") for clust in xrange(num_cluster): print(clust, "tags with max_freq_in_cluster") songs_in_cluster = np.where(labels_column == clust)[0] for tag in top_10_frequency(data[songs_in_cluster]): print("\t", vocabulary[tag]) print() print(clust, "tags with max_prob_p(c|t)") sort_func = lambda to_sort: prob_Ct[to_sort][clust] for tag in sorted(all_tags, key=sort_func, reverse=True)[:10]: print("\t", vocabulary[tag]) print() print() print("Term entropies for each cluster") term_entropies = [] for clust in xrange(num_cluster): h = entropy.entropy(prob_Tc[clust]) term_entropies.append(h) print(clust, h) print() # Number of shared tags between clusters X = np.zeros((num_cluster, len(all_tags))) for clust in xrange(num_cluster): for tag in all_tags: X[clust][tag] = prob_Tc[clust][tag] distances = pairwise_kernels(X) for i in xrange(num_cluster): distances[i, i] = 0 plt.imshow(distances, cmap="bone_r", interpolation="nearest") ax = plt.gca() plt.xticks(np.arange(0, num_cluster)) plt.yticks(np.arange(0, num_cluster)) plt.colorbar() plt.title("Confusion Matrix for Cluster Similarities") plt.ylabel("ClusterID") plt.xlabel("ClusterID") for i in xrange(num_cluster): ax.annotate("%.3f" % term_entropies[i], xy=(i, i), horizontalalignment="center", verticalalignment="center") plt.show() print("Mean difference") to_corr_1 = [] to_corr_2 = [] for clust in xrange(num_cluster): to_corr_1.append(term_entropies[clust]) to_corr_2.append(np.mean(distances[clust])) print(clust, term_entropies[clust], np.mean(distances[clust])) from scipy.stats import pearsonr print("R2 ", pearsonr(to_corr_1, to_corr_2))
proportion_train=0.75, random_state=random_state) except KeyError: raise ValueError("Wrong dataset name!") print "X_train", X_train.shape print "X_test", X_test.shape class_distrib = "random" if opts.regression else "balanced" components = select_components(X_train, y_train, n_components=opts.n_components, class_distrib=class_distrib) print "Computing linear kernels..." linear_train = pairwise_kernels(X_train, components, metric="linear") linear_test = pairwise_kernels(X_test, components, metric="linear") print "Computing rbf kernels..." rbf_train = pairwise_kernels(X_train, components, metric="rbf", gamma=opts.gamma) rbf_test = pairwise_kernels(X_test, components, metric="rbf", gamma=opts.gamma) print "Computing polynomial kernels..." poly_train = pairwise_kernels(X_train, components, metric="poly", degree=opts.degree) poly_test = pairwise_kernels(X_test, components, metric="poly", degree=opts.degree) n_components = components.shape[0]
def setData(self, X): self.X_ = X self.gram_ = metrics.pairwise_kernels(self.X_, metric = 'rbf', gamma = self.gamma_)
def MMD_single_modality(data_b6, data_btbr, modality='Structural', iterations=100000, plot=True): """ Process the data with the following approach: Embedding + RBF_kernel + KTST Parameters: ----------- Return: ---------- MMD distance, null_distribution, p-value """ print 'Analyzing %s data' %(modality) #Concatenating the data vectors = np.vstack((data_b6, data_btbr)) n_b6 = len(data_b6) n_btbr = len(data_btbr) sigma2 = np.median(pairwise_distances(vectors, metric='euclidean'))**2 k_matrix = pairwise_kernels(vectors, metric='rbf', gamma=1.0/sigma2) if plot: plot_similarity_matrix(k_matrix) #Computing the MMD mmd2u = MMD2u(k_matrix, n_b6, n_btbr) print("MMD^2_u = %s" % mmd2u) #Computing the null-distribution #Null distribution only on B6 mice # sigma2_b6 = np.median(pairwise_distances(vectors_cl1, metric='euclidean'))**2 # k_matrix_b6 = pairwise_kernels(vectors_cl1, metric='rbf', gamma=1.0/sigma2_b6) # mmd2u_null = compute_null_distribution(k_matrix_b6, 5, 5, iterations, seed=123, verbose=False) mmd2u_null = compute_null_distribution(k_matrix, n_b6, n_btbr, iterations, seed=123, verbose=False) print np.max(mmd2u_null) #Computing the p-value p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() / float(iterations)) print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations)) print 'Number of stds from MMD^2_u to mean value of null distribution: %s' % ((mmd2u - np.mean(mmd2u_null))/np.std(mmd2u_null)) if plot: fig = plt.figure() ax = fig.add_subplot(111) prob, bins, patches = plt.hist(mmd2u_null, bins=50, normed=True) ax.plot(mmd2u, prob.max()/30, 'w*', markersize=15, markeredgecolor='k', markeredgewidth=2, label="$%s MMD^2_u = %s$" % (modality, mmd2u)) # func_p_value = max(1.0/iterations, (functional_mmd[1] > functional_mmd[0]).sum() / float(iterations)) ax.annotate('p-value: %s' %(p_value), xy=(float(mmd2u), prob.max()/9.), xycoords='data', xytext=(-105, 30), textcoords='offset points', bbox=dict(boxstyle="round", fc="1."), arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"), ) plt.xlabel('$MMD^2_u$') plt.ylabel('$p(MMD^2_u)$') plt.legend(numpoints=1) # plt.title('%s_DATA: $p$-value=%s' %(modality, p_value)) print ''