Beispiel #1
0
def test_kmp_precomputed_dictionary():
    n_samples = mult_dense.shape[0]
    cv = ShuffleSplit(n_samples,
                      n_iterations=1,
                      test_fraction=0.2,
                      random_state=0)
    train, test = list(cv)[0]
    X_train, y_train = mult_dense[train], mult_target[train]
    X_test, y_test = mult_dense[test], mult_target[test]

    components = select_components(X_train, y_train,
                                   n_components=0.3,
                                   random_state=0)
    K_train = pairwise_kernels(X_train, components)

    kmp = KMPClassifier(metric="precomputed")
    kmp.fit(K_train, y_train)
    y_pred = kmp.predict(K_train)
    acc = np.mean(y_pred == y_train)
    assert_true(acc >= 0.75)

    K_test = pairwise_kernels(X_test, components)
    y_pred = kmp.predict(K_test)

    acc = np.mean(y_pred == y_test)
    assert_true(acc >= 0.63)
Beispiel #2
0
def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", **kwargs):
    """
    This function computes the kernel matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2).

    Parameters:    
        X (list of n numpy arrays of shape (numx2)): first list of persistence diagrams. 
        Y (list of m numpy arrays of shape (numx2)): second list of persistence diagrams (optional). If None, pairwise kernel values are computed from the first list only.
        kernel: kernel to use. It can be either a string ("sliced_wasserstein", "persistence_scale_space", "persistence_weighted_gaussian", "persistence_fisher") or a function taking two numpy arrays of shape (nx2) and (mx2) as inputs. If it is a function, make sure that it is symmetric.
        **kwargs: optional keyword parameters. Any further parameters are passed directly to the kernel function. See the docs of the various kernel classes in this module.

    Returns: 
        numpy array of shape (nxm): kernel matrix.
    """    
    XX = np.reshape(np.arange(len(X)), [-1,1])
    YY = None if Y is None else np.reshape(np.arange(len(Y)), [-1,1])
    if kernel == "sliced_wasserstein":
        return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"]) / kwargs["bandwidth"])
    elif kernel == "persistence_fisher":
        return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="persistence_fisher", kernel_approx=kwargs["kernel_approx"], bandwidth=kwargs["bandwidth"]) / kwargs["bandwidth_fisher"])
    elif kernel == "persistence_scale_space":
        return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_scale_space_kernel, X, Y, **kwargs))
    elif kernel == "persistence_weighted_gaussian":
        return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_weighted_gaussian_kernel, X, Y, **kwargs))
    else:
        return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(metric, **kwargs))
Beispiel #3
0
    def eval(self, X):
        """Evaluate the kernel density estimation

        Parameters
        ----------
        X : array_like
            array of points at which to evaluate the KDE.  Shape is
            (n_points, n_dim), where n_dim matches the dimension of
            the training points.

        Returns
        -------
        dens : ndarray
            array of shape (n_points,) giving the density at each point.
            The density will be normalized for metric='gaussian' or
            metric='tophat', and will be unnormalized otherwise.
        """
        X = np.atleast_2d(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X_.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.metric == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            gamma = 0.5 / self.h / self.h
            D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
            D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
            dens = D.sum(1)

        elif self.metric == 'tophat':
            # use Ball Tree to efficiently count neighbors
            bt = BallTree(self.X_)
            counts = bt.query_radius(X, self.h,
                                     count_only=True)
            dens = counts / n_volume(self.h, X.shape[1])

        elif self.metric == 'exponential':
            D = pairwise_distances(X, self.X_)
            dens = np.exp(-abs(D) / self.h)
            dens = dens.sum(1)
            dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])

        elif self.metric == 'quadratic':
            D = pairwise_distances(X, self.X_)
            dens = (1 - (D / self.h) ** 2)
            dens[D > self.h] = 0
            dens = dens.sum(1)
            dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)

        else:
            D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
            dens = D.sum(1)

        return dens
Beispiel #4
0
    def eval(self, X):
        """Evaluate the kernel density estimation

        Parameters
        ----------
        X : array_like
            array of points at which to evaluate the KDE.  Shape is
            (n_points, n_dim), where n_dim matches the dimension of
            the training points.

        Returns
        -------
        dens : ndarray
            array of shape (n_points,) giving the density at each point.
            The density will be normalized for metric='gaussian' or
            metric='tophat', and will be unnormalized otherwise.
        """
        X = np.atleast_2d(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X_.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.metric == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            gamma = 0.5 / self.h / self.h
            D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
            D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
            dens = D.sum(1)

        elif self.metric == 'tophat':
            # use Ball Tree to efficiently count neighbors
            bt = BallTree(self.X_)
            counts = bt.query_radius(X, self.h,
                                     count_only=True)
            dens = counts / n_volume(self.h, X.shape[1])

        elif self.metric == 'exponential':
            D = pairwise_distances(X, self.X_)
            dens = np.exp(-abs(D) / self.h)
            dens = dens.sum(1)
            dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])

        elif self.metric == 'quadratic':
            D = pairwise_distances(X, self.X_)
            dens = (1 - (D / self.h) ** 2)
            dens[D > self.h] = 0
            dens = dens.sum(1)
            dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)

        else:
            D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
            dens = D.sum(1)

        return dens
def kernel_two_sample_test(X, Y, kernel_function='rbf', iterations=10000,
                           verbose=False, random_state=None, **kwargs):
    """Compute MMD^2_u, its null distribution and the p-value of the
    kernel two-sample test.

    Note that extra parameters captured by **kwargs will be passed to
    pairwise_kernels() as kernel parameters. E.g. if
    kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1),
    then this will result in getting the kernel through
    kernel_function(metric='rbf', gamma=0.1).
    """
    m = len(X)
    n = len(Y)
    XY = np.vstack([X, Y])
    K = pairwise_kernels(XY, metric=kernel_function, **kwargs)
    mmd2u = MMD2u(K, m, n)
    if verbose:
        print("MMD^2_u = %s" % mmd2u)
        print("Computing the null distribution.")

    mmd2u_null = compute_null_distribution(K, m, n, iterations,
                                           verbose=verbose,
                                           random_state=random_state)
    p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() /
                  float(iterations))
    if verbose:
        print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations))

    return mmd2u, mmd2u_null, p_value
Beispiel #6
0
def kernelTwoSampleTest(X,
                        Y,
                        kernel_function='rbf',
                        iterations=10000,
                        verbose=False,
                        **kwargs):
    """Compute MMD^2_u, its null distribution and the p-value of the
    kernel two-sample test.

    Note that extra parameters captured by **kwargs will be passed to
    pairwise_kernels() as kernel parameters. E.g. if
    kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1),
    then this will result in getting the kernel through
    kernel_function(metric='rbf', gamma=0.1).
    """

    m = len(X)
    n = len(Y)

    X = X.numpy()
    X = X.reshape(X.shape[0], -1)
    Y = Y.numpy()
    Y = Y.reshape(Y.shape[0], -1)

    XY = np.vstack([X, Y])

    # calculate the kernel matrix given elements of both domains
    K = pairwise_kernels(XY, metric=kernel_function, **kwargs)

    mmd2u = MMD2u(K, m, n)
    if verbose:
        print("MMD^2_u = %s" % mmd2u)
        print("Computing the null distribution.")

    return mmd2u
Beispiel #7
0
 def _get_kernel(self, view, X, Y=None):
     params = {
         "gamma": self.gamma[view],
     }
     return pairwise_kernels(
         X, Y, metric=self.kernel[view], filter_params=True, **params
     )
Beispiel #8
0
def compute_rbf_kernel_matrix(X):
    """Compute the RBF kernel matrix with sigma2 as the median pairwise
    distance.
    """
    sigma2 = np.median(pairwise_distances(X, metric='euclidean'))**2
    K = pairwise_kernels(X, X, metric='rbf', gamma=1.0 / sigma2, n_jobs=-1)
    return K
 def extract(self, A, k, W=None, H=None):
     """ Run a NMF algorithm
             Parameters
             ----------
             A : numpy.array or scipy.sparse matrix, shape (m,n)
             m : Number of features
             n : Number of samples
             k : int - target lower rank
             Returns
             -------
             (W, H, rec)
             W : Obtained factor matrix, shape (m,k)
             H : Obtained coefficient matrix, shape (n,k)
     """
     if W is None and H is None:
         W = np.random.rand(A.shape[0], k)
         H = np.random.rand(A.shape[1], k)
     elif W is None:
         Sol, info = nnls.nnlsm_blockpivot(H, A.T)
         W = Sol.T
     elif H is None:
         Sol, info = nnls.nnlsm_blockpivot(W.T, A)
         H = Sol.T
     H_hat = np.random.rand(A.shape[1], k)
     S = metrics.pairwise_kernels(A.T)
     norm_A = mu.norm_fro(A)
     for i in range(1, self.max_iter + 1):
         (W, H, H_hat) = self.iter_solver(A, S, W, H, H_hat, self.alpha,
                                          self.beta)
         rel_error = mu.norm_fro_err(A, W, H, norm_A) / norm_A
     return W, H, H_hat
 def extract(self, A, k, max_iter=100, lambda_reg=0.1, alpha_reg=0.1):
     """ Run a NMF algorithm
             Parameters
             ----------
             A : numpy.array or scipy.sparse matrix, shape (m,n)
             m : Number of features
             n : Number of samples
             k : int - target lower rank
             lambda_reg : Regularization constant for GRNMF
             alpha : L1 regularization constant for H matrix
             Returns
             -------
             (W, H, rec)
             W : Obtained factor matrix, shape (m,k)
             H : Obtained coefficient matrix, shape (n,k)
     """
     W = np.random.rand(A.shape[0], k)
     H = np.random.rand(A.shape[1], k)
     S = metrics.pairwise_kernels(A.T)
     # normalize the distance matrix between 0 to 1
     S = S - np.min(S) / (np.max(S) - np.min(S))
     D = np.sum(S, axis=1)
     norm_A = mu.norm_fro(A)
     for i in range(1, max_iter + 1):
         (W, H) = self.iter_solver(A, S, D, W, H, lambda_reg, alpha_reg)
         rel_error = mu.norm_fro_err(A, W, H, norm_A) / norm_A
     return W, H
Beispiel #11
0
def compute_rbf_kernel_matrix(X):
    """Compute the RBF kernel matrix with sigma2 as the median pairwise
    distance.
    """
    sigma2 = np.median(pairwise_distances(X, metric='euclidean'))**2
    K = pairwise_kernels(X, X, metric='rbf', gamma=1.0/sigma2, n_jobs=-1)
    return K
Beispiel #12
0
    def __generate_kernel(self):

        t0 = timer()

        if self._n_neighbors is not None:
            self.__calc_epsilon()
            k_init = np.exp(-self._dist**2 / self._epsilon)
        else:
            k_init = pairwise_kernels(self._data,
                                      metric='rbf',
                                      gamma=1 / self._epsilon,
                                      n_jobs=self._n_jobs)
        # k_init = k_init - np.eye(k_init.shape[0])  # prohibits self-transitions

        d_init = np.sum(k_init, 1)
        d_init_alpha = d_init**(-self._alpha)
        d_init_alpha_mat = d_init_alpha.reshape(-1, 1) * d_init_alpha
        self._kernel_alpha = k_init / d_init_alpha_mat
        if np.allclose(self._kernel_alpha, self._kernel_alpha.T):
            pass
        else:
            self._kernel_alpha = (self._kernel_alpha +
                                  self._kernel_alpha.T) / 2
        self._d_alpha = np.sum(self._kernel_alpha, 1)

        t1 = timer()
        t10 = round(t1 - t0, 3)
        print('time elapsed for the computation of the kernel: {}'.format(t10))
Beispiel #13
0
def compute_mmd2u(X, Y):
    m = len(X)
    n = len(Y)
    XY = np.vstack([X, Y])
    sigma2 = np.median(pairwise_distances(X, Y, metric='euclidean'))**2
    K = pairwise_kernels(XY, metric='rbf', gamma=1./sigma2)
    
    return MMD2u(K, m, n)
Beispiel #14
0
def compute_metric_mmd2(X, Y):
    m = len(X)
    n = len(Y)
    sigma2 = np.median(pairwise_distances(X, Y, metric='euclidean'))**2
    XY = np.vstack([X, Y])
    K = pairwise_kernels(XY, metric='rbf', gamma=1.0 / sigma2)
    mmd2u = MMD2u(K, m, n, False)
    return mmd2u
Beispiel #15
0
 def score(self,x):
     n=len(x)
     Phi=pairwise_kernels(x,self.xce,metric="rbf",gamma=1./(2*self.sigma**2))
     Phi1=tile(Phi.sum(0),(n,1))
     tmp1=Phi1.T.dot(Phi)/(n**2)
     tmp2=Phi.sum(0)/(n)
     score=self.alpha.dot(tmp1).dot(self.alpha)-self.alpha.dot(tmp2)
     return -score
Beispiel #16
0
    def test_degenerate(self):
        # simple cosine similarity (we always return normalized vectors)
        sims1 = pairwise_kernels(self.query, self.index, metric='linear')
        # degenerate soft cosine should be equal to cosine
        sims2 = soft_cosine_similarities(
            self.query, self.index, np.identity(len(self.vocab)))

        self.assertTrue(np.allclose(sims1, sims2))
Beispiel #17
0
    def predict(self, X):
        '''
        Returns +1 if the sample is predicted to be novel, -1 otherwise.
        '''
        ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric)
        scores = score(self.projection, self.target_points, ks)
        prediction = np.array([1 if sc > self.threshold else -1 for sc in scores])

        return prediction
Beispiel #18
0
    def fit(self, X, y, sample_weight=None):
        kernel_mat = metrics.pairwise_kernels(X, metric=self.metric)
        proj, target_points = learn(kernel_mat, y)

        self.projection = proj
        self.target_points = target_points
        self.X_train = X

        return self
Beispiel #19
0
    def fit(self, X, y, sample_weight=None):
        kernel_mat = metrics.pairwise_kernels(X, metric=self.metric)
        proj, target_points = learn(kernel_mat, y)

        self.projection = proj
        self.target_points = target_points
        self.X_train = X

        return self
Beispiel #20
0
def witness_function(X, Y, grid, kernel_function='rbf', **kwargs):
    """
     This function computes the witness function. For the definition of the witness function see page 729
     in the "A Kernel Two-Sample Test" by Gretton et al. (2012)

    :param X: numpy-array
        Data, of size MxD [M is the number of data points, D is the features dimension]
    :param Y: numpy-array
        Data, of size NxD [N is the number of data points, D is the features dimension]
    :param gird: numpy-array
        Defines a grid for which the witness function is computed. It has the size PxD
        where P is the number of grid points, D is the features dimension
    :param kernel_function: string
        defines the kernel function, only used for the MMD.
        For the list of implemented kernel please consult with https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics
    :param kwargs:
        extra parameters, these are passed to `pairwise_kernels()` as kernel parameters.
        E.g., if `kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1)`

    :return: numpy-array
        witness function
    """

    if X.shape[1] != Y.shape[1]:
        raise ValueError(
            "Incompatible dimension for X and Y matrices. X and Y should have the same feature dimension,"
            ": X.shape[1] == %i while Y.shape[1] == %i." %
            (X.shape[1], Y.shape[1]))

    if X.shape[1] != grid.shape[1]:
        raise ValueError(
            "Incompatible dimension for data and grid matrices. data and grid should have the same feature dimension,"
            ": data.shape[1] == %i while grid.shape[1] == %i." %
            (X.shape[1], grid.shape[1]))

    # data and grid size
    m = len(X)
    n = len(Y)

    # compute pairwise kernels
    K_xg = pairwise_kernels(X, grid, metric=kernel_function, **kwargs)
    K_yg = pairwise_kernels(Y, grid, metric=kernel_function, **kwargs)

    return (np.sum(K_xg, axis=0) / m) - (np.sum(K_yg, axis=0) / n)
Beispiel #21
0
 def predict(self, X, prob=False):
     X = copy.deepcopy(X)
     K_ts_matrix = pairwise_kernels(X,
                                    self.X,
                                    metric=self.kernel,
                                    gamma=self.gamma)
     output = np.dot(K_ts_matrix, self.alpha)
     if prob is True:
         return output
     return output.argmax(axis=1)
Beispiel #22
0
    def predict(self, X):
        '''
        Returns +1 if the sample is predicted to be novel, -1 otherwise.
        '''
        ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric)
        scores = score(self.projection, self.target_points, ks)
        prediction = np.array(
            [1 if sc > self.threshold else -1 for sc in scores])

        return prediction
Beispiel #23
0
def example():
    import numpy as np
    from sklearn.metrics import pairwise_distances
    from sklearn.metrics import pairwise_kernels
    X = np.array([[2, 3], [3, 5], [5, 8]])
    Y = np.array([[1, 0], [2, 1]])

    print(pairwise_distances(X, Y, metric='manhattan'))
    print(pairwise_distances(X, metric='manhattan'))
    print(pairwise_kernels(X, Y, metric='linear'))
Beispiel #24
0
    def transform(self, X):
        """Apply the feature map to X."""
        X = check_array(X)

        embedded = pairwise_kernels(X,
                                    self.components_,
                                    metric=self.kernel,
                                    gamma=self.gamma)

        return np.dot(embedded, self.normalization_.T)
Beispiel #25
0
 def get_cosine_sim(self, s, p, o, body):
     scores = []
     claim = s + p + o
     lsen = tokenize.sent_tokenize(body)
     vec = CountVectorizer(analyzer='word')
     vec.fit(lsen)
     scores = pairwise_kernels(vec.transform([claim]),
                               vec.transform(lsen),
                               metric='cosine')
     scores = scores[0].tolist()
     return max(scores)
Beispiel #26
0
def test_statistics(X, Y, model='MMD', kernel_function='rbf', **kwargs):
    """
     This function performs a test statistics and return a test value. This implementation can perform
     the Kolmogorov-Smirnov test (for one-dimensional data only), Kullback-Leibler divergence and MMD.

    :param X: numpy-array
        Data, of size MxD [M is the number of data points, D is the features dimension]
    :param Y: numpy-array
        Data, of size NxD [N is the number of data points, D is the features dimension]
    :param model: string
        defines the basis model to perform two sample test ['KS', 'KL', 'MMD']
    :param kernel_function: string
        defines the kernel function, only used for the MMD.
        For the list of implemented kernel please consult with https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.kernel_metrics.html#sklearn.metrics.pairwise.kernel_metrics
    :param kwargs:
        extra parameters, these are passed to `pairwise_kernels()` as kernel parameters or `KL_divergence_estimator()`
        as the number of k. E.g., if `kernel_two_sample_test(..., kernel_function='rbf', gamma=0.1)`

    :return: float
        the test value
    """

    if model not in ['KS', 'KL', 'MMD']:
        raise ValueError(
            "The Model '%s' is not implemented, try 'KS', 'KL', or 'MMD'." %
            model)

    if X.shape[1] != Y.shape[1]:
        raise ValueError(
            "Incompatible dimension for X and Y matrices. X and Y should have the same feature dimension,"
            ": X.shape[1] == %i while Y.shape[1] == %i." %
            (X.shape[1], Y.shape[1]))

    if model == 'KS' and X.shape[1] > 1:
        raise ValueError("The KS test can handle only one dimensional data,"
                         ": X.shape[1] == %i and Y.shape[1] == %i." %
                         (X.shape[1], Y.shape[1]))

    m = len(X)
    n = len(Y)

    # compute the test statistics according to the input model
    if model == 'MMD':
        XY = np.vstack([X, Y])
        K = pairwise_kernels(XY, metric=kernel_function, **kwargs)
        test_value = MMD2u_estimator(K, m, n)

    elif model == 'KS':
        test_value, _ = stats.ks_2samp(X.T[0], Y.T[0])

    elif model == 'KL':
        test_value = KL_divergence_estimator(X, Y, **kwargs)

    return test_value
Beispiel #27
0
    def predict(self, X):
        X = np.asarray(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.kernel == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            h = np.asarray(self.h)
            gamma = 0.5 / h / h
            K = pairwise_kernels(X, self.X, metric='rbf', gamma=gamma)

        else:
            K = pairwise_kernels(X, self.X, metric=self.kernel, **self.kwargs)

        K /= self.dy**2

        return (K * self.y).sum(1) / K.sum(1)
Beispiel #28
0
    def predict(self, X):
        X = np.asarray(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.kernel == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            h = np.asarray(self.h)
            gamma = 0.5 / h / h
            K = pairwise_kernels(X, self.X, metric='rbf', gamma=gamma)

        else:
            K = pairwise_kernels(X, self.X, metric=self.kernel, **self.kwargs)

        K /= self.dy ** 2

        return (K * self.y).sum(1) / K.sum(1)
Beispiel #29
0
    def transform(self, X):
        """Project the points in X onto the fisher directions.

        Parameters
        ----------
        X : {array-like} of shape (n_samples, n_features) to be
            projected onto the fisher directions.
        """
        check_is_fitted(self)
        return pairwise_kernels(
            X, self.X_, metric=self.kernel, **self.kwds
        ) @ self.weights_
Beispiel #30
0
 def _get_kernel(self, view, X, Y=None):
     if callable(self.kernel[view]):
         params = self.kernel_params[view] or {}
     else:
         params = {
             "gamma": self.gamma[view],
             "degree": self.degree[view],
             "coef0": self.coef0[view],
         }
     return pairwise_kernels(
         X, Y, metric=self.kernel[view], filter_params=True, **params
     )
Beispiel #31
0
 def fit(self, X):
     A = self.__adjacent_mat(X, self.n_neighbors)
     if self.kernel == 'linear':
         K = pairwise_kernels(X, metric='linear')
     elif self.kernel == 'polynomial':
         K = pairwise_kernels(X, metric='polynomial', gamma=0.05, degree=3)
     elif self.kernel == 'sigmoid':
         K = pairwise_kernels(X, metric='sigmoid', gamma=0.5)
     elif self.kernel == 'rbf':
         K = pairwise_kernels(X, metric='rbf', gamma=self.gamma)
     else:
         raise Exception('Invalid kernel')
     I = np.eye(X.shape[0])
     T = np.dot(np.transpose(A), K)
     inv = np.linalg.inv(np.dot(T, K) + self.regu_coef * I)
     C = np.dot(inv, T)
     Coef = self.thrC(C,  self.ro)
     y_pre, C_final = self.post_proC(Coef, self.n_clusters, 8, 18)
     if self.save_affinity:
         np.savez('./gcsc-kernel-affinity.npz', C=C_final, C1=0.5 * (np.abs(C) + np.abs(C.T)))
     return y_pre
Beispiel #32
0
 def fit(self, X, y):
     self.X = X
     self.y = y
     if y.shape.__len__() != 2:
         self.classes_ = np.unique(y)
         self.n_classes_ = self.classes_.__len__()
         self.y = self.one2array(y, self.n_classes_)
     else:
         self.classes_ = np.arange(y.shape[1])
         self.n_classes_ = self.classes_.__len__()
     K_tr_matrix = pairwise_kernels(X, X, metric=self.kernel)
     self.alpha = np.dot(
         np.linalg.inv(np.eye(X.shape[0]) / self.C + K_tr_matrix), self.y)
    def __init__(self, X, y, theta_0, theta_1):
        self.X = X
        self.size, self.dim = self.X.shape
        y = y
        ε = 1e-2
        self.theta_0 = theta_0
        self.theta_1 = theta_1
        self.c = np.sqrt(theta_1**2 + ε)
        length_scale = 1.0 / (np.sqrt(2) * self.c)
        self.kernel = RBF(length_scale=length_scale)

        K = theta_0**2 * pairwise_kernels(X, metric=self.kernel)
        self.z = np.linalg.solve(K, y)
Beispiel #34
0
    def predict(self, X):
        '''
        Returns +1 if the sample is predicted to be novel, -1 otherwise.

        The threshold is selected between 0 and the minimum distance between
        two target points.
        '''
        ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric)
        scores = score(self.projection, self.target_points, ks)
        # min_dist = self._get_pairwise_min_dist()
        prediction = np.array([1 if sc > self.threshold else -1 for sc in scores])

        return scores
Beispiel #35
0
    def predict_kernel(self, X):
        n_test = X.shape[0]
        distance_matrix = 2 - 2 * pairwise_kernels(
            X, self.X, metric='rbf', gamma=10)
        mean_vector = np.zeros(
            (n_test, self.n_classes_, X.shape[1]))  # [n_X, n_class, n_feature]

        for c in range(self.n_classes_):
            c_index = np.nonzero(self.y == self.classes_[c])
            dis_c = distance_matrix[:, c_index[0]]
            X_c = self.X[c_index]
            sorted_index = dis_c.argsort()
            nearest_neighbor_c = X_c[sorted_index][:, :self.n_neighbor, :]
            mean_vector[:, c, :] = nearest_neighbor_c.mean(axis=1)
        results = np.zeros(n_test)
        for i in range(n_test):
            dis = 2 - 2 * pairwise_kernels(X[i].reshape(1, X.shape[1]),
                                           mean_vector[i, :, :],
                                           metric='rbf',
                                           gamma=1).flatten()
            results[i] = self.classes_[np.argmin(dis)]
        return results
Beispiel #36
0
    def predict(self, X):
        '''
        Returns +1 if the sample is predicted to be novel, -1 otherwise.

        The threshold is selected between 0 and the minimum distance between
        two target points.
        '''
        ks = metrics.pairwise_kernels(X=self.X_train, Y=X, metric=self.metric)
        scores = score(self.projection, self.target_points, ks)
        # min_dist = self._get_pairwise_min_dist()
        prediction = np.array(
            [1 if sc > self.threshold else -1 for sc in scores])

        return scores
Beispiel #37
0
    def fit(self, X, y):
        """
        Fit the NearestCentroid model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array, shape = [n_samples]
            Target values (integers)
        """
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)
        if self.n_components > self.classes_.size - 1:
            warnings.warn(
                "n_components > classes_.size - 1."
                "Only the first classes_.size - 1 components will be valid."
            )
        self.X_ = X
        self.y_ = y

        y_onehot = OneHotEncoder().fit_transform(
            self.y_[:, np.newaxis])

        K = pairwise_kernels(
            X, X, metric=self.kernel, **self.kwds)

        m_classes = y_onehot.T @ K / y_onehot.T.sum(1)
        indices = (y_onehot @ np.arange(self.classes_.size)).astype('i')
        N = K @ (K - m_classes[indices])

        # Add value to diagonal for rank robustness
        N += eye(self.y_.size) * self.robustness_offset

        m_classes_centered = m_classes - K.mean(1)
        M = m_classes_centered.T @ m_classes_centered

        # Find weights
        w, self.weights_ = eigsh(M, self.n_components, N, which='LM')

        # Compute centers
        centroids_ = m_classes @ self.weights_

        # Train nearest centroid classifier
        self.clf_ = NearestCentroid().fit(centroids_, self.classes_)

        return self
Beispiel #38
0
 def fit(self, X, y):
     # Validate input.
     X, y = check_X_y(X, y, accept_sparse=None, dtype='numeric')
     # Normalize input.
     self.n_, self.d_ = X.shape
     X, y = self._normalize_X_y(X, y)
     self.gamma_ = kernel_radius_to_gamma(self.kernel_radius, self.n_,
                                          self.d_,
                                          self.kernel_value_at_radius)
     # Train model.
     self.K_ = pairwise_kernels(X,
                                metric='rbf',
                                gamma=self.gamma_,
                                n_jobs=-1)
     return self
def SVM_single_modality(data_b6, data_btbr, modality='Structural'):
    """
    """
    print 'Analyzing %s data' %(modality)
    vectors = np.vstack((data_b6, data_btbr))
    y = np.hstack((np.zeros(len(data_b6)), np.ones(len(data_btbr))))
    sigma2 = np.median(pairwise_distances(vectors, metric='euclidean'))**2    
    k_matrix = pairwise_kernels(vectors, metric='rbf', gamma=1.0/sigma2)  

    clf = SVC(kernel='precomputed')
    cv_scores = cross_val_score(clf, k_matrix, y, cv=StratifiedKFold(y, n_folds=len(y)/2))
    
    print 'Mean accuracy: %s, std: %s' %(np.mean(cv_scores), np.std(cv_scores))
    print 'All folds scores: %s' %(cv_scores)
    print ''
Beispiel #40
0
    def fit(self,x):
        n=len(x)
        if self.xce is None:
            self.b=min(100,n)
            self.xce=x[permutation(n)][:self.b]
        else:
            self.b=len(self.xce)
        Phi=pairwise_kernels(x,self.xce,metric="rbf",gamma=1./(2*self.sigma**2))

        Phi1=tile(Phi.sum(0),(n,1))
        tmp1=Phi1.T.dot(Phi)/(n**2)
        tmp2=Phi.sum(0)/(n)
        self.alpha=pinv(tmp1 + self.lam*identity(self.b)).dot(tmp2)

        ppred1=maximum(Phi.dot(self.alpha),0.)
        ypred=ppred1>=0.5

        self.label=ypred
        return self
Beispiel #41
0
def main():
    r"""Plot figure: Different outcomes of a Gaussian kernel approximation."""
    T = 25  # Number of curves

    cm_subsection = np.linspace(0, 1, T + 1)
    colors = [matplotlib.cm.rainbow(x) for x in cm_subsection]

    d = 1  # Dimension of the input
    N = 250  # Number of points per curves

    # Generate N data in (-1, 1) and exact Gram matrix
    np.random.seed(0)
    X = np.linspace(-1, 1, N).reshape((N, d))
    K = pairwise_kernels(X, metric='rbf', gamma=1. / (2. * .1 ** 2))

    # A Matrix for the decomposable kernel. Link the outputs to some mean value
    c = np.random.randn(N, 2)
    A = .5 * np.eye(2) + .5 * np.ones((2, 2))

    plt.close()
    plt.rc('text', usetex=True)
    plt.rc('font', family='serif')
    f, axes = plt.subplots(2, 2, figsize=(12, 8), sharex=True, sharey=True)

    # For each curve with different D
    for k, D in enumerate(np.logspace(0, 4, T)):
        D = int(D)
        np.random.seed(0)

        w = np.random.randn(d, D) / .1
        phiX = phi(X, w, D)
        Kt = np.dot(phiX, phiX.T)

        # Generate outputs with the exact Gram matrix
        pred = np.dot(np.dot(Kt, c), A)
        axes[0, 0].plot(X, pred[:, 0], c=colors[k], lw=.5, linestyle='-')
        axes[0, 0].set_ylabel(r'$y_1$')
        axes[0, 1].plot(X, pred[:, 1], c=colors[k], lw=.5, linestyle='-')
        axes[0, 1].set_ylabel(r'$y_2$')

        # Generate outputs with the a realization of the random Gram matrix
        w = np.random.randn(d, D) / .1
        phiX = phi(X, w, D)
        Kt = np.dot(phiX, phiX.T)

        pred = np.dot(np.dot(Kt, c), A)
        axes[1, 0].plot(X, pred[:, 0], c=colors[k], lw=.5, linestyle='-')
        axes[1, 0].set_xlabel(r'$x$')
        axes[1, 0].set_ylabel(r'$y_1$')
        axes[1, 1].plot(X, pred[:, 1], c=colors[k], lw=.5, linestyle='-')
        axes[1, 1].set_xlabel(r'$x$')
        axes[1, 1].set_ylabel(r'$y_2$')

    axes[0, 0].plot(X, np.dot(np.dot(K, c), A)[:, 0], c='k', lw=.5, label='K')
    axes[0, 1].plot(X, np.dot(np.dot(K, c), A)[:, 1], c='k', lw=.5, label='K')
    axes[1, 0].plot(X, np.dot(np.dot(K, c), A)[:, 0], c='k', lw=.5, label='K')
    axes[1, 1].plot(X, np.dot(np.dot(K, c), A)[:, 1], c='k', lw=.5, label='K')

    axes[0, 0].set_title(r'$\widetilde{K}u \approx Ku$, realization 1', x=1.1)
    axes[1, 0].set_title(r'$\widetilde{K}u \approx Ku$, realization 2', x=1.1)

    for xx in axes.ravel():
        xx.legend(loc=4)

    createColorbar(1, D, f, axes)
    plt.savefig('not_Mercer.pgf', bbox_inches='tight')
print "X_train", X_train.shape
print "X_test", X_test.shape

# PCA view
print "Computing PCA..."
pca = RandomizedPCA(n_components=300)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

components_pca = select_components(X_train_pca, y_train,
                                   n_components=opts.n_components,
                                   class_distrib="balanced")

print "Computing kernels (PCA view)..."
K_pca_train = pairwise_kernels(X_train_pca, components_pca, metric="rbf",
                               gamma=0.1)
K_pca_test = pairwise_kernels(X_test_pca, components_pca, metric="rbf",
                              gamma=0.1)

# Regular view
components = select_components(X_train, y_train,
                               n_components=opts.n_components,
                               class_distrib="balanced")

print "Computing kernels (regular view)..."
K_train = pairwise_kernels(X_train, components, metric="rbf", gamma=0.1)
K_test = pairwise_kernels(X_test, components, metric="rbf", gamma=0.1)

# Combined views
n_components = components.shape[0]
n = n_components / 2
Beispiel #43
0
 def predict(self,x):
     Phi=pairwise_kernels(x,self.xce,metric="rbf",gamma=1./(2*self.sigma**2))
     ppred1=maximum(Phi.dot(self.alpha),0.)
     #ppred1=exp(-Phi.dot(self.alpha))
     ypred=ppred1>=0.5
     return ypred
Beispiel #44
0
def _summarize(data, vocabulary, labels_column, num_cluster):
    # Basic stats
    print("Number of songs per cluster")
    counter = Counter(labels_column)
    print(counter)
    print()

    prob_Ct, prob_Tc, prob_T = compute_probs(data, num_cluster, labels_column, counter)
    all_tags = range(len(prob_T))

    print("Top tags per cluster")
    for clust in xrange(num_cluster):
        print(clust, "tags with max_freq_in_cluster")
        songs_in_cluster = np.where(labels_column == clust)[0]
        for tag in top_10_frequency(data[songs_in_cluster]):
            print("\t", vocabulary[tag])
        print()

        print(clust, "tags with max_prob_p(c|t)")
        sort_func = lambda to_sort: prob_Ct[to_sort][clust]
        for tag in sorted(all_tags, key=sort_func, reverse=True)[:10]:
            print("\t", vocabulary[tag])
        print()
    print()

    print("Term entropies for each cluster")
    term_entropies = []
    for clust in xrange(num_cluster):
        h = entropy.entropy(prob_Tc[clust])
        term_entropies.append(h)
        print(clust, h)
    print()

    # Number of shared tags between clusters
    X = np.zeros((num_cluster, len(all_tags)))
    for clust in xrange(num_cluster):
        for tag in all_tags:
            X[clust][tag] = prob_Tc[clust][tag]

    distances = pairwise_kernels(X)
    for i in xrange(num_cluster):
        distances[i, i] = 0

    plt.imshow(distances, cmap="bone_r", interpolation="nearest")
    ax = plt.gca()
    plt.xticks(np.arange(0, num_cluster))
    plt.yticks(np.arange(0, num_cluster))
    plt.colorbar()
    plt.title("Confusion Matrix for Cluster Similarities")
    plt.ylabel("ClusterID")
    plt.xlabel("ClusterID")
    for i in xrange(num_cluster):
        ax.annotate("%.3f" % term_entropies[i], xy=(i, i), horizontalalignment="center", verticalalignment="center")
    plt.show()

    print("Mean difference")
    to_corr_1 = []
    to_corr_2 = []
    for clust in xrange(num_cluster):
        to_corr_1.append(term_entropies[clust])
        to_corr_2.append(np.mean(distances[clust]))
        print(clust, term_entropies[clust], np.mean(distances[clust]))
    from scipy.stats import pearsonr

    print("R2 ", pearsonr(to_corr_1, to_corr_2))
                                                    proportion_train=0.75,
                                                    random_state=random_state)
except KeyError:
    raise ValueError("Wrong dataset name!")

print "X_train", X_train.shape
print "X_test", X_test.shape

class_distrib = "random" if opts.regression else "balanced"

components = select_components(X_train, y_train,
                               n_components=opts.n_components,
                               class_distrib=class_distrib)

print "Computing linear kernels..."
linear_train = pairwise_kernels(X_train, components, metric="linear")
linear_test = pairwise_kernels(X_test, components, metric="linear")

print "Computing rbf kernels..."
rbf_train = pairwise_kernels(X_train, components, metric="rbf",
                             gamma=opts.gamma)
rbf_test = pairwise_kernels(X_test, components, metric="rbf",
                            gamma=opts.gamma)

print "Computing polynomial kernels..."
poly_train = pairwise_kernels(X_train, components, metric="poly",
                              degree=opts.degree)
poly_test = pairwise_kernels(X_test, components, metric="poly",
                              degree=opts.degree)

n_components = components.shape[0]
Beispiel #46
0
 def setData(self, X):
     self.X_ = X
     self.gram_ = metrics.pairwise_kernels(self.X_, metric = 'rbf',
                                           gamma = self.gamma_)
def MMD_single_modality(data_b6, data_btbr, modality='Structural',
                             iterations=100000, plot=True):
    """
    Process the data with the following approach: Embedding + 
    RBF_kernel + KTST
    Parameters:
    -----------
    
    Return:
    ----------
        MMD distance, null_distribution, p-value
    """
    print 'Analyzing %s data' %(modality)
    
    #Concatenating the data
    vectors = np.vstack((data_b6, data_btbr))
    n_b6 = len(data_b6)
    n_btbr = len(data_btbr)
   
    sigma2 = np.median(pairwise_distances(vectors, metric='euclidean'))**2    
    k_matrix = pairwise_kernels(vectors, metric='rbf', gamma=1.0/sigma2)    
    
    if plot:
        plot_similarity_matrix(k_matrix)
    
    #Computing the MMD
    mmd2u = MMD2u(k_matrix, n_b6, n_btbr)
    print("MMD^2_u = %s" % mmd2u)    
    #Computing the null-distribution
        
    #Null distribution only on B6 mice
#    sigma2_b6 = np.median(pairwise_distances(vectors_cl1, metric='euclidean'))**2    
#    k_matrix_b6 = pairwise_kernels(vectors_cl1, metric='rbf', gamma=1.0/sigma2_b6)
#    mmd2u_null = compute_null_distribution(k_matrix_b6, 5, 5, iterations, seed=123, verbose=False)
  
    mmd2u_null = compute_null_distribution(k_matrix, n_b6, n_btbr, iterations, 
                                           seed=123, verbose=False)
    
    print np.max(mmd2u_null)
    #Computing the p-value
    p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() / float(iterations))
    print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations))    
    print 'Number of stds from MMD^2_u to mean value of null distribution: %s' % ((mmd2u - np.mean(mmd2u_null))/np.std(mmd2u_null))
    
    if plot:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        prob, bins, patches = plt.hist(mmd2u_null, bins=50, normed=True)
        ax.plot(mmd2u, prob.max()/30, 'w*', markersize=15, 
                markeredgecolor='k', markeredgewidth=2, 
                label="$%s MMD^2_u = %s$" % (modality, mmd2u))
    #    func_p_value = max(1.0/iterations, (functional_mmd[1] > functional_mmd[0]).sum() / float(iterations))

        ax.annotate('p-value: %s' %(p_value), 
                    xy=(float(mmd2u), prob.max()/9.),  xycoords='data',
                    xytext=(-105, 30), textcoords='offset points',
                    bbox=dict(boxstyle="round", fc="1."),
                    arrowprops=dict(arrowstyle="->",
                                    connectionstyle="angle,angleA=0,angleB=90,rad=10"),
                    )
        plt.xlabel('$MMD^2_u$')
        plt.ylabel('$p(MMD^2_u)$')
        plt.legend(numpoints=1)
#        plt.title('%s_DATA: $p$-value=%s' %(modality, p_value))
        print ''