Exemple #1
0
def _pairwise_callable(X, Y, metric, **kwds):
    """Handle the callable case for pairwise_{distances,kernels}
    """
    try:
        X, Y = check_pairwise_arrays(X, Y)
    except ValueError:
        X, Y = check_pairwise_arrays(X, Y, dtype=object)  # try not to convert

    if X is Y:
        # Only calculate metric for upper triangle
        out = np.zeros((X.shape[0], Y.shape[0]), dtype='float')
        iterator = itertools.combinations(range(X.shape[0]), 2)
        for i, j in iterator:
            out[i, j] = metric(X[i], Y[j], **kwds)

        # Make symmetric
        # NB: out += out.T will produce incorrect results
        out = out + out.T

        # Calculate diagonal
        # NB: nonzero diagonals are allowed for both metrics and kernels
        for i in range(X.shape[0]):
            x = X[i]
            out[i, i] = metric(x, x, **kwds)

    else:
        # Calculate all cells
        out = np.empty((X.shape[0], Y.shape[0]), dtype='float')
        iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
        for i, j in iterator:
            out[i, j] = metric(X[i], Y[j], **kwds)

    return out
Exemple #2
0
def histogram_intersection_kernel(X, Y = None, alpha = None, beta = None):
    """
    Source: https://github.com/kuantkid/scikit-learn/commit/16c82d8f2fe763df7bfee9bbcc40016fb84affcf
    Author: kuantkid
    Date: Nov 20, 2012

    Compute the histogram intersection kernel(min kernel) 
    between X and Y::
        K(x, y) = \\sum_i^n min(|x_i|^\x07lpha, |y_i|^\x08eta)
    Parameters
    ----------
    X : array of shape (n_samples_1, n_features)
    Y : array of shape (n_samples_2, n_features)
    gamma : float
    Returns
    -------
    Gram matrix : array of shape (n_samples_1, n_samples_2)
    """
    (X, Y,) = pairwise.check_pairwise_arrays(X, Y)
    if alpha is not None:
        X = np.abs(X) ** alpha
    if beta is not None:
        Y = np.abs(Y) ** beta
    (n_samples_1, n_features,) = X.shape
    (n_samples_2, _,) = Y.shape
    K = np.zeros(shape=(n_samples_1, n_samples_2), dtype=np.float)
    for i in range(n_samples_1):
        K[i] = np.sum(np.minimum(X[i], Y), axis=1)

    return K
def test_check_dense_matrices():
    # Ensure that pairwise array check works for dense matrices.
    # Check that if XB is None, XB is returned as reference to XA
    XA = np.resize(np.arange(40), (5, 8))
    XA_checked, XB_checked = check_pairwise_arrays(XA, None)
    assert XA_checked is XB_checked
    assert_array_equal(XA, XA_checked)
Exemple #4
0
def GeneralizedNormalKernel(X, Y=None, gamma = None, beta = 1):
    """Compute the generalized normal kernel between X and Y.
    The generalized normal kernel is defined as::
        K(x, y) = exp(-gamma ||x-y||_1^beta)
    for each pair of rows x in X and y in Y.
    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)
    Y : array of shape (n_samples_Y, n_features)
    gamma : float
    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """

    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    if beta == 1:
        K = -gamma * manhattan_distances(X, Y)
    else:
        K = -gamma * manhattan_distances(X, Y) ** beta
    np.exp(K, K)    # exponentiate K in-place
    return K
def trimmedrbf_kernel(X, Y=None, gamma=None, robust_gamma = None):
    """
    Compute the rbf (gaussian) kernel between X and Y::

        K(x, y) = exp(-gamma ||x-y||**2)

    for each pair of rows x in X and y in Y.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    K = euclidean_distances(X, Y, squared=True)
    print K
    print "SHape kernel" + str(np.where(np.sqrt(K) > robust_gamma)[0].shape)
    K[np.where(np.sqrt(K) > robust_gamma)] = robust_gamma**2
    
    K *= -gamma
    np.exp(K, K)    # exponentiate K in-place
    return K
Exemple #6
0
def MaternKernel(X, Y=None, gamma = None, p = 0):
    """Compute the generalized normal kernel between X and Y.
    The generalized normal kernel is defined as::
        K(x, y) = exp(-gamma ||x-y||_1^beta)
    for each pair of rows x in X and y in Y.
    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)
    Y : array of shape (n_samples_Y, n_features)
    gamma : float
    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    assert(p == int(p))

    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    r = manhattan_distances(X, Y)
    if p == 0:
        K = -gamma * r
        np.exp(K, K)    # exponentiate K in-place
    if p == 1:
        K = -gamma * r * math.sqrt(3)
        np.exp(K, K)    # exponentiate K in-place
        K *= (1+gamma * r * math.sqrt(3))
    if p == 1:
        K = -gamma * r * math.sqrt(5)
        np.exp(K, K)    # exponentiate K in-place
        K *= (1+gamma * r * math.sqrt(5) + 5./3. * (r*gamma)**2)
    return K
Exemple #7
0
def poisson_kernel(X, Y=None, gamma=None, Sigma_inv = None):
    """
    Compute the poisson kernel between X and Y::
        K(x, y) = exp(-gamma ||x-mu||^2/mu)
        mu = centroid of X (=X if X.shape[0] == 1)
    for each pair of rows x in X and y in Y.
    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)
    Y : array of shape (n_samples_Y, n_features)
    gamma : float
    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]
    if Sigma_inv is None:
        raise ValueError('Missing Sigma_inv')
    
    v = X - Y
    K = -0.5 * gamma * np.sqrt(v.dot(Sigma_inv).dot(v.T))
    np.exp(K, K)    # exponentiate K in-place
    return K
Exemple #8
0
def test_check_sparse_arrays():
    """ Ensures that checks return valid sparse matrices. """
    rng = np.random.RandomState(0)
    XA = rng.random_sample((5, 4))
    XA_sparse = csr_matrix(XA)
    XB = rng.random_sample((5, 4))
    XB_sparse = csr_matrix(XB)
    XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse)

    # compare their difference because testing csr matrices for
    # equality with '==' does not work as expected.
    assert_true(abs(XA_sparse - XA_checked).nnz == 0)
    assert_true(abs(XB_sparse - XB_checked).nnz == 0)

    XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XA_sparse)
    assert_true(XA_sparse == XB_checked)
    assert_true(abs(XA_sparse - XA_checked).nnz == 0)
def test_check_XB_returned():
    """ Ensure that if XA and XB are given correctly, they return as equal."""
    # Check that if XB is not None, it is returned equal.
    # Note that the second dimension of XB is the same as XA.
    XA = np.resize(np.arange(40), (5, 8))
    XB = np.resize(np.arange(32), (4, 8))
    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
    assert_array_equal(XA, XA_checked)
    assert_array_equal(XB, XB_checked)
def test_check_tuple_input():
    # Ensures that checks return valid tuples.
    rng = np.random.RandomState(0)
    XA = rng.random_sample((5, 4))
    XA_tuples = tuplify(XA)
    XB = rng.random_sample((5, 4))
    XB_tuples = tuplify(XB)
    XA_checked, XB_checked = check_pairwise_arrays(XA_tuples, XB_tuples)
    assert_array_equal(XA_tuples, XA_checked)
    assert_array_equal(XB_tuples, XB_checked)
Exemple #11
0
def test_check_sparse_arrays():
    """ Ensures that checks return valid sparse matrices. """
    rng = np.random.RandomState(0)
    XA = rng.random_sample((5, 4))
    XA_sparse = csr_matrix(XA)
    XB = rng.random_sample((5, 4))
    XB_sparse = csr_matrix(XB)
    XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse)
    assert_equal(XA_sparse, XA_checked)
    assert_equal(XB_sparse, XB_checked)
Exemple #12
0
def test_check_sparse_arrays():
    # Ensures that checks return valid sparse matrices.
    rng = np.random.RandomState(0)
    XA = rng.random_sample((5, 4))
    XA_sparse = csr_matrix(XA)
    XB = rng.random_sample((5, 4))
    XB_sparse = csr_matrix(XB)
    XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse)
    # compare their difference because testing csr matrices for
    # equality with '==' does not work as expected.
    assert_true(issparse(XA_checked))
    assert_equal(abs(XA_sparse - XA_checked).sum(), 0)
    assert_true(issparse(XB_checked))
    assert_equal(abs(XB_sparse - XB_checked).sum(), 0)

    XA_checked, XA_2_checked = check_pairwise_arrays(XA_sparse, XA_sparse)
    assert_true(issparse(XA_checked))
    assert_equal(abs(XA_sparse - XA_checked).sum(), 0)
    assert_true(issparse(XA_2_checked))
    assert_equal(abs(XA_2_checked - XA_checked).sum(), 0)
def roll_invariant_euclidean_distances(X, Y=None, squared=False):
    """
    Considering the rows of X (and Y=X) as vectors, compute the
    distance matrix between each pair of vectors.
    The distance is the minimum of the euclidean distance over all rolls:

        dist(x, y) = min_\tau(||x(t) - y(t - \tau)||^2)

    Parameters
    ----------
    X : array, shape (n_samples_1, n_features)

    Y : array, shape (n_samples_2, n_features)

    squared : boolean
        Not used. Only for API compatibility.

    Returns
    -------
    distances : array, shape (n_samples_1, n_samples_2)

    """
    X = np.atleast_2d(X)
    if Y is not None:
        Y = np.atleast_2d(Y)
    X, Y = check_pairwise_arrays(X, Y)
    n_samples_1, n_features = X.shape
    n_samples_2, n_features = Y.shape

    X_norm = np.power(np.linalg.norm(X, axis=1), 2)
    Y_norm = np.power(np.linalg.norm(Y, axis=1), 2)

    # n_pads = 0
    # n_fft = next_fast_len(n_features + n_pads)
    n_fft = n_features  # not fast but otherwise the distance is wrong
    X_hat = rfft(X, n_fft, axis=1)
    Y_hat = rfft(Y, n_fft, axis=1).conj()

    # # broadcasting can have a huge memory cost
    # XY_hat = X_hat[:, None, :] * Y_hat[None, :, :]
    # XY = irfft(XY_hat, n_fft, axis=2).max(axis=2)
    # distances = X_norm[:, None] + Y_norm[None, :] - 2 * XY

    distances = np.zeros((n_samples_1, n_samples_2))
    if n_samples_2 > 1:
        print('RIED on %s samples, this might be slow' % (distances.shape, ))
    for ii in range(n_samples_1):
        for jj in range(n_samples_2):
            XY = irfft(X_hat[ii] * Y_hat[jj], n_fft).max()
            distances[ii, jj] = X_norm[ii] + Y_norm[jj] - 2 * XY

    distances += 1e-12

    return distances
Exemple #14
0
def test_check_tuple_input():
    """ Ensures that checks return valid tuples. """
    rng = np.random.RandomState(0)
    XA = rng.random_sample((5, 4))
    XB = rng.random_sample((5, 4))
    XA_checked, XB_checked = check_pairwise_arrays(XA_tuples, XB_tuples)
    assert_array_equal(XA_tuples, XA_checked)
    assert_array_equal(XB_tuples, XB_checked)


    XA_tuples = tuplify(XA)
    XB_tuples = tuplify(XB)
    def my_rbf(X, Y=None):
        """ K(x, y) = exp(-gamma ||x-y||^2)
        Returns kernel_matrix : array of shape (n_samples_X, n_samples_Y) """
        X, Y = check_pairwise_arrays(X, Y)

        #if gamma is None:
        #    gamma = 1.0 / X.shape[1]

        K = euclidean_distances(X, Y, squared=True)
        K *= -gamma
        exp(K, K)  # exponentiate K in-place
        return K
def polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1):
    """ K(X, Y) = (gamma <X, Y> + coef0)^degree   
    Returns Gram matrix : array of shape (n_samples_1, n_samples_2) """
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    K = safe_sparse_dot(X, Y.T, dense_output=True)
    K *= gamma
    K += coef0
    K **= degree
    return K
Exemple #17
0
def anova_kernel(X, Y=None, gamma=None, p=1):
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1. / X.shape[1]

    diff = X[:, None, :] - Y[None, :, :]
    diff **= 2
    diff *= -gamma
    np.exp(diff, out=diff)
    K = diff.sum(axis=2)
    K **= p
    return K
def test_check_preserve_type():
    # Ensures that type float32 is preserved.
    XA = np.resize(np.arange(40), (5, 8)).astype(np.float32)
    XB = np.resize(np.arange(40), (5, 8)).astype(np.float32)

    XA_checked, XB_checked = check_pairwise_arrays(XA, None)
    assert_equal(XA_checked.dtype, np.float32)

    # both float32
    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
    assert_equal(XA_checked.dtype, np.float32)
    assert_equal(XB_checked.dtype, np.float32)

    # mismatched A
    XA_checked, XB_checked = check_pairwise_arrays(XA.astype(np.float), XB)
    assert_equal(XA_checked.dtype, np.float)
    assert_equal(XB_checked.dtype, np.float)

    # mismatched B
    XA_checked, XB_checked = check_pairwise_arrays(XA, XB.astype(np.float))
    assert_equal(XA_checked.dtype, np.float)
    assert_equal(XB_checked.dtype, np.float)
def test_check_preserve_type():
    # Ensures that type float32 is preserved.
    XA = np.resize(np.arange(40), (5, 8)).astype(np.float32)
    XB = np.resize(np.arange(40), (5, 8)).astype(np.float32)

    XA_checked, XB_checked = check_pairwise_arrays(XA, None)
    assert_equal(XA_checked.dtype, np.float32)

    # both float32
    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
    assert_equal(XA_checked.dtype, np.float32)
    assert_equal(XB_checked.dtype, np.float32)

    # mismatched A
    XA_checked, XB_checked = check_pairwise_arrays(XA.astype(np.float), XB)
    assert_equal(XA_checked.dtype, np.float)
    assert_equal(XB_checked.dtype, np.float)

    # mismatched B
    XA_checked, XB_checked = check_pairwise_arrays(XA, XB.astype(np.float))
    assert_equal(XA_checked.dtype, np.float)
    assert_equal(XB_checked.dtype, np.float)
def test_check_XB_returned():
    # Ensure that if XA and XB are given correctly, they return as equal.
    # Check that if XB is not None, it is returned equal.
    # Note that the second dimension of XB is the same as XA.
    XA = np.resize(np.arange(40), (5, 8))
    XB = np.resize(np.arange(32), (4, 8))
    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
    assert_array_equal(XA, XA_checked)
    assert_array_equal(XB, XB_checked)

    XB = np.resize(np.arange(40), (5, 8))
    XA_checked, XB_checked = check_paired_arrays(XA, XB)
    assert_array_equal(XA, XA_checked)
    assert_array_equal(XB, XB_checked)
Exemple #21
0
def cross_distances(X, y=None):
    """
    Computes the nonzero componentwise cross-distances between the vectors
    in X or between the vectors in X and the vectors in y.

    Parameters
    ----------

    X: np.ndarray [n_obs, dim]
            - The input variables.
    y: np.ndarray [n_y, dim]
            - The training data.
    Returns
    -------

    D: np.ndarray [n_obs * (n_obs - 1) / 2, dim]
            - The cross-distances between the vectors in X.

    ij: np.ndarray [n_obs * (n_obs - 1) / 2, 2]
            - The indices i and j of the vectors in X associated to the cross-
              distances in D.
    """
    n_samples, n_features = X.shape
    if y is None:
        n_nonzero_cross_dist = n_samples * (n_samples - 1) // 2
        ij = np.zeros((n_nonzero_cross_dist, 2), dtype=np.int32)
        D = np.zeros((n_nonzero_cross_dist, n_features))
        ll_1 = 0

        for k in range(n_samples - 1):
            ll_0 = ll_1
            ll_1 = ll_0 + n_samples - k - 1
            ij[ll_0:ll_1, 0] = k
            ij[ll_0:ll_1, 1] = np.arange(k + 1, n_samples)
            D[ll_0:ll_1] = X[k] - X[(k + 1):n_samples]
    else:
        n_y, n_features = y.shape
        X, y = check_pairwise_arrays(X, y)
        n_nonzero_cross_dist = n_samples * n_y
        ij = np.zeros((n_nonzero_cross_dist, 2), dtype=np.int32)
        D = np.zeros((n_nonzero_cross_dist, n_features))
        for k in range(n_nonzero_cross_dist):
            xk = k // n_y
            yk = k % n_y
            D[k] = X[xk] - y[yk]
            ij[k, 0] = xk
            ij[k, 1] = yk

    return D, ij.astype(np.int32)
def cosine_similarity(X, Y=None, dense_output=True, shrink=0):
    """Compute cosine similarity between samples in X and Y.

    Cosine similarity, or the cosine kernel, computes similarity as the
    normalized dot product of X and Y:

        K(X, Y) = <X, Y> / (||X||*||Y||)

    On L2-normalized data, this function is equivalent to linear_kernel.

    Read more in the :ref:`User Guide <cosine_similarity>`.

    Parameters
    ----------
    X : ndarray or sparse array, shape: (n_samples_X, n_features)
        Input data.

    Y : ndarray or sparse array, shape: (n_samples_Y, n_features)
        Input data. If ``None``, the output will be the pairwise
        similarities between all samples in ``X``.

    dense_output : boolean (optional), default True
        Whether to return dense output even when the input is sparse. If
        ``False``, the output is sparse if both input arrays are sparse.

        .. versionadded:: 0.17
           parameter ``dense_output`` for dense output.

    Returns
    -------
    kernel matrix : array
        An array with shape (n_samples_X, n_samples_Y).
    """
    # to avoid recursive import

    X, Y = check_pairwise_arrays(X, Y)

    X_normalized = normalize(X, copy=True, shrink=shrink)
    if X is Y:
        Y_normalized = X_normalized
    else:
        Y_normalized = normalize(Y, copy=True, shrink=shrink)

    K = safe_sparse_dot(X_normalized,
                        Y_normalized.T,
                        dense_output=dense_output)

    return K
Exemple #23
0
    def _cosine_distances_prenorm(self, X, Y):
        """
        Return cosine distances based on a prenormalized vectors.

        It allows for much faster computation of cosine distances.
        """
        if not self.prenorm:
            raise Exception('Vectors must be prenormalized!')
        if Y is None:
            Y = X
        X, Y = smp.check_pairwise_arrays(X, Y)
        sims = X.dot(Y.T)

        if scipy.sparse.issparse(sims):
            sims = sims.todense()

        return 1 - sims
def andrews_kernel(X, Y=None, c=None):
    """
    Compute the tukey kernel between X and Y::

        K(x, y) = {1/12*(1-(||x-y||/c)**2)**3
                                            }

    for each pair of rows x in X and y in Y.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """

    X, Y = check_pairwise_arrays(X, Y)

    if c is None:
        c = X.mean() * 50
    print c
    K = euclidean_distances(X, Y, squared=False)

    print K
    print "Shape kernel" + str(np.where(K / c > 1)[0].shape)
    print(K / c)
    print "power"
    print np.pi * (K / c)
    print "power 2"
    print np.cos(np.pi * (K / c))
    print "power 3"
    print 1.0 / 2.0 * np.power(np.pi, 2) * np.cos(np.pi * (K / c))
    gramMatrix = np.zeros(K.shape)
    gramMatrix[np.where(K / c <= 1)] = (1.0 / 2.0 * np.power(np.pi, 2) *
                                        np.cos(np.pi *
                                               (K / c)))[np.where(K / c <= 1)]
    gramMatrix[np.where(K / c > 1)] = -1.0 / 2.0 * np.power(np.pi, 2)
    print "Shape kernel" + str(np.where(K / c > 1)[0].shape)
    return gramMatrix
    def K_linear(X, Y=None):
        print('**K_linear**')
        X, Y = check_pairwise_arrays(X, Y)
        K = zeros((X.shape[0], Y.shape[0]))

        if X is Y:  #fit-> La gramiana K es simétrica
            for i, x in enumerate(X):
                for j, z in enumerate(Y):
                    K[i][j] = K[j][i] = x @ z
                    if j > i:
                        break
        else:  #predict-> K NO es simétrica, es K<x,x_i>
            return X @ Y.T
            #for i,x in enumerate(X):
            #   for j,z in enumerate(Y):
            #      K[i][j] = x@z

        return K
Exemple #26
0
    def _cosine_distances_prenorm(self, X, Y):
        """
        Return cosine distances based on a prenormalized vectors.

        It allows for much faster computation of cosine distances.
        """
        if not self.prenorm:
            raise Exception(
                'Vectors must be prenormalized!')
        if Y is None:
            Y = X
        X, Y = smp.check_pairwise_arrays(X, Y)
        sims = X.dot(Y.T)

        if scipy.sparse.issparse(sims):
            sims = sims.todense()

        return 1 - sims
Exemple #27
0
def andrews_kernel(X, Y=None, c=None):
    """
    Compute the tukey kernel between X and Y::

        K(x, y) = {1/12*(1-(||x-y||/c)**2)**3
                                            }

    for each pair of rows x in X and y in Y.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """

    X, Y = check_pairwise_arrays(X, Y)

    if c is None:
        c = X.mean()*50
    print c
    K = euclidean_distances(X, Y, squared=False)
    
    print K
    print "Shape kernel" + str(np.where(K/c > 1)[0].shape)
    print (K/c)
    print "power"
    print np.pi*(K/c)
    print "power 2"
    print np.cos(np.pi*(K/c))
    print "power 3"
    print 1.0/2.0*np.power(np.pi,2) *np.cos(np.pi*(K/c)) 
    gramMatrix = np.zeros(K.shape)
    gramMatrix[np.where(K/c <= 1)] = (1.0/2.0*np.power(np.pi,2) *np.cos(np.pi*(K/c))) [np.where(K/c <= 1)]
    gramMatrix[np.where(K/c > 1)] = -1.0/2.0*np.power(np.pi,2)
    print "Shape kernel" + str(np.where(K/c > 1)[0].shape)
    return gramMatrix
Exemple #28
0
def sqrtcos_similarity(X, Y=None, dense_output=True):
    """Compute cosine similarity between samples in X and Y.
        Cosine similarity, or the cosine kernel, computes similarity as the
        normalized dot product of X and Y:
            K(X, Y) = <X, Y> / (||X||*||Y||)
        On L2-normalized data, this function is equivalent to linear_kernel.
        Read more in the :ref:`User Guide <cosine_similarity>`.
        Parameters
        ----------
        X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
            Input data.
        Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features), \
                default=None
            Input data. If ``None``, the output will be the pairwise
            similarities between all samples in ``X``.
        dense_output : bool, default=True
            Whether to return dense output even when the input is sparse. If
            ``False``, the output is sparse if both input arrays are sparse.
            .. versionadded:: 0.17
               parameter ``dense_output`` for dense output.
        Returns
        -------
        kernel matrix : ndarray of shape (n_samples_X, n_samples_Y)
        """
    # to avoid recursive import
    X = np.sqrt(X)
    #Y = np.sqrt(Y)
    X, Y = pairwise.check_pairwise_arrays(X, Y)

    X_normalized = pairwise.normalize(X, copy=True, norm='l2')
    if X is Y:
        Y_normalized = X_normalized
    else:
        Y_normalized = pairwise.normalize(Y, copy=True, norm='l2')

    K = pairwise.safe_sparse_dot(X_normalized,
                                 Y_normalized.T,
                                 dense_output=dense_output)

    return K
def sigmoid_kernel(X, Y=None, gamma=None, coef0=1):
    """ K(X, Y) = tanh(gamma <X, Y> + coef0)
    Returns Gram matrix : array of shape (n_samples_1, n_samples_2)"""
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    K = safe_sparse_dot(X, Y.T, dense_output=True)
    K *= gamma
    K += coef0
    tanh(K, K)  # compute tanh in-place
    return K


#def test_K_sHerm():
#"""MUESTRA POLINOMIOS DE S-HERMITE PARA VALIDAR LA H(x_i,n) ESCALADA"""
#plt.figure()
#t = np.arange(-1,1.1,.1) #Rango de prueba
#for i in range(1,6):
#plt.plot(t, H(t,i)*2**(-i), label = 'Grado '+str(i))
#plt.legend()
#plt.title("Polinomios Ortogonales de s-Hermite")
Exemple #30
0
def first_periodic_kernel(X, Y=None, gamma=None, period=None):
    # TODO: Add mathematical form of the kernel in the docstring
    """Compute the first periodic kernel between *X* and *Y*.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float, default None
        If None, default to 1.0 / n_samples_X

    period : float, default None
        If None, default to 2 * pi.

        This parameter should not be default as
        wrong estimation lead to poor learning score.

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 0.8

    if period is None:
        period = 2. * pi

    a = -log(gamma) / period
    b = 2 * pi / period
    c = sqrt(pi / a) * (exp(- b ** 2 / (4 * a)) + 1)
    K = euclidean_distances(X, Y, squared=True)

    # TODO: Optimize to avoid temporary?
    return exp(-a * K) * (1 + cos(b * sqrt(K))) / c
Exemple #31
0
def first_periodic_kernel(X, Y=None, gamma=None, period=None):
    # TODO: Add mathematical form of the kernel in the docstring
    """Compute the first periodic kernel between *X* and *Y*.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float, default None
        If None, default to 1.0 / n_samples_X

    period : float, default None
        If None, default to 2 * pi.

        This parameter should not be default as
        wrong estimation lead to poor learning score.

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 0.8

    if period is None:
        period = 2. * pi

    a = -log(gamma) / period
    b = 2 * pi / period
    c = sqrt(pi / a) * (exp(-b**2 / (4 * a)) + 1)
    K = euclidean_distances(X, Y, squared=True)

    # TODO: Optimize to avoid temporary?
    return exp(-a * K) * (1 + cos(b * sqrt(K))) / c
Exemple #32
0
def weighting_euclidean_distances(X, Y, weights):
    """
    X : predict values
    Y : code_book
    """
    # get distance
    X, Y = check_pairwise_arrays(X, Y)
    distances = []
    for pre_vector in X:
        dists = []
        for codeword in Y:
            d1 = sum([
                abs(codeword[i]) * weights[i] * (pre_vector[i] - codeword[i]) *
                (pre_vector[i] - codeword[i]) for i in xrange(len(codeword))
            ])
            dists.append(d1)
        distances.append(dists)
    distances = np.array(distances)
    distances[distances < 0] = 0
    if X is Y:
        # Ensure that distances between vectors and themselves are set to 0.0.
        # This may not be the case due to floating point rounding errors.
        distances.flat[::distances.shape[0] + 1] = 0.0
    return np.sqrt(distances, out=distances)
Exemple #33
0
def cosine_similarity(X, Y=None):
    """Compute cosine similarity between samples in X and Y.

    Cosine similarity, or the cosine kernel, computes similarity as the
    normalized dot product of X and Y:

        K(X, Y) = <X, Y> / (||X||*||Y||)

    On L2-normalized data, this function is equivalent to linear_kernel.

    Parameters
    ----------
    X : array_like, sparse matrix
        with shape (n_samples_X, n_features).

    Y : array_like, sparse matrix (optional)
        with shape (n_samples_Y, n_features).

    Returns
    -------
    kernel matrix : array_like
        An array with shape (n_samples_X, n_samples_Y).
    """
    # to avoid recursive import

    X, Y = check_pairwise_arrays(X, Y)

    X_normalized = normalize(X, copy=True)
    if X is Y:
        Y_normalized = X_normalized
    else:
        Y_normalized = normalize(Y, copy=True)

    K = linear_kernel(X_normalized, Y_normalized)

    return K
Exemple #34
0
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix inputs.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    """
    if (metric not in _VALID_METRICS
        and not callable(metric)
        and metric != "precomputed"):
        raise ValueError("Unknown metric %s. "
                         "Valid metrics are %s, or 'precomputed', or a "
                         "callable" % (metric, _VALID_METRICS))

    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True)
        whom = ("`pairwise_distances`. Precomputed distance "
                                " need to have non-negative values.")
        check_non_negative(X, whom=whom)
        return X
    elif ((metric == 'cosine') and (Y is None)
          and (not issparse(X)) and X.dtype == np.float64):
        return _daal4py_cosine_distance_dense(X)
    elif ((metric == 'correlation') and (Y is None) and
          (not issparse(X)) and X.dtype == np.float64):
        return _daal4py_correlation_distance_dense(X)
    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric, **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        if (dtype == bool
            and (X.dtype != bool or (Y is not None and Y.dtype != bool))):
            msg = "Data was converted to boolean for metric %s" % metric
            warnings.warn(msg, DataConversionWarning)

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype)

        # precompute data-derived metric params
        params = _precompute_metric_params(X, Y, metric=metric, **kwds)
        kwds.update(**params)

        if effective_n_jobs(n_jobs) == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Exemple #35
0

    XB = np.arange(32)
    assert_raises(ValueError, check_pairwise_arrays, XA, XB)
def test_check_sparse_arrays():
    """ Ensures that checks return valid sparse matrices. """
    rng = np.random.RandomState(0)
    XA = rng.random_sample((5, 4))
    XB = rng.random_sample((5, 4))
    XA_sparse = csr_matrix(XA)

    XB_sparse = csr_matrix(XB)
    XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse)

<<<<<<< REMOTE
XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XA_sparse)
=======

>>>>>>> LOCAL
    assert_true(XA_sparse == XB_checked)
    assert_true(abs(XA_sparse - XA_checked).nnz == 0)


    # compare their difference because testing csr matrices for
    # equality with '==' does not work as expected.
    assert_true(abs(XA_sparse - XA_checked).nnz == 0)
    assert_true(abs(XB_sparse - XB_checked).nnz == 0)
def tuplify(X):
    """ Turns a numpy matrix (any n-dimensional array) into tuples."""
    s = X.shape
    if len(s) > 1:
    def _get_kernel(self, X, Y=None, nystroem_kernel=False):
        X, Y = check_pairwise_arrays(X, Y)
        if nystroem_kernel:  ##Cannot use self.nytroem since kernel needs also be computable for full data for prediction when Nystroem sampling is used
            if self.component_indices is None:
                rnd = check_random_state(self.random_state)
                n_samples = X.shape[0]
                # get basis vectors
                if self.n_components > n_samples:
                    # XXX should we just bail?
                    n_components = n_samples
                    warnings.warn(
                        "n_components > n_samples. This is not possible.\n"
                        "n_components was set to n_samples, which results"
                        " in inefficient evaluation of the full kernel.")
                else:
                    n_components = self.n_components
                n_components = min(n_samples, n_components)
                self.component_indices = rnd.permutation(
                    n_samples)[:n_components]
            X = X[self.component_indices].copy()
            d = euclidean_distances(X, X)
        else:
            d = euclidean_distances(X, Y)
        ##Get n_neighbors largest element to find range if not given
        if (self.theta is None):
            if (self.n_neighbors == "inf") | (
                    self.n_neighbors == np.inf
            ):  ##special case: chose theta such that it equals the average distance to the farest neighbor
                self.n_neighbors = X.shape[0] - 1
                self.range_adjust = 1.
            if (not self.prctg_neighbors is None) & (self.n_neighbors is None):
                self.n_neighbors = int(X.shape[0] * self.prctg_neighbors)
            if not self.n_neighbors is None:
                if self.kernel == "GW":  ##Choose theta such that on average every point has n_neighbors non-zero entries
                    ds = d.flatten()
                    ds = ds[~(ds == 0)]  ##Remove diagonal
                    self.theta = bn.partition(
                        ds, d.shape[0] * self.n_neighbors -
                        1)[d.shape[0] * self.n_neighbors - 1]
                else:  ##Choose theta as average distance to n_neighbors'th nearest neighbor
                    kdt = scipy.spatial.cKDTree(X)
                    dists, neighs = kdt.query(
                        X, self.n_neighbors + 1
                    )  ##get distance to n_neighbors+1 nearest neighbors (incl. point itself)
                    self.theta = np.mean(
                        dists[:, self.n_neighbors]
                    )  ##calculate average distance to n_neighbors'th nearest neighbor (only true neighbors excl. point itself)
            if self.kernel == "rbf":
                self.theta = self.theta / (
                    self.range_adjust**0.5
                )  ##range_adjust=3 (4.6) correlation should drop to 5% (1%) at distance = theta
            if self.kernel == "laplace":
                self.theta = self.theta / self.range_adjust
            print("Chosen theta: " + str(round(self.theta, 4)))
        if self.kernel == "GW":
            d *= -1. / self.theta
            d2 = d.copy()
            d += 1.
            d[d < 0] = 0
            d *= d
            d2 *= -2
            d2 += 1
            d *= d2
            ##Above code does the same as below:

    #        tmp=1-d/self.theta
    #        tmp[tmp<0]=0
    #        d=tmp**2*(1+2*d/self.theta)
        if self.kernel == "rbf":
            ##np.exp(-(d/self.theta)**2)
            d *= (1. / self.theta)
            d *= -d
            np.exp(d, d)
        if self.kernel == "laplace":
            ##np.exp(-d/self.theta)
            d *= (-1. / self.theta)
            np.exp(d, d)
        if self.sparse:
            #            print("Sparsity ratio: " +str(round(float(100*np.sum(d>0))/X.shape[0]/X.shape[0],2))+"%")
            return csc_matrix(d)
        else:
            return d
Exemple #37
0
 def __call__(self, X, Y=None):
     X, Y = check_pairwise_arrays(X, Y)
     return safe_sparse_dot(X, Y.T, dense_output=True)
Exemple #38
0
 def __call__(self, X, Y=None):
     X, Y = check_pairwise_arrays(X, Y)
     gamma = (1. / (X.shape[1] * X.var()) if self.gamma == 'scale' else 1. /
              X.shape[1] if self.gamma == 'auto' else self.gamma)
     return (gamma * safe_sparse_dot(X, Y.T, dense_output=True) +
             self.coef0)**self.degree
Exemple #39
0
 def __call__(self, X, Y=None):
     X, Y = check_pairwise_arrays(X, Y)
     gamma = (1. / (X.shape[1] * X.var()) if self.gamma == 'scale' else 1. /
              X.shape[1] if self.gamma == 'auto' else self.gamma)
     return np.exp(-gamma * euclidean_distances(X, Y, squared=True))
Exemple #40
0
def _prep_X_Y_for_cython(X, Y):
    X, Y = check_pairwise_arrays(X, Y)
    X, Y = X.astype(np.double, order='C'), Y.astype(np.double, order='C').T  # transposing Y here!
    res = np.zeros((X.shape[0], Y.shape[1]), dtype=X.dtype)
    return X, Y, res
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
                            force_all_finite=True, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix
      inputs.
      ['nan_euclidean'] but it does not yet support sparse matrices.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if
        metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf and np.nan in array. The
        possibilities are:

        - True: Force all values of array to be finite.
        - False: accept both np.inf and np.nan in array.
        - 'allow-nan': accept only np.nan values in array. Values cannot
          be infinite.

        .. versionadded:: 0.22

    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    See also
    --------
    pairwise_distances_chunked : performs the same calculation as this
        function, but returns a generator of chunks of the distance matrix, in
        order to limit memory usage.
    paired_distances : Computes the distances between corresponding
                       elements of two arrays
    """
    if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed":
        raise ValueError("Unknown metric %s. Valid metrics are %s, or 'precomputed', "
                         "or a callable" % (metric, _VALID_METRICS))

    X = _daal_check_array(X, accept_sparse=['csr', 'csc', 'coo'],
                          force_all_finite=force_all_finite)

    _patching_status = PatchingConditionsChain(
        "sklearn.metrics.pairwise_distances")
    _dal_ready = _patching_status.and_conditions([
        (metric == 'cosine' or metric == 'correlation',
            f"'{metric}' metric is not supported. "
            "Only 'cosine' and 'correlation' metrics are supported."),
        (Y is None, "Second feature array is not supported."),
        (not issparse(X), "X is sparse. Sparse input is not supported."),
        (X.dtype == np.float64,
            f"{X.dtype} X data type is not supported. Only np.float64 is supported.")
    ])
    _patching_status.write_log()
    if _dal_ready:
        if metric == 'cosine':
            return _daal4py_cosine_distance_dense(X)
        if metric == 'correlation':
            return _daal4py_correlation_distance_dense(X)
        raise ValueError(f"'{metric}' distance is wrong for daal4py.")
    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True,
                                     force_all_finite=force_all_finite)
        whom = ("`pairwise_distances`. Precomputed distance "
                " need to have non-negative values.")
        check_non_negative(X, whom=whom)
        return X
    if metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric,
                       force_all_finite=force_all_finite, **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)):
            msg = "Data was converted to boolean for metric %s" % metric
            warnings.warn(msg, DataConversionWarning)

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype,
                                     force_all_finite=force_all_finite)

        # precompute data-derived metric params
        params = _precompute_metric_params(X, Y, metric=metric, **kwds)
        kwds.update(**params)

        if effective_n_jobs(n_jobs) == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Exemple #42
0
def linear_kernel(X, Y=None, dense_output=True):
    X, Y = check_pairwise_arrays(X, Y)
    return safe_sparse_dot(X, Y.T, dense_output=dense_output)
Exemple #43
0
def differences(X, Y):
    "compute the componentwise difference between X and Y"
    X, Y = check_pairwise_arrays(X, Y)
    D = X[:, np.newaxis, :] - Y[np.newaxis, :, :]
    return D.reshape((-1, X.shape[1]))
def my_linear(X, Y=None, dense_output=True):
    print('**K_linear**')
    X, Y = check_pairwise_arrays(X, Y)
    #return [email protected]
    return safe_sparse_dot(X, Y.T, dense_output=dense_output)
Exemple #45
0
def differences(X, Y):
    X, Y = check_pairwise_arrays(X, Y)
    D = X[:, np.newaxis, :] - Y[np.newaxis, :, :]
    return D.reshape((-1, X.shape[1]))
Exemple #46
0
def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix inputs.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    """
    if (metric not in _VALID_METRICS and
            not callable(metric) and metric != "precomputed"):
        raise ValueError("Unknown metric %s. "
                         "Valid metrics are %s, or 'precomputed', or a "
                         "callable" % (metric, _VALID_METRICS))

    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True)
        return X
    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric, **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype)

        if n_jobs == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)