Esempio n. 1
0
    def test_pdist(self):
        for metric, argdict in self.scipy_metrics.iteritems():
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = pdist(self.X1, metric, **kwargs)
                Dsq_true = squareform(D_true)
                dm = DistanceMetric(metric, **kwargs)
                for X in self.X1, self.spX1:
                    yield self.check_pdist, metric, X, dm, Dsq_true, True

                for X in self.X1, self.spX1:
                    yield self.check_pdist, metric, X, dm, D_true, False

        for rmetric, (metric, func) in self.reduced_metrics.iteritems():
            argdict = self.scipy_metrics[metric]
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = func(pdist(self.X1, metric, **kwargs),
                              **kwargs)
                Dsq_true = squareform(D_true)
                dm = DistanceMetric(rmetric, **kwargs)
                for X in self.X1, self.spX1:
                    yield self.check_pdist, rmetric, X, dm, Dsq_true, True

                for X in self.X1, self.spX1:
                    yield self.check_pdist, rmetric, X, dm, D_true, False
Esempio n. 2
0
def test_user_metric(m1=2, m2=3):
    X1 = np.random.random((m1, DTEST))
    X2 = np.random.random((m2, DTEST))
    f = lambda x, y: np.dot(x[::-1], y)

    dist_metric = DistanceMetric(f)
    res1 = dist_metric.cdist(X1, X2)

    res2 = cdist(X1, X2, f)

    assert np.allclose(res1, res2)
Esempio n. 3
0
def test_user_metric(m1 = 2, m2 = 3):
    X1 = np.random.random((m1, DTEST))
    X2 = np.random.random((m2, DTEST))
    f = lambda x, y: np.dot(x[::-1], y)

    dist_metric = DistanceMetric(f)
    res1 = dist_metric.cdist(X1, X2)

    res2 = cdist(X1, X2, f)

    assert np.allclose(res1, res2)
Esempio n. 4
0
def pearson_correlation(X, Y):
    """
    Considering the rows of X (and Y=X) as vectors, compute the
    distance matrix between each pair of vectors.

    This correlation implementation is equivalent to the cosine similarity
    since the data it receives is assumed to be centered -- mean is 0. The
    correlation may be interpreted as the cosine of the angle between the two
    vectors defined by the users' preference values.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples_1, n_features]

    Y : {array-like, sparse matrix}, shape = [n_samples_2, n_features]

    Returns
    -------
    distances : {array, sparse matrix}, shape = [n_samples_1, n_samples_2]

    Examples
    --------
    >>> from crab.metrics.pairwise import pearson_correlation
    >>> X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    >>> # distance between rows of X
    >>> pearson_correlation(X, X)
    array([[ 1., 1.],
           [ 1., 1.]])
    >>> pearson_correlation(X, [[3.0, 3.5, 1.5, 5.0, 3.5,3.0]])
    array([[ 0.39605902],
               [ 0.39605902]])
    """
    # should not need X_norm_squared because if you could precompute that as
    # well as Y, then you should just pre-compute the output and not even
    # call this function.
    from distmetrics import DistanceMetric

    X, Y = check_pairwise_arrays(X, Y)
    n_samples_X, n_features_X = X.shape
    n_samples_Y, n_features_Y = Y.shape

    if n_features_X != n_features_Y:
        raise Exception("X and Y should have the same number of features!")

    if X is Y:
        X = Y = np.asanyarray(X)
    else:
        X = np.asanyarray(X)
        Y = np.asanyarray(Y)

    dm = DistanceMetric(metric='correlation')
    D = dm.pdist(X, squareform=True)

    return 1 - D
Esempio n. 5
0
def bench_float(m1=200, m2=200, rseed=0):
    print 79 * '_'
    print " real valued distance metrics"
    print
    np.random.seed(rseed)
    X1 = np.random.random((m1, DTEST))
    X2 = np.random.random((m2, DTEST))
    for (metric, argdict) in METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))
            print metric, param_info(kwargs)

            t0 = time()
            try:
                dist_metric = DistanceMetric(metric, **kwargs)
                Yc1 = dist_metric.cdist(X1, X2)
            except Exception as inst:
                print " >>>>>>>>>> error in pyDistances cdist:"
                print "           ", inst
            t1 = time()
            try:
                Yc2 = cdist(X1, X2, metric, **kwargs)
            except Exception as inst:
                print " >>>>>>>>>> error in scipy cdist:"
                print "           ", inst
            t2 = time()
            try:
                dist_metric = DistanceMetric(metric, **kwargs)
                Yp1 = dist_metric.pdist(X1)
            except Exception as inst:
                print " >>>>>>>>>> error in pyDistances pdist:"
                print "           ", inst
            t3 = time()
            try:
                Yp2 = pdist(X1, metric, **kwargs)
            except Exception as inst:
                print " >>>>>>>>>> error in scipy pdist:"
                print "           ", inst
            t4 = time()

            if not np.allclose(Yc1, Yc2):
                print " >>>>>>>>>> FAIL: cdist results don't match"
            if not np.allclose(Yp1, Yp2):
                print " >>>>>>>>>> FAIL: pdist results don't match"
            print " - pyDistances:  c: %.4f sec     p: %.4f sec" % (t1 - t0,
                                                                    t3 - t2)
            print " - scipy:        c: %.4f sec     p: %.4f sec" % (t2 - t1,
                                                                    t4 - t3)

    print ''
Esempio n. 6
0
    def _check_metrics_bool(self, k, metric, kwargs):
        bt = BallTree(self.Xbool, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.Ybool, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.cdist(self.Ybool, self.Xbool)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm]

        # we don't check the indices here because there are very often
        # ties for nearest neighbors, which cause the test to fail.
        # Distances will be correct in either case.
        assert_array_almost_equal(dist_bt, dist_dm)
Esempio n. 7
0
    def _check_metrics_float(self, k, metric, kwargs):
        bt = BallTree(self.X, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.X, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.pdist(self.X, squareform=True)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm]

        # we don't check the indices here because if there is a tie for
        # nearest neighbor, then the test may fail.  Distances will reflect
        # whether the search was successful
        assert_array_almost_equal(dist_bt, dist_dm)
Esempio n. 8
0
def test_ball_tree_query_radius_count(n_samples=100, n_features=10):
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1

    dm = DistanceMetric()
    D = dm.pdist(X, squareform=True)

    r = np.mean(D)

    bt = BallTree(X)
    count1 = bt.query_radius(X, r, count_only=True)

    count2 = (D <= r).sum(1)

    assert_array_almost_equal(count1, count2)
Esempio n. 9
0
def test_ball_tree_query_radius_count(n_samples=100, n_features=10):
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1

    dm = DistanceMetric()
    D = dm.pdist(X, squareform=True)

    r = np.mean(D)

    bt = BallTree(X)
    count1 = bt.query_radius(X, r, count_only=True)

    count2 = (D <= r).sum(1)

    assert_array_almost_equal(count1, count2)
Esempio n. 10
0
    def _check_metrics_float(self, k, metric, kwargs):
        bt = BallTree(self.X, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.X, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.pdist(self.X, squareform=True)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm]

        # we don't check the indices here because if there is a tie for
        # nearest neighbor, then the test may fail.  Distances will reflect
        # whether the search was successful
        assert_array_almost_equal(dist_bt, dist_dm)
Esempio n. 11
0
    def _check_metrics_bool(self, k, metric, kwargs):
        bt = BallTree(self.Xbool, metric=metric, **kwargs)
        dist_bt, ind_bt = bt.query(self.Ybool, k=k)

        dm = DistanceMetric(metric=metric, **kwargs)
        D = dm.cdist(self.Ybool, self.Xbool)

        ind_dm = np.argsort(D, 1)[:, :k]
        dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm]
        
        # we don't check the indices here because there are very often
        # ties for nearest neighbors, which cause the test to fail.
        # Distances will be correct in either case.
        assert_array_almost_equal(dist_bt, dist_dm)
Esempio n. 12
0
    def test_query_radius_count(self):
        # center the data
        X = 2 * self.X - 1

        dm = DistanceMetric()
        D = dm.pdist(X, squareform=True)

        r = np.mean(D)

        bt = BallTree(X)
        count1 = bt.query_radius(X, r, count_only=True)

        count2 = (D <= r).sum(1)

        assert_array_almost_equal(count1, count2)
Esempio n. 13
0
    def test_query_radius_count(self):
        # center the data
        X = 2 * self.X - 1

        dm = DistanceMetric()
        D = dm.pdist(X, squareform=True)

        r = np.mean(D)

        bt = BallTree(X)
        count1 = bt.query_radius(X, r, count_only=True)

        count2 = (D <= r).sum(1)

        assert_array_almost_equal(count1, count2)
Esempio n. 14
0
    def test_query_radius_indices(self, n_queries=20):
        # center the data
        X = 2 * self.X - 1

        dm = DistanceMetric()
        D = dm.cdist(X[:n_queries], X)
        r = np.mean(D)

        bt = BallTree(X)
        ind = bt.query_radius(X[:n_queries], r, return_distance=False)
        ind2 = np.zeros(D.shape) + np.arange(D.shape[1])

        ind = np.concatenate(map(np.sort, ind))
        ind2 = ind2[D <= r]
        
        assert_array_almost_equal(ind, ind2)
Esempio n. 15
0
    def test_query_radius_indices(self, n_queries=20):
        # center the data
        X = 2 * self.X - 1

        dm = DistanceMetric()
        D = dm.cdist(X[:n_queries], X)
        r = np.mean(D)

        bt = BallTree(X)
        ind = bt.query_radius(X[:n_queries], r, return_distance=False)
        ind2 = np.zeros(D.shape) + np.arange(D.shape[1])

        ind = np.concatenate(map(np.sort, ind))
        ind2 = ind2[D <= r]

        assert_array_almost_equal(ind, ind2)
Esempio n. 16
0
def test_pdist(m=15, rseed=0):
    """Compare DistanceMetric.pdist to scipy.spatial.distance.pdist"""
    np.random.seed(rseed)
    X = np.random.random((m, DTEST))
    for (metric, argdict) in METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))
            dist_metric = DistanceMetric(metric, **kwargs)

            Y1 = dist_metric.pdist(X)
            Y2 = squareform(pdist(X, metric, **kwargs))

            if not np.allclose(Y1, Y2):
                print metric, keys, vals
                print Y1[:5, :5]
                print Y2[:5, :5]
                assert np.allclose(Y1, Y2)
Esempio n. 17
0
def test_pdist(m=15, rseed=0):
    """Compare DistanceMetric.pdist to scipy.spatial.distance.pdist"""
    np.random.seed(rseed)
    X = np.random.random((m, DTEST))
    for (metric, argdict) in METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))
            dist_metric = DistanceMetric(metric, **kwargs)

            Y1 = dist_metric.pdist(X)
            Y2 = pdist(X, metric, **kwargs)

            if not np.allclose(Y1, Y2):
                print metric, keys, vals
                print Y1[:5, :5]
                print Y2[:5, :5]
                assert np.allclose(Y1, Y2)
Esempio n. 18
0
    def test_query_radius_distance(self):
        # center the data
        X = 2 * self.X - 1

        # choose a query point near the origin
        query_pt = 0.01 * X[:1]

        eps = 1E-15  # roundoff error can cause test to fail
        bt = BallTree(X, leaf_size=5)

        # compute reference distances
        dm = DistanceMetric()
        dist_true = dm.cdist(query_pt, X)[0]
        dist_true.sort()

        for r in np.linspace(dist_true[0], dist_true[-1], 10):
            yield (self._check_query_radius_distance,
                   X, bt, query_pt, dist_true, r, eps)
Esempio n. 19
0
    def test_query_radius_distance(self):
        # center the data
        X = 2 * self.X - 1

        # choose a query point near the origin
        query_pt = 0.01 * X[:1]

        eps = 1E-15  # roundoff error can cause test to fail
        bt = BallTree(X, leaf_size=5)

        # compute reference distances
        dm = DistanceMetric()
        dist_true = dm.cdist(query_pt, X)[0]
        dist_true.sort()

        for r in np.linspace(dist_true[0], dist_true[-1], 10):
            yield (self._check_query_radius_distance, X, bt, query_pt,
                   dist_true, r, eps)
Esempio n. 20
0
def test_ball_tree_query_radius_indices(n_samples=100, n_features=10):
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1

    dm = DistanceMetric()
    D = dm.cdist(X[:10], X)

    r = np.mean(D)

    bt = BallTree(X)
    ind = bt.query_radius(X[:10], r, return_distance=False)

    for i in range(10):
        ind1 = ind[i]
        ind2 = np.where(D[i] <= r)[0]

        ind1.sort()
        ind2.sort()

        assert_array_almost_equal(ind1, ind2)
Esempio n. 21
0
def test_ball_tree_query_radius_indices(n_samples=100, n_features=10):
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1

    dm = DistanceMetric()
    D = dm.cdist(X[:10], X)

    r = np.mean(D)

    bt = BallTree(X)
    ind = bt.query_radius(X[:10], r, return_distance=False)

    for i in range(10):
        ind1 = ind[i]
        ind2 = np.where(D[i] <= r)[0]

        ind1.sort()
        ind2.sort()

        assert_array_almost_equal(ind1, ind2)
Esempio n. 22
0
def test_cdist_bool(m1=15, m2=20, rseed=0):
    """Compare DistanceMetric.cdist to scipy.spatial.distance.cdist"""
    np.random.seed(rseed)
    X1 = (np.random.random((m1, DTEST)) > 0.5).astype(float)
    X2 = (np.random.random((m2, DTEST)) > 0.5).astype(float)
    for (metric, argdict) in BOOL_METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))
            dist_metric = DistanceMetric(metric, **kwargs)

            Y1 = dist_metric.cdist(X1, X2)
            Y2 = cdist(X1, X2, metric, **kwargs)

            if not np.allclose(Y1, Y2):
                print metric, keys, vals
                print Y1[:5, :5]
                print Y2[:5, :5]
                assert np.allclose(Y1, Y2)
Esempio n. 23
0
def test_cdist_bool(m1=15, m2=20, rseed=0):
    """Compare DistanceMetric.cdist to scipy.spatial.distance.cdist"""
    np.random.seed(rseed)
    X1 = (np.random.random((m1, DTEST)) > 0.5).astype(float)
    X2 = (np.random.random((m2, DTEST)) > 0.5).astype(float)
    for (metric, argdict) in BOOL_METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))
            dist_metric = DistanceMetric(metric, **kwargs)

            Y1 = dist_metric.cdist(X1, X2)
            Y2 = cdist(X1, X2, metric, **kwargs)

            if not np.allclose(Y1, Y2):
                print metric, keys, vals
                print Y1[:5, :5]
                print Y2[:5, :5]
                assert np.allclose(Y1, Y2)
Esempio n. 24
0
def bench_cdist_bool(m1=100, m2=100, rseed=0):
    np.random.seed(rseed)
    X1 = (np.random.random((m1, DTEST)) > 0.5).astype(float)
    X2 = (np.random.random((m2, DTEST)) > 0.5).astype(float)
    for (metric, argdict) in BOOL_METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))

            t0 = time()
            dist_metric = DistanceMetric(metric, **kwargs)
            Y1 = dist_metric.cdist(X1, X2)
            t1 = time()
            Y2 = cdist(X1, X2, metric, **kwargs)
            t2 = time()

            print metric, print_params(kwargs)
            if not np.allclose(Y1, Y2):
                print " >>>>>>>>>>>>>>>>>>>> FAIL: results don't match"
            print " - pyDistances: %.2g sec" % (t1 - t0)
            print " - scipy:       %.2g sec" % (t2 - t1)
Esempio n. 25
0
def bench_cdist_bool(m1=100, m2=100, rseed=0):
    np.random.seed(rseed)
    X1 = (np.random.random((m1, DTEST)) > 0.5).astype(float)
    X2 = (np.random.random((m2, DTEST)) > 0.5).astype(float)
    for (metric, argdict) in BOOL_METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))

            t0 = time()
            dist_metric = DistanceMetric(metric, **kwargs)
            Y1 = dist_metric.cdist(X1, X2)
            t1 = time()
            Y2 = cdist(X1, X2, metric, **kwargs)
            t2 = time()

            print metric, print_params(kwargs)
            if not np.allclose(Y1, Y2):
                print " >>>>>>>>>>>>>>>>>>>> FAIL: results don't match"
            print " - pyDistances: %.2g sec" % (t1 - t0)
            print " - scipy:       %.2g sec" % (t2 - t1)
Esempio n. 26
0
    def __init__(self, X, leaf_size=20, metric='euclidean', **kwargs):
        self.X = np.asarray(X)
        self.leaf_size = leaf_size
        self.metric = metric
        self.kwargs = kwargs
        
        # create the distance metric
        self.dm = DistanceMetric(metric, **kwargs)

        # build the tree
        self.indices = np.arange(X.shape[0], dtype=int)
        self.head_node = Node(self.X, self.indices,
                              self.leaf_size, self.dm)
Esempio n. 27
0
    def test_cdist(self):
        for metric, argdict in self.scipy_metrics.iteritems():
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = cdist(self.X1, self.X2, metric, **kwargs)
                dm = DistanceMetric(metric, **kwargs)
                for X1 in self.X1, self.spX1:
                    for X2 in self.X2, self.spX2:
                        yield self.check_cdist, metric, X1, X2, dm, D_true

        for rmetric, (metric, func) in self.reduced_metrics.iteritems():
            argdict = self.scipy_metrics[metric]
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = func(cdist(self.X1, self.X2, metric, **kwargs),
                              **kwargs)
                dm = DistanceMetric(rmetric, **kwargs)
                for X1 in self.X1, self.spX1:
                    for X2 in self.X2, self.spX2:
                        yield self.check_cdist, rmetric, X1, X2, dm, D_true
Esempio n. 28
0
def bench_float(m1=100, m2=100, rseed=0):
    print 79 * '_'
    print " real valued distance metrics"
    print
    np.random.seed(rseed)
    X1 = np.random.random((m1, DTEST))
    X2 = np.random.random((m2, DTEST))
    for (metric, argdict) in METRIC_DICT.iteritems():
        keys = argdict.keys()
        for vals in itertools.product(*argdict.values()):
            kwargs = dict(zip(keys, vals))
            print metric, param_info(kwargs)

            t0 = time()
            try:
                dist_metric = DistanceMetric(metric, **kwargs)
                Yc1 = dist_metric.cdist(X1, X2)
            except Exception as inst:
                print " >>>>>>>>>> error in pyDistances cdist:"
                print "           ", inst
            t1 = time()
            try:
                Yc2 = cdist(X1, X2, metric, **kwargs)
            except Exception as inst:
                print " >>>>>>>>>> error in scipy cdist:"
                print "           ", inst
            t2 = time()
            try:
                dist_metric = DistanceMetric(metric, **kwargs)
                Yp1 = dist_metric.pdist(X1)
            except Exception as inst:
                print " >>>>>>>>>> error in pyDistances pdist:"
                print "           ", inst
            t3 = time()
            try:
                Yp2 = pdist(X1, metric, **kwargs)
            except Exception as inst:
                print " >>>>>>>>>> error in scipy pdist:"
                print "           ", inst
            t4 = time()

            if not np.allclose(Yc1, Yc2):
                print " >>>>>>>>>> FAIL: cdist results don't match"
            if not np.allclose(Yp1, Yp2):
                print " >>>>>>>>>> FAIL: pdist results don't match"
            print " - pyDistances:  c: %.2g sec     p: %.2g sec" % (t1 - t0,
                                                                    t3 - t2)
            print " - scipy:        c: %.2g sec     p: %.2g sec" % (t2 - t1,
                                                                    t4 - t3)

    print ''
Esempio n. 29
0
def test_pdist_squareform(m=10, d=3, rseed=0):
    X = np.random.random((m, d))
    dist_metric = DistanceMetric()
    Y1 = squareform(dist_metric.pdist(X, squareform=False))
    Y2 = dist_metric.pdist(X, squareform=True)
    assert np.allclose(Y1, Y2)
Esempio n. 30
0
def test_pdist_squareform(m=10, d=3, rseed=0):
    X = np.random.random((m, d))
    dist_metric = DistanceMetric()
    Y1 = squareform(dist_metric.pdist(X, squareform=False))
    Y2 = dist_metric.pdist(X, squareform=True)
    assert np.allclose(Y1, Y2)