def test_pdist(self): for metric, argdict in self.scipy_metrics.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = pdist(self.X1, metric, **kwargs) Dsq_true = squareform(D_true) dm = DistanceMetric(metric, **kwargs) for X in self.X1, self.spX1: yield self.check_pdist, metric, X, dm, Dsq_true, True for X in self.X1, self.spX1: yield self.check_pdist, metric, X, dm, D_true, False for rmetric, (metric, func) in self.reduced_metrics.iteritems(): argdict = self.scipy_metrics[metric] keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = func(pdist(self.X1, metric, **kwargs), **kwargs) Dsq_true = squareform(D_true) dm = DistanceMetric(rmetric, **kwargs) for X in self.X1, self.spX1: yield self.check_pdist, rmetric, X, dm, Dsq_true, True for X in self.X1, self.spX1: yield self.check_pdist, rmetric, X, dm, D_true, False
def test_user_metric(m1=2, m2=3): X1 = np.random.random((m1, DTEST)) X2 = np.random.random((m2, DTEST)) f = lambda x, y: np.dot(x[::-1], y) dist_metric = DistanceMetric(f) res1 = dist_metric.cdist(X1, X2) res2 = cdist(X1, X2, f) assert np.allclose(res1, res2)
def test_user_metric(m1 = 2, m2 = 3): X1 = np.random.random((m1, DTEST)) X2 = np.random.random((m2, DTEST)) f = lambda x, y: np.dot(x[::-1], y) dist_metric = DistanceMetric(f) res1 = dist_metric.cdist(X1, X2) res2 = cdist(X1, X2, f) assert np.allclose(res1, res2)
def pearson_correlation(X, Y): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. This correlation implementation is equivalent to the cosine similarity since the data it receives is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle between the two vectors defined by the users' preference values. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples_1, n_features] Y : {array-like, sparse matrix}, shape = [n_samples_2, n_features] Returns ------- distances : {array, sparse matrix}, shape = [n_samples_1, n_samples_2] Examples -------- >>> from crab.metrics.pairwise import pearson_correlation >>> X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0],[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]] >>> # distance between rows of X >>> pearson_correlation(X, X) array([[ 1., 1.], [ 1., 1.]]) >>> pearson_correlation(X, [[3.0, 3.5, 1.5, 5.0, 3.5,3.0]]) array([[ 0.39605902], [ 0.39605902]]) """ # should not need X_norm_squared because if you could precompute that as # well as Y, then you should just pre-compute the output and not even # call this function. from distmetrics import DistanceMetric X, Y = check_pairwise_arrays(X, Y) n_samples_X, n_features_X = X.shape n_samples_Y, n_features_Y = Y.shape if n_features_X != n_features_Y: raise Exception("X and Y should have the same number of features!") if X is Y: X = Y = np.asanyarray(X) else: X = np.asanyarray(X) Y = np.asanyarray(Y) dm = DistanceMetric(metric='correlation') D = dm.pdist(X, squareform=True) return 1 - D
def bench_float(m1=200, m2=200, rseed=0): print 79 * '_' print " real valued distance metrics" print np.random.seed(rseed) X1 = np.random.random((m1, DTEST)) X2 = np.random.random((m2, DTEST)) for (metric, argdict) in METRIC_DICT.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) print metric, param_info(kwargs) t0 = time() try: dist_metric = DistanceMetric(metric, **kwargs) Yc1 = dist_metric.cdist(X1, X2) except Exception as inst: print " >>>>>>>>>> error in pyDistances cdist:" print " ", inst t1 = time() try: Yc2 = cdist(X1, X2, metric, **kwargs) except Exception as inst: print " >>>>>>>>>> error in scipy cdist:" print " ", inst t2 = time() try: dist_metric = DistanceMetric(metric, **kwargs) Yp1 = dist_metric.pdist(X1) except Exception as inst: print " >>>>>>>>>> error in pyDistances pdist:" print " ", inst t3 = time() try: Yp2 = pdist(X1, metric, **kwargs) except Exception as inst: print " >>>>>>>>>> error in scipy pdist:" print " ", inst t4 = time() if not np.allclose(Yc1, Yc2): print " >>>>>>>>>> FAIL: cdist results don't match" if not np.allclose(Yp1, Yp2): print " >>>>>>>>>> FAIL: pdist results don't match" print " - pyDistances: c: %.4f sec p: %.4f sec" % (t1 - t0, t3 - t2) print " - scipy: c: %.4f sec p: %.4f sec" % (t2 - t1, t4 - t3) print ''
def _check_metrics_bool(self, k, metric, kwargs): bt = BallTree(self.Xbool, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.Ybool, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.cdist(self.Ybool, self.Xbool) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.Ybool.shape[0])[:, None], ind_dm] # we don't check the indices here because there are very often # ties for nearest neighbors, which cause the test to fail. # Distances will be correct in either case. assert_array_almost_equal(dist_bt, dist_dm)
def _check_metrics_float(self, k, metric, kwargs): bt = BallTree(self.X, metric=metric, **kwargs) dist_bt, ind_bt = bt.query(self.X, k=k) dm = DistanceMetric(metric=metric, **kwargs) D = dm.pdist(self.X, squareform=True) ind_dm = np.argsort(D, 1)[:, :k] dist_dm = D[np.arange(self.X.shape[0])[:, None], ind_dm] # we don't check the indices here because if there is a tie for # nearest neighbor, then the test may fail. Distances will reflect # whether the search was successful assert_array_almost_equal(dist_bt, dist_dm)
def test_ball_tree_query_radius_count(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 dm = DistanceMetric() D = dm.pdist(X, squareform=True) r = np.mean(D) bt = BallTree(X) count1 = bt.query_radius(X, r, count_only=True) count2 = (D <= r).sum(1) assert_array_almost_equal(count1, count2)
def test_query_radius_count(self): # center the data X = 2 * self.X - 1 dm = DistanceMetric() D = dm.pdist(X, squareform=True) r = np.mean(D) bt = BallTree(X) count1 = bt.query_radius(X, r, count_only=True) count2 = (D <= r).sum(1) assert_array_almost_equal(count1, count2)
def test_query_radius_indices(self, n_queries=20): # center the data X = 2 * self.X - 1 dm = DistanceMetric() D = dm.cdist(X[:n_queries], X) r = np.mean(D) bt = BallTree(X) ind = bt.query_radius(X[:n_queries], r, return_distance=False) ind2 = np.zeros(D.shape) + np.arange(D.shape[1]) ind = np.concatenate(map(np.sort, ind)) ind2 = ind2[D <= r] assert_array_almost_equal(ind, ind2)
def test_pdist(m=15, rseed=0): """Compare DistanceMetric.pdist to scipy.spatial.distance.pdist""" np.random.seed(rseed) X = np.random.random((m, DTEST)) for (metric, argdict) in METRIC_DICT.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) dist_metric = DistanceMetric(metric, **kwargs) Y1 = dist_metric.pdist(X) Y2 = squareform(pdist(X, metric, **kwargs)) if not np.allclose(Y1, Y2): print metric, keys, vals print Y1[:5, :5] print Y2[:5, :5] assert np.allclose(Y1, Y2)
def test_pdist(m=15, rseed=0): """Compare DistanceMetric.pdist to scipy.spatial.distance.pdist""" np.random.seed(rseed) X = np.random.random((m, DTEST)) for (metric, argdict) in METRIC_DICT.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) dist_metric = DistanceMetric(metric, **kwargs) Y1 = dist_metric.pdist(X) Y2 = pdist(X, metric, **kwargs) if not np.allclose(Y1, Y2): print metric, keys, vals print Y1[:5, :5] print Y2[:5, :5] assert np.allclose(Y1, Y2)
def test_query_radius_distance(self): # center the data X = 2 * self.X - 1 # choose a query point near the origin query_pt = 0.01 * X[:1] eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) # compute reference distances dm = DistanceMetric() dist_true = dm.cdist(query_pt, X)[0] dist_true.sort() for r in np.linspace(dist_true[0], dist_true[-1], 10): yield (self._check_query_radius_distance, X, bt, query_pt, dist_true, r, eps)
def test_ball_tree_query_radius_indices(n_samples=100, n_features=10): X = 2 * np.random.random(size=(n_samples, n_features)) - 1 dm = DistanceMetric() D = dm.cdist(X[:10], X) r = np.mean(D) bt = BallTree(X) ind = bt.query_radius(X[:10], r, return_distance=False) for i in range(10): ind1 = ind[i] ind2 = np.where(D[i] <= r)[0] ind1.sort() ind2.sort() assert_array_almost_equal(ind1, ind2)
def test_cdist_bool(m1=15, m2=20, rseed=0): """Compare DistanceMetric.cdist to scipy.spatial.distance.cdist""" np.random.seed(rseed) X1 = (np.random.random((m1, DTEST)) > 0.5).astype(float) X2 = (np.random.random((m2, DTEST)) > 0.5).astype(float) for (metric, argdict) in BOOL_METRIC_DICT.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) dist_metric = DistanceMetric(metric, **kwargs) Y1 = dist_metric.cdist(X1, X2) Y2 = cdist(X1, X2, metric, **kwargs) if not np.allclose(Y1, Y2): print metric, keys, vals print Y1[:5, :5] print Y2[:5, :5] assert np.allclose(Y1, Y2)
def bench_cdist_bool(m1=100, m2=100, rseed=0): np.random.seed(rseed) X1 = (np.random.random((m1, DTEST)) > 0.5).astype(float) X2 = (np.random.random((m2, DTEST)) > 0.5).astype(float) for (metric, argdict) in BOOL_METRIC_DICT.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) t0 = time() dist_metric = DistanceMetric(metric, **kwargs) Y1 = dist_metric.cdist(X1, X2) t1 = time() Y2 = cdist(X1, X2, metric, **kwargs) t2 = time() print metric, print_params(kwargs) if not np.allclose(Y1, Y2): print " >>>>>>>>>>>>>>>>>>>> FAIL: results don't match" print " - pyDistances: %.2g sec" % (t1 - t0) print " - scipy: %.2g sec" % (t2 - t1)
def __init__(self, X, leaf_size=20, metric='euclidean', **kwargs): self.X = np.asarray(X) self.leaf_size = leaf_size self.metric = metric self.kwargs = kwargs # create the distance metric self.dm = DistanceMetric(metric, **kwargs) # build the tree self.indices = np.arange(X.shape[0], dtype=int) self.head_node = Node(self.X, self.indices, self.leaf_size, self.dm)
def test_cdist(self): for metric, argdict in self.scipy_metrics.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = cdist(self.X1, self.X2, metric, **kwargs) dm = DistanceMetric(metric, **kwargs) for X1 in self.X1, self.spX1: for X2 in self.X2, self.spX2: yield self.check_cdist, metric, X1, X2, dm, D_true for rmetric, (metric, func) in self.reduced_metrics.iteritems(): argdict = self.scipy_metrics[metric] keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = func(cdist(self.X1, self.X2, metric, **kwargs), **kwargs) dm = DistanceMetric(rmetric, **kwargs) for X1 in self.X1, self.spX1: for X2 in self.X2, self.spX2: yield self.check_cdist, rmetric, X1, X2, dm, D_true
def bench_float(m1=100, m2=100, rseed=0): print 79 * '_' print " real valued distance metrics" print np.random.seed(rseed) X1 = np.random.random((m1, DTEST)) X2 = np.random.random((m2, DTEST)) for (metric, argdict) in METRIC_DICT.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) print metric, param_info(kwargs) t0 = time() try: dist_metric = DistanceMetric(metric, **kwargs) Yc1 = dist_metric.cdist(X1, X2) except Exception as inst: print " >>>>>>>>>> error in pyDistances cdist:" print " ", inst t1 = time() try: Yc2 = cdist(X1, X2, metric, **kwargs) except Exception as inst: print " >>>>>>>>>> error in scipy cdist:" print " ", inst t2 = time() try: dist_metric = DistanceMetric(metric, **kwargs) Yp1 = dist_metric.pdist(X1) except Exception as inst: print " >>>>>>>>>> error in pyDistances pdist:" print " ", inst t3 = time() try: Yp2 = pdist(X1, metric, **kwargs) except Exception as inst: print " >>>>>>>>>> error in scipy pdist:" print " ", inst t4 = time() if not np.allclose(Yc1, Yc2): print " >>>>>>>>>> FAIL: cdist results don't match" if not np.allclose(Yp1, Yp2): print " >>>>>>>>>> FAIL: pdist results don't match" print " - pyDistances: c: %.2g sec p: %.2g sec" % (t1 - t0, t3 - t2) print " - scipy: c: %.2g sec p: %.2g sec" % (t2 - t1, t4 - t3) print ''
def test_pdist_squareform(m=10, d=3, rseed=0): X = np.random.random((m, d)) dist_metric = DistanceMetric() Y1 = squareform(dist_metric.pdist(X, squareform=False)) Y2 = dist_metric.pdist(X, squareform=True) assert np.allclose(Y1, Y2)