def test_wminkowski_minkowski_equivalence(p): w = rng.random_sample(d) # Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski' dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p)) dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w) D_wmks = dm_wmks.pairwise(X1, X2) D_mks = dm_mks.pairwise(X1, X2) assert_array_almost_equal(D_wmks, D_mks)
def test_input_data_size(): # Regression test for #6288 # Previously, a metric requiring a particular input dimension would fail def custom_metric(x, y): assert x.shape[0] == 3 return np.sum((x - y) ** 2) rng = check_random_state(0) X = rng.rand(10, 3) pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric) eucl = DistanceMetric.get_metric("euclidean") assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)
def _test_distrib_compute(device): rank = idist.get_rank() canberra = DistanceMetric.get_metric("canberra") def _test(metric_device): metric_device = torch.device(metric_device) m = CanberraMetric(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 10, size=(10,), device=device).float() y = torch.randint(0, 10, size=(10,), device=device).float() m.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() res = m.compute() assert canberra.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(res) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def brute_force_neighbors(X, Y, k, metric, **kwargs): from sklearn.metrics import DistanceMetric X, Y = check_array(X), check_array(Y) D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) ind = np.argsort(D, axis=1)[:, :k] dist = D[np.arange(Y.shape[0])[:, None], ind] return dist, ind
def test_minkowski_metric_validate_weights_size(): w2 = rng.random_sample(d + 1) dm = DistanceMetric.get_metric("minkowski", p=3, w=w2) msg = ("MinkowskiDistance: the size of w must match " f"the number of features \\({X1.shape[1]}\\). " f"Currently len\\(w\\)={w2.shape[0]}.") with pytest.raises(ValueError, match=msg): dm.pairwise(X1, X2)
def check_pdist_bool(metric, D_true): dm = DistanceMetric.get_metric(metric) D12 = dm.pairwise(X1_bool) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if metric == "jaccard" and sp_version < parse_version("1.2.0"): D_true[np.isnan(D_true)] = 0 assert_array_almost_equal(D12, D_true)
def test_pyfunc_metric(): X = np.random.random((10, 3)) euclidean = DistanceMetric.get_metric("euclidean") pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2) # Check if both callable metric and predefined metric initialized # DistanceMetric object is picklable euclidean_pkl = pickle.loads(pickle.dumps(euclidean)) pyfunc_pkl = pickle.loads(pickle.dumps(pyfunc)) D1 = euclidean.pairwise(X) D2 = pyfunc.pairwise(X) D1_pkl = euclidean_pkl.pairwise(X) D2_pkl = pyfunc_pkl.pairwise(X) assert_array_almost_equal(D1, D2) assert_array_almost_equal(D1_pkl, D2_pkl)
def test_pdist_bool_metrics(metric, X_bool): D_true = cdist(X_bool, X_bool, metric) dm = DistanceMetric.get_metric(metric) D12 = dm.pairwise(X_bool) # Based on https://github.com/scipy/scipy/pull/7373 # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric # was changed to return 0, instead of nan. if metric == "jaccard" and sp_version < parse_version("1.2.0"): D_true[np.isnan(D_true)] = 0 assert_allclose(D12, D_true)
def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3): rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) r = np.linspace(0, 1, 10) tree = Cls(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree) assert_array_almost_equal(counts, counts_true)
def _test_distrib_integration(device): rank = idist.get_rank() torch.manual_seed(12) manhattan = DistanceMetric.get_metric("manhattan") def _test(n_epochs, metric_device): metric_device = torch.device(metric_device) n_iters = 80 s = 16 n_classes = 2 offset = n_iters * s y_true = torch.rand(size=(offset * idist.get_world_size(), )).to(device) y_preds = torch.rand(size=(offset * idist.get_world_size(), )).to(device) def update(engine, i): return ( y_preds[i * s + rank * offset:(i + 1) * s + rank * offset], y_true[i * s + rank * offset:(i + 1) * s + rank * offset], ) engine = Engine(update) m = ManhattanDistance(device=metric_device) m.attach(engine, "md") data = list(range(n_iters)) engine.run(data=data, max_epochs=n_epochs) assert "md" in engine.state.metrics res = engine.state.metrics["md"] if isinstance(res, torch.Tensor): res = res.cpu().numpy() np_y_true = y_true.cpu().numpy() np_y_preds = y_preds.cpu().numpy() assert pytest.approx(res) == manhattan.pairwise( [np_y_preds, np_y_true])[0][1] metric_devices = ["cpu"] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: for _ in range(2): _test(n_epochs=1, metric_device=metric_device) _test(n_epochs=2, metric_device=metric_device)
def check_cdist(metric, kwargs, X1, X2): if metric == "wminkowski": # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 WarningToExpect = None if sp_version >= parse_version("1.6.0"): WarningToExpect = DeprecationWarning with pytest.warns(WarningToExpect): D_scipy_cdist = cdist(X1, X2, metric, **kwargs) else: D_scipy_cdist = cdist(X1, X2, metric, **kwargs) dm = DistanceMetric.get_metric(metric, **kwargs) D_sklearn = dm.pairwise(X1, X2) assert_array_almost_equal(D_sklearn, D_scipy_cdist)
def test_mst_linkage_core_memory_mapped(metric): """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset. Non-regression test for issue #19875. """ rng = np.random.RandomState(seed=1) X = rng.normal(size=(20, 4)) Xmm = create_memmap_backed_data(X) argdict = METRICS_DEFAULT_PARAMS[metric] keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) distance_metric = DistanceMetric.get_metric(metric, **kwargs) mst = mst_linkage_core(X, distance_metric) mst_mm = mst_linkage_core(Xmm, distance_metric) np.testing.assert_equal(mst, mst_mm)
def test_distance_metrics_dtype_consistency(metric_param_grid): # DistanceMetric must return similar distances for # both 64bit and 32bit data. metric, param_grid = metric_param_grid keys = param_grid.keys() for vals in itertools.product(*param_grid.values()): kwargs = dict(zip(keys, vals)) dm64 = DistanceMetric.get_metric(metric, **kwargs) dm32 = DistanceMetric32.get_metric(metric, **kwargs) D64 = dm64.pairwise(X64) D32 = dm32.pairwise(X32) assert_allclose(D64, D32) D64 = dm64.pairwise(X64, Y64) D32 = dm32.pairwise(X32, Y32) assert_allclose(D64, D32)
def test_haversine_metric(): def haversine_slow(x1, x2): return 2 * np.arcsin( np.sqrt( np.sin(0.5 * (x1[0] - x2[0]))**2 + np.cos(x1[0]) * np.cos(x2[0]) * np.sin(0.5 * (x1[1] - x2[1]))**2)) X = np.random.random((10, 2)) haversine = DistanceMetric.get_metric("haversine") D1 = haversine.pairwise(X) D2 = np.zeros_like(D1) for i, x1 in enumerate(X): for j, x2 in enumerate(X): D2[i, j] = haversine_slow(x1, x2) assert_array_almost_equal(D1, D2) assert_array_almost_equal(haversine.dist_to_rdist(D1), np.sin(0.5 * D2)**2)
def test_readonly_kwargs(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/21685 rng = check_random_state(0) weights = rng.rand(100) VI = rng.rand(10, 10) weights.setflags(write=False) VI.setflags(write=False) # Those distances metrics have to support readonly buffers. DistanceMetric.get_metric("seuclidean", V=weights) DistanceMetric.get_metric("wminkowski", p=1, w=weights) DistanceMetric.get_metric("mahalanobis", VI=VI)
def _test(y_pred, y, batch_size): def update_fn(engine, batch): idx = (engine.state.iteration - 1) * batch_size y_true_batch = np_y[idx : idx + batch_size] y_pred_batch = np_y_pred[idx : idx + batch_size] return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) engine = Engine(update_fn) m = CanberraMetric() m.attach(engine, "cm") np_y = y.numpy().ravel() np_y_pred = y_pred.numpy().ravel() canberra = DistanceMetric.get_metric("canberra") data = list(range(y_pred.shape[0] // batch_size)) cm = engine.run(data, max_epochs=1).metrics["cm"] assert canberra.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(cm)
def test_compute(): a = np.random.randn(4) b = np.random.randn(4) c = np.random.randn(4) d = np.random.randn(4) ground_truth = np.random.randn(4) m = CanberraMetric() canberra = DistanceMetric.get_metric("canberra") m.update((torch.from_numpy(a), torch.from_numpy(ground_truth))) np_sum = (np.abs(ground_truth - a) / (np.abs(a) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) assert canberra.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(b), torch.from_numpy(ground_truth))) np_sum += ((np.abs(ground_truth - b)) / (np.abs(b) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([a, b]) v2 = np.hstack([ground_truth, ground_truth]) assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(c), torch.from_numpy(ground_truth))) np_sum += ((np.abs(ground_truth - c)) / (np.abs(c) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, c]) v2 = np.hstack([v2, ground_truth]) assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(d), torch.from_numpy(ground_truth))) np_sum += (np.abs(ground_truth - d) / (np.abs(d) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, d]) v2 = np.hstack([v2, ground_truth]) assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
def test_mahattan_distance(): a = np.random.randn(4) b = np.random.randn(4) c = np.random.randn(4) d = np.random.randn(4) ground_truth = np.random.randn(4) m = ManhattanDistance() manhattan = DistanceMetric.get_metric("manhattan") m.update((torch.from_numpy(a), torch.from_numpy(ground_truth))) np_sum = np.abs(ground_truth - a).sum() assert m.compute() == pytest.approx(np_sum) assert manhattan.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(b), torch.from_numpy(ground_truth))) np_sum += np.abs(ground_truth - b).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([a, b]) v2 = np.hstack([ground_truth, ground_truth]) assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(c), torch.from_numpy(ground_truth))) np_sum += np.abs(ground_truth - c).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, c]) v2 = np.hstack([v2, ground_truth]) assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(d), torch.from_numpy(ground_truth))) np_sum += np.abs(ground_truth - d).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, d]) v2 = np.hstack([v2, ground_truth]) assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
def test_cdist_bool_metric(metric, X_bool, Y_bool): D_true = cdist(X_bool, Y_bool, metric) dm = DistanceMetric.get_metric(metric) D12 = dm.pairwise(X_bool, Y_bool) assert_allclose(D12, D_true)
def test_pickle_bool_metrics(metric, X_bool): dm = DistanceMetric.get_metric(metric) D1 = dm.pairwise(X_bool) dm2 = pickle.loads(pickle.dumps(dm)) D2 = dm2.pairwise(X_bool) assert_allclose(D1, D2)
def check_cdist(metric, kwargs, D_true): dm = DistanceMetric.get_metric(metric, **kwargs) D12 = dm.pairwise(X1, X2) assert_array_almost_equal(D12, D_true)
def test_minkowski_metric_validate_weights_values(w, err_type, err_msg): with pytest.raises(err_type, match=err_msg): DistanceMetric.get_metric("minkowski", p=3, w=w)
def check_pickle(metric, kwargs): dm = DistanceMetric.get_metric(metric, **kwargs) D1 = dm.pairwise(X1) dm2 = pickle.loads(pickle.dumps(dm)) D2 = dm2.pairwise(X1) assert_array_almost_equal(D1, D2)
def test_pickle_bool_metrics(metric, X1_bool): dm = DistanceMetric.get_metric(metric) D1 = dm.pairwise(X1_bool) dm2 = pickle.loads(pickle.dumps(dm)) D2 = dm2.pairwise(X1_bool) assert_array_almost_equal(D1, D2)
def test_wminkowski_deprecated(): w = rng.random_sample(d) msg = "WMinkowskiDistance is deprecated in version 1.1" with pytest.warns(FutureWarning, match=msg): DistanceMetric.get_metric("wminkowski", p=3, w=w)
def check_cdist_bool(metric, D_true): dm = DistanceMetric.get_metric(metric) D12 = dm.pairwise(X1_bool, X2_bool) assert_array_almost_equal(D12, D_true)
def brute_force_neighbors(X, Y, k, metric, **kwargs): D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) ind = np.argsort(D, axis=1)[:, :k] dist = D[np.arange(Y.shape[0])[:, None], ind] return dist, ind