Ejemplo n.º 1
0
def test_wminkowski_minkowski_equivalence(p):
    w = rng.random_sample(d)
    # Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski'
    dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p))
    dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w)
    D_wmks = dm_wmks.pairwise(X1, X2)
    D_mks = dm_mks.pairwise(X1, X2)
    assert_array_almost_equal(D_wmks, D_mks)
Ejemplo n.º 2
0
def test_input_data_size():
    # Regression test for #6288
    # Previously, a metric requiring a particular input dimension would fail
    def custom_metric(x, y):
        assert x.shape[0] == 3
        return np.sum((x - y) ** 2)

    rng = check_random_state(0)
    X = rng.rand(10, 3)

    pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric)
    eucl = DistanceMetric.get_metric("euclidean")
    assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)
Ejemplo n.º 3
0
def _test_distrib_compute(device):
    rank = idist.get_rank()

    canberra = DistanceMetric.get_metric("canberra")

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = CanberraMetric(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.randint(0, 10, size=(10,), device=device).float()
        y = torch.randint(0, 10, size=(10,), device=device).float()

        m.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()
        res = m.compute()
        assert canberra.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(res)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Ejemplo n.º 4
0
def brute_force_neighbors(X, Y, k, metric, **kwargs):
    from sklearn.metrics import DistanceMetric

    X, Y = check_array(X), check_array(Y)
    D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
    ind = np.argsort(D, axis=1)[:, :k]
    dist = D[np.arange(Y.shape[0])[:, None], ind]
    return dist, ind
Ejemplo n.º 5
0
def test_minkowski_metric_validate_weights_size():
    w2 = rng.random_sample(d + 1)
    dm = DistanceMetric.get_metric("minkowski", p=3, w=w2)
    msg = ("MinkowskiDistance: the size of w must match "
           f"the number of features \\({X1.shape[1]}\\). "
           f"Currently len\\(w\\)={w2.shape[0]}.")
    with pytest.raises(ValueError, match=msg):
        dm.pairwise(X1, X2)
Ejemplo n.º 6
0
def check_pdist_bool(metric, D_true):
    dm = DistanceMetric.get_metric(metric)
    D12 = dm.pairwise(X1_bool)
    # Based on https://github.com/scipy/scipy/pull/7373
    # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
    # was changed to return 0, instead of nan.
    if metric == "jaccard" and sp_version < parse_version("1.2.0"):
        D_true[np.isnan(D_true)] = 0
    assert_array_almost_equal(D12, D_true)
Ejemplo n.º 7
0
def test_pyfunc_metric():
    X = np.random.random((10, 3))

    euclidean = DistanceMetric.get_metric("euclidean")
    pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2)

    # Check if both callable metric and predefined metric initialized
    # DistanceMetric object is picklable
    euclidean_pkl = pickle.loads(pickle.dumps(euclidean))
    pyfunc_pkl = pickle.loads(pickle.dumps(pyfunc))

    D1 = euclidean.pairwise(X)
    D2 = pyfunc.pairwise(X)

    D1_pkl = euclidean_pkl.pairwise(X)
    D2_pkl = pyfunc_pkl.pairwise(X)

    assert_array_almost_equal(D1, D2)
    assert_array_almost_equal(D1_pkl, D2_pkl)
Ejemplo n.º 8
0
def test_pdist_bool_metrics(metric, X_bool):
    D_true = cdist(X_bool, X_bool, metric)
    dm = DistanceMetric.get_metric(metric)
    D12 = dm.pairwise(X_bool)
    # Based on https://github.com/scipy/scipy/pull/7373
    # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
    # was changed to return 0, instead of nan.
    if metric == "jaccard" and sp_version < parse_version("1.2.0"):
        D_true[np.isnan(D_true)] = 0
    assert_allclose(D12, D_true)
Ejemplo n.º 9
0
def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
    rng = check_random_state(0)
    X = rng.random_sample((n_samples, n_features))
    Y = rng.random_sample((n_samples, n_features))
    r = np.linspace(0, 1, 10)
    tree = Cls(X, leaf_size=10)

    D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
    counts_true = [(D <= ri).sum() for ri in r]

    counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree)
    assert_array_almost_equal(counts, counts_true)
Ejemplo n.º 10
0
def _test_distrib_integration(device):

    rank = idist.get_rank()
    torch.manual_seed(12)

    manhattan = DistanceMetric.get_metric("manhattan")

    def _test(n_epochs, metric_device):
        metric_device = torch.device(metric_device)
        n_iters = 80
        s = 16
        n_classes = 2

        offset = n_iters * s
        y_true = torch.rand(size=(offset *
                                  idist.get_world_size(), )).to(device)
        y_preds = torch.rand(size=(offset *
                                   idist.get_world_size(), )).to(device)

        def update(engine, i):
            return (
                y_preds[i * s + rank * offset:(i + 1) * s + rank * offset],
                y_true[i * s + rank * offset:(i + 1) * s + rank * offset],
            )

        engine = Engine(update)

        m = ManhattanDistance(device=metric_device)
        m.attach(engine, "md")

        data = list(range(n_iters))
        engine.run(data=data, max_epochs=n_epochs)

        assert "md" in engine.state.metrics

        res = engine.state.metrics["md"]
        if isinstance(res, torch.Tensor):
            res = res.cpu().numpy()

        np_y_true = y_true.cpu().numpy()
        np_y_preds = y_preds.cpu().numpy()

        assert pytest.approx(res) == manhattan.pairwise(
            [np_y_preds, np_y_true])[0][1]

    metric_devices = ["cpu"]
    if device.type != "xla":
        metric_devices.append(idist.device())
    for metric_device in metric_devices:
        for _ in range(2):
            _test(n_epochs=1, metric_device=metric_device)
            _test(n_epochs=2, metric_device=metric_device)
Ejemplo n.º 11
0
def check_cdist(metric, kwargs, X1, X2):
    if metric == "wminkowski":
        # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
        WarningToExpect = None
        if sp_version >= parse_version("1.6.0"):
            WarningToExpect = DeprecationWarning
        with pytest.warns(WarningToExpect):
            D_scipy_cdist = cdist(X1, X2, metric, **kwargs)
    else:
        D_scipy_cdist = cdist(X1, X2, metric, **kwargs)

    dm = DistanceMetric.get_metric(metric, **kwargs)
    D_sklearn = dm.pairwise(X1, X2)
    assert_array_almost_equal(D_sklearn, D_scipy_cdist)
Ejemplo n.º 12
0
def test_mst_linkage_core_memory_mapped(metric):
    """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.

    Non-regression test for issue #19875.
    """
    rng = np.random.RandomState(seed=1)
    X = rng.normal(size=(20, 4))
    Xmm = create_memmap_backed_data(X)
    argdict = METRICS_DEFAULT_PARAMS[metric]
    keys = argdict.keys()
    for vals in itertools.product(*argdict.values()):
        kwargs = dict(zip(keys, vals))
        distance_metric = DistanceMetric.get_metric(metric, **kwargs)
        mst = mst_linkage_core(X, distance_metric)
        mst_mm = mst_linkage_core(Xmm, distance_metric)
        np.testing.assert_equal(mst, mst_mm)
Ejemplo n.º 13
0
def test_distance_metrics_dtype_consistency(metric_param_grid):
    # DistanceMetric must return similar distances for
    # both 64bit and 32bit data.
    metric, param_grid = metric_param_grid
    keys = param_grid.keys()
    for vals in itertools.product(*param_grid.values()):
        kwargs = dict(zip(keys, vals))
        dm64 = DistanceMetric.get_metric(metric, **kwargs)
        dm32 = DistanceMetric32.get_metric(metric, **kwargs)

        D64 = dm64.pairwise(X64)
        D32 = dm32.pairwise(X32)
        assert_allclose(D64, D32)

        D64 = dm64.pairwise(X64, Y64)
        D32 = dm32.pairwise(X32, Y32)
        assert_allclose(D64, D32)
Ejemplo n.º 14
0
def test_haversine_metric():
    def haversine_slow(x1, x2):
        return 2 * np.arcsin(
            np.sqrt(
                np.sin(0.5 * (x1[0] - x2[0]))**2 +
                np.cos(x1[0]) * np.cos(x2[0]) * np.sin(0.5 *
                                                       (x1[1] - x2[1]))**2))

    X = np.random.random((10, 2))

    haversine = DistanceMetric.get_metric("haversine")

    D1 = haversine.pairwise(X)
    D2 = np.zeros_like(D1)
    for i, x1 in enumerate(X):
        for j, x2 in enumerate(X):
            D2[i, j] = haversine_slow(x1, x2)

    assert_array_almost_equal(D1, D2)
    assert_array_almost_equal(haversine.dist_to_rdist(D1), np.sin(0.5 * D2)**2)
Ejemplo n.º 15
0
def test_readonly_kwargs():
    # Non-regression test for:
    # https://github.com/scikit-learn/scikit-learn/issues/21685

    rng = check_random_state(0)

    weights = rng.rand(100)
    VI = rng.rand(10, 10)
    weights.setflags(write=False)
    VI.setflags(write=False)

    # Those distances metrics have to support readonly buffers.
    DistanceMetric.get_metric("seuclidean", V=weights)
    DistanceMetric.get_metric("wminkowski", p=1, w=weights)
    DistanceMetric.get_metric("mahalanobis", VI=VI)
Ejemplo n.º 16
0
    def _test(y_pred, y, batch_size):
        def update_fn(engine, batch):
            idx = (engine.state.iteration - 1) * batch_size
            y_true_batch = np_y[idx : idx + batch_size]
            y_pred_batch = np_y_pred[idx : idx + batch_size]
            return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)

        engine = Engine(update_fn)

        m = CanberraMetric()
        m.attach(engine, "cm")

        np_y = y.numpy().ravel()
        np_y_pred = y_pred.numpy().ravel()

        canberra = DistanceMetric.get_metric("canberra")

        data = list(range(y_pred.shape[0] // batch_size))
        cm = engine.run(data, max_epochs=1).metrics["cm"]

        assert canberra.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(cm)
Ejemplo n.º 17
0
def test_compute():
    a = np.random.randn(4)
    b = np.random.randn(4)
    c = np.random.randn(4)
    d = np.random.randn(4)
    ground_truth = np.random.randn(4)

    m = CanberraMetric()

    canberra = DistanceMetric.get_metric("canberra")

    m.update((torch.from_numpy(a), torch.from_numpy(ground_truth)))
    np_sum = (np.abs(ground_truth - a) / (np.abs(a) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    assert canberra.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(b), torch.from_numpy(ground_truth)))
    np_sum += ((np.abs(ground_truth - b)) / (np.abs(b) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([a, b])
    v2 = np.hstack([ground_truth, ground_truth])
    assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(c), torch.from_numpy(ground_truth)))
    np_sum += ((np.abs(ground_truth - c)) / (np.abs(c) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, c])
    v2 = np.hstack([v2, ground_truth])
    assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(d), torch.from_numpy(ground_truth)))
    np_sum += (np.abs(ground_truth - d) / (np.abs(d) + np.abs(ground_truth))).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, d])
    v2 = np.hstack([v2, ground_truth])
    assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
Ejemplo n.º 18
0
def test_mahattan_distance():
    a = np.random.randn(4)
    b = np.random.randn(4)
    c = np.random.randn(4)
    d = np.random.randn(4)
    ground_truth = np.random.randn(4)

    m = ManhattanDistance()

    manhattan = DistanceMetric.get_metric("manhattan")

    m.update((torch.from_numpy(a), torch.from_numpy(ground_truth)))
    np_sum = np.abs(ground_truth - a).sum()
    assert m.compute() == pytest.approx(np_sum)
    assert manhattan.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(b), torch.from_numpy(ground_truth)))
    np_sum += np.abs(ground_truth - b).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([a, b])
    v2 = np.hstack([ground_truth, ground_truth])
    assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(c), torch.from_numpy(ground_truth)))
    np_sum += np.abs(ground_truth - c).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, c])
    v2 = np.hstack([v2, ground_truth])
    assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)

    m.update((torch.from_numpy(d), torch.from_numpy(ground_truth)))
    np_sum += np.abs(ground_truth - d).sum()
    assert m.compute() == pytest.approx(np_sum)
    v1 = np.hstack([v1, d])
    v2 = np.hstack([v2, ground_truth])
    assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
Ejemplo n.º 19
0
def test_cdist_bool_metric(metric, X_bool, Y_bool):
    D_true = cdist(X_bool, Y_bool, metric)
    dm = DistanceMetric.get_metric(metric)
    D12 = dm.pairwise(X_bool, Y_bool)
    assert_allclose(D12, D_true)
Ejemplo n.º 20
0
def test_pickle_bool_metrics(metric, X_bool):
    dm = DistanceMetric.get_metric(metric)
    D1 = dm.pairwise(X_bool)
    dm2 = pickle.loads(pickle.dumps(dm))
    D2 = dm2.pairwise(X_bool)
    assert_allclose(D1, D2)
Ejemplo n.º 21
0
def check_cdist(metric, kwargs, D_true):
    dm = DistanceMetric.get_metric(metric, **kwargs)
    D12 = dm.pairwise(X1, X2)
    assert_array_almost_equal(D12, D_true)
Ejemplo n.º 22
0
def test_minkowski_metric_validate_weights_values(w, err_type, err_msg):
    with pytest.raises(err_type, match=err_msg):
        DistanceMetric.get_metric("minkowski", p=3, w=w)
Ejemplo n.º 23
0
def check_pickle(metric, kwargs):
    dm = DistanceMetric.get_metric(metric, **kwargs)
    D1 = dm.pairwise(X1)
    dm2 = pickle.loads(pickle.dumps(dm))
    D2 = dm2.pairwise(X1)
    assert_array_almost_equal(D1, D2)
Ejemplo n.º 24
0
def test_pickle_bool_metrics(metric, X1_bool):
    dm = DistanceMetric.get_metric(metric)
    D1 = dm.pairwise(X1_bool)
    dm2 = pickle.loads(pickle.dumps(dm))
    D2 = dm2.pairwise(X1_bool)
    assert_array_almost_equal(D1, D2)
Ejemplo n.º 25
0
def test_wminkowski_deprecated():
    w = rng.random_sample(d)
    msg = "WMinkowskiDistance is deprecated in version 1.1"
    with pytest.warns(FutureWarning, match=msg):
        DistanceMetric.get_metric("wminkowski", p=3, w=w)
Ejemplo n.º 26
0
def check_cdist_bool(metric, D_true):
    dm = DistanceMetric.get_metric(metric)
    D12 = dm.pairwise(X1_bool, X2_bool)
    assert_array_almost_equal(D12, D_true)
Ejemplo n.º 27
0
def brute_force_neighbors(X, Y, k, metric, **kwargs):
    D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
    ind = np.argsort(D, axis=1)[:, :k]
    dist = D[np.arange(Y.shape[0])[:, None], ind]
    return dist, ind