コード例 #1
0
def test_n_threads_agnosticism(
    PairwiseDistancesReduction,
    seed,
    n_samples,
    chunk_size,
    n_features=100,
    dtype=np.float64,
):
    # Results should not depend on the number of threads
    rng = np.random.RandomState(seed)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    parameter = (
        10 if PairwiseDistancesReduction is PairwiseDistancesArgKmin
        # Scaling the radius slightly with the numbers of dimensions
        else 10**np.log(n_features))

    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        return_distance=True,
    )

    with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
        dist, indices = PairwiseDistancesReduction.compute(
            X, Y, parameter, return_distance=True)

    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices,
                                              indices)
コード例 #2
0
def test_strategies_consistency(
    PairwiseDistancesReduction,
    metric,
    n_samples,
    seed,
    n_features=10,
    dtype=np.float64,
):

    rng = np.random.RandomState(seed)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    # Haversine distance only accepts 2D data
    if metric == "haversine":
        X = np.ascontiguousarray(X[:, :2])
        Y = np.ascontiguousarray(Y[:, :2])

    parameter = (
        10
        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
        # Scaling the radius slightly with the numbers of dimensions
        else 10 ** np.log(n_features)
    )

    dist_par_X, indices_par_X = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        metric=metric,
        # Taking the first
        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
        # To be sure to use parallelization
        chunk_size=n_samples // 4,
        strategy="parallel_on_X",
        return_distance=True,
    )

    dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        metric=metric,
        # Taking the first
        metric_kwargs=_get_dummy_metric_params_list(metric, n_features)[0],
        # To be sure to use parallelization
        chunk_size=n_samples // 4,
        strategy="parallel_on_Y",
        return_distance=True,
    )

    ASSERT_RESULT[PairwiseDistancesReduction](
        dist_par_X,
        dist_par_Y,
        indices_par_X,
        indices_par_Y,
    )
コード例 #3
0
def test_memmap_backed_data(
    metric,
    PairwiseDistancesReduction,
    n_samples=512,
    n_features=100,
    dtype=np.float64,
):
    # Results must not depend on the datasets writability
    rng = np.random.RandomState(0)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    # Create read only datasets
    X_mm, Y_mm = create_memmap_backed_data([X, Y])

    if PairwiseDistancesReduction is PairwiseDistancesArgKmin:
        parameter = 10
        check_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10**np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}

    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        metric=metric,
        return_distance=True,
    )

    dist_mm, indices_mm = PairwiseDistancesReduction.compute(
        X_mm,
        Y_mm,
        parameter,
        metric=metric,
        return_distance=True,
    )

    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist_mm,
                                                       ref_indices, indices_mm,
                                                       **check_parameters)
コード例 #4
0
def test_chunk_size_agnosticism(
    global_random_seed,
    PairwiseDistancesReduction,
    n_samples,
    chunk_size,
    n_features=100,
    dtype=np.float64,
):
    # Results must not depend on the chunk size
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    if PairwiseDistancesReduction is PairwiseDistancesArgKmin:
        parameter = 10
        check_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10**np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}

    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        metric="manhattan",
        return_distance=True,
    )

    dist, indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        chunk_size=chunk_size,
        metric="manhattan",
        return_distance=True,
    )

    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist,
                                                       ref_indices, indices,
                                                       **check_parameters)
コード例 #5
0
def test_n_threads_agnosticism(
    global_random_seed,
    PairwiseDistancesReduction,
    n_samples,
    chunk_size,
    n_features=100,
    dtype=np.float64,
):
    # Results must not depend on the number of threads
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    if PairwiseDistancesReduction is PairwiseDistancesArgKmin:
        parameter = 10
        check_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10**np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}

    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        return_distance=True,
    )

    with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
        dist, indices = PairwiseDistancesReduction.compute(
            X, Y, parameter, return_distance=True)

    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist,
                                                       ref_indices, indices,
                                                       **check_parameters)
コード例 #6
0
def test_chunk_size_agnosticism(
    global_random_seed,
    PairwiseDistancesReduction,
    n_samples,
    chunk_size,
    n_features=100,
    dtype=np.float64,
):
    # Results should not depend on the chunk size
    rng = np.random.RandomState(global_random_seed)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    parameter = (
        10
        if PairwiseDistancesReduction is PairwiseDistancesArgKmin
        # Scaling the radius slightly with the numbers of dimensions
        else 10 ** np.log(n_features)
    )

    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        return_distance=True,
    )

    dist, indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        chunk_size=chunk_size,
        return_distance=True,
    )

    ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices)
コード例 #7
0
def test_pairwise_distances_reduction_is_usable_for():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 10)
    Y = rng.rand(100, 10)
    metric = "euclidean"
    assert PairwiseDistancesReduction.is_usable_for(X, Y, metric)
    assert not PairwiseDistancesReduction.is_usable_for(
        X.astype(np.int64), Y.astype(np.int64), metric
    )

    assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc")
    # TODO: remove once 32 bits datasets are supported
    assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric)
    assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric)

    # TODO: remove once sparse matrices are supported
    assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric)
    assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric)
コード例 #8
0
        return_distance=True,
    )

    with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
        dist, indices = PairwiseDistancesReduction.compute(
            X, Y, parameter, return_distance=True)

    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist,
                                                       ref_indices, indices,
                                                       **check_parameters)


# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("n_samples", [100, 1000])
@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics())
@pytest.mark.parametrize(
    "PairwiseDistancesReduction",
    [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood],
)
def test_strategies_consistency(
    global_random_seed,
    PairwiseDistancesReduction,
    metric,
    n_samples,
    n_features=10,
    dtype=np.float64,
):

    rng = np.random.RandomState(global_random_seed)
    spread = 100