def test_max_features():
    # Test max_features parameter using various values
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)
    max_features = X.shape[1]
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est, threshold=-np.inf)
    transformer2 = SelectFromModel(estimator=est,
                                   max_features=max_features,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)
    X_new2 = transformer2.fit_transform(X, y)
    assert_allclose(X_new1, X_new2)

    # Test max_features against actual model.
    transformer1 = SelectFromModel(
        estimator=Lasso(alpha=0.025, random_state=42))
    X_new1 = transformer1.fit_transform(X, y)
    scores1 = np.abs(transformer1.estimator_.coef_)
    candidate_indices1 = np.argsort(-scores1, kind='mergesort')

    for n_features in range(1, X_new1.shape[1] + 1):
        transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025,
                                                       random_state=42),
                                       max_features=n_features,
                                       threshold=-np.inf)
        X_new2 = transformer2.fit_transform(X, y)
        scores2 = np.abs(transformer2.estimator_.coef_)
        candidate_indices2 = np.argsort(-scores2, kind='mergesort')
        assert_allclose(X[:, candidate_indices1[:n_features]],
                        X[:, candidate_indices2[:n_features]])
    assert_allclose(transformer1.estimator_.coef_,
                    transformer2.estimator_.coef_)
Beispiel #2
0
def test_inertia(dtype):
    rng = np.random.RandomState(0)
    X_sparse = sp.random(100,
                         10,
                         density=0.5,
                         format="csr",
                         random_state=rng,
                         dtype=dtype)
    X_dense = X_sparse.toarray()
    sample_weight = rng.randn(100).astype(dtype, copy=False)
    centers = rng.randn(5, 10).astype(dtype, copy=False)
    labels = rng.randint(5, size=100, dtype=np.int32)

    distances = ((X_dense - centers[labels])**2).sum(axis=1)
    expected = np.sum(distances * sample_weight)

    inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels)
    inertia_sparse = _inertia_sparse(X_sparse, sample_weight, centers, labels)

    assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6)
    assert_allclose(inertia_dense, expected, rtol=1e-6)
    assert_allclose(inertia_sparse, expected, rtol=1e-6)
def test_euclidean_distance(dtype, squared):
    # Check that the _euclidean_(dense/sparse)_dense helpers produce correct
    # results
    rng = np.random.RandomState(0)
    a_sparse = sp.random(
        1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype
    )
    a_dense = a_sparse.toarray().reshape(-1)
    b = rng.randn(100).astype(dtype, copy=False)
    b_squared_norm = (b**2).sum()

    expected = ((a_dense - b) ** 2).sum()
    expected = expected if squared else np.sqrt(expected)

    distance_dense_dense = _euclidean_dense_dense_wrapper(a_dense, b, squared)
    distance_sparse_dense = _euclidean_sparse_dense_wrapper(
        a_sparse.data, a_sparse.indices, b, b_squared_norm, squared
    )

    assert_allclose(distance_dense_dense, distance_sparse_dense, rtol=1e-6)
    assert_allclose(distance_dense_dense, expected, rtol=1e-6)
    assert_allclose(distance_sparse_dense, expected, rtol=1e-6)
Beispiel #4
0
def test_incremental_mean_and_variance_ignore_nan():
    old_means = np.array([535., 535., 535., 535.])
    old_variances = np.array([4225., 4225., 4225., 4225.])
    old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)

    X = np.array([[170, 170, 170, 170],
                  [430, 430, 430, 430],
                  [300, 300, 300, 300]])

    X_nan = np.array([[170, np.nan, 170, 170],
                      [np.nan, 170, 430, 430],
                      [430, 430, np.nan, 300],
                      [300, 300, 300, np.nan]])

    X_means, X_variances, X_count = _incremental_mean_and_var(
        X, old_means, old_variances, old_sample_count)
    X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
        X_nan, old_means, old_variances, old_sample_count)

    assert_allclose(X_nan_means, X_means)
    assert_allclose(X_nan_variances, X_variances)
    assert_allclose(X_nan_count, X_count)
Beispiel #5
0
def test_iterative_imputer_clip_truncnorm():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1

    imputer = IterativeImputer(
        missing_values=0,
        max_iter=2,
        n_nearest_features=5,
        sample_posterior=True,
        min_value=0.1,
        max_value=0.2,
        verbose=1,
        imputation_order="random",
        random_state=rng,
    )
    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #6
0
def test_knn_imputer_weight_uniform(na):

    X = np.array([
        [0, 0],
        [na, 2],
        [4, 3],
        [5, 6],
        [7, 7],
        [9, 8],
        [11, 10]
    ])

    # Test with "uniform" weight (or unweighted)
    X_imputed_uniform = np.array([
        [0, 0],
        [5, 2],
        [4, 3],
        [5, 6],
        [7, 7],
        [9, 8],
        [11, 10]
    ])

    imputer = KNNImputer(weights="uniform", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed_uniform)

    # Test with "callable" weight
    def no_weight(dist):
        return None

    imputer = KNNImputer(weights=no_weight, missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed_uniform)

    # Test with "callable" uniform weight
    def uniform_weight(dist):
        return np.ones_like(dist)

    imputer = KNNImputer(weights=uniform_weight, missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
Beispiel #7
0
def check_samplers_multiclass_ova(name, Sampler):
    # Check that multiclass target lead to the same results than OVA encoding
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    y_ova = label_binarize(y, np.unique(y))
    sampler = Sampler()
    set_random_state(sampler)
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova)
    assert_allclose(X_res, X_res_ova)
    if issubclass(Sampler, BaseEnsembleSampler):
        for batch_y, batch_y_ova in zip(y_res, y_res_ova):
            assert type_of_target(batch_y_ova) == type_of_target(y_ova)
            assert_allclose(batch_y, batch_y_ova.argmax(axis=1))
    else:
        assert type_of_target(y_res_ova) == type_of_target(y_ova)
        assert_allclose(y_res, y_res_ova.argmax(axis=1))
Beispiel #8
0
def check_samplers_pandas(name, Sampler):
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y_df = pd.DataFrame(y)
    y_s = pd.Series(y, name="class")
    sampler = Sampler()
    if isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]

    else:
        samplers = [Sampler()]

    for sampler in samplers:
        set_random_state(sampler)
        X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
        X_res_df, y_res_df = sampler.fit_resample(X_df, y_df)
        X_res, y_res = sampler.fit_resample(X, y)

        # check that we return the same type for dataframes or series types
        assert isinstance(X_res_df, pd.DataFrame)
        assert isinstance(y_res_df, pd.DataFrame)
        assert isinstance(y_res_s, pd.Series)

        assert X_df.columns.to_list() == X_res_df.columns.to_list()
        assert y_df.columns.to_list() == y_res_df.columns.to_list()
        assert y_s.name == y_res_s.name

        assert_allclose(X_res_df.to_numpy(), X_res)
        assert_allclose(y_res_df.to_numpy().ravel(), y_res)
        assert_allclose(y_res_s.to_numpy(), y_res)
def test_transform_target_regressor_2d_transformer_multioutput():
    # Check consistency with transformer accepting only 2D array and a 2D y
    # array.
    X = friedman[0]
    y = np.vstack((friedman[1], friedman[1]**2 + 1)).T
    transformer = StandardScaler()
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    y_tran = regr.transformer_.transform(y)
    _check_standard_scaled(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)
Beispiel #10
0
def test_partial_dependence_dataframe(estimator, preprocessor, features):
    # check that the partial dependence support dataframe and pipeline
    # including a column transformer
    pd = pytest.importorskip("pandas")
    df = pd.DataFrame(iris.data, columns=iris.feature_names)

    pipe = make_pipeline(preprocessor, estimator)
    pipe.fit(df, iris.target)
    pdp_pipe, values_pipe = partial_dependence(pipe,
                                               df,
                                               features=features,
                                               grid_resolution=10)

    # the column transformer will reorder the column when transforming
    # we mixed the index to be sure that we are computing the partial
    # dependence of the right columns
    if preprocessor is not None:
        X_proc = clone(preprocessor).fit_transform(df)
        features_clf = [0, 1]
    else:
        X_proc = df
        features_clf = [0, 2]

    clf = clone(estimator).fit(X_proc, iris.target)
    pdp_clf, values_clf = partial_dependence(clf,
                                             X_proc,
                                             features=features_clf,
                                             method='brute',
                                             grid_resolution=10)

    assert_allclose(pdp_pipe, pdp_clf)
    if preprocessor is not None:
        scaler = preprocessor.named_transformers_['standardscaler']
        assert_allclose(values_pipe[1],
                        values_clf[1] * scaler.scale_[1] + scaler.mean_[1])
    else:
        assert_allclose(values_pipe[1], values_clf[1])
Beispiel #11
0
def test_model_pipeline_same_dense_and_sparse(LinearModel, params):
    # Test that linear model preceeded by StandardScaler in the pipeline and
    # with normalize set to False gives the same y_pred and the same .coef_
    # given X sparse or dense

    model_dense = make_pipeline(
        StandardScaler(with_mean=False),
        LinearModel(normalize=False, **params)
    )

    model_sparse = make_pipeline(
        StandardScaler(with_mean=False),
        LinearModel(normalize=False, **params)
    )

    # prepare the data
    rng = np.random.RandomState(0)
    n_samples = 200
    n_features = 2
    X = rng.randn(n_samples, n_features)
    X[X < 0.1] = 0.

    X_sparse = sparse.csr_matrix(X)
    y = rng.rand(n_samples)

    if is_classifier(model_dense):
        y = np.sign(y)

    model_dense.fit(X, y)
    model_sparse.fit(X_sparse, y)

    assert_allclose(model_sparse[1].coef_, model_dense[1].coef_)
    y_pred_dense = model_dense.predict(X)
    y_pred_sparse = model_sparse.predict(X_sparse)
    assert_allclose(y_pred_dense, y_pred_sparse)

    assert_allclose(model_dense[1].intercept_, model_sparse[1].intercept_)
def test_transform_target_regressor_1d_transformer(X, y):
    # All transformer in scikit-learn expect 2D data. FunctionTransformer with
    # validate=False lift this constraint without checking that the input is a
    # 2D vector. We check the consistency of the data shape using a 1D and 2D y
    # array.
    transformer = FunctionTransformer(func=lambda x: x + 1,
                                      inverse_func=lambda x: x - 1)
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    y_tran = regr.transformer_.transform(y)
    _check_shifted_by_one(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)
Beispiel #13
0
def test_lars_numeric_consistency(LARS, has_coef_path, args):
    # The test ensures numerical consistency between trained coefficients
    # of float32 and float64.
    rtol = 1e-5
    atol = 1e-5

    rng = np.random.RandomState(0)
    X_64 = rng.rand(6, 6)
    y_64 = rng.rand(6)

    model_64 = LARS(**args).fit(X_64, y_64)
    model_32 = LARS(**args).fit(X_64.astype(np.float32),
                                y_64.astype(np.float32))

    assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol)
    if has_coef_path:
        assert_allclose(model_64.coef_path_,
                        model_32.coef_path_,
                        rtol=rtol,
                        atol=atol)
    assert_allclose(model_64.intercept_,
                    model_32.intercept_,
                    rtol=rtol,
                    atol=atol)
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
    old_means = np.array([535., 535., 535., 535.])
    old_variances = np.array([4225., 4225., 4225., 4225.])
    old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)

    X = sparse_constructor(
        np.array([[170, 170, 170, 170], [430, 430, 430, 430],
                  [300, 300, 300, 300]]))

    X_nan = sparse_constructor(
        np.array([[170, np.nan, 170, 170], [np.nan, 170, 430, 430],
                  [430, 430, np.nan, 300], [300, 300, 300, np.nan]]))

    # we avoid creating specific data for axis 0 and 1: translating the data is
    # enough.
    if axis:
        X = X.T
        X_nan = X_nan.T

    # take a copy of the old statistics since they are modified in place.
    X_means, X_vars, X_sample_count = incr_mean_variance_axis(
        X,
        axis=axis,
        last_mean=old_means.copy(),
        last_var=old_variances.copy(),
        last_n=old_sample_count.copy())
    X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
        X_nan,
        axis=axis,
        last_mean=old_means.copy(),
        last_var=old_variances.copy(),
        last_n=old_sample_count.copy())

    assert_allclose(X_nan_means, X_means)
    assert_allclose(X_nan_vars, X_vars)
    assert_allclose(X_nan_sample_count, X_sample_count)
Beispiel #15
0
def test_knn_imputer_verify(na):
    # Test with an imputable matrix
    X = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, na],
        [3, 2, 3, na],
        [na, 4, 5, 5],
        [6, na, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])

    X_imputed = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 8],
        [3, 2, 3, 8],
        [4, 4, 5, 5],
        [6, 3, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])

    imputer = KNNImputer(missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    # Test when there is not enough neighbors
    X = np.array([
        [1, 0, 0, na],
        [2, 1, 2, na],
        [3, 2, 3, na],
        [4, 4, 5, na],
        [6, 7, 6, na],
        [8, 8, 8, na],
        [20, 20, 20, 20],
        [22, 22, 22, 22],
    ])

    # Not enough neighbors, use column mean from training
    X_impute_value = (20 + 22) / 2
    X_imputed = np.array([
        [1, 0, 0, X_impute_value],
        [2, 1, 2, X_impute_value],
        [3, 2, 3, X_impute_value],
        [4, 4, 5, X_impute_value],
        [6, 7, 6, X_impute_value],
        [8, 8, 8, X_impute_value],
        [20, 20, 20, 20],
        [22, 22, 22, 22],
    ])

    imputer = KNNImputer(missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    # Test when data in fit() and transform() are different
    X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16]])

    X1 = np.array([[1, 0], [3, 2], [4, na]])

    X_2_1 = (0 + 3 + 6 + 7 + 8) / 5
    X1_imputed = np.array([[1, 0], [3, 2], [4, X_2_1]])

    imputer = KNNImputer(missing_values=na)
    assert_allclose(imputer.fit(X).transform(X1), X1_imputed)
Beispiel #16
0
def _assert_same_lars_path_result(output1, output2):
    assert len(output1) == len(output2)
    for o1, o2 in zip(output1, output2):
        assert_allclose(o1, o2)
Beispiel #17
0
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, 'solver %s' % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, 'solver %s' % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y,
                           'solver %s' % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_allclose(np.exp(y_log_proba_pred1), y_proba_pred1,
                        rtol=1e-6, atol=1e-6, err_msg='solver %s' % solver)

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert np.any(y_pred3 != y3), 'solver %s' % solver

    # Test invalid shrinkages
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    assert_raises(NotImplementedError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=np.array([1, 2]))
    with pytest.raises(TypeError,
                       match="shrinkage must be a float or a string"):
        clf.fit(X, y)
    clf = LinearDiscriminantAnalysis(solver="lsqr",
                                     shrinkage=0.1,
                                     covariance_estimator=ShrunkCovariance())
    with pytest.raises(ValueError,
                       match=("covariance_estimator and shrinkage "
                              "parameters are not None. "
                              "Only one of the two can be set.")):
        clf.fit(X, y)
    # Test unknown solver
    clf = LinearDiscriminantAnalysis(solver="dummy")
    assert_raises(ValueError, clf.fit, X, y)

    # test bad solver with covariance_estimator
    clf = LinearDiscriminantAnalysis(solver="svd",
                                     covariance_estimator=LedoitWolf())
    with pytest.raises(ValueError,
                       match="covariance estimator is not supported with svd"):
        clf.fit(X, y)

    # test bad covariance estimator
    clf = LinearDiscriminantAnalysis(solver="lsqr",
                                     covariance_estimator=KMeans(n_clusters=2))
    with pytest.raises(ValueError,
                       match="KMeans does not have a covariance_ attribute"):
        clf.fit(X, y)
Beispiel #18
0
def test_lda_predict_proba(solver, n_classes):
    def generate_dataset(n_samples, centers, covariances, random_state=None):
        """Generate a multivariate normal data given some centers and
        covariances"""
        rng = check_random_state(random_state)
        X = np.vstack([rng.multivariate_normal(mean, cov,
                                               size=n_samples // len(centers))
                       for mean, cov in zip(centers, covariances)])
        y = np.hstack([[clazz] * (n_samples // len(centers))
                       for clazz in range(len(centers))])
        return X, y

    blob_centers = np.array([[0, 0], [-10, 40], [-30, 30]])[:n_classes]
    blob_stds = np.array([[[10, 10], [10, 100]]] * len(blob_centers))
    X, y = generate_dataset(
        n_samples=90000, centers=blob_centers, covariances=blob_stds,
        random_state=42
    )
    lda = LinearDiscriminantAnalysis(solver=solver, store_covariance=True,
                                     shrinkage=None).fit(X, y)
    # check that the empirical means and covariances are close enough to the
    # one used to generate the data
    assert_allclose(lda.means_, blob_centers, atol=1e-1)
    assert_allclose(lda.covariance_, blob_stds[0], atol=1)

    # implement the method to compute the probability given in The Elements
    # of Statistical Learning (cf. p.127, Sect. 4.4.5 "Logistic Regression
    # or LDA?")
    precision = linalg.inv(blob_stds[0])
    alpha_k = []
    alpha_k_0 = []
    for clazz in range(len(blob_centers) - 1):
        alpha_k.append(
            np.dot(precision,
                   (blob_centers[clazz] - blob_centers[-1])[:, np.newaxis]))
        alpha_k_0.append(
            np.dot(- 0.5 * (blob_centers[clazz] +
                            blob_centers[-1])[np.newaxis, :], alpha_k[-1]))

    sample = np.array([[-22, 22]])

    def discriminant_func(sample, coef, intercept, clazz):
        return np.exp(intercept[clazz] + np.dot(sample, coef[clazz]))

    prob = np.array([float(
        discriminant_func(sample, alpha_k, alpha_k_0, clazz) /
        (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
                  for clazz in range(n_classes - 1)]))) for clazz in range(
                      n_classes - 1)])

    prob_ref = 1 - np.sum(prob)

    # check the consistency of the computed probability
    # all probabilities should sum to one
    prob_ref_2 = float(
        1 / (1 + sum([discriminant_func(sample, alpha_k, alpha_k_0, clazz)
                      for clazz in range(n_classes - 1)]))
    )

    assert prob_ref == pytest.approx(prob_ref_2)
    # check that the probability of LDA are close to the theoretical
    # probabilties
    assert_allclose(lda.predict_proba(sample),
                    np.hstack([prob, prob_ref])[np.newaxis],
                    atol=1e-2)
Beispiel #19
0
def test_compare_to_ELKI():
    # Expected values, computed with (future) ELKI 0.7.5 using:
    # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
    #   -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
    # where the FixedDBIDsFilter gives 0-indexed ids.
    r1 = [np.inf, 1.0574896366427478, 0.7587934993548423, 0.7290174038973836,
          0.7290174038973836, 0.7290174038973836, 0.6861627576116127,
          0.7587934993548423, 0.9280118450166668, 1.1748022534146194,
          3.3355455741292257, 0.49618389254482587, 0.2552805046961355,
          0.2552805046961355, 0.24944622248445714, 0.24944622248445714,
          0.24944622248445714, 0.2552805046961355, 0.2552805046961355,
          0.3086779122185853, 4.163024452756142, 1.623152630340929,
          0.45315840475822655, 0.25468325192031926, 0.2254004358159971,
          0.18765711877083036, 0.1821471333893275, 0.1821471333893275,
          0.18765711877083036, 0.18765711877083036, 0.2240202988740153,
          1.154337614548715, 1.342604473837069, 1.323308536402633,
          0.8607514948648837, 0.27219111215810565, 0.13260875220533205,
          0.13260875220533205, 0.09890587675958984, 0.09890587675958984,
          0.13548790801634494, 0.1575483940837384, 0.17515137170530226,
          0.17575920159442388, 0.27219111215810565, 0.6101447895405373,
          1.3189208094864302, 1.323308536402633, 2.2509184159764577,
          2.4517810628594527, 3.675977064404973, 3.8264795626020365,
          2.9130735341510614, 2.9130735341510614, 2.9130735341510614,
          2.9130735341510614, 2.8459300127258036, 2.8459300127258036,
          2.8459300127258036, 3.0321982337972537]
    o1 = [0, 3, 6, 4, 7, 8, 2, 9, 5, 1, 31, 30, 32, 34, 33, 38, 39, 35, 37, 36,
          44, 21, 23, 24, 22, 25, 27, 29, 26, 28, 20, 40, 45, 46, 10, 15, 11,
          13, 17, 19, 18, 12, 16, 14, 47, 49, 43, 48, 42, 41, 53, 57, 51, 52,
          56, 59, 54, 55, 58, 50]
    p1 = [-1, 0, 3, 6, 6, 6, 8, 3, 7, 5, 1, 31, 30, 30, 34, 34, 34, 32, 32, 37,
          36, 44, 21, 23, 24, 22, 25, 25, 22, 22, 22, 21, 40, 45, 46, 10, 15,
          15, 13, 13, 15, 11, 19, 15, 10, 47, 12, 45, 14, 43, 42, 53, 57, 57,
          57, 57, 59, 59, 59, 58]

    # Tests against known extraction array
    # Does NOT work with metric='euclidean', because sklearn euclidean has
    # worse numeric precision. 'minkowski' is slower but more accurate.
    clust1 = OPTICS(min_samples=5).fit(X)

    assert_array_equal(clust1.ordering_, np.array(o1))
    assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
    assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
    # ELKI currently does not print the core distances (which are not used much
    # in literature, but we can at least ensure to have this consistency:
    for i in clust1.ordering_[1:]:
        assert (clust1.reachability_[i] >=
                clust1.core_distances_[clust1.predecessor_[i]])

    # Expected values, computed with (future) ELKI 0.7.5 using
    r2 = [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
          np.inf, np.inf, np.inf, 0.27219111215810565, 0.13260875220533205,
          0.13260875220533205, 0.09890587675958984, 0.09890587675958984,
          0.13548790801634494, 0.1575483940837384, 0.17515137170530226,
          0.17575920159442388, 0.27219111215810565, 0.4928068613197889,
          np.inf, 0.2666183922512113, 0.18765711877083036, 0.1821471333893275,
          0.1821471333893275, 0.1821471333893275, 0.18715928772277457,
          0.18765711877083036, 0.18765711877083036, 0.25468325192031926,
          np.inf, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714,
          0.24944622248445714, 0.24944622248445714, 0.2552805046961355,
          0.2552805046961355, 0.3086779122185853, 0.34466409325984865,
          np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
          np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
          np.inf, np.inf]
    o2 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 11, 13, 17, 19, 18, 12, 16, 14,
          47, 46, 20, 22, 25, 23, 27, 29, 24, 26, 28, 21, 30, 32, 34, 33, 38,
          39, 35, 37, 36, 31, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53,
          54, 55, 56, 57, 58, 59]
    p2 = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 15, 15, 13, 13, 15,
          11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30,
          30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1,
          -1, -1, -1, -1, -1, -1, -1, -1, -1]
    clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)

    assert_array_equal(clust2.ordering_, np.array(o2))
    assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
    assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))

    index = np.where(clust1.core_distances_ <= 0.5)[0]
    assert_allclose(clust1.core_distances_[index],
                    clust2.core_distances_[index])
Beispiel #20
0
def test_iforest_average_path_length():
    # It tests non-regression for #8549 which used the wrong formula
    # for average path length, strictly for the integer case
    # Updated to check average path length when input is <= 2 (issue #11839)
    result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0
    result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0
    assert_allclose(_average_path_length([0]), [0.0])
    assert_allclose(_average_path_length([1]), [0.0])
    assert_allclose(_average_path_length([2]), [1.0])
    assert_allclose(_average_path_length([5]), [result_one])
    assert_allclose(_average_path_length([999]), [result_two])
    assert_allclose(
        _average_path_length(np.array([1, 2, 5, 999])),
        [0.0, 1.0, result_one, result_two],
    )
    # _average_path_length is increasing
    avg_path_length = _average_path_length(np.arange(5))
    assert_array_equal(avg_path_length, np.sort(avg_path_length))
Beispiel #21
0
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)

    centers_old = centers + rng.normal(size=centers.shape)
    centers_old_csr = centers_old.copy()

    centers_new = np.zeros_like(centers_old)
    centers_new_csr = np.zeros_like(centers_old_csr)

    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)

    x_squared_norms = (X**2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    sample_weight = np.ones(X.shape[0], dtype=X.dtype)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]
    sample_weight_mb = sample_weight[:10]

    # step 1: compute the dense minibatch update
    old_inertia = _mini_batch_step(
        X_mb,
        x_mb_squared_norms,
        sample_weight_mb,
        centers_old,
        centers_new,
        weight_sums,
        np.random.RandomState(0),
        random_reassign=False,
    )
    assert old_inertia > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(
        X_mb, sample_weight_mb, x_mb_squared_norms, centers_new
    )
    assert new_inertia > 0.0
    assert new_inertia < old_inertia

    # step 2: compute the sparse minibatch update
    old_inertia_csr = _mini_batch_step(
        X_mb_csr,
        x_mb_squared_norms_csr,
        sample_weight_mb,
        centers_old_csr,
        centers_new_csr,
        weight_sums_csr,
        np.random.RandomState(0),
        random_reassign=False,
    )
    assert old_inertia_csr > 0.0

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(
        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr
    )
    assert new_inertia_csr > 0.0
    assert new_inertia_csr < old_inertia_csr

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_allclose(centers_new, centers_new_csr)
    assert_allclose(old_inertia, old_inertia_csr)
    assert_allclose(new_inertia, new_inertia_csr)
def _check_standard_scaled(y, y_pred):
    y_mean = np.mean(y, axis=0)
    y_std = np.std(y, axis=0)
    assert_allclose((y - y_mean) / y_std, y_pred)
Beispiel #23
0
def test_missing_value_handling(
    est, func, support_sparse, strictly_positive, omit_kwargs
):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[
        rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
    ] = np.nan
    if strictly_positive:
        X += np.nanmin(X) + 0.1
    X_train, X_test = train_test_split(X, random_state=1)
    # sanity check
    assert not np.all(np.isnan(X_train), axis=0).any()
    assert np.any(np.isnan(X_train), axis=0).all()
    assert np.any(np.isnan(X_test), axis=0).all()
    X_test[:, 0] = np.nan  # make sure this boundary case is tested

    with pytest.warns(None) as records:
        Xt = est.fit(X_train).transform(X_test)
    # ensure no warnings are raised
    assert len(records) == 0
    # missing values should still be missing, and only them
    assert_array_equal(np.isnan(Xt), np.isnan(X_test))

    # check that the function leads to the same results as the class
    with pytest.warns(None) as records:
        Xt_class = est.transform(X_train)
    assert len(records) == 0
    kwargs = est.get_params()
    # remove the parameters which should be omitted because they
    # are not defined in the sister function of the preprocessing class
    for kwarg in omit_kwargs:
        _ = kwargs.pop(kwarg)
    Xt_func = func(X_train, **kwargs)
    assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
    assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])

    # check that the inverse transform keep NaN
    Xt_inv = est.inverse_transform(Xt)
    assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
    # FIXME: we can introduce equal_nan=True in recent version of numpy.
    # For the moment which just check that non-NaN values are almost equal.
    assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])

    for i in range(X.shape[1]):
        # train only on non-NaN
        est.fit(_get_valid_samples_by_column(X_train, i))
        # check transforming with NaN works even when training without NaN
        with pytest.warns(None) as records:
            Xt_col = est.transform(X_test[:, [i]])
        assert len(records) == 0
        assert_allclose(Xt_col, Xt[:, [i]])
        # check non-NaN is handled as before - the 1st column is all nan
        if not np.isnan(X_test[:, i]).all():
            Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i))
            assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])

    if support_sparse:
        est_dense = clone(est)
        est_sparse = clone(est)

        with pytest.warns(None) as records:
            Xt_dense = est_dense.fit(X_train).transform(X_test)
            Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
        assert len(records) == 0
        for sparse_constructor in (
            sparse.csr_matrix,
            sparse.csc_matrix,
            sparse.bsr_matrix,
            sparse.coo_matrix,
            sparse.dia_matrix,
            sparse.dok_matrix,
            sparse.lil_matrix,
        ):
            # check that the dense and sparse inputs lead to the same results
            # precompute the matrix to avoid catching side warnings
            X_train_sp = sparse_constructor(X_train)
            X_test_sp = sparse_constructor(X_test)
            with pytest.warns(None) as records:
                warnings.simplefilter("ignore", PendingDeprecationWarning)
                Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
            assert len(records) == 0
            assert_allclose(Xt_sp.A, Xt_dense)
            with pytest.warns(None) as records:
                warnings.simplefilter("ignore", PendingDeprecationWarning)
                Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
            assert len(records) == 0
            assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
Beispiel #24
0
def test_fastica_simple(add_noise, global_random_seed, global_dtype):
    # Test the FastICA algorithm on very simple data.
    rng = np.random.RandomState(global_random_seed)
    n_samples = 1000
    # Generate two sources:
    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
    s2 = stats.t.rvs(1, size=n_samples, random_state=global_random_seed)
    s = np.c_[s1, s2].T
    center_and_norm(s)
    s = s.astype(global_dtype)
    s1, s2 = s

    # Mixing angle
    phi = 0.6
    mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi),
                                                    -np.cos(phi)]])
    mixing = mixing.astype(global_dtype)
    m = np.dot(mixing, s)

    if add_noise:
        m += 0.1 * rng.randn(2, 1000)

    center_and_norm(m)

    # function as fun arg
    def g_test(x):
        return x**3, (3 * x**2).mean(axis=-1)

    algos = ["parallel", "deflation"]
    nls = ["logcosh", "exp", "cube", g_test]
    whitening = ["arbitrary-variance", "unit-variance", False]
    for algo, nl, whiten in itertools.product(algos, nls, whitening):
        if whiten:
            k_, mixing_, s_ = fastica(m.T,
                                      fun=nl,
                                      whiten=whiten,
                                      algorithm=algo,
                                      random_state=rng)
            with pytest.raises(ValueError):
                fastica(m.T, fun=np.tanh, whiten=whiten, algorithm=algo)
        else:
            pca = PCA(n_components=2, whiten=True, random_state=rng)
            X = pca.fit_transform(m.T)
            k_, mixing_, s_ = fastica(X,
                                      fun=nl,
                                      algorithm=algo,
                                      whiten=False,
                                      random_state=rng)
            with pytest.raises(ValueError):
                fastica(X, fun=np.tanh, algorithm=algo)
        s_ = s_.T
        # Check that the mixing model described in the docstring holds:
        if whiten:
            # XXX: exact reconstruction to standard relative tolerance is not
            # possible. This is probably expected when add_noise is True but we
            # also need a non-trivial atol in float32 when add_noise is False.
            #
            # Note that the 2 sources are non-Gaussian in this test.
            atol = 1e-5 if global_dtype == np.float32 else 0
            assert_allclose(np.dot(np.dot(mixing_, k_), m), s_, atol=atol)

        center_and_norm(s_)
        s1_, s2_ = s_
        # Check to see if the sources have been estimated
        # in the wrong order
        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
            s2_, s1_ = s_
        s1_ *= np.sign(np.dot(s1_, s1))
        s2_ *= np.sign(np.dot(s2_, s2))

        # Check that we have estimated the original sources
        if not add_noise:
            assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-2)
            assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-2)
        else:
            assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-1)
            assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-1)

    # Test FastICA class
    _, _, sources_fun = fastica(m.T,
                                fun=nl,
                                algorithm=algo,
                                random_state=global_random_seed)
    ica = FastICA(fun=nl, algorithm=algo, random_state=global_random_seed)
    sources = ica.fit_transform(m.T)
    assert ica.components_.shape == (2, 2)
    assert sources.shape == (1000, 2)

    assert_allclose(sources_fun, sources)
    assert_allclose(sources, ica.transform(m.T))

    assert ica.mixing_.shape == (2, 2)

    for fn in [np.tanh, "exp(-.5(x^2))"]:
        ica = FastICA(fun=fn, algorithm=algo)
        with pytest.raises(ValueError):
            ica.fit(m.T)

    with pytest.raises(TypeError):
        FastICA(fun=range(10)).fit(m.T)
def _check_shifted_by_one(y, y_pred):
    assert_allclose(y + 1, y_pred)
Beispiel #26
0
def test_knn_imputer_weight_distance(na):
    X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])

    # Test with "distance" weight
    nn = KNeighborsRegressor(metric="euclidean", weights="distance")
    X_rows_idx = [0, 2, 3, 4, 5, 6]
    nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
    knn_imputed_value = nn.predict(X[1:2, 1:])[0]

    # Manual calculation
    X_neighbors_idx = [0, 2, 3, 4, 5]
    dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
    weights = 1 / dist[:, X_neighbors_idx].ravel()
    manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)

    X_imputed_distance1 = np.array([[0, 0], [manual_imputed_value, 2], [4, 3],
                                    [5, 6], [7, 7], [9, 8], [11, 10]])

    # NearestNeighbor calculation
    X_imputed_distance2 = np.array([[0, 0], [knn_imputed_value, 2], [4, 3],
                                    [5, 6], [7, 7], [9, 8], [11, 10]])

    imputer = KNNImputer(weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
    assert_allclose(imputer.fit_transform(X), X_imputed_distance2)

    # Test with weights = "distance" and n_neighbors=2
    X = np.array([
        [na, 0, 0],
        [2, 1, 2],
        [3, 2, 3],
        [4, 5, 5],
    ])

    # neighbors are rows 1, 2, the nan_euclidean_distances are:
    dist_0_1 = np.sqrt((3 / 2) * ((1 - 0)**2 + (2 - 0)**2))
    dist_0_2 = np.sqrt((3 / 2) * ((2 - 0)**2 + (3 - 0)**2))
    imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])

    X_imputed = np.array([
        [imputed_value, 0, 0],
        [2, 1, 2],
        [3, 2, 3],
        [4, 5, 5],
    ])

    imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    # Test with varying missingness patterns
    X = np.array([
        [1, 0, 0, 1],
        [0, na, 1, na],
        [1, 1, 1, na],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 1],
        [10, 10, 10, 10],
    ])

    # Get weights of donor neighbors
    dist = nan_euclidean_distances(X, missing_values=na)
    r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
    r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
    r1c1_nbor_wt = 1 / r1c1_nbor_dists
    r1c3_nbor_wt = 1 / r1c3_nbor_dists

    r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
    r2c3_nbor_wt = 1 / r2c3_nbor_dists

    # Collect donor values
    col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
    col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()

    # Final imputed values
    r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
    r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
    r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)

    X_imputed = np.array([
        [1, 0, 0, 1],
        [0, r1c1_imp, 1, r1c3_imp],
        [1, 1, 1, r2c3_imp],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [1, 0, 1, 1],
        [10, 10, 10, 10],
    ])

    imputer = KNNImputer(weights="distance", missing_values=na)
    assert_allclose(imputer.fit_transform(X), X_imputed)

    X = np.array([
        [0, 0, 0, na],
        [1, 1, 1, na],
        [2, 2, na, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6],
        [na, 7, 7, 7],
    ])

    dist = pairwise_distances(X,
                              metric="nan_euclidean",
                              squared=False,
                              missing_values=na)

    # Calculate weights
    r0c3_w = 1.0 / dist[0, 2:-1]
    r1c3_w = 1.0 / dist[1, 2:-1]
    r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
    r7c0_w = 1.0 / dist[7, 2:7]

    # Calculate weighted averages
    r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
    r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
    r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
    r7c0 = np.average(X[2:7, 0], weights=r7c0_w)

    X_imputed = np.array([
        [0, 0, 0, r0c3],
        [1, 1, 1, r1c3],
        [2, 2, r2c2, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6],
        [r7c0, 7, 7, 7],
    ])

    imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
    assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
Beispiel #27
0
def test_kmeans_plusplus_norms(x_squared_norms):
    # Check that defining x_squared_norms returns the same as default=None.
    centers, indices = kmeans_plusplus(X, n_clusters, x_squared_norms=x_squared_norms)

    assert_allclose(X[indices], centers)
Beispiel #28
0
def assert_argkmin_results_quasi_equality(
    ref_dist,
    dist,
    ref_indices,
    indices,
    rtol=1e-4,
):
    """Assert that argkmin results are valid up to:
      - relative tolerance on computed distance values
      - permutations of indices for distances values that differ up to
        a precision level

    To be used for testing neighbors queries on float32 datasets: we
    accept neighbors rank swaps only if they are caused by small
    rounding errors on the distance computations.
    """
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])

    n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)

    assert (ref_dist.shape == dist.shape == ref_indices.shape ==
            indices.shape), "Arrays of results have various shapes."

    n_queries, n_neighbors = ref_dist.shape

    # Asserting equality results one row at a time
    for query_idx in range(n_queries):
        ref_dist_row = ref_dist[query_idx]
        dist_row = dist[query_idx]

        assert is_sorted(
            ref_dist_row
        ), f"Reference distances aren't sorted on row {query_idx}"
        assert is_sorted(
            dist_row), f"Distances aren't sorted on row {query_idx}"

        assert_allclose(ref_dist_row, dist_row, rtol=rtol)

        ref_indices_row = ref_indices[query_idx]
        indices_row = indices[query_idx]

        # Grouping indices by distances using sets on a rounded distances up
        # to a given number of decimals of significant digits derived from rtol.
        reference_neighbors_groups = defaultdict(set)
        effective_neighbors_groups = defaultdict(set)

        for neighbor_rank in range(n_neighbors):
            rounded_dist = relative_rounding(
                ref_dist_row[neighbor_rank],
                n_significant_digits=n_significant_digits,
            )
            reference_neighbors_groups[rounded_dist].add(
                ref_indices_row[neighbor_rank])
            effective_neighbors_groups[rounded_dist].add(
                indices_row[neighbor_rank])

        # Asserting equality of groups (sets) for each distance
        msg = (
            f"Neighbors indices for query {query_idx} are not matching "
            f"when rounding distances at {n_significant_digits} significant digits "
            f"derived from rtol={rtol:.1e}")
        for rounded_distance in reference_neighbors_groups.keys():
            assert (reference_neighbors_groups[rounded_distance] ==
                    effective_neighbors_groups[rounded_distance]), msg
Beispiel #29
0
def test_fit_transform(Estimator):
    # Check equivalence between fit.transform and fit_transform
    X1 = Estimator(random_state=0, n_init=1).fit(X).transform(X)
    X2 = Estimator(random_state=0, n_init=1).fit_transform(X)
    assert_allclose(X1, X2)
Beispiel #30
0
def assert_radius_neighborhood_results_quasi_equality(
    ref_dist,
    dist,
    ref_indices,
    indices,
    radius,
    rtol=1e-4,
):
    """Assert that radius neighborhood results are valid up to:
      - relative tolerance on computed distance values
      - permutations of indices for distances values that differ up to
        a precision level
      - missing or extra last elements if their distance is
        close to the radius

    To be used for testing neighbors queries on float32 datasets: we
    accept neighbors rank swaps only if they are caused by small
    rounding errors on the distance computations.

    Input arrays must be sorted w.r.t distances.
    """
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])

    n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)

    assert (len(ref_dist) == len(dist) == len(ref_indices) ==
            len(indices)), "Arrays of results have various lengths."

    n_queries = len(ref_dist)

    # Asserting equality of results one vector at a time
    for query_idx in range(n_queries):

        ref_dist_row = ref_dist[query_idx]
        dist_row = dist[query_idx]

        assert is_sorted(
            ref_dist_row
        ), f"Reference distances aren't sorted on row {query_idx}"
        assert is_sorted(
            dist_row), f"Distances aren't sorted on row {query_idx}"

        # Vectors' lengths might be different due to small
        # numerical differences of distance w.r.t the `radius` threshold.
        largest_row = ref_dist_row if len(ref_dist_row) > len(
            dist_row) else dist_row

        # For the longest distances vector, we check that last extra elements
        # that aren't present in the other vector are all in: [radius ± rtol]
        min_length = min(len(ref_dist_row), len(dist_row))
        last_extra_elements = largest_row[min_length:]
        if last_extra_elements.size > 0:
            assert np.all(
                radius - rtol <= last_extra_elements <= radius + rtol
            ), (f"The last extra elements ({last_extra_elements}) aren't in [radius ±"
                f" rtol]=[{radius} ± {rtol}]")

        # We truncate the neighbors results list on the smallest length to
        # be able to compare them, ignoring the elements checked above.
        ref_dist_row = ref_dist_row[:min_length]
        dist_row = dist_row[:min_length]

        assert_allclose(ref_dist_row, dist_row, rtol=rtol)

        ref_indices_row = ref_indices[query_idx]
        indices_row = indices[query_idx]

        # Grouping indices by distances using sets on a rounded distances up
        # to a given number of significant digits derived from rtol.
        reference_neighbors_groups = defaultdict(set)
        effective_neighbors_groups = defaultdict(set)

        for neighbor_rank in range(min_length):
            rounded_dist = relative_rounding(
                ref_dist_row[neighbor_rank],
                n_significant_digits=n_significant_digits,
            )
            reference_neighbors_groups[rounded_dist].add(
                ref_indices_row[neighbor_rank])
            effective_neighbors_groups[rounded_dist].add(
                indices_row[neighbor_rank])

        # Asserting equality of groups (sets) for each distance
        msg = (
            f"Neighbors indices for query {query_idx} are not matching "
            f"when rounding distances at {n_significant_digits} significant digits "
            f"derived from rtol={rtol:.1e}")
        for rounded_distance in reference_neighbors_groups.keys():
            assert (reference_neighbors_groups[rounded_distance] ==
                    effective_neighbors_groups[rounded_distance]), msg