Example #1
0
def test_as_float_array():
    # Test function for as_float_array
    X = np.ones((3, 10), dtype=np.int32)
    X = X + np.arange(10, dtype=np.int32)
    # Checks that the return type is ok
    X2 = as_float_array(X, copy=False)
    np.testing.assert_equal(X2.dtype, np.float32)
    # Another test
    X = X.astype(np.int64)
    X2 = as_float_array(X, copy=True)
    # Checking that the array wasn't overwritten
    assert_true(as_float_array(X, False) is not X)
    # Checking that the new type is ok
    np.testing.assert_equal(X2.dtype, np.float64)
    # Here, X is of the right type, it shouldn't be modified
    X = np.ones((3, 2), dtype=np.float32)
    assert_true(as_float_array(X, copy=False) is X)
    # Test that if X is fortran ordered it stays
    X = np.asfortranarray(X)
    assert_true(np.isfortran(as_float_array(X, copy=True)))

    # Test the copy parameter with some matrices
    matrices = [
        np.matrix(np.arange(5)),
        sp.csc_matrix(np.arange(5)).toarray(),
        sparse_random_matrix(10, 10, density=0.10).toarray(),
    ]
    for M in matrices:
        N = as_float_array(M, copy=True)
        N[0, 0] = np.nan
        assert_false(np.isnan(M).any())
Example #2
0
def test_check_increasing_down():
    x = [0, 1, 2, 3, 4, 5]
    y = [0, -1.5, -2.77, -8.99, -8.99, -50]

    # Check that we got increasing=False and no warnings
    is_increasing = assert_no_warnings(check_increasing, x, y)
    assert_false(is_increasing)
Example #3
0
def test_pickle_version_warning():
    # check that warnings are raised when unpickling in a different version

    # first, check no warning when in the same version:
    iris = datasets.load_iris()
    tree = DecisionTreeClassifier().fit(iris.data, iris.target)
    tree_pickle = pickle.dumps(tree)
    assert_true(b"version" in tree_pickle)
    assert_no_warnings(pickle.loads, tree_pickle)

    # check that warning is raised on different version
    tree_pickle_other = tree_pickle.replace(sklearn.__version__.encode(), b"something")
    message = (
        "Trying to unpickle estimator DecisionTreeClassifier from "
        "version {0} when using version {1}. This might lead to "
        "breaking code or invalid results. "
        "Use at your own risk.".format("something", sklearn.__version__)
    )
    assert_warns_message(UserWarning, message, pickle.loads, tree_pickle_other)

    # check that not including any version also works:
    # TreeNoVersion has no getstate, like pre-0.18
    tree = TreeNoVersion().fit(iris.data, iris.target)

    tree_pickle_noversion = pickle.dumps(tree)
    assert_false(b"version" in tree_pickle_noversion)
    message = message.replace("something", "pre-0.18")
    message = message.replace("DecisionTreeClassifier", "TreeNoVersion")
    # check we got the warning about using pre-0.18 pickle
    assert_warns_message(UserWarning, message, pickle.loads, tree_pickle_noversion)

    # check that no warning is raised for external estimators
    TreeNoVersion.__module__ = "notsklearn"
    assert_no_warnings(pickle.loads, tree_pickle_noversion)
Example #4
0
def test_group_shuffle_split():
    for groups_i in test_groups:
        X = y = np.ones(len(groups_i))
        n_splits = 6
        test_size = 1.0 / 3
        slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0)

        # Make sure the repr works
        repr(slo)

        # Test that the length is correct
        assert_equal(slo.get_n_splits(X, y, groups=groups_i), n_splits)

        l_unique = np.unique(groups_i)
        l = np.asarray(groups_i)

        for train, test in slo.split(X, y, groups=groups_i):
            # First test: no train group is in the test set and vice versa
            l_train_unique = np.unique(l[train])
            l_test_unique = np.unique(l[test])
            assert_false(np.any(np.in1d(l[train], l_test_unique)))
            assert_false(np.any(np.in1d(l[test], l_train_unique)))

            # Second test: train and test add up to all the data
            assert_equal(l[train].size + l[test].size, l.size)

            # Third test: train and test are disjoint
            assert_array_equal(np.intersect1d(train, test), [])

            # Fourth test:
            # unique train and test groups are correct, +- 1 for rounding error
            assert_true(abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1)
            assert_true(abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1)
Example #5
0
def test_unsorted_indices():
    # test that the result with sorted and unsorted indices in csr is the same
    # we use a subset of digits as iris, blobs or make_classification didn't
    # show the problem
    digits = load_digits()
    X, y = digits.data[:50], digits.target[:50]
    X_test = sparse.csr_matrix(digits.data[50:100])

    X_sparse = sparse.csr_matrix(X)
    coef_dense = svm.SVC(kernel="linear", probability=True, random_state=0).fit(X, y).coef_
    sparse_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit(X_sparse, y)
    coef_sorted = sparse_svc.coef_
    # make sure dense and sparse SVM give the same result
    assert_array_almost_equal(coef_dense, coef_sorted.toarray())

    X_sparse_unsorted = X_sparse[np.arange(X.shape[0])]
    X_test_unsorted = X_test[np.arange(X_test.shape[0])]

    # make sure we scramble the indices
    assert_false(X_sparse_unsorted.has_sorted_indices)
    assert_false(X_test_unsorted.has_sorted_indices)

    unsorted_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit(X_sparse_unsorted, y)
    coef_unsorted = unsorted_svc.coef_
    # make sure unsorted indices give same result
    assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray())
    assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted), sparse_svc.predict_proba(X_test))
Example #6
0
def test_predict_proba_disabled():
    # Test predict_proba when disabled on estimator.
    X = np.arange(20).reshape(5, -1)
    y = [0, 0, 1, 1, 1]
    clf = SVC(probability=False)
    gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
    assert_false(hasattr(gs, "predict_proba"))
def test_sgd_optimizer_trigger_stopping():
    params = [np.zeros(shape) for shape in shapes]
    lr = 2e-6
    optimizer = SGDOptimizer(params, lr, lr_schedule="adaptive")
    assert_false(optimizer.trigger_stopping("", False))
    assert_equal(lr / 5, optimizer.learning_rate)
    assert_true(optimizer.trigger_stopping("", False))
Example #8
0
def test_check_increasing_down_extreme():
    x = [0, 1, 2, 3, 4, 5]
    y = [0, -1, -2, -3, -4, -5]

    # Check that we got increasing=False and no warnings
    is_increasing = assert_no_warnings(check_increasing, x, y)
    assert_false(is_increasing)
def test_randomized_svd_sign_flip_with_transpose():
    # Check if the randomized_svd sign flipping is always done based on u
    # irrespective of transpose.
    # See https://github.com/scikit-learn/scikit-learn/issues/5608
    # for more details.
    def max_loading_is_positive(u, v):
        """
        returns bool tuple indicating if the values maximising np.abs
        are positive across all rows for u and across all columns for v.
        """
        u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
        v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
        return u_based, v_based

    mat = np.arange(10 * 8).reshape(10, -1)

    # Without transpose
    u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
    u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
    assert_true(u_based)
    assert_false(v_based)

    # With transpose
    u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(mat, 3, flip_sign=True, transpose=True)
    u_based, v_based = max_loading_is_positive(u_flipped_with_transpose, v_flipped_with_transpose)
    assert_true(u_based)
    assert_false(v_based)
def test_projgrad_nmf_fit_nn_output():
    """Test that the decomposition does not contain negative values"""
    A = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)]
    for init in (None, "nndsvd", "nndsvda", "nndsvdar"):
        model = nmf.ProjectedGradientNMF(n_components=2, init=init, random_state=0)
        transf = model.fit_transform(A)
        assert_false((model.components_ < 0).any() or (transf < 0).any())
Example #11
0
def test_scale_function_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_scaled = scale(X, with_mean=False)
    assert_false(np.any(np.isnan(X_scaled)))

    X_csr_scaled = scale(X_csr, with_mean=False)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    # test csc has same outcome
    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())

    # raises value error on axis != 0
    assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)

    assert_array_almost_equal(X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
def test_check_is_partition():
    p = np.arange(100)
    assert_true(cval._check_is_partition(p, 100))
    assert_false(cval._check_is_partition(np.delete(p, 23), 100))

    p[0] = 23
    assert_false(cval._check_is_partition(p, 100))
Example #13
0
def test_cv_iterable_wrapper():
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv = OldSKF(y_multiclass, n_folds=3)
    wrapped_old_skf = _CVIterableWrapper(cv)

    # Check if split works correctly
    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))

    # Check if get_n_splits works correctly
    assert_equal(len(cv), wrapped_old_skf.get_n_splits())

    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert_false(
        splits_are_equal, "If the splits are randomized, " "successive calls to split should yield different results"
    )
def test_check_is_permutation():
    p = np.arange(100)
    assert_true(_check_is_permutation(p, 100))
    assert_false(_check_is_permutation(np.delete(p, 23), 100))

    p[0] = 23
    assert_false(_check_is_permutation(p, 100))
Example #15
0
def test_initialize_nn_output():
    # Test that initialization does not return negative values
    rng = np.random.mtrand.RandomState(42)
    data = np.abs(rng.randn(10, 10))
    for init in ("random", "nndsvd", "nndsvda", "nndsvdar"):
        W, H = nmf._initialize_nmf(data, 10, init=init, random_state=0)
        assert_false((W < 0).any() or (H < 0).any())
Example #16
0
def check_estimators_overwrite_params(name, Estimator, X, y):
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    if hasattr(estimator, "batch_size"):
        # FIXME
        # for MiniBatchDictLearning
        estimator.batch_size = 1

    if name in ["GaussianRandomProjection", "SparseRandomProjection"]:
        # Due to the jl lemma and very few samples, the number
        # of components of the random matrix projection will be
        # greater
        # than the number of features.
        # So we impose a smaller number (avoid "auto" mode)
        estimator = Estimator(n_components=1)

    set_random_state(estimator)

    params = estimator.get_params()
    estimator.fit(X, y)
    new_params = estimator.get_params()
    for k, v in params.items():
        assert_false(
            np.any(new_params[k] != v),
            "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]),
        )
Example #17
0
def check_cv_results_array_types(cv_results, param_keys, score_keys):
    # Check if the search `cv_results`'s array are of correct types
    assert_true(all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys))
    assert_true(all(cv_results[key].dtype == object for key in param_keys))
    assert_false(any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys))
    assert_true(all(cv_results[key].dtype == np.float64 for key in score_keys if not key.startswith("rank")))
    assert_true(cv_results["rank_test_score"].dtype == np.int32)
Example #18
0
def check_results_array_types(results, param_keys, score_keys):
    # Check if the search results' array are of correct types
    assert_true(all(isinstance(results[param], np.ma.MaskedArray) for param in param_keys))
    assert_true(all(results[key].dtype == object for key in param_keys))
    assert_false(any(isinstance(results[key], np.ma.MaskedArray) for key in score_keys))
    assert_true(all(results[key].dtype == np.float64 for key in score_keys if key != "test_rank_score"))
    assert_true(results["test_rank_score"].dtype == np.int32)
def check_estimators_overwrite_params(name, Estimator):
    X, y = make_blobs(random_state=0, n_samples=9)
    y = multioutput_estimator_convert_y_2d(name, y)
    # some want non-negative input
    X -= X.min()
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    if hasattr(estimator, "batch_size"):
        # FIXME
        # for MiniBatchDictLearning
        estimator.batch_size = 1

    set_fast_parameters(estimator)

    set_random_state(estimator)

    params = estimator.get_params()
    estimator.fit(X, y)
    new_params = estimator.get_params()
    for k, v in params.items():
        assert_false(
            np.any(new_params[k] != v),
            "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]),
        )
Example #20
0
def test_nmf_fit_nn_output():
    # Test that the decomposition does not contain negative values
    A = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)]
    for solver in ("cd", "mu"):
        for init in (None, "nndsvd", "nndsvda", "nndsvdar", "random"):
            model = NMF(n_components=2, solver=solver, init=init, random_state=0)
            transf = model.fit_transform(A)
            assert_false((model.components_ < 0).any() or (transf < 0).any())
Example #21
0
def test_fit_predict_with_intermediate_fit_params():
    # tests that Pipeline passes fit_params to intermediate steps
    # when fit_predict is invoked
    pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())])
    pipe.fit_predict(X=None, y=None, transf__should_get_this=True, clf__should_succeed=True)
    assert_true(pipe.named_steps["transf"].fit_params["should_get_this"])
    assert_true(pipe.named_steps["clf"].successful)
    assert_false("should_succeed" in pipe.named_steps["transf"].fit_params)
Example #22
0
def test_transform_nan():
    # Test that SparsePCA won't return NaN when there is 0 feature in all
    # samples.
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    Y[:, 0] = 0
    estimator = SparsePCA(n_components=8)
    assert_false(np.any(np.isnan(estimator.fit_transform(Y))))
Example #23
0
def test_check_ci_warn():
    x = [0, 1, 2, 3, 4, 5]
    y = [0, -1, 2, -3, 4, -5]

    # Check that we got increasing=False and CI interval warning
    is_increasing = assert_warns_message(UserWarning, "interval", check_increasing, x, y)

    assert_false(is_increasing)
Example #24
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_false(np.all(X == Xt))

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_true(np.all(X == Xt))

    # copy=False, sparse csr, axis=1 => no copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=0 => no copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=1 => copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=1, missing_values=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    assert_false(sparse.issparse(Xt))
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss="log", random_state=1)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    # parallelism requires this to be the case for a sane implementation
    assert_false(est1 is est2)
Example #26
0
def test_label_binarizer_errors():
    """Check that invalid arguments yield ValueError"""
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))

    multi_label = [(2, 3), (0,), (0, 2)]
    assert_raises(ValueError, lb.transform, multi_label)

    lb = LabelBinarizer()
    assert_raises(ValueError, lb.transform, [])
    assert_raises(ValueError, lb.inverse_transform, [])

    y = np.array([[0, 1, 0], [1, 1, 1]])
    classes = np.arange(3)
    assert_raises(ValueError, label_binarize, y, classes, multilabel=True, neg_label=2, pos_label=1)
    assert_raises(ValueError, label_binarize, y, classes, multilabel=True, neg_label=2, pos_label=2)

    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1)
    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)

    assert_raises(ValueError, LabelBinarizer, neg_label=1, pos_label=2, sparse_output=True)

    # Fail on y_type
    assert_raises(
        ValueError,
        _inverse_binarize_thresholding,
        y=csr_matrix([[1, 2], [2, 1]]),
        output_type="foo",
        classes=[1, 2],
        threshold=0,
    )

    # Fail on the number of classes
    assert_raises(
        ValueError,
        _inverse_binarize_thresholding,
        y=csr_matrix([[1, 2], [2, 1]]),
        output_type="foo",
        classes=[1, 2, 3],
        threshold=0,
    )

    # Fail on the dimension of 'binary'
    assert_raises(
        ValueError,
        _inverse_binarize_thresholding,
        y=np.array([[1, 2, 3], [2, 1, 3]]),
        output_type="binary",
        classes=[1, 2, 3],
        threshold=0,
    )

    # Fail on multioutput data
    assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]]))
    assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]), [1, 2, 3])
Example #27
0
def test_regression_squared_loss_nn_l2():
    X, y, _ = make_nn_regression(n_samples=100, n_features=10, n_informative=8, random_state=0)

    reg = SGDRegressor(loss="squared", penalty="nnl2", learning_rate="constant", eta0=1e-1, alpha=1e-4, random_state=0)

    reg.fit(X, y)
    pred = reg.predict(X)
    assert_almost_equal(np.mean((pred - y) ** 2), 0.033, 3)
    assert_almost_equal(reg.coef_.sum(), 2.131, 3)
    assert_false((reg.coef_ < 0).any())
Example #28
0
def test_partial_fit_errors():
    # Test partial_fit error handling."""
    X = [[3, 2], [1, 6]]
    y = [1, 0]

    # no classes passed
    assert_raises(ValueError, MLPClassifier(algorithm="sgd").partial_fit, X, y, classes=[2])

    # l-bfgs doesn't support partial_fit
    assert_false(hasattr(MLPClassifier(algorithm="l-bfgs"), "partial_fit"))
def test_pairwise_attribute():
    clf_precomputed = svm.SVC(kernel="precomputed")
    clf_notprecomputed = svm.SVC()

    for MultiClassClassifier in [OneVsRestClassifier, OneVsOneClassifier]:
        ovr_false = MultiClassClassifier(clf_notprecomputed)
        assert_false(ovr_false._pairwise)

        ovr_true = MultiClassClassifier(clf_precomputed)
        assert_true(ovr_true._pairwise)
Example #30
0
def test_no_nan():
    # Assert Silhouette Coefficient != nan when there is 1 sample in a class.
    # This tests for the condition that caused issue 960.
    # Note that there is only one sample in cluster 0. This used to cause the
    # silhouette_score to return nan (see bug #960).
    labels = np.array([1, 0, 1, 1, 1])
    # The distance matrix doesn't actually matter.
    D = np.random.RandomState(0).rand(len(labels), len(labels))
    silhouette = silhouette_score(D, labels, metric="precomputed")
    assert_false(np.isnan(silhouette))