Beispiel #1
0
def test_grid_search_score_method():
    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                               random_state=0)
    clf = LinearSVC(random_state=0)
    grid = {'C': [.1]}

    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
    search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
    search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
                                              scoring='roc_auc').fit(X, y)
    search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)

    # Check warning only occurs in situation where behavior changed:
    # estimator requires score method to compete with scoring parameter
    score_no_scoring = search_no_scoring.score(X, y)
    score_accuracy = search_accuracy.score(X, y)
    score_no_score_auc = search_no_score_method_auc.score(X, y)
    score_auc = search_auc.score(X, y)

    # ensure the test is sane
    assert_true(score_auc < 1.0)
    assert_true(score_accuracy < 1.0)
    assert_not_equal(score_auc, score_accuracy)

    assert_almost_equal(score_accuracy, score_no_scoring)
    assert_almost_equal(score_auc, score_no_score_auc)
Beispiel #2
0
def test_kernel_clone_after_set_params(kernel):
    # This test is to verify that using set_params does not
    # break clone on kernels.
    # This used to break because in kernels such as the RBF, non-trivial
    # logic that modified the length scale used to be in the constructor
    # See https://github.com/scikit-learn/scikit-learn/issues/6961
    # for more details.
    bounds = (1e-5, 1e5)
    kernel_cloned = clone(kernel)
    params = kernel.get_params()
    # RationalQuadratic kernel is isotropic.
    isotropic_kernels = (ExpSineSquared, RationalQuadratic)
    if 'length_scale' in params and not isinstance(kernel, isotropic_kernels):
        length_scale = params['length_scale']
        if np.iterable(length_scale):
            params['length_scale'] = length_scale[0]
            params['length_scale_bounds'] = bounds
        else:
            params['length_scale'] = [length_scale] * 2
            params['length_scale_bounds'] = bounds * 2
        kernel_cloned.set_params(**params)
        kernel_cloned_clone = clone(kernel_cloned)
        assert_equal(kernel_cloned_clone.get_params(),
                     kernel_cloned.get_params())
        assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned))
        check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone)
Beispiel #3
0
def test_base():
    # Check BaseEnsemble methods.
    ensemble = BaggingClassifier(
        base_estimator=Perceptron(tol=1e-3, random_state=None), n_estimators=3)

    iris = load_iris()
    ensemble.fit(iris.data, iris.target)
    ensemble.estimators_ = []  # empty the list and create estimators manually

    ensemble._make_estimator()
    random_state = np.random.RandomState(3)
    ensemble._make_estimator(random_state=random_state)
    ensemble._make_estimator(random_state=random_state)
    ensemble._make_estimator(append=False)

    assert_equal(3, len(ensemble))
    assert_equal(3, len(ensemble.estimators_))

    assert_true(isinstance(ensemble[0], Perceptron))
    assert_equal(ensemble[0].random_state, None)
    assert_true(isinstance(ensemble[1].random_state, int))
    assert_true(isinstance(ensemble[2].random_state, int))
    assert_not_equal(ensemble[1].random_state, ensemble[2].random_state)

    np_int_ensemble = BaggingClassifier(base_estimator=Perceptron(tol=1e-3),
                                        n_estimators=np.int32(3))
    np_int_ensemble.fit(iris.data, iris.target)
def test_kernel_clone_after_set_params():
    # This test is to verify that using set_params does not
    # break clone on kernels.
    # This used to break because in kernels such as the RBF, non-trivial
    # logic that modified the length scale used to be in the constructor
    # See https://github.com/scikit-learn/scikit-learn/issues/6961
    # for more details.
    bounds = (1e-5, 1e5)
    for kernel in kernels:
        kernel_cloned = clone(kernel)
        params = kernel.get_params()
        # RationalQuadratic kernel is isotropic.
        isotropic_kernels = (ExpSineSquared, RationalQuadratic)
        if 'length_scale' in params and not isinstance(kernel,
                                                       isotropic_kernels):
            length_scale = params['length_scale']
            if np.iterable(length_scale):
                params['length_scale'] = length_scale[0]
                params['length_scale_bounds'] = bounds
            else:
                params['length_scale'] = [length_scale] * 2
                params['length_scale_bounds'] = bounds * 2
            kernel_cloned.set_params(**params)
            kernel_cloned_clone = clone(kernel_cloned)
            assert_equal(kernel_cloned_clone.get_params(),
                         kernel_cloned.get_params())
            assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned))
            yield (check_hyperparameters_equal, kernel_cloned,
                   kernel_cloned_clone)
def test_grid_search_score_method():
    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                               random_state=0)
    clf = LinearSVC(random_state=0)
    grid = {'C': [.1]}

    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
    search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
    search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
                                              scoring='roc_auc').fit(X, y)
    search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)

    # ChangedBehaviourWarning occurred previously (prior to #9005)
    score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
    score_accuracy = assert_no_warnings(search_accuracy.score, X, y)
    score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
                                            X, y)
    score_auc = assert_no_warnings(search_auc.score, X, y)

    # ensure the test is sane
    assert_true(score_auc < 1.0)
    assert_true(score_accuracy < 1.0)
    assert_not_equal(score_auc, score_accuracy)

    assert_almost_equal(score_accuracy, score_no_scoring)
    assert_almost_equal(score_auc, score_no_score_auc)
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver,
                             fit_inverse_transform=True)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed),
                                      np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed, [])

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1],
                         X_fit_transformed.shape[1])

            # inverse transform
            X_pred2 = kpca.inverse_transform(X_pred_transformed)
            assert_equal(X_pred2.shape, X_pred.shape)
Beispiel #7
0
def test_grid_search_score_method():
    X, y = make_classification(n_samples=100,
                               n_classes=2,
                               flip_y=.2,
                               random_state=0)
    clf = LinearSVC(random_state=0)
    grid = {'C': [.1]}

    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
    search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
    search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(),
                                              grid,
                                              scoring='roc_auc').fit(X, y)
    search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)

    # ChangedBehaviourWarning occurred previously (prior to #9005)
    score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
    score_accuracy = assert_no_warnings(search_accuracy.score, X, y)
    score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
                                            X, y)
    score_auc = assert_no_warnings(search_auc.score, X, y)

    # ensure the test is sane
    assert_true(score_auc < 1.0)
    assert_true(score_accuracy < 1.0)
    assert_not_equal(score_auc, score_accuracy)

    assert_almost_equal(score_accuracy, score_no_scoring)
    assert_almost_equal(score_auc, score_no_score_auc)
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4,
                             kernel=kernel,
                             eigen_solver=eigen_solver,
                             fit_inverse_transform=True)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed),
                                      np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed, [])

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1],
                         X_fit_transformed.shape[1])

            # inverse transform
            X_pred2 = kpca.inverse_transform(X_pred_transformed)
            assert_equal(X_pred2.shape, X_pred.shape)
def test_score_sample_weight():
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import DecisionTreeRegressor
    from sklearn import datasets

    rng = np.random.RandomState(0)

    # test both ClassifierMixin and RegressorMixin
    estimators = [
        DecisionTreeClassifier(max_depth=2),
        DecisionTreeRegressor(max_depth=2)
    ]
    sets = [datasets.load_iris(), datasets.load_boston()]

    for est, ds in zip(estimators, sets):
        est.fit(ds.data, ds.target)
        # generate random sample weights
        sample_weight = rng.randint(1, 10, size=len(ds.target))
        # check that the score with and without sample weights are different
        assert_not_equal(est.score(ds.data, ds.target),
                         est.score(ds.data,
                                   ds.target,
                                   sample_weight=sample_weight),
                         msg="Unweighted and weighted scores "
                         "are unexpectedly equal")
Beispiel #10
0
def test_kernel_clone():
    """ Test that sklearn's clone works correctly on kernels. """
    for kernel in kernels:
        kernel_cloned = clone(kernel)

        assert_equal(kernel, kernel_cloned)
        assert_not_equal(id(kernel), id(kernel_cloned))
        for attr in kernel.__dict__.keys():
            attr_value = getattr(kernel, attr)
            attr_value_cloned = getattr(kernel_cloned, attr)
            if attr.startswith("hyperparameter_"):
                assert_equal(attr_value.name, attr_value_cloned.name)
                assert_equal(attr_value.value_type,
                             attr_value_cloned.value_type)
                assert_array_equal(attr_value.bounds,
                                   attr_value_cloned.bounds)
                assert_equal(attr_value.n_elements,
                             attr_value_cloned.n_elements)
            elif np.iterable(attr_value):
                for i in range(len(attr_value)):
                    if np.iterable(attr_value[i]):
                        assert_array_equal(attr_value[i],
                                           attr_value_cloned[i])
                    else:
                        assert_equal(attr_value[i], attr_value_cloned[i])
            else:
                assert_equal(attr_value, attr_value_cloned)
            if not isinstance(attr_value, Hashable):
                # modifiable attributes must not be identical
                assert_not_equal(id(attr_value), id(attr_value_cloned))
Beispiel #11
0
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut,
                       4, indices=True)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut,
                       4, 2, indices=True)
    kf = assert_warns(DeprecationWarning, cval.KFold,
                      4, 2, indices=True)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold,
                       y, 2, indices=True)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut,
                        labels, indices=True)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut,
                        labels, 2, indices=True)
    b = cval.Bootstrap(2)  # only in index mode
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit,
                      2, indices=True)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
def test_discretenb_predict_proba():
    """Test discrete NB classes' probability scores"""

    # The 100s below distinguish Bernoulli from multinomial.
    X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
    X_multinomial = [[0, 1], [1, 3], [4, 0]]

    # Confirm that the 100s above distinguish Bernoulli from multinomial
    y = [0, 0, 1]
    cls_b = BernoulliNB().fit(X_bernoulli, y)
    cls_m = MultinomialNB().fit(X_bernoulli, y)
    assert_not_equal(cls_b.predict(X_bernoulli)[-1],
                     cls_m.predict(X_bernoulli)[-1])

    # test binary case (1-d output)
    y = [0, 0, 2]   # 2 is regression test for binary case, 02e673
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict(X[-1]), 2)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 2))
        assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
                                  np.array([1., 1.]), 6)

    # test multiclass case (2-d output, must sum to one)
    y = [0, 1, 2]
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 3))
        assert_equal(clf.predict_proba(X[:2]).shape, (2, 3))
        assert_almost_equal(np.sum(clf.predict_proba(X[1])), 1)
        assert_almost_equal(np.sum(clf.predict_proba(X[-1])), 1)
        assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
        assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    def histogram(x, y, **kwargs):
        # Histogram kernel implemented as a callable.
        assert_equal(kwargs, {})  # no kernel_params that we didn't ask for
        return np.minimum(x, y).sum()

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly", histogram):
            # histogram kernel produces singular matrix inside linalg.solve
            # XXX use a least-squares approximation?
            inv = not callable(kernel)

            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed.size, 0)

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])

            # inverse transform
            if inv:
                X_pred2 = kpca.inverse_transform(X_pred_transformed)
                assert_equal(X_pred2.shape, X_pred.shape)
Beispiel #14
0
def test_similarity_tree():
    # Test that rules are well splitted
    rules = [
        ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)),
        ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)),
        ("a > 2 and b > 45", (0.5, 0.3, 0)),
        ("a > 2 and b > 40", (0.5, 0.2, 0)),
        ("a <= 2 and b <= 45", (1, 1, 0)),
        ("a > 2 and c <= 3", (1, 1, 0)),
        ("b > 45", (1, 1, 0)),
    ]

    sk = SkopeRules(max_depth_duplication=2)
    rulesets = sk._find_similar_rulesets(rules)
    # Assert some couples of rules are in the same bag
    idx_bags_rules = []
    for idx_rule, r in enumerate(rules):
        idx_bags_for_rule = []
        for idx_bag, bag in enumerate(rulesets):
            if r in bag:
                idx_bags_for_rule.append(idx_bag)
        idx_bags_rules.append(idx_bags_for_rule)

    assert_equal(idx_bags_rules[0], idx_bags_rules[1])
    assert_not_equal(idx_bags_rules[0], idx_bags_rules[2])
    # Assert the best rules are kept
    final_rules = sk.deduplicate(rules)
    assert_in(rules[0], final_rules)
    assert_in(rules[2], final_rules)
    assert_not_in(rules[3], final_rules)
Beispiel #15
0
def test_grid_search_score_method():
    X, y = make_classification(n_samples=100,
                               n_classes=2,
                               flip_y=.2,
                               random_state=0)
    clf = LinearSVC(random_state=0)
    grid = {'C': [.1]}

    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
    search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
    search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(),
                                              grid,
                                              scoring='roc_auc').fit(X, y)
    search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)

    # Check warning only occurs in situation where behavior changed:
    # estimator requires score method to compete with scoring parameter
    score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
    score_accuracy = assert_warns(ChangedBehaviorWarning,
                                  search_accuracy.score, X, y)
    score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
                                            X, y)
    score_auc = assert_warns(ChangedBehaviorWarning, search_auc.score, X, y)
    # ensure the test is sane
    assert_true(score_auc < 1.0)
    assert_true(score_accuracy < 1.0)
    assert_not_equal(score_auc, score_accuracy)

    assert_almost_equal(score_accuracy, score_no_scoring)
    assert_almost_equal(score_auc, score_no_score_auc)
Beispiel #16
0
def test_hash_functions():
    # Checks randomness of hash functions.
    # Variance and mean of each hash function (projection vector)
    # should be different from flattened array of hash functions.
    # If hash functions are not randomly built (seeded with
    # same value), variances and means of all functions are equal.
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        n_estimators=n_estimators,
        random_state=rng.randint(0, np.iinfo(np.int32).max))
    ignore_warnings(lshf.fit)(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
def test_kernel_clone():
    """ Test that sklearn's clone works correctly on kernels. """
    for kernel in kernels:
        kernel_cloned = clone(kernel)

        assert_equal(kernel, kernel_cloned)
        assert_not_equal(id(kernel), id(kernel_cloned))
        for attr in kernel.__dict__.keys():
            attr_value = getattr(kernel, attr)
            attr_value_cloned = getattr(kernel_cloned, attr)
            if attr.startswith("hyperparameter_"):
                assert_equal(attr_value.name, attr_value_cloned.name)
                assert_equal(attr_value.value_type,
                             attr_value_cloned.value_type)
                assert_array_equal(attr_value.bounds,
                                   attr_value_cloned.bounds)
                assert_equal(attr_value.n_elements,
                             attr_value_cloned.n_elements)
            elif np.iterable(attr_value):
                for i in range(len(attr_value)):
                    if np.iterable(attr_value[i]):
                        assert_array_equal(attr_value[i],
                                           attr_value_cloned[i])
                    else:
                        assert_equal(attr_value[i], attr_value_cloned[i])
            else:
                assert_equal(attr_value, attr_value_cloned)
            if not isinstance(attr_value, Hashable):
                # modifiable attributes must not be identical
                assert_not_equal(id(attr_value), id(attr_value_cloned))
Beispiel #18
0
def test_hash_rule():
    assert_equal(len({
                        Rule('a <= 2 and a <= 3'),
                        Rule('a <= 2')
                      }), 1)
    assert_not_equal(len({
                        Rule('a <= 4 and a <= 3'),
                        Rule('a <= 2')
                      }), 1)
Beispiel #19
0
def test_scorer_sample_weight():
    """Test that scorers support sample_weight or raise sensible errors"""

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
                                             return_indicator=True,
                                             random_state=0)
    split = train_test_split(X, y, y_ml, random_state=0)
    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split

    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy='median')
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    estimator = dict([(name, sensible_regr) for name in REGRESSION_SCORERS] +
                     [(name, sensible_clf) for name in CLF_SCORERS] +
                     [(name, sensible_ml_clf)
                      for name in MULTILABEL_ONLY_SCORERS])

    for name, scorer in SCORERS.items():
        if name in MULTILABEL_ONLY_SCORERS:
            target = y_ml_test
        else:
            target = y_test
        try:
            weighted = scorer(estimator[name],
                              X_test,
                              target,
                              sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], target[10:])
            unweighted = scorer(estimator[name], X_test, target)
            assert_not_equal(weighted,
                             unweighted,
                             msg="scorer {0} behaves identically when "
                             "called with sample weights: {1} vs "
                             "{2}".format(name, weighted, unweighted))
            assert_almost_equal(weighted,
                                ignored,
                                err_msg="scorer {0} behaves differently when "
                                "ignoring samples and setting sample_weight to"
                                " 0: {1} vs {2}".format(
                                    name, weighted, ignored))

        except TypeError as e:
            assert_true(
                "sample_weight" in str(e),
                "scorer {0} raises unhelpful exception when called "
                "with sample weights: {1}".format(name, str(e)))
def test_additive_chi2_sampler():
    # test that AdditiveChi2Sampler approximates kernel on random data

    # compute exact kernel
    # appreviations for easier formular
    X_ = X[:, np.newaxis, :]
    Y_ = Y[np.newaxis, :, :]

    large_kernel = 2 * X_ * Y_ / (X_ + Y_)

    # reduce to n_samples_x x n_samples_y by summing over features
    kernel = (large_kernel.sum(axis=2))

    # approximate kernel mapping
    transform = AdditiveChi2Sampler(sample_steps=3)
    X_trans = transform.fit_transform(X)
    Y_trans = transform.transform(Y)

    kernel_approx = np.dot(X_trans, Y_trans.T)

    assert_array_almost_equal(kernel, kernel_approx, 1)

    X_sp_trans = transform.fit_transform(csr_matrix(X))
    Y_sp_trans = transform.transform(csr_matrix(Y))

    assert_array_equal(X_trans, X_sp_trans.A)
    assert_array_equal(Y_trans, Y_sp_trans.A)

    # test error is raised on negative input
    Y_neg = Y.copy()
    Y_neg[0, 0] = -1
    assert_raises(ValueError, transform.transform, Y_neg)

    # test error on invalid sample_steps
    transform = AdditiveChi2Sampler(sample_steps=4)
    assert_raises(ValueError, transform.fit, X)

    # test that the sample interval is set correctly
    sample_steps_available = [1, 2, 3]
    for sample_steps in sample_steps_available:

        # test that the sample_interval is initialized correctly
        transform = AdditiveChi2Sampler(sample_steps=sample_steps)
        assert_equal(transform.sample_interval, None)

        # test that the sample_interval is changed in the fit method
        transform.fit(X)
        assert_not_equal(transform.sample_interval_, None)

    # test that the sample_interval is set correctly
    sample_interval = 0.3
    transform = AdditiveChi2Sampler(sample_steps=4,
                                    sample_interval=sample_interval)
    assert_equal(transform.sample_interval, sample_interval)
    transform.fit(X)
    assert_equal(transform.sample_interval_, sample_interval)
Beispiel #21
0
def test_additive_chi2_sampler():
    # test that AdditiveChi2Sampler approximates kernel on random data

    # compute exact kernel
    # abbreviations for easier formula
    X_ = X[:, np.newaxis, :]
    Y_ = Y[np.newaxis, :, :]

    large_kernel = 2 * X_ * Y_ / (X_ + Y_)

    # reduce to n_samples_x x n_samples_y by summing over features
    kernel = (large_kernel.sum(axis=2))

    # approximate kernel mapping
    transform = AdditiveChi2Sampler(sample_steps=3)
    X_trans = transform.fit_transform(X)
    Y_trans = transform.transform(Y)

    kernel_approx = np.dot(X_trans, Y_trans.T)

    assert_array_almost_equal(kernel, kernel_approx, 1)

    X_sp_trans = transform.fit_transform(csr_matrix(X))
    Y_sp_trans = transform.transform(csr_matrix(Y))

    assert_array_equal(X_trans, X_sp_trans.A)
    assert_array_equal(Y_trans, Y_sp_trans.A)

    # test error is raised on negative input
    Y_neg = Y.copy()
    Y_neg[0, 0] = -1
    assert_raises(ValueError, transform.transform, Y_neg)

    # test error on invalid sample_steps
    transform = AdditiveChi2Sampler(sample_steps=4)
    assert_raises(ValueError, transform.fit, X)

    # test that the sample interval is set correctly
    sample_steps_available = [1, 2, 3]
    for sample_steps in sample_steps_available:

        # test that the sample_interval is initialized correctly
        transform = AdditiveChi2Sampler(sample_steps=sample_steps)
        assert_equal(transform.sample_interval, None)

        # test that the sample_interval is changed in the fit method
        transform.fit(X)
        assert_not_equal(transform.sample_interval_, None)

    # test that the sample_interval is set correctly
    sample_interval = 0.3
    transform = AdditiveChi2Sampler(sample_steps=4,
                                    sample_interval=sample_interval)
    assert_equal(transform.sample_interval, sample_interval)
    transform.fit(X)
    assert_equal(transform.sample_interval_, sample_interval)
Beispiel #22
0
def test_scorer_sample_weight():
    # Test that scorers support sample_weight or raise sensible errors

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0)
    split = train_test_split(X, y, y_ml, random_state=0)
    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split

    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy="median")
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    estimator = dict(
        [(name, sensible_regr) for name in REGRESSION_SCORERS]
        + [(name, sensible_clf) for name in CLF_SCORERS]
        + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
    )

    for name, scorer in SCORERS.items():
        if name in MULTILABEL_ONLY_SCORERS:
            target = y_ml_test
        else:
            target = y_test
        try:
            weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], target[10:])
            unweighted = scorer(estimator[name], X_test, target)
            assert_not_equal(
                weighted,
                unweighted,
                msg="scorer {0} behaves identically when "
                "called with sample weights: {1} vs "
                "{2}".format(name, weighted, unweighted),
            )
            assert_almost_equal(
                weighted,
                ignored,
                err_msg="scorer {0} behaves differently when "
                "ignoring samples and setting sample_weight to"
                " 0: {1} vs {2}".format(name, weighted, ignored),
            )

        except TypeError as e:
            assert_true(
                "sample_weight" in str(e),
                "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)),
            )
Beispiel #23
0
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    X_40 = np.ones(40)
    y = [0] * 20 + [1] * 20
    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
    for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)):
        assert_not_equal(set(test0), set(test1))
    check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    X_40 = np.ones(40)
    y = [0] * 20 + [1] * 20
    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
    for (_, test0), (_, test1) in zip(kf0.split(X_40, y),
                                      kf1.split(X_40, y)):
        assert_not_equal(set(test0), set(test1))
    check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
Beispiel #25
0
def test_average_precision_score_tied_values():
    # Here if we go from left to right in y_true, the 0 values are
    # are separated from the 1 values, so it appears that we've
    # Correctly sorted our classifications. But in fact the first two
    # values have the same score (0.5) and so the first two values
    # could be swapped around, creating an imperfect sorting. This
    # imperfection should come through in the end score, making it less
    # than one.
    y_true = [0, 1, 1]
    y_score = [.5, .5, .6]
    assert_not_equal(average_precision_score(y_true, y_score), 1.)
def test_average_precision_score_tied_values():
    # Here if we go from left to right in y_true, the 0 values are
    # are separated from the 1 values, so it appears that we've
    # Correctly sorted our classifications. But in fact the first two
    # values have the same score (0.5) and so the first two values
    # could be swapped around, creating an imperfect sorting. This
    # imperfection should come through in the end score, making it less
    # than one.
    y_true = [0, 1, 1]
    y_score = [.5, .5, .6]
    assert_not_equal(average_precision_score(y_true, y_score), 1.)
Beispiel #27
0
def test_beta_dist():
    du2 = [randn(n_features, 1) for i in range(n_kernels)]
    for i in range(len(du2)):
        du2[i] /= norm(du2[i])

    assert_equal(0., beta_dist(du, du))
    assert_not_equal(0., beta_dist(du, du2))

    du2 = [randn(n_features+2, 1) for i in range(n_kernels)]
    for i in range(len(du2)):
        du2[i] /= norm(du2[i])
    assert_raises(ValueError, beta_dist, du, du2)
Beispiel #28
0
def test_beta_dist():
    du2 = [randn(n_features, 1) for i in range(n_kernels)]
    for i in range(len(du2)):
        du2[i] /= norm(du2[i])

    assert_equal(0., beta_dist(du, du))
    assert_not_equal(0., beta_dist(du, du2))

    du2 = [randn(n_features + 2, 1) for i in range(n_kernels)]
    for i in range(len(du2)):
        du2[i] /= norm(du2[i])
    assert_raises(ValueError, beta_dist, du, du2)
Beispiel #29
0
    def test_partial_fit_multiclass_delete_old_class(self):
        # Train with training data containing only 2 classes(d1), then deletes
        # the second class. Check that the prediction for an example for the
        # second class is now the first class.
        third = X2.shape[0] // 3
        sixth = 2 * third
        clf = self.factory(alpha=0.01)

        clf.partial_fit(X2[:sixth], Y2[:sixth])
        label1, score1 = clf.predict_and_score(X2[third])
        assert_equal(clf.delete_class(Y2[third]), True)
        label2, score2 = clf.predict_and_score(X2[third])
        assert_not_equal(label1, label2)
Beispiel #30
0
def test_set_random_states():
    # Linear Discriminant Analysis doesn't have random state: smoke test
    _set_random_states(LinearDiscriminantAnalysis(), random_state=17)

    clf1 = Perceptron(tol=1e-3, random_state=None)
    assert_equal(clf1.random_state, None)
    # check random_state is None still sets
    _set_random_states(clf1, None)
    assert_true(isinstance(clf1.random_state, int))

    # check random_state fixes results in consistent initialisation
    _set_random_states(clf1, 3)
    assert_true(isinstance(clf1.random_state, int))
    clf2 = Perceptron(tol=1e-3, random_state=None)
    _set_random_states(clf2, 3)
    assert_equal(clf1.random_state, clf2.random_state)

    # nested random_state

    def make_steps():
        return [('sel', SelectFromModel(Perceptron(tol=1e-3,
                                                   random_state=None))),
                ('clf', Perceptron(tol=1e-3, random_state=None))]

    est1 = Pipeline(make_steps())
    _set_random_states(est1, 3)
    assert_true(isinstance(est1.steps[0][1].estimator.random_state, int))
    assert_true(isinstance(est1.steps[1][1].random_state, int))
    assert_not_equal(est1.get_params()['sel__estimator__random_state'],
                     est1.get_params()['clf__random_state'])

    # ensure multiple random_state paramaters are invariant to get_params()
    # iteration order

    class AlphaParamPipeline(Pipeline):
        def get_params(self, *args, **kwargs):
            params = Pipeline.get_params(self, *args, **kwargs).items()
            return OrderedDict(sorted(params))

    class RevParamPipeline(Pipeline):
        def get_params(self, *args, **kwargs):
            params = Pipeline.get_params(self, *args, **kwargs).items()
            return OrderedDict(sorted(params, reverse=True))

    for cls in [AlphaParamPipeline, RevParamPipeline]:
        est2 = cls(make_steps())
        _set_random_states(est2, 3)
        assert_equal(est1.get_params()['sel__estimator__random_state'],
                     est2.get_params()['sel__estimator__random_state'])
        assert_equal(est1.get_params()['clf__random_state'],
                     est2.get_params()['clf__random_state'])
def test_kernel_clone(kernel):
    # Test that sklearn's clone works correctly on kernels.
    kernel_cloned = clone(kernel)

    # XXX: Should this be fixed?
    # This differs from the sklearn's estimators equality check.
    assert_equal(kernel, kernel_cloned)
    assert_not_equal(id(kernel), id(kernel_cloned))

    # Check that all constructor parameters are equal.
    assert_equal(kernel.get_params(), kernel_cloned.get_params())

    # Check that all hyperparameters are equal.
    check_hyperparameters_equal(kernel, kernel_cloned)
def test_multi_target_sample_weight_partial_fit():
    # weighted regressor
    X = [[1, 2, 3], [4, 5, 6]]
    y = [[3.141, 2.718], [2.718, 3.141]]
    w = [2., 1.]
    rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
    rgr_w.partial_fit(X, y, w)

    # weighted with different weights
    w = [2., 2.]
    rgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
    rgr.partial_fit(X, y, w)

    assert_not_equal(rgr.predict(X)[0][0], rgr_w.predict(X)[0][0])
def test_multi_target_sample_weight_partial_fit():
    # weighted regressor
    X = [[1, 2, 3], [4, 5, 6]]
    y = [[3.141, 2.718], [2.718, 3.141]]
    w = [2., 1.]
    rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0))
    rgr_w.partial_fit(X, y, w)

    # weighted with different weights
    w = [2., 2.]
    rgr = MultiOutputRegressor(SGDRegressor(random_state=0))
    rgr.partial_fit(X, y, w)

    assert_not_equal(rgr.predict(X)[0][0], rgr_w.predict(X)[0][0])
Beispiel #34
0
def test_kernel_clone(kernel):
    # Test that sklearn's clone works correctly on kernels.
    kernel_cloned = clone(kernel)

    # XXX: Should this be fixed?
    # This differs from the sklearn's estimators equality check.
    assert_equal(kernel, kernel_cloned)
    assert_not_equal(id(kernel), id(kernel_cloned))

    # Check that all constructor parameters are equal.
    assert_equal(kernel.get_params(), kernel_cloned.get_params())

    # Check that all hyperparameters are equal.
    check_hyperparameters_equal(kernel, kernel_cloned)
def test_verbose():
    """Test whether verbose works as intended."""
    X = Xboston
    y = yboston

    elm_fit = ELMRegressor(verbose=True)
    elm_batch_fit = ELMRegressor(verbose=True, batch_size=50)
    for elm in [elm_fit, elm_batch_fit]:
        old_stdout = sys.stdout
        sys.stdout = output = StringIO()

        elm.fit(X, y)
        sys.stdout = old_stdout

        assert_not_equal(output.getvalue(), '')
def test_constraints_rvs():
    space = Space([
        Real(1, 10),
        Real(1, 10),
        Real(1, 10),
        Integer(0, 10),
        Integer(0, 10),
        Integer(0, 10),
        Categorical(list('abcdefg')),
        Categorical(list('abcdefg')),
        Categorical(list('abcdefg'))
    ])
    
    cons_list = [Single(0,5.0,'real'),
            Inclusive(1,(3.0,5.0),'real'),
            Exclusive(2,(3.0,5.0),'real'),
            Single(3,5,'integer'),
            Inclusive(4,(3,5),'integer'),
            Exclusive(5,(3,5),'integer'),
            Single(6,'b','categorical'),
            Inclusive(7,('c','d','e'),'categorical'),
            Exclusive(8,('c','d','e'),'categorical'),
            # Note that two constraints are being added to dimension 4 and 5
            Inclusive(4,(7,9),'integer'),
            Exclusive(5,(7,9),'integer'),
    ]

    # Test lenght of samples
    constraints = Constraints(cons_list,space)
    samples = constraints.rvs(n_samples = 100)
    assert_equal(len(samples),100)
    assert_equal(len(samples[0]),space.n_dims)
    assert_equal(len(samples[-1]),space.n_dims)
    
    # Test random state
    samples_a = constraints.rvs(n_samples = 100,random_state = 1)
    samples_b = constraints.rvs(n_samples = 100,random_state = 1)
    samples_c = constraints.rvs(n_samples = 100,random_state = 2)
    assert_equal(samples_a,samples_b)
    assert_not_equal(samples_a,samples_c)

    # Test invalid constraint combinations
    space = Space([Real(0, 1)])
    cons_list = [Exclusive(0,(0.3,0.7),'real'), Inclusive(0,(0.5,0.6),'real')]
    constraints = Constraints(cons_list,space)
    with raises(RuntimeError):
        samples = constraints.rvs(n_samples = 10)
def test_classifier_chain_crossval_fit_and_predict():
    # Fit classifier chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain_cv.fit(X, Y)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred_cv = classifier_chain_cv.predict(X)
    Y_pred = classifier_chain.predict(X)

    assert_equal(Y_pred_cv.shape, Y.shape)
    assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4)

    assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv),
                     jaccard_similarity_score(Y, Y_pred))
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
Beispiel #39
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
Beispiel #40
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
def test_base_chain_random_order():
    # Fit base chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    for chain in [ClassifierChain(LogisticRegression()),
                  RegressorChain(Ridge())]:
        chain_random = clone(chain).set_params(order='random', random_state=42)
        chain_random.fit(X, Y)
        chain_fixed = clone(chain).set_params(order=chain_random.order_)
        chain_fixed.fit(X, Y)
        assert_array_equal(chain_fixed.order_, chain_random.order_)
        assert_not_equal(list(chain_random.order), list(range(4)))
        assert_equal(len(chain_random.order_), 4)
        assert_equal(len(set(chain_random.order_)), 4)
        # Randomly ordered chain should behave identically to a fixed order
        # chain with the same order.
        for est1, est2 in zip(chain_random.estimators_,
                              chain_fixed.estimators_):
            assert_array_almost_equal(est1.coef_, est2.coef_)
def test_classifier_chain_crossval_fit_and_predict():
    # Fit classifier chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain_cv.fit(X, Y)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred_cv = classifier_chain_cv.predict(X)
    Y_pred = classifier_chain.predict(X)

    assert_equal(Y_pred_cv.shape, Y.shape)
    assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4)

    assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv),
                     jaccard_similarity_score(Y, Y_pred))
def test_base_chain_random_order():
    # Fit base chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    for chain in [ClassifierChain(LogisticRegression()),
                  RegressorChain(Ridge())]:
        chain_random = clone(chain).set_params(order='random', random_state=42)
        chain_random.fit(X, Y)
        chain_fixed = clone(chain).set_params(order=chain_random.order_)
        chain_fixed.fit(X, Y)
        assert_array_equal(chain_fixed.order_, chain_random.order_)
        assert_not_equal(list(chain_random.order), list(range(4)))
        assert_equal(len(chain_random.order_), 4)
        assert_equal(len(set(chain_random.order_)), 4)
        # Randomly ordered chain should behave identically to a fixed order
        # chain with the same order.
        for est1, est2 in zip(chain_random.estimators_,
                              chain_fixed.estimators_):
            assert_array_almost_equal(est1.coef_, est2.coef_)
Beispiel #44
0
def test_correlation():
    du2 = [randn(n_features,) for i in range(n_kernels)]
    for i in range(len(du2)):
        du2[i] /= norm(du2[i])
    dm2 = [randn(n_features, n_dims) for i in range(n_kernels)]
    for i in range(len(dm2)):
        dm2[i] /= norm(dm2[i])

    assert_equal(100., detection_rate(du, du, 0.97))
    assert_not_equal(100., detection_rate(du, du2, 0.99))
    assert_equal(100., detection_rate(dm, dm, 0.97))
    assert_not_equal(100., detection_rate(dm, dm2, 0.99))
    assert_equal((100., 100.), precision_recall(du, du, 0.97))
    assert_equal((0., 0.), precision_recall(du, du2, 0.99))
    assert_true(allclose(precision_recall_points(du, du),
                            (ones(len(du)), ones(len(du)))))
    assert_true(not allclose(precision_recall_points(du, du2),
                                (ones(len(du)), ones(len(du2)))))
Beispiel #45
0
    def test_kmeansplusplus_noy(self):
        X_train = pandas.DataFrame(data=dict(
            x=[0, 1, 2, 10, 11, 12, -10, -11, -12],
            y=[0, 1, 2, 10, 11, 12, -10, -11, -12],
            z=[0, 1, 2, 10, 11, 12, -10, -11, -12]))

        # these should clearly belong to just 1 of the 3 clusters
        X_test = pandas.DataFrame(data=dict(
            x=[-1, 3, 9, 13, -13, -20],
            y=[-1, 3, 9, 13, -13, -20],
            z=[-1, 3, 9, 13, -13, -20]))

        mymodel = KMeansPlusPlus(n_clusters=3).fit(
            X_train)  # does not work without y_train
        scores = mymodel.predict(X_test)

        # Evaluate the model
        # currently KMeansPlusPlus doesnt return label values but its internal
        # label enumeration
        # which could be random so accuracy cant be measure like below
        # accuracy = np.mean(y_test == [i for i in scores])
        # assert_equal(accuracy[0], 1.0, "accuracy should be %s" % 1.0)

        # we can measure instead that elements are grouped into 3 clusters
        assert_equal(
            scores[0],
            scores[1],
            "elements should be in the same cluster")
        assert_not_equal(
            scores[1],
            scores[2],
            "elements should be in different clusters")
        assert_equal(
            scores[2],
            scores[3],
            "elements should be in the same cluster")
        assert_not_equal(
            scores[3],
            scores[4],
            "elements should be in different clusters")
        assert_equal(
            scores[4],
            scores[5],
            "elements should be in the same cluster")
def test_median_absolute_error_weights():
    y_tr = [3, -0.5, 2, 7]
    y_pr = [2.5, 0.0, 2, 8]
    sample_weight = [2, 3, 1, 4]
    # check that unit weights gives the same score as no weight
    unweighted_score = median_absolute_error(y_tr, y_pr, sample_weight=None)
    assert_almost_equal(
        unweighted_score, median_absolute_error(y_tr, y_pr,
                        sample_weight=np.ones(shape=len(y_tr))),
        err_msg="For median_absolute_error sample_weight=None is not "
                "equivalent to sample_weight=ones" )

    # check that the weighted and unweighted scores are unequal
    weighted_score = median_absolute_error(y_tr, y_pr,
                            sample_weight=sample_weight)
    assert_not_equal(
        unweighted_score, weighted_score,
        msg="Unweighted and weighted scores are unexpectedly "
            "equal (%f) for median_absolute_error" % weighted_score)
Beispiel #47
0
def test_sensitivity_specificity_ignored_labels():
    """Test a subset of labels may be requested for SS"""
    y_true = [1, 1, 2, 3]
    y_pred = [1, 3, 3, 3]

    specificity_13 = partial(specificity_score, y_true, y_pred, labels=[1, 3])
    specificity_all = partial(specificity_score, y_true, y_pred, labels=None)

    assert_array_almost_equal([1., 0.33], specificity_13(average=None), 2)
    assert_almost_equal(np.mean([1., 0.33]), specificity_13(average='macro'),
                        2)
    assert_almost_equal(np.average([1., .33], weights=[2., 1.]),
                        specificity_13(average='weighted'), 2)
    assert_almost_equal(3. / (3. + 2.), specificity_13(average='micro'), 2)

    # ensure the above were meaningful tests:
    for average in ['macro', 'weighted', 'micro']:
        assert_not_equal(specificity_13(average=average),
                         specificity_all(average=average))
Beispiel #48
0
def test_correlation():
    du2 = [randn(n_features, ) for i in range(n_kernels)]
    for i in range(len(du2)):
        du2[i] /= norm(du2[i])
    dm2 = [randn(n_features, n_dims) for i in range(n_kernels)]
    for i in range(len(dm2)):
        dm2[i] /= norm(dm2[i])

    assert_equal(100., detection_rate(du, du, 0.97))
    assert_not_equal(100., detection_rate(du, du2, 0.99))
    assert_equal(100., detection_rate(dm, dm, 0.97))
    assert_not_equal(100., detection_rate(dm, dm2, 0.99))
    assert_equal((100., 100.), precision_recall(du, du, 0.97))
    assert_equal((0., 0.), precision_recall(du, du2, 0.99))
    assert_true(
        allclose(precision_recall_points(du, du),
                 (ones(len(du)), ones(len(du)))))
    assert_true(not allclose(precision_recall_points(du, du2),
                             (ones(len(du)), ones(len(du2)))))
def test_shuffle_kfold():
    # Check the indices are shuffled properly
    kf = KFold(3)
    kf2 = KFold(3, shuffle=True, random_state=0)
    kf3 = KFold(3, shuffle=True, random_state=1)

    X = np.ones(300)

    all_folds = np.zeros(300)
    for (tr1, te1), (tr2, te2), (tr3, te3) in zip(
            kf.split(X), kf2.split(X), kf3.split(X)):
        for tr_a, tr_b in combinations((tr1, tr2, tr3), 2):
            # Assert that there is no complete overlap
            assert_not_equal(len(np.intersect1d(tr_a, tr_b)), len(tr1))

        # Set all test indices in successive iterations of kf2 to 1
        all_folds[te2] = 1

    # Check that all indices are returned in the different test folds
    assert_equal(sum(all_folds), 300)
Beispiel #50
0
def test_score_sample_weight():

    rng = np.random.RandomState(0)

    # test both ClassifierMixin and RegressorMixin
    estimators = [DecisionTreeClassifier(max_depth=2),
                  DecisionTreeRegressor(max_depth=2)]
    sets = [datasets.load_iris(),
            datasets.load_boston()]

    for est, ds in zip(estimators, sets):
        est.fit(ds.data, ds.target)
        # generate random sample weights
        sample_weight = rng.randint(1, 10, size=len(ds.target))
        # check that the score with and without sample weights are different
        assert_not_equal(est.score(ds.data, ds.target),
                         est.score(ds.data, ds.target,
                                   sample_weight=sample_weight),
                         msg="Unweighted and weighted scores "
                             "are unexpectedly equal")
def test_scorer_sample_weight():
    """Test that scorers support sample_weight or raise sensible errors"""

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy='median')
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier()
    sensible_clf.fit(X_train, y_train)
    estimator = dict([(name, sensible_regr)
                      for name in REGRESSION_SCORERS] +
                     [(name, sensible_clf)
                      for name in CLF_SCORERS])

    for name, scorer in SCORERS.items():
        try:
            weighted = scorer(estimator[name], X_test, y_test,
                              sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], y_test[10:])
            unweighted = scorer(estimator[name], X_test, y_test)
            assert_not_equal(weighted, unweighted,
                             "scorer {0} behaves identically when called with "
                             "sample weights: {1} vs {2}".format(name,
                                                                 weighted,
                                                                 unweighted))
            assert_equal(weighted, ignored,
                         "scorer {0} behaves differently when ignoring "
                         "samples and setting sample_weight to 0: "
                         "{1} vs {2}".format(name, weighted, ignored))

        except TypeError as e:
            assert_true("sample_weight" in str(e),
                        "scorer {0} raises unhelpful exception when called "
                        "with sample weights: {1}".format(name, str(e)))
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut, 4, indices=True)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut, 4, 2, indices=True)
    kf = assert_warns(DeprecationWarning, cval.KFold, 4, 2, indices=True)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold, y, 2, indices=True)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut, labels, indices=True)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut, labels, 2, indices=True)
    ps = assert_warns(DeprecationWarning, cval.PredefinedSplit, [1, 1, 2, 2], indices=True)
    # Bootstrap as a cross-validation is deprecated
    b = assert_warns(DeprecationWarning, cval.Bootstrap, 2)
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit, 2, indices=True)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, "b")
            assert_not_equal(np.asarray(train).dtype.kind, "b")
            X[train], X[test]
            y[train], y[test]
Beispiel #53
0
def test_kernel_clone():
    """ Test that sklearn's clone works correctly on kernels. """
    bounds = (1e-5, 1e5)
    for kernel in kernels:
        kernel_cloned = clone(kernel)

        # XXX: Should this be fixed?
        # This differs from the sklearn's estimators equality check.
        assert_equal(kernel, kernel_cloned)
        assert_not_equal(id(kernel), id(kernel_cloned))

        # Check that all constructor parameters are equal.
        assert_equal(kernel.get_params(), kernel_cloned.get_params())

        # Check that all hyperparameters are equal.
        yield check_hyperparameters_equal, kernel, kernel_cloned

        # This test is to verify that using set_params does not
        # break clone on kernels.
        # This used to break because in kernels such as the RBF, non-trivial
        # logic that modified the length scale used to be in the constructor
        # See https://github.com/scikit-learn/scikit-learn/issues/6961
        # for more details.
        params = kernel.get_params()
        # RationalQuadratic kernel is isotropic.
        isotropic_kernels = (ExpSineSquared, RationalQuadratic)
        if 'length_scale' in params and not isinstance(kernel, isotropic_kernels):
            length_scale = params['length_scale']
            if np.iterable(length_scale):
                params['length_scale'] = length_scale[0]
                params['length_scale_bounds'] = bounds
            else:
                params['length_scale'] = [length_scale] * 2
                params['length_scale_bounds'] = bounds * 2
            kernel_cloned.set_params(**params)
            kernel_cloned_clone = clone(kernel_cloned)
            assert_equal(kernel_cloned_clone.get_params(),
                         kernel_cloned.get_params())
            assert_not_equal(id(kernel_cloned_clone), id(kernel_cloned))
            yield check_hyperparameters_equal, kernel_cloned, kernel_cloned_clone
def test_classifier_chain_random_order():
    # Fit classifier chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_random = ClassifierChain(LogisticRegression(),
                                              order='random',
                                              random_state=42)
    classifier_chain_random.fit(X, Y)
    Y_pred_random = classifier_chain_random.predict(X)

    assert_not_equal(list(classifier_chain_random.order), list(range(4)))
    assert_equal(len(classifier_chain_random.order_), 4)
    assert_equal(len(set(classifier_chain_random.order_)), 4)

    classifier_chain_fixed = \
        ClassifierChain(LogisticRegression(),
                        order=classifier_chain_random.order_)
    classifier_chain_fixed.fit(X, Y)
    Y_pred_fixed = classifier_chain_fixed.predict(X)

    # Randomly ordered chain should behave identically to a fixed order chain
    # with the same order.
    assert_array_equal(Y_pred_random, Y_pred_fixed)