Exemple #1
0
def check_warm_start(name, random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = hastie_X, hastie_y
    ForestEstimator = FOREST_ESTIMATORS[name]
    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = ForestEstimator(n_estimators=n_estimators,
                                     random_state=random_state,
                                     warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert_equal(len(clf_ws), n_estimators)

    clf_no_ws = ForestEstimator(n_estimators=10,
                                random_state=random_state,
                                warm_start=False)
    clf_no_ws.fit(X, y)

    assert_equal(set([tree.random_state for tree in clf_ws]),
                 set([tree.random_state for tree in clf_no_ws]))

    assert_array_equal(clf_ws.apply(X),
                       clf_no_ws.apply(X),
                       err_msg="Failed with {0}".format(name))
Exemple #2
0
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(
        NotFittedError,
        lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64),
        X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_equal(clf.predict_proba(X_test), staged_proba)
Exemple #3
0
def test_safe_indexing():
    X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    inds = np.array([1, 2])
    X_inds = safe_indexing(X, inds)
    X_arrays = safe_indexing(np.array(X), inds)
    assert_array_equal(np.array(X_inds), X_arrays)
    assert_array_equal(np.array(X_inds), np.array(X)[inds])
Exemple #4
0
def test_safe_indexing_mock_pandas():
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    X_df = MockDataFrame(X)
    inds = np.array([1, 2])
    X_df_indexed = safe_indexing(X_df, inds)
    X_indexed = safe_indexing(X_df, inds)
    assert_array_equal(np.array(X_df_indexed), X_indexed)
Exemple #5
0
def test_symbol_labels():
    # Test with non-integer class labels.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    symbol_y = tosequence(map(str, y))

    clf.fit(X, symbol_y)
    assert_array_equal(clf.predict(T), tosequence(map(str, true_result)))
    assert_equal(100, len(clf.estimators_))
Exemple #6
0
def test_vector_sign_flip():
    # Testing that sign flip is working & largest value has positive sign
    data = np.random.RandomState(36).randn(5, 5)
    max_abs_rows = np.argmax(np.abs(data), axis=1)
    data_flipped = _deterministic_vector_sign_flip(data)
    max_rows = np.argmax(data_flipped, axis=1)
    assert_array_equal(max_abs_rows, max_rows)
    signs = np.sign(data[range(data.shape[0]), max_abs_rows])
    assert_array_equal(data, data_flipped * signs[:, np.newaxis])
Exemple #7
0
def test_dtype_convert(n_classes=15):
    classifier = RandomForestClassifier(random_state=0, bootstrap=False)

    X = np.eye(n_classes)
    y = [ch for ch in 'ABCDEFGHIJKLMNOPQRSTU'[:n_classes]]

    result = classifier.fit(X, y).predict(X)
    assert_array_equal(classifier.classes_, y)
    assert_array_equal(result, y)
Exemple #8
0
def test_non_uniform_weights_toy_edge_case_clf():
    X = [[1, 0], [1, 0], [1, 0], [0, 1]]
    y = [0, 0, 1, 0]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1]
    for loss in ('deviance', 'exponential'):
        gb = GradientBoostingClassifier(n_estimators=5)
        gb.fit(X, y, sample_weight=sample_weight)
        assert_array_equal(gb.predict([[1, 0]]), [1])
Exemple #9
0
def test_clone_empty_array():
    # Regression test for cloning estimators with empty arrays
    clf = MyEstimator(empty=np.array([]))
    clf2 = clone(clf)
    assert_array_equal(clf.empty, clf2.empty)

    clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]])))
    clf2 = clone(clf)
    assert_array_equal(clf.empty.data, clf2.empty.data)
Exemple #10
0
def test_float_class_labels():
    # Test with float class labels.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    float_y = np.asarray(y, dtype=np.float32)

    clf.fit(X, float_y)
    assert_array_equal(clf.predict(T), np.asarray(true_result,
                                                  dtype=np.float32))
    assert_equal(100, len(clf.estimators_))
Exemple #11
0
def test_gen_even_slices():
    # check that gen_even_slices contains all samples
    some_range = range(10)
    joined_range = list(
        chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
    assert_array_equal(some_range, joined_range)

    # check that passing negative n_chunks raises an error
    slices = gen_even_slices(10, -1)
    assert_raises_regex(ValueError, "gen_even_slices got n_packs=-1, must be"
                        " >=1", next, slices)
Exemple #12
0
def test_degenerate_targets():
    # Check if we can fit even though all targets are equal.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    # classifier should raise exception
    assert_raises(ValueError, clf.fit, X, np.ones(len(X)))

    clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
    clf.fit(X, np.ones(len(X)))
    clf.predict([rng.rand(2)])
    assert_array_equal(np.ones((1, ), dtype=np.float64),
                       clf.predict([rng.rand(2)]))
Exemple #13
0
def test_warm_start_oob_switch():
    # Test if oob can be turned on during warm start.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est.fit(X, y)
        est.set_params(n_estimators=110, subsample=0.5)
        est.fit(X, y)

        assert_array_equal(est.oob_improvement_[:100], np.zeros(100))
        # the last 10 are not zeros
        assert_array_equal(est.oob_improvement_[-10:] == 0.0,
                           np.zeros(10, dtype=np.bool))
Exemple #14
0
def test_shape_y():
    # Test with float class labels.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    y_ = np.asarray(y, dtype=np.int32)
    y_ = y_[:, np.newaxis]

    # This will raise a DataConversionWarning that we want to
    # "always" raise, elsewhere the warnings gets ignored in the
    # later tests, and the tests that check for this warning fail
    assert_warns(DataConversionWarning, clf.fit, X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))
Exemple #15
0
def test_mem_layout():
    # Test with different memory layouts of X and y
    X_ = np.asfortranarray(X)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X_, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    X_ = np.ascontiguousarray(X)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X_, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    y_ = np.asarray(y, dtype=np.int32)
    y_ = np.ascontiguousarray(y_)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    y_ = np.asarray(y, dtype=np.int32)
    y_ = np.asfortranarray(y_)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))
Exemple #16
0
def test_cartesian():
    # Check if cartesian product delivers the right results

    axes = (np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7]))

    true_out = np.array([[1, 4, 6], [1, 4, 7], [1, 5, 6], [1, 5, 7], [2, 4, 6],
                         [2, 4, 7], [2, 5, 6], [2, 5, 7], [3, 4, 6], [3, 4, 7],
                         [3, 5, 6], [3, 5, 7]])

    out = cartesian(axes)
    assert_array_equal(true_out, out)

    # check single axis
    x = np.arange(3)
    assert_array_equal(x[:, np.newaxis], cartesian((x, )))
Exemple #17
0
def test_random_weights():
    # set this up so that each row should have a weighted mode of 6,
    # with a score that is easily reproduced
    mode_result = 6

    rng = np.random.RandomState(0)
    x = rng.randint(mode_result, size=(100, 10))
    w = rng.random_sample(x.shape)

    x[:, :5] = mode_result
    w[:, :5] += 1

    mode, score = weighted_mode(x, w, axis=1)

    assert_array_equal(mode, mode_result)
    assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
Exemple #18
0
def test_random_trees_dense_equal():
    # Test that the `sparse_output` parameter of RandomTreesEmbedding
    # works by returning the same array for both argument values.

    # Create the RTEs
    hasher_dense = RandomTreesEmbedding(n_estimators=10,
                                        sparse_output=False,
                                        random_state=0)
    hasher_sparse = RandomTreesEmbedding(n_estimators=10,
                                         sparse_output=True,
                                         random_state=0)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed_dense = hasher_dense.fit_transform(X)
    X_transformed_sparse = hasher_sparse.fit_transform(X)

    # Assert that dense and sparse hashers have same array.
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
Exemple #19
0
def check_classification_toy(name):
    """Check classification on a toy dataset."""
    ForestClassifier = FOREST_CLASSIFIERS[name]

    clf = ForestClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    clf = ForestClassifier(n_estimators=10, max_features=1, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf))

    # also test apply
    leaf_indices = clf.apply(X)
    assert_equal(leaf_indices.shape, (len(X), clf.n_estimators))
Exemple #20
0
def test_probability_log():
    # Predict probabilities.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    assert_raises(ValueError, clf.predict_proba, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # check if probabilities are in [0, 1].
    y_proba = clf.predict_proba(T)
    assert_true(np.all(y_proba >= 0.0))
    assert_true(np.all(y_proba <= 1.0))

    # derive predictions from probabilities
    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
    assert_array_equal(y_pred, true_result)
Exemple #21
0
def test_serialization():
    # Check model serialization.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    try:
        import cPickle as pickle
    except ImportError:
        import pickle

    serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL)
    clf = None
    clf = pickle.loads(serialized_clf)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))
Exemple #22
0
def check_classification_toy(presort, loss):
    # Check classification on a toy dataset.
    clf = GradientBoostingClassifier(loss=loss,
                                     n_estimators=10,
                                     random_state=1,
                                     presort=presort)

    assert_raises(ValueError, clf.predict, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(10, len(clf.estimators_))

    deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:])
    assert_true(np.any(deviance_decrease >= 0.0))

    leaves = clf.apply(X)
    assert_equal(leaves.shape, (6, 10, 1))
Exemple #23
0
def test_safe_indexing_pandas():
    try:
        import pandas as pd
    except ImportError:
        raise SkipTest("Pandas not found")
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    X_df = pd.DataFrame(X)
    inds = np.array([1, 2])
    X_df_indexed = safe_indexing(X_df, inds)
    X_indexed = safe_indexing(X_df, inds)
    assert_array_equal(np.array(X_df_indexed), X_indexed)
    # fun with read-only data in dataframes
    # this happens in joblib memmapping
    X.setflags(write=False)
    X_df_readonly = pd.DataFrame(X)
    with warnings.catch_warnings(record=True):
        X_df_ro_indexed = safe_indexing(X_df_readonly, inds)

    assert_array_equal(np.array(X_df_ro_indexed), X_indexed)
Exemple #24
0
def test_staged_predict():
    # Test whether staged decision function eventually gives
    # the same prediction.
    X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test = X[200:]
    clf = GradientBoostingRegressor()
    # test raise ValueError if not fitted
    assert_raises(
        ValueError,
        lambda X: np.fromiter(clf.staged_predict(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # test if prediction for last stage equals ``predict``
    for y in clf.staged_predict(X_test):
        assert_equal(y.shape, y_pred.shape)

    assert_array_equal(y_pred, y)
Exemple #25
0
def test_probability_exponential():
    # Predict probabilities.
    clf = GradientBoostingClassifier(loss='exponential',
                                     n_estimators=100,
                                     random_state=1)

    assert_raises(ValueError, clf.predict_proba, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # check if probabilities are in [0, 1].
    y_proba = clf.predict_proba(T)
    assert_true(np.all(y_proba >= 0.0))
    assert_true(np.all(y_proba <= 1.0))
    score = clf.decision_function(T).ravel()
    assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score)))

    # derive predictions from probabilities
    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
    assert_array_equal(y_pred, true_result)
Exemple #26
0
def test_column_or_1d():
    EXAMPLES = [
        ("binary", ["spam", "egg", "spam"]),
        ("binary", [0, 1, 0, 1]),
        ("continuous", np.arange(10) / 20.),
        ("multiclass", [1, 2, 3]),
        ("multiclass", [0, 1, 2, 2, 0]),
        ("multiclass", [[1], [2], [3]]),
        ("multilabel-indicator", [[0, 1, 0], [0, 0, 1]]),
        ("multiclass-multioutput", [[1, 2, 3]]),
        ("multiclass-multioutput", [[1, 1], [2, 2], [3, 1]]),
        ("multiclass-multioutput", [[5, 1], [4, 2], [3, 1]]),
        ("multiclass-multioutput", [[1, 2, 3]]),
        ("continuous-multioutput", np.arange(30).reshape((-1, 3))),
    ]

    for y_type, y in EXAMPLES:
        if y_type in ["binary", 'multiclass', "continuous"]:
            assert_array_equal(column_or_1d(y), np.ravel(y))
        else:
            assert_raises(ValueError, column_or_1d, y)
Exemple #27
0
def check_warm_start_equal_n_estimators(name):
    # Test if warm start with equal n_estimators does nothing and returns the
    # same forest and raises a warning.
    X, y = hastie_X, hastie_y
    ForestEstimator = FOREST_ESTIMATORS[name]
    clf = ForestEstimator(n_estimators=5,
                          max_depth=3,
                          warm_start=True,
                          random_state=1)
    clf.fit(X, y)

    clf_2 = ForestEstimator(n_estimators=5,
                            max_depth=3,
                            warm_start=True,
                            random_state=1)
    clf_2.fit(X, y)
    # Now clf_2 equals clf.

    clf_2.set_params(random_state=2)
    assert_warns(UserWarning, clf_2.fit, X, y)
    # If we had fit the trees again we would have got a different forest as we
    # changed the random state.
    assert_array_equal(clf.apply(X), clf_2.apply(X))
Exemple #28
0
def check_decision_path(name):
    X, y = hastie_X, hastie_y
    n_samples = X.shape[0]
    ForestEstimator = FOREST_ESTIMATORS[name]
    est = ForestEstimator(n_estimators=5,
                          max_depth=1,
                          warm_start=False,
                          random_state=1)
    est.fit(X, y)
    indicator, n_nodes_ptr = est.decision_path(X)

    assert_equal(indicator.shape[1], n_nodes_ptr[-1])
    assert_equal(indicator.shape[0], n_samples)
    assert_array_equal(np.diff(n_nodes_ptr),
                       [e.tree_.node_count for e in est.estimators_])

    # Assert that leaves index are correct
    leaves = est.apply(X)
    for est_id in range(leaves.shape[1]):
        leave_indicator = [
            indicator[i, n_nodes_ptr[est_id] + j]
            for i, j in enumerate(leaves[:, est_id])
        ]
        assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
Exemple #29
0
def check_classes_shape(name):
    # Test that n_classes_ and classes_ have proper shape.
    ForestClassifier = FOREST_CLASSIFIERS[name]

    # Classification, single output
    clf = ForestClassifier(random_state=0).fit(X, y)

    assert_equal(clf.n_classes_, 2)
    assert_array_equal(clf.classes_, [-1, 1])

    # Classification, multi-output
    _y = np.vstack((y, np.array(y) * 2)).T
    clf = ForestClassifier(random_state=0).fit(X, _y)

    assert_array_equal(clf.n_classes_, [2, 2])
    assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
Exemple #30
0
def test_shuffle_dont_convert_to_array():
    # Check that shuffle does not try to convert to numpy arrays with float
    # dtypes can let any indexable datastructure pass-through.
    a = ['a', 'b', 'c']
    b = np.array(['a', 'b', 'c'], dtype=object)
    c = [1, 2, 3]
    d = MockDataFrame(np.array([['a', 0], ['b', 1], ['c', 2]], dtype=object))
    e = sp.csc_matrix(np.arange(6).reshape(3, 2))
    a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)

    assert_equal(a_s, ['c', 'b', 'a'])
    assert_equal(type(a_s), list)

    assert_array_equal(b_s, ['c', 'b', 'a'])
    assert_equal(b_s.dtype, object)

    assert_equal(c_s, [3, 2, 1])
    assert_equal(type(c_s), list)

    assert_array_equal(d_s,
                       np.array([['c', 2], ['b', 1], ['a', 0]], dtype=object))
    assert_equal(type(d_s), MockDataFrame)

    assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))