Example #1
0
def test_linearsvc():
    # Test basic routines using LinearSVC
    clf = svm.LinearSVC(random_state=0).fit(X, Y)

    # by default should have intercept
    assert_true(clf.fit_intercept)

    assert_array_equal(clf.predict(T), true_result)
    assert_array_almost_equal(clf.intercept_, [0], decimal=3)

    # the same with l1 penalty
    clf = svm.LinearSVC(penalty='l1', loss='squared_hinge', dual=False, random_state=0).fit(X, Y)
    assert_array_equal(clf.predict(T), true_result)

    # l2 penalty with dual formulation
    clf = svm.LinearSVC(penalty='l2', dual=True, random_state=0).fit(X, Y)
    assert_array_equal(clf.predict(T), true_result)

    # l2 penalty, l1 loss
    clf = svm.LinearSVC(penalty='l2', loss='hinge', dual=True, random_state=0)
    clf.fit(X, Y)
    assert_array_equal(clf.predict(T), true_result)

    # test also decision function
    dec = clf.decision_function(T)
    res = (dec > 0).astype(np.int) + 1
    assert_array_equal(res, true_result)
Example #2
0
def test_RadiusNeighborsRegressor_multioutput(n_samples=40,
                                              n_features=5,
                                              n_test_pts=10,
                                              n_neighbors=3,
                                              random_state=0):
    """Test k-neighbors in multi-output regression with various weight"""
    rng = np.random.RandomState(random_state)
    X = 2 * rng.rand(n_samples, n_features) - 1
    y = np.sqrt((X ** 2).sum(1))
    y /= y.max()
    y = np.vstack([y, y]).T

    y_target = y[:n_test_pts]
    weights = ['uniform', 'distance', _weight_func]

    for algorithm, weights in product(ALGORITHMS, weights):
        rnn = neighbors.RadiusNeighborsRegressor(n_neighbors=n_neighbors,
                                                 weights=weights,
                                                 algorithm=algorithm)
        rnn.fit(X, y)
        epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1)
        y_pred = rnn.predict(X[:n_test_pts] + epsilon)

        assert_equal(y_pred.shape, y_target.shape)
        assert_true(np.all(np.abs(y_pred - y_target) < 0.3))
def test_parameter_grid():
    """Test basic properties of ParameterGrid."""
    params1 = {"foo": [1, 2, 3]}
    grid1 = ParameterGrid(params1)
    assert_true(isinstance(grid1, Iterable))
    assert_true(isinstance(grid1, Sized))
    assert_equal(len(grid1), 3)

    params2 = {"foo": [4, 2],
               "bar": ["ham", "spam", "eggs"]}
    grid2 = ParameterGrid(params2)
    assert_equal(len(grid2), 6)

    # loop to assert we can iterate over the grid multiple times
    for i in xrange(2):
        # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
        points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
        assert_equal(points,
                     set(("bar", x, "foo", y)
                         for x, y in product(params2["bar"], params2["foo"])))

    # Special case: empty grid (useful to get default estimator settings)
    empty = ParameterGrid({})
    assert_equal(len(empty), 1)
    assert_equal(list(empty), [{}])

    has_empty = ParameterGrid([{'C': [1, 10]}, {}])
    assert_equal(len(has_empty), 3)
    assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}])
Example #4
0
def test_dense_liblinear_intercept_handling(classifier=svm.LinearSVC):
    # Test that dense liblinear honours intercept_scaling param
    X = [[2, 1],
         [3, 1],
         [1, 3],
         [2, 3]]
    y = [0, 0, 1, 1]
    clf = classifier(fit_intercept=True, penalty='l1', loss='squared_hinge',
                     dual=False, C=4, tol=1e-7, random_state=0)
    assert_true(clf.intercept_scaling == 1, clf.intercept_scaling)
    assert_true(clf.fit_intercept)

    # when intercept_scaling is low the intercept value is highly "penalized"
    # by regularization
    clf.intercept_scaling = 1
    clf.fit(X, y)
    assert_almost_equal(clf.intercept_, 0, decimal=5)

    # when intercept_scaling is sufficiently high, the intercept value
    # is not affected by regularization
    clf.intercept_scaling = 100
    clf.fit(X, y)
    intercept1 = clf.intercept_
    assert_less(intercept1, -1)

    # when intercept_scaling is sufficiently high, the intercept value
    # doesn't depend on intercept_scaling value
    clf.intercept_scaling = 1000
    clf.fit(X, y)
    intercept2 = clf.intercept_
    assert_array_almost_equal(intercept1, intercept2, decimal=2)
def test_grid_search_precomputed_kernel():
    """Test that grid search works when the input features are given in the
    form of a precomputed kernel matrix """
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    # compute the training kernel matrix corresponding to the linear kernel
    K_train = np.dot(X_[:180], X_[:180].T)
    y_train = y_[:180]

    clf = SVC(kernel='precomputed')
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(K_train, y_train)

    assert_true(cv.best_score_ >= 0)

    # compute the test kernel matrix
    K_test = np.dot(X_[180:], X_[:180].T)
    y_test = y_[180:]

    y_pred = cv.predict(K_test)

    assert_true(np.mean(y_pred == y_test) >= 0)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def test_check_is_partition():
    p = np.arange(100)
    assert_true(cval._check_is_partition(p, 100))
    assert_false(cval._check_is_partition(np.delete(p, 23), 100))

    p[0] = 23
    assert_false(cval._check_is_partition(p, 100))
def test_labels_assignment_and_inertia():
    # pure numpy implementation as easily auditable reference gold
    # implementation
    rng = np.random.RandomState(42)
    noisy_centers = centers + rng.normal(size=centers.shape)
    labels_gold = - np.ones(n_samples, dtype=np.int)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)
    for center_id in range(n_clusters):
        dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
        labels_gold[dist < mindist] = center_id
        mindist = np.minimum(dist, mindist)
    inertia_gold = mindist.sum()
    assert_true((mindist >= 0.0).all())
    assert_true((labels_gold != -1).all())

    # perform label assignment using the dense array input
    x_squared_norms = (X ** 2).sum(axis=1)
    labels_array, inertia_array = _labels_inertia(
        X, x_squared_norms, noisy_centers)
    assert_array_almost_equal(inertia_array, inertia_gold)
    assert_array_equal(labels_array, labels_gold)

    # perform label assignment using the sparse CSR input
    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
    labels_csr, inertia_csr = _labels_inertia(
        X_csr, x_squared_norms_from_csr, noisy_centers)
    assert_array_almost_equal(inertia_csr, inertia_gold)
    assert_array_equal(labels_csr, labels_gold)
def test_check_is_permutation():
    p = np.arange(100)
    assert_true(_check_is_permutation(p, 100))
    assert_false(_check_is_permutation(np.delete(p, 23), 100))

    p[0] = 23
    assert_false(_check_is_permutation(p, 100))
def train_test_split_mock_pandas():
    # X mock dataframe
    X_df = MockDataFrame(X)
    X_train, X_test = train_test_split(X_df)
    assert_true(isinstance(X_train, MockDataFrame))
    assert_true(isinstance(X_test, MockDataFrame))
    X_train_arr, X_test_arr = train_test_split(X_df)
Example #10
0
def test_ovo_gridsearch():
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    Cs = [0.1, 0.5, 0.8]
    cv = GridSearchCV(ovo, {'estimator__C': Cs})
    cv.fit(iris.data, iris.target)
    best_C = cv.best_estimator_.estimators_[0].C
    assert_true(best_C in Cs)
Example #11
0
def test_ovr_multilabel():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
    y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"],
         ["ham", "eggs"], ["ham"]]
    #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]]
    Y = np.array([[0, 1, 1],
                  [0, 1, 0],
                  [1, 1, 1],
                  [1, 0, 1],
                  [1, 0, 0]])

    classes = set("ham eggs spam".split())

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet(), Lasso(alpha=0.5)):
        # test input as lists of tuples
        clf = assert_warns(DeprecationWarning,
                           OneVsRestClassifier(base_clf).fit,
                           X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_equal(set(y_pred), set(["spam", "eggs"]))
        assert_true(clf.multilabel_)

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_array_equal(y_pred, [0, 1, 1])
        assert_true(clf.multilabel_)
Example #12
0
def check_warm_start_oob(name):
    # Test that the warm start computes oob score when asked.
    X, y = hastie_X, hastie_y
    ForestEstimator = FOREST_ESTIMATORS[name]
    # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning.
    clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False,
                          random_state=1, bootstrap=True, oob_score=True)
    clf.fit(X, y)

    clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False,
                            random_state=1, bootstrap=True, oob_score=False)
    clf_2.fit(X, y)

    clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
    clf_2.fit(X, y)

    assert_true(hasattr(clf_2, 'oob_score_'))
    assert_equal(clf.oob_score_, clf_2.oob_score_)

    # Test that oob_score is computed even if we don't need to train
    # additional trees.
    clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True,
                            random_state=1, bootstrap=True, oob_score=False)
    clf_3.fit(X, y)
    assert_true(not(hasattr(clf_3, 'oob_score_')))

    clf_3.set_params(oob_score=True)
    ignore_warnings(clf_3.fit)(X, y)

    assert_equal(clf.oob_score_, clf_3.oob_score_)
def check_get_params_invariance(name, estimator):
    class T(BaseEstimator):
        """Mock classifier
        """

        def __init__(self):
            pass

        def fit(self, X, y):
            return self

    if name in ('FeatureUnion', 'Pipeline'):
        e = estimator([('clf', T())])

    elif name in ('GridSearchCV' 'RandomizedSearchCV'):
        return

    else:
        e = estimator()

    shallow_params = e.get_params(deep=False)
    deep_params = e.get_params(deep=True)

    assert_true(all(item in deep_params.items() for item in
                    shallow_params.items()))
def test_kfold_valueerrors():
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.KFold, 3, 4)

    # Check that a warning is raised if the least populated class has too few
    # members.
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')
        y = [3, 3, -1, -1, 2]
        cv = cval.StratifiedKFold(y, 3)
        # checking there was only one warning.
        assert_equal(len(w), 1)
        # checking it has the right type
        assert_equal(w[0].category, Warning)
        # checking it's the right warning. This might be a bad test since it's
        # a characteristic of the code and not a behavior
        assert_true("The least populated class" in str(w[0]))

        # Check that despite the warning the folds are still computed even
        # though all the classes are not necessarily represented at on each
        # side of the split at each split
        check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))

    # Error when number of folds is <= 1
    assert_raises(ValueError, cval.KFold, 2, 0)
    assert_raises(ValueError, cval.KFold, 2, 1)
    assert_raises(ValueError, cval.StratifiedKFold, y, 0)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1)

    # When n is not integer:
    assert_raises(ValueError, cval.KFold, 2.5, 2)

    # When n_folds is not integer:
    assert_raises(ValueError, cval.KFold, 5, 1.5)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1.5)
Example #15
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(20, ))
    y_pred = random_state.randint(0, 2, size=(20, ))

    # We shouldn't forget any metrics
    assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS,
                                              THRESHOLDED_METRICS,
                                              METRIC_UNDEFINED_MULTICLASS),
                 set(ALL_METRICS))

    assert_equal(
        set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)),
        set([]))

    # Symmetric metric
    for name in SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_almost_equal(metric(y_true, y_pred),
                            metric(y_pred, y_true),
                            err_msg="%s is not symmetric" % name)

    # Not symmetric metrics
    for name in NOT_SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_true(np.any(metric(y_true, y_pred) != metric(y_pred, y_true)),
                    msg="%s seems to be symmetric" % name)
Example #16
0
def test_fetch_rcv1():
    try:
        data1 = fetch_rcv1(shuffle=False, download_if_missing=False)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise SkipTest("Download RCV1 dataset to run this test.")

    X1, Y1 = data1.data, data1.target
    cat_list, s1 = data1.target_names.tolist(), data1.sample_id

    # test sparsity
    assert_true(sp.issparse(X1))
    assert_true(sp.issparse(Y1))
    assert_equal(60915113, X1.data.size)
    assert_equal(2606875, Y1.data.size)

    # test shapes
    assert_equal((804414, 47236), X1.shape)
    assert_equal((804414, 103), Y1.shape)
    assert_equal((804414,), s1.shape)
    assert_equal(103, len(cat_list))

    # test ordering of categories
    first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151']
    assert_array_equal(first_categories, cat_list[:6])

    # test number of sample for some categories
    some_categories = ('GMIL', 'E143', 'CCAT')
    number_non_zero_in_cat = (5, 1206, 381327)
    for num, cat in zip(number_non_zero_in_cat, some_categories):
        j = cat_list.index(cat)
        assert_equal(num, Y1[:, j].data.size)

    # test shuffling and subset
    data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77,
                       download_if_missing=False)
    X2, Y2 = data2.data, data2.target
    s2 = data2.sample_id

    # test return_X_y option
    fetch_func = partial(fetch_rcv1, shuffle=False, subset='train',
                         download_if_missing=False)
    check_return_X_y(data2, fetch_func)

    # The first 23149 samples are the training samples
    assert_array_equal(np.sort(s1[:23149]), np.sort(s2))

    # test some precise values
    some_sample_ids = (2286, 3274, 14042)
    for sample_id in some_sample_ids:
        idx1 = s1.tolist().index(sample_id)
        idx2 = s2.tolist().index(sample_id)

        feature_values_1 = X1[idx1, :].toarray()
        feature_values_2 = X2[idx2, :].toarray()
        assert_almost_equal(feature_values_1, feature_values_2)

        target_values_1 = Y1[idx1, :].toarray()
        target_values_2 = Y2[idx2, :].toarray()
        assert_almost_equal(target_values_1, target_values_2)
Example #17
0
def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
            n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    # test caching
    clustering = Ward(n_clusters=10, connectivity=connectivity,
                      memory=mkdtemp())
    clustering.fit(X)
    labels = clustering.labels_
    assert_true(np.size(np.unique(labels)) == 10)
    # Turn caching off now
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    # Check that we obtain the same solution with early-stopping of the
    # tree building
    clustering.compute_full_tree = False
    clustering.fit(X)
    np.testing.assert_array_equal(clustering.labels_, labels)
    clustering.connectivity = None
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
    # Check that we raise a TypeError on dense matrices
    clustering = Ward(n_clusters=10,
                      connectivity=connectivity.todense())
    assert_raises(TypeError, clustering.fit, X)
    clustering = Ward(n_clusters=10,
                      connectivity=sparse.lil_matrix(
                          connectivity.todense()[:10, :10]))
    assert_raises(ValueError, clustering.fit, X)
def test_enet_positive_constraint():
    X = [[-1], [0], [1]]
    y = [1, 0, -1]  # just a straight line with negative slope

    enet = ElasticNet(alpha=0.1, max_iter=1000, positive=True)
    enet.fit(X, y)
    assert_true(min(enet.coef_) >= 0)
def test_grid_search_score_method():
    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                               random_state=0)
    clf = LinearSVC(random_state=0)
    grid = {'C': [.1]}

    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
    search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
    search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
                                              scoring='roc_auc').fit(X, y)
    search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)

    # ChangedBehaviourWarning occurred previously (prior to #9005)
    score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
    score_accuracy = assert_no_warnings(search_accuracy.score, X, y)
    score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
                                            X, y)
    score_auc = assert_no_warnings(search_auc.score, X, y)

    # ensure the test is sane
    assert_true(score_auc < 1.0)
    assert_true(score_accuracy < 1.0)
    assert_not_equal(score_auc, score_accuracy)

    assert_almost_equal(score_accuracy, score_no_scoring)
    assert_almost_equal(score_auc, score_no_score_auc)
def test_parameters_sampler_replacement():
    # raise error if n_iter too large
    params = {'first': [0, 1], 'second': ['a', 'b', 'c']}
    sampler = ParameterSampler(params, n_iter=7)
    assert_raises(ValueError, list, sampler)
    # degenerates to GridSearchCV if n_iter the same as grid_size
    sampler = ParameterSampler(params, n_iter=6)
    samples = list(sampler)
    assert_equal(len(samples), 6)
    for values in ParameterGrid(params):
        assert_true(values in samples)

    # test sampling without replacement in a large grid
    params = {'a': range(10), 'b': range(10), 'c': range(10)}
    sampler = ParameterSampler(params, n_iter=99, random_state=42)
    samples = list(sampler)
    assert_equal(len(samples), 99)
    hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c'])
                        for p in samples]
    assert_equal(len(set(hashable_samples)), 99)

    # doesn't go into infinite loops
    params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']}
    sampler = ParameterSampler(params_distribution, n_iter=7)
    samples = list(sampler)
    assert_equal(len(samples), 7)
def test_simple():
    # Principle of Lars is to keep covariances tied and decreasing

    # also test verbose output
    from sklearn.externals.six.moves import cStringIO as StringIO
    import sys
    old_stdout = sys.stdout
    try:
        sys.stdout = StringIO()

        alphas_, active, coef_path_ = linear_model.lars_path(
            diabetes.data, diabetes.target, method="lar", verbose=10)

        sys.stdout = old_stdout

        for (i, coef_) in enumerate(coef_path_.T):
            res = y - np.dot(X, coef_)
            cov = np.dot(X.T, res)
            C = np.max(abs(cov))
            eps = 1e-3
            ocur = len(cov[C - eps < abs(cov)])
            if i < X.shape[1]:
                assert_true(ocur == i + 1)
            else:
                # no more than max_pred variables can go into the active set
                assert_true(ocur == X.shape[1])
    finally:
        sys.stdout = old_stdout
def test_enet_path_positive():
    # Test that the coefs returned by positive=True in enet_path are positive

    X, y, _, _ = build_dataset(n_samples=50, n_features=50)
    for path in [enet_path, lasso_path]:
        pos_path_coef = path(X, y, positive=True)[1]
        assert_true(np.all(pos_path_coef >= 0))
def test_check_increasing_up_extreme():
    x = [0, 1, 2, 3, 4, 5]
    y = [0, 1, 2, 3, 4, 5]

    # Check that we got increasing=True and no warnings
    is_increasing = assert_no_warnings(check_increasing, x, y)
    assert_true(is_increasing)
def test_lasso_cv():
    X, y, X_test, y_test = build_dataset()
    max_iter = 150
    clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter).fit(X, y)
    assert_almost_equal(clf.alpha_, 0.056, 2)

    clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter, precompute=True)
    clf.fit(X, y)
    assert_almost_equal(clf.alpha_, 0.056, 2)

    # Check that the lars and the coordinate descent implementation
    # select a similar alpha
    lars = LassoLarsCV(normalize=False, max_iter=30).fit(X, y)
    # for this we check that they don't fall in the grid of
    # clf.alphas further than 1
    assert_true(np.abs(
        np.searchsorted(clf.alphas_[::-1], lars.alpha_) -
        np.searchsorted(clf.alphas_[::-1], clf.alpha_)) <= 1)
    # check that they also give a similar MSE
    mse_lars = interpolate.interp1d(lars.cv_alphas_, lars.cv_mse_path_.T)
    np.testing.assert_approx_equal(mse_lars(clf.alphas_[5]).mean(),
                                   clf.mse_path_[5].mean(), significant=2)

    # test set
    assert_greater(clf.score(X_test, y_test), 0.99)
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import sklearn.externals.joblib.parallel as joblib_par
        _mp = joblib_par.multiprocessing
        joblib_par.multiprocessing = None
        try:
            spca = SparsePCA(n_components=3, n_jobs=2, random_state=0,
                             alpha=alpha).fit(Y)
            U2 = spca.transform(Y)
        finally:
            joblib_par.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
                         random_state=0).fit(Y)
        U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_check_increasing_up():
    x = [0, 1, 2, 3, 4, 5]
    y = [0, 1.5, 2.77, 8.99, 8.99, 50]

    # Check that we got increasing=True and no warnings
    is_increasing = assert_no_warnings(check_increasing, x, y)
    assert_true(is_increasing)
Example #28
0
    def test_sgd_l1(self):
        """Test L1 regularization"""
        n = len(X4)
        rng = np.random.RandomState(13)
        idx = np.arange(n)
        rng.shuffle(idx)

        X = X4[idx, :]
        Y = Y4[idx]

        clf = self.factory(penalty="l1", alpha=0.2, fit_intercept=False, n_iter=2000, shuffle=False)
        clf.fit(X, Y)
        assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,)))
        pred = clf.predict(X)
        assert_array_equal(pred, Y)

        # test sparsify with dense inputs
        clf.sparsify()
        assert_true(sp.issparse(clf.coef_))
        pred = clf.predict(X)
        assert_array_equal(pred, Y)

        # pickle and unpickle with sparse coef_
        clf = pickle.loads(pickle.dumps(clf))
        assert_true(sp.issparse(clf.coef_))
        pred = clf.predict(X)
        assert_array_equal(pred, Y)
Example #29
0
def test_scale_function_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_scaled = scale(X, with_mean=False)
    assert_false(np.any(np.isnan(X_scaled)))

    X_csr_scaled = scale(X_csr, with_mean=False)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    # test csc has same outcome
    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())

    # raises value error on axis != 0
    assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
def test_lars_path_positive_constraint():
    # this is the main test for the positive parameter on the lars_path method
    # the estimator classes just make use of this function

    # we do the test on the diabetes dataset

    # ensure that we get negative coefficients when positive=False
    # and all positive when positive=True
    # for method 'lar' (default) and lasso

    # Once deprecation of LAR + positive option is done use these:
    # assert_raises(ValueError, linear_model.lars_path, diabetes['data'],
    #               diabetes['target'], method='lar', positive=True)

    with pytest.warns(DeprecationWarning, match="broken"):
        linear_model.lars_path(diabetes['data'], diabetes['target'],
                               return_path=True, method='lar',
                               positive=True)

    method = 'lasso'
    alpha, active, coefs = \
        linear_model.lars_path(diabetes['data'], diabetes['target'],
                               return_path=True, method=method,
                               positive=False)
    assert_true(coefs.min() < 0)

    alpha, active, coefs = \
        linear_model.lars_path(diabetes['data'], diabetes['target'],
                               return_path=True, method=method,
                               positive=True)
    assert_true(coefs.min() >= 0)
Example #31
0
 def test_parameters(self):
     assert_true(
         hasattr(self.clf, 'decision_scores_')
         and self.clf.decision_scores_ is not None)
     assert_true(
         hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
     assert_true(
         hasattr(self.clf, 'threshold_')
         and self.clf.threshold_ is not None)
     assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None)
     assert_true(
         hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
     assert_true(hasattr(self.clf, 'hist_') and self.clf.hist_ is not None)
     assert_true(
         hasattr(self.clf, 'bin_edges_')
         and self.clf.bin_edges_ is not None)
Example #32
0
def test_normalize():
    a = Real(2.0, 30.0, transform="normalize")
    for i in range(50):
        check_limits(a.rvs(random_state=i), 2, 30)

    rng = np.random.RandomState(0)
    X = rng.randn(100)
    X = 28 * (X - X.min()) / (X.max() - X.min()) + 2

    # Check transformed values are in [0, 1]
    assert_true(np.all(a.transform(X) <= np.ones_like(X)))
    assert_true(np.all(np.zeros_like(X) <= a.transform(X)))

    # Check inverse transform
    assert_array_almost_equal(a.inverse_transform(a.transform(X)), X)

    # log-uniform prior
    a = Real(10**2.0, 10**4.0, prior="log-uniform", transform="normalize")
    for i in range(50):
        check_limits(a.rvs(random_state=i), 10**2, 10**4)

    rng = np.random.RandomState(0)
    X = np.clip(10**3 * rng.randn(100), 10**2.0, 10**4.0)

    # Check transform
    assert_true(np.all(a.transform(X) <= np.ones_like(X)))
    assert_true(np.all(np.zeros_like(X) <= a.transform(X)))

    # Check inverse transform
    assert_array_almost_equal(a.inverse_transform(a.transform(X)), X)

    a = Integer(2, 30, transform="normalize")
    for i in range(50):
        check_limits(a.rvs(random_state=i), 2, 30)
    assert_array_equal(a.transformed_bounds, (0, 1))

    X = rng.randint(2, 31)
    # Check transformed values are in [0, 1]
    assert_true(np.all(a.transform(X) <= np.ones_like(X)))
    assert_true(np.all(np.zeros_like(X) <= a.transform(X)))

    # Check inverse transform
    X_orig = a.inverse_transform(a.transform(X))
    assert_equal(X_orig.dtype, "int64")
    assert_array_equal(X_orig, X)
Example #33
0
def test_space_api():
    space = Space([(0.0, 1.0), (-5, 5), ("a", "b", "c"),
                   (1.0, 5.0, "log-uniform"), ("e", "f")])

    cat_space = Space([(1, "r"), (1.0, "r")])
    assert isinstance(cat_space.dimensions[0], Categorical)
    assert isinstance(cat_space.dimensions[1], Categorical)

    assert_equal(len(space.dimensions), 5)
    assert_true(isinstance(space.dimensions[0], Real))
    assert_true(isinstance(space.dimensions[1], Integer))
    assert_true(isinstance(space.dimensions[2], Categorical))
    assert_true(isinstance(space.dimensions[3], Real))
    assert_true(isinstance(space.dimensions[4], Categorical))

    samples = space.rvs(n_samples=10, random_state=0)
    assert_equal(len(samples), 10)
    assert_equal(len(samples[0]), 5)

    assert_true(isinstance(samples, list))
    for n in range(4):
        assert_true(isinstance(samples[n], list))

    assert_true(isinstance(samples[0][0], numbers.Real))
    assert_true(isinstance(samples[0][1], numbers.Integral))
    assert_true(isinstance(samples[0][2], str))
    assert_true(isinstance(samples[0][3], numbers.Real))
    assert_true(isinstance(samples[0][4], str))

    samples_transformed = space.transform(samples)
    assert_equal(samples_transformed.shape[0], len(samples))
    assert_equal(samples_transformed.shape[1], 1 + 1 + 3 + 1 + 1)
    assert_array_equal(samples, space.inverse_transform(samples_transformed))

    samples = space.inverse_transform(samples_transformed)
    assert_true(isinstance(samples[0][0], numbers.Real))
    assert_true(isinstance(samples[0][1], numbers.Integral))
    assert_true(isinstance(samples[0][2], str))
    assert_true(isinstance(samples[0][3], numbers.Real))
    assert_true(isinstance(samples[0][4], str))

    for b1, b2 in zip(space.bounds, [(0.0, 1.0), (-5, 5),
                                     np.asarray(["a", "b", "c"]), (1.0, 5.0),
                                     np.asarray(["e", "f"])]):
        assert_array_equal(b1, b2)

    for b1, b2 in zip(space.transformed_bounds,
                      [(0.0, 1.0), (-5, 5), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0),
                       (np.log10(1.0), np.log10(5.0)), (0.0, 1.0)]):
        assert_array_equal(b1, b2)
Example #34
0
def test_metaestimator_delegation():
    """Ensures specified metaestimators have methods iff subestimator does"""
    def hides(method):
        @property
        def wrapper(obj):
            if obj.hidden_method == method.__name__:
                raise AttributeError('%r is hidden' % obj.hidden_method)
            return functools.partial(method, obj)

        return wrapper

    class SubEstimator(BaseEstimator):
        def __init__(self, param=1, hidden_method=None):
            self.param = param
            self.hidden_method = hidden_method

        def fit(self, X, y=None, *args, **kwargs):
            self.coef_ = np.arange(X.shape[1])
            return True

        def _check_fit(self):
            if not hasattr(self, 'coef_'):
                raise RuntimeError('Estimator is not fit')

        @hides
        def inverse_transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def predict(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_log_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def decision_function(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def score(self, X, *args, **kwargs):
            self._check_fit()
            return 1.0

    methods = [
        k for k in iterkeys(SubEstimator.__dict__)
        if not k.startswith('_') and not k.startswith('fit')
    ]
    methods.sort()

    for delegator_data in DELEGATING_METAESTIMATORS:
        delegate = SubEstimator()
        delegator = delegator_data.construct(delegate)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            assert_true(hasattr(delegate, method))
            assert_true(
                hasattr(delegator, method),
                msg="%s does not have method %r when its delegate does" %
                (delegator_data.name, method))
            # delegation before fit raises an exception
            assert_raises(Exception, getattr(delegator, method),
                          delegator_data.fit_args[0])

        delegator.fit(*delegator_data.fit_args)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            # smoke test delegation
            getattr(delegator, method)(delegator_data.fit_args[0])

        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            delegate = SubEstimator(hidden_method=method)
            delegator = delegator_data.construct(delegate)
            assert_false(hasattr(delegate, method))
            assert_false(hasattr(delegator, method),
                         msg="%s has method %r when its delegate does not" %
                         (delegator_data.name, method))
Example #35
0
def test_has_fit_parameter():
    assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight"))
    assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight"))
    assert_true(has_fit_parameter(SVR, "sample_weight"))
    assert_true(has_fit_parameter(SVR(), "sample_weight"))
Example #36
0
def test_check_array_dtype_warning():
    X_int_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    X_float64 = np.asarray(X_int_list, dtype=np.float64)
    X_float32 = np.asarray(X_int_list, dtype=np.float32)
    X_int64 = np.asarray(X_int_list, dtype=np.int64)
    X_csr_float64 = sp.csr_matrix(X_float64)
    X_csr_float32 = sp.csr_matrix(X_float32)
    X_csc_float32 = sp.csc_matrix(X_float32)
    X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32)
    y = [0, 0, 1]
    integer_data = [X_int64, X_csc_int32]
    float64_data = [X_float64, X_csr_float64]
    float32_data = [X_float32, X_csr_float32, X_csc_float32]
    for X in integer_data:
        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=np.float64,
                                       accept_sparse=True)
        assert_equal(X_checked.dtype, np.float64)

        X_checked = assert_warns(DataConversionWarning,
                                 check_array,
                                 X,
                                 dtype=np.float64,
                                 accept_sparse=True,
                                 warn_on_dtype=True)
        assert_equal(X_checked.dtype, np.float64)

        # Check that the warning message includes the name of the Estimator
        X_checked = assert_warns_message(DataConversionWarning,
                                         'SomeEstimator',
                                         check_array,
                                         X,
                                         dtype=[np.float64, np.float32],
                                         accept_sparse=True,
                                         warn_on_dtype=True,
                                         estimator='SomeEstimator')
        assert_equal(X_checked.dtype, np.float64)

        X_checked, y_checked = assert_warns_message(
            DataConversionWarning,
            'KNeighborsClassifier',
            check_X_y,
            X,
            y,
            dtype=np.float64,
            accept_sparse=True,
            warn_on_dtype=True,
            estimator=KNeighborsClassifier())

        assert_equal(X_checked.dtype, np.float64)

    for X in float64_data:
        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=np.float64,
                                       accept_sparse=True,
                                       warn_on_dtype=True)
        assert_equal(X_checked.dtype, np.float64)
        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=np.float64,
                                       accept_sparse=True,
                                       warn_on_dtype=False)
        assert_equal(X_checked.dtype, np.float64)

    for X in float32_data:
        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=[np.float64, np.float32],
                                       accept_sparse=True)
        assert_equal(X_checked.dtype, np.float32)
        assert_true(X_checked is X)

        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=[np.float64, np.float32],
                                       accept_sparse=['csr', 'dok'],
                                       copy=True)
        assert_equal(X_checked.dtype, np.float32)
        assert_false(X_checked is X)

    X_checked = assert_no_warnings(check_array,
                                   X_csc_float32,
                                   dtype=[np.float64, np.float32],
                                   accept_sparse=['csr', 'dok'],
                                   copy=False)
    assert_equal(X_checked.dtype, np.float32)
    assert_false(X_checked is X_csc_float32)
    assert_equal(X_checked.format, 'csr')
Example #37
0
def test_check_array():
    # accept_sparse == None
    # raise error on sparse inputs
    X = [[1, 2], [3, 4]]
    X_csr = sp.csr_matrix(X)
    assert_raises(TypeError, check_array, X_csr)
    # ensure_2d
    X_array = check_array([0, 1, 2], ensure_2d=False)
    assert_equal(X_array.ndim, 1)
    # don't allow ndim > 3
    X_ndim = np.arange(8).reshape(2, 2, 2)
    assert_raises(ValueError, check_array, X_ndim)
    check_array(X_ndim, allow_nd=True)  # doesn't raise
    # force_all_finite
    X_inf = np.arange(4).reshape(2, 2).astype(np.float)
    X_inf[0, 0] = np.inf
    assert_raises(ValueError, check_array, X_inf)
    check_array(X_inf, force_all_finite=False)  # no raise
    # nan check
    X_nan = np.arange(4).reshape(2, 2).astype(np.float)
    X_nan[0, 0] = np.nan
    assert_raises(ValueError, check_array, X_nan)
    check_array(X_inf, force_all_finite=False)  # no raise

    # dtype and order enforcement.
    X_C = np.arange(4).reshape(2, 2).copy("C")
    X_F = X_C.copy("F")
    X_int = X_C.astype(np.int)
    X_float = X_C.astype(np.float)
    Xs = [X_C, X_F, X_int, X_float]
    dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object]
    orders = ['C', 'F', None]
    copys = [True, False]

    for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
        X_checked = check_array(X, dtype=dtype, order=order, copy=copy)
        if dtype is not None:
            assert_equal(X_checked.dtype, dtype)
        else:
            assert_equal(X_checked.dtype, X.dtype)
        if order == 'C':
            assert_true(X_checked.flags['C_CONTIGUOUS'])
            assert_false(X_checked.flags['F_CONTIGUOUS'])
        elif order == 'F':
            assert_true(X_checked.flags['F_CONTIGUOUS'])
            assert_false(X_checked.flags['C_CONTIGUOUS'])
        if copy:
            assert_false(X is X_checked)
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS']
                    == X.flags['C_CONTIGUOUS']
                    and X_checked.flags['F_CONTIGUOUS']
                    == X.flags['F_CONTIGUOUS']):
                assert_true(X is X_checked)

    # allowed sparse != None
    X_csc = sp.csc_matrix(X_C)
    X_coo = X_csc.tocoo()
    X_dok = X_csc.todok()
    X_int = X_csc.astype(np.int)
    X_float = X_csc.astype(np.float)

    Xs = [X_csc, X_coo, X_dok, X_int, X_float]
    accept_sparses = [['csr', 'coo'], ['coo', 'dok']]
    for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses,
                                                 copys):
        with warnings.catch_warnings(record=True) as w:
            X_checked = check_array(X,
                                    dtype=dtype,
                                    accept_sparse=accept_sparse,
                                    copy=copy)
        if (dtype is object or sp.isspmatrix_dok(X)) and len(w):
            message = str(w[0].message)
            messages = [
                "object dtype is not supported by sparse matrices",
                "Can't check dok sparse matrix for nan or inf."
            ]
            assert_true(message in messages)
        else:
            assert_equal(len(w), 0)
        if dtype is not None:
            assert_equal(X_checked.dtype, dtype)
        else:
            assert_equal(X_checked.dtype, X.dtype)
        if X.format in accept_sparse:
            # no change if allowed
            assert_equal(X.format, X_checked.format)
        else:
            # got converted
            assert_equal(X_checked.format, accept_sparse[0])
        if copy:
            assert_false(X is X_checked)
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and X.format == X_checked.format):
                assert_true(X is X_checked)

    # other input formats
    # convert lists to arrays
    X_dense = check_array([[1, 2], [3, 4]])
    assert_true(isinstance(X_dense, np.ndarray))
    # raise on too deep lists
    assert_raises(ValueError, check_array, X_ndim.tolist())
    check_array(X_ndim.tolist(), allow_nd=True)  # doesn't raise
    # convert weird stuff to arrays
    X_no_array = NotAnArray(X_dense)
    result = check_array(X_no_array)
    assert_true(isinstance(result, np.ndarray))
Example #38
0
def test_n_nonzero_coefs():
    assert_true(count_nonzero(orthogonal_mp(X, y[:, 0],
                              n_nonzero_coefs=5)) <= 5)
    assert_true(count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5,
                                            precompute=True)) <= 5)
Example #39
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    classifiers = all_estimators(type_filter='classifier')
    X_m, y_m = make_blobs(random_state=0)
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]
    for (X, y) in [(X_m, y_m), (X_b, y_b)]:
        # do it once with binary, once with multiclass
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, n_features = X.shape
        for name, Classifier in classifiers:
            if name in dont_test:
                continue
            if name in ['MultinomialNB', 'BernoulliNB']:
                # TODO also test these!
                continue
            # catch deprecation warnings
            with warnings.catch_warnings(record=True):
                classifier = Classifier()
            # raises error on malformed input for fit
            assert_raises(ValueError, classifier.fit, X, y[:-1])

            # fit
            classifier.fit(X, y)
            assert_true(hasattr(classifier, "classes_"))
            y_pred = classifier.predict(X)
            assert_equal(y_pred.shape, (n_samples, ))
            # training set performance
            assert_greater(accuracy_score(y, y_pred), 0.85)

            # raises error on malformed input for predict
            assert_raises(ValueError, classifier.predict, X.T)
            if hasattr(classifier, "decision_function"):
                try:
                    # decision_function agrees with predict:
                    decision = classifier.decision_function(X)
                    if n_classes is 2:
                        assert_equal(decision.ravel().shape, (n_samples, ))
                        dec_pred = (decision.ravel() > 0).astype(np.int)
                        assert_array_equal(dec_pred, y_pred)
                    if (n_classes is 3
                            and not isinstance(classifier, BaseLibSVM)):
                        # 1on1 of LibSVM works differently
                        assert_equal(decision.shape, (n_samples, n_classes))
                        assert_array_equal(np.argmax(decision, axis=1), y_pred)

                    # raises error on malformed input
                    assert_raises(ValueError, classifier.decision_function,
                                  X.T)
                    # raises error on malformed input for decision_function
                    assert_raises(ValueError, classifier.decision_function,
                                  X.T)
                except NotImplementedError:
                    pass
            if hasattr(classifier, "predict_proba"):
                try:
                    # predict_proba agrees with predict:
                    y_prob = classifier.predict_proba(X)
                    assert_equal(y_prob.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                    # check that probas for all classes sum to one
                    assert_array_almost_equal(np.sum(y_prob, axis=1),
                                              np.ones(n_samples))
                    # raises error on malformed input
                    assert_raises(ValueError, classifier.predict_proba, X.T)
                    # raises error on malformed input for predict_proba
                    assert_raises(ValueError, classifier.predict_proba, X.T)
                except NotImplementedError:
                    pass
Example #40
0
def test_tol():
    tol = 0.5
    gamma = orthogonal_mp(X, y[:, 0], tol=tol)
    gamma_gram = orthogonal_mp(X, y[:, 0], tol=tol, precompute=True)
    assert_true(np.sum((y[:, 0] - np.dot(X, gamma)) ** 2) <= tol)
    assert_true(np.sum((y[:, 0] - np.dot(X, gamma_gram)) ** 2) <= tol)
Example #41
0
def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            transformer = Transformer()
        set_random_state(transformer)
        if hasattr(transformer, 'compute_importances'):
            transformer.compute_importances = True

        if name == 'SelectKBest':
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            transformer.k = 1
        elif name in ['GaussianRandomProjection', 'SparseRandomProjection']:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            transformer.n_components = 1
        elif name == "MiniBatchDictionaryLearning":
            transformer.set_params(n_iter=5)  # default = 1000

        elif name == "KernelPCA":
            transformer.remove_zero_eig = False

        # fit

        if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
            y_ = np.c_[y, y]
            y_[::2, 1] *= 2
        else:
            y_ = y

        try:
            transformer.fit(X, y_)
            X_pred = transformer.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple):
                for x_pred in X_pred:
                    assert_equal(x_pred.shape[0], n_samples)
            else:
                assert_equal(X_pred.shape[0], n_samples)
        except Exception as e:
            print(transformer)
            print(e)
            print()
            succeeded = False
            continue

        if hasattr(transformer, 'transform'):
            if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
                X_pred2 = transformer.transform(X, y_)
                X_pred3 = transformer.fit_transform(X, y=y_)
            else:
                X_pred2 = transformer.transform(X)
                X_pred3 = transformer.fit_transform(X, y=y_)
            if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
                for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                    assert_array_almost_equal(
                        x_pred, x_pred2, 2,
                        "fit_transform not correct in %s" % Transformer)
                    assert_array_almost_equal(
                        x_pred3, x_pred2, 2,
                        "fit_transform not correct in %s" % Transformer)
            else:
                assert_array_almost_equal(
                    X_pred, X_pred2, 2,
                    "fit_transform not correct in %s" % Transformer)
                assert_array_almost_equal(
                    X_pred3, X_pred2, 2,
                    "fit_transform not correct in %s" % Transformer)

            # raises error on malformed input for transform
            assert_raises(ValueError, transformer.transform, X.T)
    assert_true(succeeded)
Example #42
0
 def fit(self, X, Y):
     assert_true(len(X) == len(Y))
     self.coef_ = np.ones(X.shape[1], dtype=np.float64)
     return self
Example #43
0
def test_fastica(add_noise=False):
    """ Test the FastICA algorithm on very simple data.
    """
    # scipy.stats uses the global RNG:
    rng = np.random.RandomState(0)
    n_samples = 1000
    # Generate two sources:
    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
    s2 = stats.t.rvs(1, size=n_samples)
    s = np.c_[s1, s2].T
    center_and_norm(s)
    s1, s2 = s

    # Mixing angle
    phi = 0.6
    mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi),
                                                    -np.cos(phi)]])
    m = np.dot(mixing, s)

    if add_noise:
        m += 0.1 * rng.randn(2, 1000)

    center_and_norm(m)

    algos = ['parallel', 'deflation']
    nls = ['logcosh', 'exp', 'cube']
    whitening = [True, False]
    for algo, nl, whiten in itertools.product(algos, nls, whitening):
        if whiten:
            k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo)
        else:
            X = PCA(n_components=2, whiten=True).fit_transform(m.T)
            k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False)
        s_ = s_.T
        # Check that the mixing model described in the docstring holds:
        if whiten:
            assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m))

        center_and_norm(s_)
        s1_, s2_ = s_
        # Check to see if the sources have been estimated
        # in the wrong order
        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
            s2_, s1_ = s_
        s1_ *= np.sign(np.dot(s1_, s1))
        s2_ *= np.sign(np.dot(s2_, s2))

        # Check that we have estimated the original sources
        if add_noise == False:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2)
        else:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1)

    # Test FastICA class
    ica = FastICA(fun=nl, algorithm=algo, random_state=0)
    ica.fit(m.T)
    ica.get_mixing_matrix()
    assert_true(ica.components_.shape == (2, 2))
    assert_true(ica.sources_.shape == (1000, 2))
Example #44
0
def test_transformers_pickle():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            transformer = Transformer()
        if not hasattr(transformer, 'transform'):
            continue
        set_random_state(transformer)
        if hasattr(transformer, 'compute_importances'):
            transformer.compute_importances = True

        if name == "SelectKBest":
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            transformer.k = 1
        elif name in ['GaussianRandomProjection', 'SparseRandomProjection']:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            transformer.n_components = 1

        # fit
        if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
            random_state = np.random.RandomState(seed=12345)
            y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        transformer.fit(X, y_)
        X_pred = transformer.fit(X, y_).transform(X)
        pickled_transformer = pickle.dumps(transformer)
        unpickled_transformer = pickle.loads(pickled_transformer)
        pickled_X_pred = unpickled_transformer.transform(X)

        try:
            assert_array_almost_equal(pickled_X_pred, X_pred)
        except Exception as exc:
            succeeded = False
            print("Transformer %s doesn't predict the same value "
                  "after pickling" % name)
            raise exc

    assert_true(succeeded)
Example #45
0
def test_check_array():
    # accept_sparse == None
    # raise error on sparse inputs
    X = [[1, 2], [3, 4]]
    X_csr = sp.csr_matrix(X)
    assert_raises(TypeError, check_array, X_csr)
    # ensure_2d=False
    X_array = check_array([0, 1, 2], ensure_2d=False)
    assert_equal(X_array.ndim, 1)
    # ensure_2d=True with 1d array
    assert_raise_message(ValueError,
                         'Expected 2D array, got 1D array instead',
                         check_array, [0, 1, 2],
                         ensure_2d=True)
    # ensure_2d=True with scalar array
    assert_raise_message(ValueError,
                         'Expected 2D array, got scalar array instead',
                         check_array,
                         10,
                         ensure_2d=True)
    # don't allow ndim > 3
    X_ndim = np.arange(8).reshape(2, 2, 2)
    assert_raises(ValueError, check_array, X_ndim)
    check_array(X_ndim, allow_nd=True)  # doesn't raise

    # dtype and order enforcement.
    X_C = np.arange(4).reshape(2, 2).copy("C")
    X_F = X_C.copy("F")
    X_int = X_C.astype(np.int)
    X_float = X_C.astype(np.float)
    Xs = [X_C, X_F, X_int, X_float]
    dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object]
    orders = ['C', 'F', None]
    copys = [True, False]

    for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
        X_checked = check_array(X, dtype=dtype, order=order, copy=copy)
        if dtype is not None:
            assert_equal(X_checked.dtype, dtype)
        else:
            assert_equal(X_checked.dtype, X.dtype)
        if order == 'C':
            assert_true(X_checked.flags['C_CONTIGUOUS'])
            assert_false(X_checked.flags['F_CONTIGUOUS'])
        elif order == 'F':
            assert_true(X_checked.flags['F_CONTIGUOUS'])
            assert_false(X_checked.flags['C_CONTIGUOUS'])
        if copy:
            assert_false(X is X_checked)
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS']
                    == X.flags['C_CONTIGUOUS']
                    and X_checked.flags['F_CONTIGUOUS']
                    == X.flags['F_CONTIGUOUS']):
                assert_true(X is X_checked)

    # allowed sparse != None
    X_csc = sp.csc_matrix(X_C)
    X_coo = X_csc.tocoo()
    X_dok = X_csc.todok()
    X_int = X_csc.astype(np.int)
    X_float = X_csc.astype(np.float)

    Xs = [X_csc, X_coo, X_dok, X_int, X_float]
    accept_sparses = [['csr', 'coo'], ['coo', 'dok']]
    for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses,
                                                 copys):
        with warnings.catch_warnings(record=True) as w:
            X_checked = check_array(X,
                                    dtype=dtype,
                                    accept_sparse=accept_sparse,
                                    copy=copy)
        if (dtype is object or sp.isspmatrix_dok(X)) and len(w):
            message = str(w[0].message)
            messages = [
                "object dtype is not supported by sparse matrices",
                "Can't check dok sparse matrix for nan or inf."
            ]
            assert_true(message in messages)
        else:
            assert_equal(len(w), 0)
        if dtype is not None:
            assert_equal(X_checked.dtype, dtype)
        else:
            assert_equal(X_checked.dtype, X.dtype)
        if X.format in accept_sparse:
            # no change if allowed
            assert_equal(X.format, X_checked.format)
        else:
            # got converted
            assert_equal(X_checked.format, accept_sparse[0])
        if copy:
            assert_false(X is X_checked)
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and X.format == X_checked.format):
                assert_true(X is X_checked)

    # other input formats
    # convert lists to arrays
    X_dense = check_array([[1, 2], [3, 4]])
    assert_true(isinstance(X_dense, np.ndarray))
    # raise on too deep lists
    assert_raises(ValueError, check_array, X_ndim.tolist())
    check_array(X_ndim.tolist(), allow_nd=True)  # doesn't raise
    # convert weird stuff to arrays
    X_no_array = NotAnArray(X_dense)
    result = check_array(X_no_array)
    assert_true(isinstance(result, np.ndarray))

    # deprecation warning if string-like array with dtype="numeric"
    expected_warn_regex = r"converted to decimal numbers if dtype='numeric'"
    X_str = [['11', '12'], ['13', 'xx']]
    for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]:
        with pytest.warns(FutureWarning, match=expected_warn_regex):
            check_array(X, dtype="numeric")

    # deprecation warning if byte-like array with dtype="numeric"
    X_bytes = [[b'a', b'b'], [b'c', b'd']]
    for X in [X_bytes, np.array(X_bytes, dtype='V1')]:
        with pytest.warns(FutureWarning, match=expected_warn_regex):
            check_array(X, dtype="numeric")
Example #46
0
def test_base_estimator():
    # Check base_estimator and its default values.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    ensemble = BaggingClassifier(None,
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))

    ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))

    ensemble = BaggingClassifier(Perceptron(),
                                 n_jobs=3,
                                 random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, Perceptron))

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = BaggingRegressor(None,
                                n_jobs=3,
                                random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=3,
                                random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))

    ensemble = BaggingRegressor(SVR(),
                                n_jobs=3,
                                random_state=0).fit(X_train, y_train)
    assert_true(isinstance(ensemble.base_estimator_, SVR))
Example #47
0
def test_hashing_vectorizer():
    v = HashingVectorizer()
    X = v.transform(ALL_FOOD_DOCS)
    token_nnz = X.nnz
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # By default the hashed values receive a random sign and l2 normalization
    # makes the feature values bounded
    assert_true(np.min(X.data) > -1)
    assert_true(np.min(X.data) < 0)
    assert_true(np.max(X.data) > 0)
    assert_true(np.max(X.data) < 1)

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)

    # Check vectorization with some non-default parameters
    v = HashingVectorizer(ngram_range=(1, 2), non_negative=True, norm='l1')
    X = v.transform(ALL_FOOD_DOCS)
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # ngrams generate more non zeros
    ngrams_nnz = X.nnz
    assert_true(ngrams_nnz > token_nnz)
    assert_true(ngrams_nnz < 2 * token_nnz)

    # makes the feature values bounded
    assert_true(np.min(X.data) > 0)
    assert_true(np.max(X.data) < 1)

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
Example #48
0
    def test_sgd_proba(self):
        # Check SGD.predict_proba

        # Hinge loss does not allow for conditional prob estimate.
        # We cannot use the factory here, because it defines predict_proba
        # anyway.
        clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y)
        assert_false(hasattr(clf, "predict_proba"))
        assert_false(hasattr(clf, "predict_log_proba"))

        # log and modified_huber losses can output probability estimates
        # binary case
        for loss in ["log", "modified_huber"]:
            clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
            clf.fit(X, Y)
            p = clf.predict_proba([3, 2])
            assert_true(p[0, 1] > 0.5)
            p = clf.predict_proba([-1, -1])
            assert_true(p[0, 1] < 0.5)

            p = clf.predict_log_proba([3, 2])
            assert_true(p[0, 1] > p[0, 0])
            p = clf.predict_log_proba([-1, -1])
            assert_true(p[0, 1] < p[0, 0])

        # log loss multiclass probability estimates
        clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2)

        d = clf.decision_function([[.1, -.1], [.3, .2]])
        p = clf.predict_proba([[.1, -.1], [.3, .2]])
        assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
        assert_almost_equal(p[0].sum(), 1)
        assert_true(np.all(p[0] >= 0))

        p = clf.predict_proba([-1, -1])
        d = clf.decision_function([-1, -1])
        assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))

        l = clf.predict_log_proba([3, 2])
        p = clf.predict_proba([3, 2])
        assert_array_almost_equal(np.log(p), l)

        l = clf.predict_log_proba([-1, -1])
        p = clf.predict_proba([-1, -1])
        assert_array_almost_equal(np.log(p), l)

        # Modified Huber multiclass probability estimates; requires a separate
        # test because the hard zero/one probabilities may destroy the
        # ordering present in decision_function output.
        clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
        clf.fit(X2, Y2)
        d = clf.decision_function([3, 2])
        p = clf.predict_proba([3, 2])
        if not isinstance(self, SparseSGDClassifierTestCase):
            assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1))
        else:  # XXX the sparse test gets a different X2 (?)
            assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1))

        # the following sample produces decision_function values < -1,
        # which would cause naive normalization to fail (see comment
        # in SGDClassifier.predict_proba)
        x = X.mean(axis=0)
        d = clf.decision_function(x)
        if np.all(d < -1):  # XXX not true in sparse test case (why?)
            p = clf.predict_proba(x)
            assert_array_almost_equal(p[0], [1 / 3.] * 3)
Example #49
0
 def test_parameters(self):
     assert_true(
         hasattr(self.clf, 'decision_scores_')
         and self.clf.decision_scores_ is not None)
     assert_true(
         hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
     assert_true(
         hasattr(self.clf, 'threshold_')
         and self.clf.threshold_ is not None)
     assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None)
     assert_true(
         hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
     assert_true(
         hasattr(self.clf, 'support_') and self.clf.support_ is not None)
     assert_true(
         hasattr(self.clf, 'support_vectors_')
         and self.clf.support_vectors_ is not None)
     assert_true(
         hasattr(self.clf, 'dual_coef_')
         and self.clf.dual_coef_ is not None)
     assert_true(
         hasattr(self.clf, 'intercept_')
         and self.clf.intercept_ is not None)
def test_input_estimator_unchanged():
    # Test that SelectFromModel fits on a clone of the estimator.
    est = RandomForestClassifier()
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    assert_true(transformer.estimator is est)
def test_make_imbalance_5():
    X_, y_ = make_imbalance(X, Y, ratio=0.01, min_c_=0)
    counter = Counter(y_)
    assert_equal(counter[1], 500)
    assert_equal(counter[0], 5)
    assert_true(np.all([X_i in X for X_i in X_]))
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert_true(isinstance(estimator[0].steps[-1][1].random_state, int))
Example #53
0
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval._check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval._check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval._check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]

    with warnings.catch_warnings(record=True):
        # deprecated sequence of sequence format
        cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval._check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold))
Example #54
0
def check_classifiers_train(name, Classifier):
    X_m, y_m = make_blobs(random_state=0)
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]
    for (X, y) in [(X_m, y_m), (X_b, y_b)]:
        # catch deprecation warnings
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, n_features = X.shape
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name in ['BernoulliNB', 'MultinomialNB']:
            X -= X.min()
        set_fast_parameters(classifier)
        set_random_state(classifier)
        # raises error on malformed input for fit
        assert_raises(ValueError, classifier.fit, X, y[:-1])

        # fit
        classifier.fit(X, y)
        # with lists
        classifier.fit(X.tolist(), y.tolist())
        assert_true(hasattr(classifier, "classes_"))
        y_pred = classifier.predict(X)
        assert_equal(y_pred.shape, (n_samples, ))
        # training set performance
        if name not in ['BernoulliNB', 'MultinomialNB']:
            assert_greater(accuracy_score(y, y_pred), 0.85)

        # raises error on malformed input for predict
        assert_raises(ValueError, classifier.predict, X.T)
        if hasattr(classifier, "decision_function"):
            try:
                # decision_function agrees with predict
                decision = classifier.decision_function(X)
                if n_classes is 2:
                    assert_equal(decision.shape, (n_samples, ))
                    dec_pred = (decision.ravel() > 0).astype(np.int)
                    assert_array_equal(dec_pred, y_pred)
                if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)):
                    # 1on1 of LibSVM works differently
                    assert_equal(decision.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)

                # raises error on malformed input
                assert_raises(ValueError, classifier.decision_function, X.T)
                # raises error on malformed input for decision_function
                assert_raises(ValueError, classifier.decision_function, X.T)
            except NotImplementedError:
                pass
        if hasattr(classifier, "predict_proba"):
            # predict_proba agrees with predict
            y_prob = classifier.predict_proba(X)
            assert_equal(y_prob.shape, (n_samples, n_classes))
            assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
            # check that probas for all classes sum to one
            assert_array_almost_equal(np.sum(y_prob, axis=1),
                                      np.ones(n_samples))
            # raises error on malformed input
            assert_raises(ValueError, classifier.predict_proba, X.T)
            # raises error on malformed input for predict_proba
            assert_raises(ValueError, classifier.predict_proba, X.T)
Example #55
0
def test_space_api():
    space = Space([(0.0, 1.0), (-5, 5), ("a", "b", "c"),
                   (1.0, 5.0, "log-uniform"), ("e", "f")])

    cat_space = Space([(1, "r"), (1.0, "r")])
    assert isinstance(cat_space.dimensions[0], Categorical)
    assert isinstance(cat_space.dimensions[1], Categorical)

    assert_equal(len(space.dimensions), 5)
    assert_true(isinstance(space.dimensions[0], Real))
    assert_true(isinstance(space.dimensions[1], Integer))
    assert_true(isinstance(space.dimensions[2], Categorical))
    assert_true(isinstance(space.dimensions[3], Real))
    assert_true(isinstance(space.dimensions[4], Categorical))

    samples = space.rvs(n_samples=10, random_state=0)
    assert_equal(len(samples), 10)
    assert_equal(len(samples[0]), 5)

    assert_true(isinstance(samples, list))
    for n in range(4):
        assert_true(isinstance(samples[n], list))

    assert_true(isinstance(samples[0][0], numbers.Real))
    assert_true(isinstance(samples[0][1], numbers.Integral))
    assert_true(isinstance(samples[0][2], str))
    assert_true(isinstance(samples[0][3], numbers.Real))
    assert_true(isinstance(samples[0][4], str))

    samples_transformed = space.transform(samples)
    assert_equal(samples_transformed.shape[0], len(samples))
    assert_equal(samples_transformed.shape[1], 1 + 1 + 3 + 1 + 1)

    # our space contains mixed types, this means we can't use
    # `array_allclose` or similar to check points are close after a round-trip
    # of transformations
    for orig, round_trip in zip(samples,
                                space.inverse_transform(samples_transformed)):
        assert space.distance(orig, round_trip) < 1.e-8

    samples = space.inverse_transform(samples_transformed)
    assert_true(isinstance(samples[0][0], numbers.Real))
    assert_true(isinstance(samples[0][1], numbers.Integral))
    assert_true(isinstance(samples[0][2], str))
    assert_true(isinstance(samples[0][3], numbers.Real))
    assert_true(isinstance(samples[0][4], str))

    for b1, b2 in zip(space.bounds, [(0.0, 1.0), (-5, 5),
                                     np.asarray(["a", "b", "c"]), (1.0, 5.0),
                                     np.asarray(["e", "f"])]):
        assert_array_equal(b1, b2)

    for b1, b2 in zip(space.transformed_bounds,
                      [(0.0, 1.0), (-5, 5), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0),
                       (np.log10(1.0), np.log10(5.0)), (0.0, 1.0)]):
        assert_array_equal(b1, b2)
Example #56
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(svm,
                                                        X,
                                                        y,
                                                        n_permutations=30,
                                                        cv=cv,
                                                        scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm,
        X,
        y,
        n_permutations=30,
        cv=cv,
        scoring="accuracy",
        labels=np.ones(y.size),
        random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse,
        X_sparse,
        y,
        n_permutations=30,
        cv=cv_sparse,
        scoring="accuracy",
        labels=np.ones(y.size),
        random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) /
                y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = cval.permutation_test_score(svm,
                                                   X,
                                                   y,
                                                   n_permutations=100,
                                                   scoring=scorer,
                                                   cv=cv,
                                                   random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(svm,
                                                        X,
                                                        y,
                                                        n_permutations=30,
                                                        cv=cv,
                                                        scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
Example #57
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
Example #58
0
 def test_parameters(self):
     assert_true(
         hasattr(self.estimator, 'base_estimators')
         and self.estimator.base_estimators is not None)
Example #59
0
def test_add_dummy_feature_csr():
    X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]])
    X = add_dummy_feature(X)
    assert_true(sparse.isspmatrix_csr(X), X)
    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def test_factor_analysis():
    """Test FactorAnalysis ability to recover the data covariance structure
    """
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
        assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always', ConvergenceWarning)
        fa1.max_iter = 1
        fa1.verbose = True
        fa1.fit(X)
        assert_true(w[-1].category == ConvergenceWarning)

        warnings.simplefilter('always', DeprecationWarning)
        FactorAnalysis(verbose=1)
        assert_true(w[-1].category == DeprecationWarning)

    # Test get_covariance and get_precision with n_components == n_features
    # with n_components < n_features and with n_components == 0
    for n_components in [0, 2, X.shape[1]]:
        fa.n_components = n_components
        fa.fit(X)
        cov = fa.get_covariance()
        precision = fa.get_precision()
        assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]),
                                  12)