Beispiel #1
0
def test_selective_pca():
    original = X
    cols = [original.columns[0]]  # Only perform on first...
    compare_cols = np.array(original[[
        'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'
    ]].as_matrix())  # should be the same as the trans cols

    transformer = SelectivePCA(cols=cols, n_components=0.85).fit(original)
    transformed = transformer.transform(original)

    untouched_cols = np.array(transformed[[
        'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'
    ]].as_matrix())
    assert_array_almost_equal(compare_cols, untouched_cols)
    assert 'PC1' in transformed.columns
    assert transformed.shape[1] == 4
    assert isinstance(transformer.get_decomposition(), PCA)
    assert SelectivePCA().get_decomposition() is None

    # test the selective mixin
    assert isinstance(transformer.cols, list)

    # what if we want to weight it?
    pca_df = SelectivePCA(weight=True, n_components=0.99,
                          as_df=False).fit_transform(original)
    pca_arr = SelectivePCA(weight=True, n_components=0.99,
                           as_df=False).fit_transform(iris.data)
    assert_array_equal(pca_df, pca_arr)

    # hack to assert they are not equal if weighted
    pca_arr = SelectivePCA(weight=False, n_components=0.99,
                           as_df=False).fit_transform(iris.data)
    assert_fails(assert_array_equal, AssertionError, pca_df, pca_arr)
Beispiel #2
0
def test_smote_error():
    x = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 4]])

    df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c'])

    # this fails because we can't perform smote on single observation (obs='4', in this case)
    assert_fails(SMOTEClassBalancer(y='c', ratio=1.0).balance, ValueError, df)
Beispiel #3
0
def test_act_stats():
    pred = [0.0, 1.0, 1.5]
    loss = [0.5, 0.5, 1.0]
    expo = [1.0, 0.5, 1.0]

    a = GainsStatisticalReport().fit_fold(pred=pred, expo=expo, loss=loss)
    # now see if we can get one to fail...
    assert_fails(a.fit_fold, TypeError, **{'pred': pred, 'expo': expo, 'loss': loss, 'prem': 12})

    # this one will work:
    a.fit_fold(pred=pred, expo=expo, loss=loss, prem=[1.0, 1.0, 1.0])

    # initializing with a bad 'score_by' will fail
    assert_fails(GainsStatisticalReport, ValueError, **{'score_by': 'accuracy'})

    # purposefully set n_folds and not set n_iter
    assert_fails(GainsStatisticalReport, ValueError, **{'n_folds': 10})

    # purposefully set wrong error_behavior
    assert_fails(GainsStatisticalReport(error_behavior='').fit_fold,
                 ValueError,
                 **{'pred': pred, 'expo': expo, 'loss': loss})

    # purposefully set n_folds so that n_obs is not be divisible by n_folds and n_iter
    assert_fails(GainsStatisticalReport(n_folds=121, n_iter=111).as_data_frame, ValueError)

    # set iid to false
    GainsStatisticalReport(iid=False).as_data_frame()

    # assert this is two in length...
    d = a.as_data_frame()
    assert d.shape[0] == 2
Beispiel #4
0
def test_function_mapper():
    Y = np.array([['USA', 'RED', 'a'],
                  ['MEX', 'GRN', 'b'],
                  ['FRA', 'RED', 'b']])
    y = pd.DataFrame.from_records(data=Y, columns=['A', 'B', 'C'])
    # Tack on a pseudo-numeric col
    y['D'] = np.array(['$5,000', '$6,000', '$7'])
    y['E'] = np.array(['8%', '52%', '0.3%'])

    def fun(i):
        return i.replace('[\$,%]', '', regex=True).astype(float)

    transformer = FunctionMapper(cols=['D', 'E'], fun=fun).fit(y)
    transformed = transformer.transform(y)
    assert transformed['D'].dtype == float

    # test on all, assert all columns captured
    x = y[['D', 'E']]
    t = FunctionMapper(fun=fun).fit_transform(x)
    assert t['D'].dtype == float and t['E'].dtype == float

    # Try on just one column
    t = FunctionMapper(cols='D', fun=fun).fit_transform(x)
    assert t['D'].dtype == float and t['E'].dtype == object

    # Try on no function
    assert x.equals(FunctionMapper().fit_transform(x))

    # Test on non-function
    assert_fails(FunctionMapper(fun='woo-hoo').fit, ValueError, x)
Beispiel #5
0
def test_qr():
    # test just the decomp first
    q = QRDecomposition(X)
    aux = q.qraux
    assert_array_almost_equal(
        aux, np.array([1.07056264, 1.0559255, 1.03857984, 1.04672249]))

    # test that we can get the rank
    assert q.get_rank() == 4

    # test that we can get the R matrix and that it's rank 4
    assert q.get_R_rank() == 4

    # next, let's test that we can get the coefficients:
    coef = q.get_coef(X)
    assert_array_almost_equal(
        coef,
        np.array([
            [1.00000000e+00, 1.96618714e-16, -0.00000000e+00, -2.00339858e-16],
            [3.00642915e-16, 1.00000000e+00, -0.00000000e+00, 1.75787325e-16],
            [-4.04768123e-16, 4.83060041e-17, 1.00000000e+00, 4.23545747e-16],
            [-1.19866575e-16, -1.74365433e-17, 1.10216442e-17, 1.00000000e+00]
        ]))

    # ensure dimension error
    assert_fails(q.get_coef, ValueError, X[:140, :])
Beispiel #6
0
def test_yeo_johnson():
    transformer = YeoJohnsonTransformer().fit(X)  # will fit on all cols

    # Assert transform works...
    transformed = transformer.transform(X)
    assert isinstance(transformed, pd.DataFrame)

    # assert as df false yields array
    assert isinstance(YeoJohnsonTransformer(as_df=False).fit_transform(X), np.ndarray)
    assert transformer.cols is None

    # Test on only one row...
    assert_fails(YeoJohnsonTransformer().fit, ValueError, X.iloc[0])

    # Test it on a random...
    m, n = 1000, 5
    x = np.random.rand(m, n)

    # make some random
    mask = np.random.rand(m, n) % 2 < 0.5
    signs = np.ones((m, n))
    signs[~mask] = -1
    x *= signs

    YeoJohnsonTransformer().fit(x)
Beispiel #7
0
def test_bytes():
    # assert works for DF
    df_memory_estimate(X_no_targ)
    # assert fails for bad str
    assert_fails(df_memory_estimate, ValueError, **{
        'X': X_no_targ,
        'unit': 'pb'
    })
Beispiel #8
0
def test_interactions():
    x_dict = {
        'a': [0, 0, 0, 1],
        'b': [1, 0, 0, 1],
        'c': [0, 1, 0, 1],
        'd': [1, 1, 1, 0]
    }

    X_pd = pd.DataFrame.from_dict(x_dict)[['a', 'b', 'c', 'd']]  # ordering

    # try with no cols arg
    trans = InteractionTermTransformer()
    X_trans = trans.fit_transform(X_pd)
    expected_names = ['a', 'b', 'c', 'd', 'a_b_I', 'a_c_I', 'a_d_I', 'b_c_I', 'b_d_I', 'c_d_I']
    assert all([i == j for i, j in zip(X_trans.columns.tolist(), expected_names)])  # assert col names equal
    assert_array_equal(X_trans.as_matrix(), np.array([
        [0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
        [0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 1, 0, 1, 0, 0]
    ]))

    # try with a custom function...
    def cust_add(a, b):
        return (a + b).values

    trans = InteractionTermTransformer(interaction_function=cust_add, as_df=False)
    X_trans = trans.fit_transform(X_pd)
    assert_array_equal(X_trans, np.array([
        [0, 1, 0, 1, 1, 0, 1, 1, 2, 1],
        [0, 0, 1, 1, 0, 1, 1, 1, 1, 2],
        [0, 0, 0, 1, 0, 0, 1, 0, 1, 1],
        [1, 1, 1, 0, 2, 2, 1, 2, 1, 1]
    ]))

    # assert fails with a non-function arg
    assert_fails(InteractionTermTransformer(interaction_function='a').fit, TypeError, X_pd)

    # test with just two cols
    # try with no cols arg
    trans = InteractionTermTransformer(cols=['a', 'b'])
    X_trans = trans.fit_transform(X_pd)
    expected_names = ['a', 'b', 'c', 'd', 'a_b_I']
    assert all([i == j for i, j in zip(X_trans.columns.tolist(), expected_names)])  # assert col names equal
    assert_array_equal(X_trans.as_matrix(), np.array([
        [0, 1, 0, 1, 0],
        [0, 0, 1, 1, 0],
        [0, 0, 0, 1, 0],
        [1, 1, 1, 0, 1]
    ]))

    # test on only_return_interactions...
    trans = InteractionTermTransformer(cols=['a', 'b'], only_return_interactions=True)
    X_trans = trans.fit_transform(X_pd)
    expected_names = sorted(['a', 'b', 'a_b_I'])
    actual_names = sorted(X_trans.columns.tolist())
    assert all([expected_names[i] == actual_names[i] for i in range(len(expected_names))])
Beispiel #9
0
def test_as_numpy():
    assert_fails(_as_numpy, TypeError, 'blah')
    assert _as_numpy(None) is None

    i = [1, 2, 3]
    x = np.array(i)
    assert_array_equal(x, _as_numpy(x))
    assert_array_equal(np.asarray(i), _as_numpy(i))
    assert_array_equal(_as_numpy(pd.DataFrame.from_records(X)), X)
Beispiel #10
0
def test_as_numpy():
    assert_fails(_as_numpy, TypeError, 'blah')
    assert _as_numpy(None) is None

    i = [1, 2, 3]
    x = np.array(i)
    assert_array_equal(x, _as_numpy(x))
    assert_array_equal(np.asarray(i), _as_numpy(i))
    assert_array_equal(_as_numpy(pd.DataFrame.from_records(X)), X)
Beispiel #11
0
def test_validate_on_non_df():
    x = iris.data
    validate_is_pd(x, None)

    # it will try to create a DF out of a String
    assert_fails(validate_is_pd, TypeError, 'asdf', 'asdf')

    # try on list of list and no cols
    x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    validate_is_pd(x, None)
Beispiel #12
0
def test_not_implemented_failure():
    # define anon decomposer
    class AnonDecomposer(_BaseSelectiveDecomposer):
        def __init__(self, cols=None, n_components=None, as_df=True):
            super(AnonDecomposer, self).__init__(cols, n_components, as_df)

        def get_decomposition(self):
            return super(AnonDecomposer, self).get_decomposition()

    assert_fails(AnonDecomposer().get_decomposition, NotImplementedError)
Beispiel #13
0
def test_superclass_not_implemented():
    # anon balancer
    class AnonBalancer(_BaseBalancer):
        def __init__(self, y=None, ratio=0.2, as_df=True):
            super(AnonBalancer, self).__init__(ratio, y, as_df)

        def balance(self, X):
            return super(AnonBalancer, self).balance(X)

    assert_fails(AnonBalancer().balance, NotImplementedError, X)
Beispiel #14
0
def test_superclass_not_implemented():
    # anon balancer
    class AnonBalancer(_BaseBalancer):
        def __init__(self, y=None, ratio=0.2, as_df=True):
            super(AnonBalancer, self).__init__(ratio, y, as_df)

        def balance(self, X):
            return super(AnonBalancer, self).balance(X)

    assert_fails(AnonBalancer().balance, NotImplementedError, X)
Beispiel #15
0
def test_smote_error():
    x = np.array([
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 4]
    ])

    df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c'])

    # this fails because we can't perform smote on single observation (obs='4', in this case)
    assert_fails(SMOTEClassBalancer(y='c', ratio=1.0).balance, ValueError, df)
Beispiel #16
0
def test_grid_search_fix():
    df = load_iris_df(shuffle=True, tgt_name='targ')
    y = df.pop("targ")

    pipe = Pipeline([('rf', RandomForestClassifier())])
    pipe2 = Pipeline([('pca', SelectivePCA())])

    hyp = {'rf__n_estimators': [10, 15]}
    hyp2 = {'pca__n_components': [1, 2]}

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for iid in [True, False]:
            grid1 = _SK17GridSearchCV(estimator=pipe,
                                      param_grid=hyp,
                                      cv=2,
                                      iid=iid)
            grid1.fit_predict(df, y)

            grid2 = _SK17RandomizedSearchCV(estimator=pipe,
                                            param_distributions=hyp,
                                            cv=2,
                                            n_iter=2,
                                            iid=iid)
            grid2.fit_predict(df, y)

            # coverage
            grid1._estimator_type
            grid1.score(df, y)

        # try with just a transformer
        grid3 = _SK17GridSearchCV(estimator=pipe2, param_grid=hyp2, cv=2)
        X_trans = grid3.fit_transform(df, None)

        # test inverse transform
        grid3.inverse_transform(X_trans)

        # __repr__ coverage
        grid3.grid_scores_[0]

        # test fail with mismatched dims
        assert_fails(grid3.fit, ValueError, X, np.array([1, 2, 3]))

        # test value error on missing scorer_
        sco = grid2.scorer_
        grid2.scorer_ = None
        assert_fails(grid2.score, ValueError, df, y)
        grid2.scorer_ = sco

        # test predict proba
        grid2.predict_proba(df)
        grid2.predict_log_proba(df)
Beispiel #17
0
    def test_corr():
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            corr_plot(X=X_no_targ, plot_type='cor', corr='precomputed')
            corr_plot(X=X_no_targ, plot_type='cor', corr='not_precomputed')
            corr_plot(X=X_no_targ, plot_type='pair', corr='precomputed')
            corr_plot(X=X_no_targ, plot_type='kde', corr='precomputed')

            assert_fails(corr_plot, ValueError, **{
                'X': X_no_targ,
                'plot_type': 'bad_type'
            })
Beispiel #18
0
def test_strange_input():
    # test numpy array input with numeric cols
    x = iris.data
    cols = [0, 2]

    SelectiveScaler(cols=cols).fit_transform(x)
    SelectiveScaler(cols=[]).fit_transform(x)

    SelectivePCA(cols=cols).fit_transform(x)
    SelectivePCA(cols=[]).fit_transform(x)

    # test bad input
    assert_fails(validate_is_pd, TypeError, "bad", None)
Beispiel #19
0
    def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
Beispiel #20
0
def test_linear_combos():
    lcf = LinearCombinationFilterer().fit(Z)
    assert_array_equal(lcf.drop_, ['C'])

    z = lcf.transform(Z)
    assert_array_equal(z.columns.values, ['A', 'B'])
    assert (z.B == 1).all()

    # test on no linear combos
    lcf = LinearCombinationFilterer(cols=['A', 'B']).fit(Z)
    assert not lcf.drop_
    assert Z.equals(lcf.transform(Z))

    # test too few features
    assert_fails(LinearCombinationFilterer(cols=['A']).fit, ValueError, Z)
Beispiel #21
0
def test_safe_log_exp():
    assert log(0) == __min_log__
    assert exp(1000000) == __max_exp__

    l_res = log([1, 2, 3])
    e_res = exp([1, 2, 3])
    assert_array_almost_equal(l_res, np.array([0., 0.69314718, 1.09861229]))
    assert_array_almost_equal(e_res,
                              np.array([2.71828183, 7.3890561, 20.08553692]))

    assert isinstance(l_res, np.ndarray)
    assert isinstance(e_res, np.ndarray)

    # try something with no __iter__ attr
    assert_fails(log, ValueError, 'A')
    assert_fails(exp, ValueError, 'A')
Beispiel #22
0
def test_boxcox():
    transformer = BoxCoxTransformer().fit(X)  # Will fit on all cols

    # Assert similar lambdas
    assert_array_almost_equal(sorted(dict_values(transformer.lambda_)),
                              np.array([
                                -0.14475082666963388, 
                                0.26165380763371671, 
                                0.64441777772515185,
                                0.93129521538860016
                              ]))

    # Assert exact shifts
    assert_array_equal(dict_values(transformer.shift_), np.array([0., 0., 0., 0.]))

    # Now subtract out some fixed amt from X, assert we get different values:
    x = X - 10
    transformer = BoxCoxTransformer().fit(x)

    # Assert similar lambdas
    assert_array_almost_equal(sorted(dict_values(transformer.lambda_)),
                              np.array([
                                0.42501980692063013, 
                                0.5928185584100969, 
                                0.59843688208993162, 
                                0.69983717204250795
                              ]))

    # Assert exact shifts
    assert_array_equal(sorted(dict_values(transformer.shift_)), np.array([5.700001, 8.000001, 9.000001, 9.900001]))

    # assert transform works
    transformed = transformer.transform(X)
    assert isinstance(transformed, pd.DataFrame)

    # assert as df false yields array
    assert isinstance(BoxCoxTransformer(as_df=False).fit_transform(X), np.ndarray)

    # test the selective mixin
    assert transformer.cols is None

    # Test on only one row...
    assert_fails(BoxCoxTransformer().fit, ValueError, X.iloc[0])
    assert_fails(BoxCoxTransformer().fit, ValueError, np.random.rand(1, 5))
Beispiel #23
0
def test_conf_matrix():
    a = [0, 1, 0, 1, 1]
    b = [0, 1, 1, 1, 0]

    df, ser = report_confusion_matrix(a, b)
    assert df.iloc[0, 0] == 1
    assert df.iloc[0, 1] == 1
    assert df.iloc[1, 0] == 1
    assert df.iloc[1, 1] == 2
    assert_almost_equal(ser['True Pos. Rate'], 0.666666666667)
    assert_almost_equal(ser['Diagnostic odds ratio'], 2.00000)

    # assert false yields None on series
    df, ser = report_confusion_matrix(a, b, False)
    assert ser is None

    # assert fails with > 2 classes
    a[0] = 2
    assert_fails(report_confusion_matrix, ValueError, a, b)
Beispiel #24
0
def test_sparsity():
    x = np.array([
        [1, 2, 3],
        [1, np.nan, np.nan],
        [1, 2, np.nan]
    ])

    df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c'])

    # test at .33 level
    filt = SparseFeatureDropper(threshold=0.3).fit(df)
    assert len(filt.drop_) == 2
    assert all([i in filt.drop_ for i in ('b', 'c')]), 'expected "b" and "c" but got %s' % ', '.join(filt.drop_)
    assert isinstance(filt.drop_, list)

    # test at 2/3 level
    filt = SparseFeatureDropper(threshold=0.6).fit(df)
    assert len(filt.drop_) == 1
    assert 'c' in filt.drop_, 'expected "c" but got %s' % filt.drop_

    # test with a bad value
    assert_fails(SparseFeatureDropper(threshold=1.0).fit, ValueError, df)
    assert_fails(SparseFeatureDropper(threshold=-0.1).fit, ValueError, df)
    assert_fails(SparseFeatureDropper(threshold='a').fit, ValueError, df)

    # only try on the 'a' col
    filt = SparseFeatureDropper(cols=['a']).fit(df)
    assert not filt.drop_
Beispiel #25
0
def test_nzv_filterer():
    transformer = NearZeroVarianceFilterer().fit(X)
    assert not transformer.drop_

    z = X.copy()
    z['zeros'] = np.zeros(150)

    transformer = NearZeroVarianceFilterer().fit(z)
    assert len(transformer.drop_) == 1
    assert transformer.drop_[0] == 'zeros'
    assert transformer.transform(z).shape[1] == 4

    # test the selective mixin
    assert transformer.cols is None, 'expected None but got %s' % str(transformer.cols)

    # see what happens if we have a nan or inf in the mix:
    a = pd.DataFrame.from_records(data=np.reshape(np.arange(25), (5, 5)))
    a.iloc[0, 0] = np.inf
    a.iloc[0, 1] = np.nan

    # expect a ValueError
    assert_fails(NearZeroVarianceFilterer().fit, ValueError, a)

    # test with the ratio strategy
    transformer = NearZeroVarianceFilterer(strategy='ratio', threshold=0.1)
    assert_fails(transformer.fit, ValueError, z)  # will fail because thresh must be greater than 1.0

    x = np.array([
        [1, 2, 3],
        [1, 5, 3],
        [1, 2, 4],
        [2, 5, 4]
    ])

    df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c'])
    transformer = NearZeroVarianceFilterer(strategy='ratio', threshold=3.0).fit(df)
    assert len(transformer.drop_) == 1
    assert transformer.drop_[0] == 'a'
    assert len(transformer.var_) == 1
    assert transformer.var_['a'] == 3.0
Beispiel #26
0
def test_multi_collinearity():
    transformer = MulticollinearityFilterer()

    # Test fit_transform
    x = transformer.fit_transform(X)
    assert x.shape[1] == 3

    col_nms = x.columns
    assert col_nms[0] == 'sepal length (cm)'
    assert col_nms[1] == 'sepal width (cm)'
    assert col_nms[2] == 'petal width (cm)'
    assert len(transformer.drop_) == 1
    assert len(transformer.mean_abs_correlations_) == 1
    print(transformer.correlations_)  # the correlations...

    # test the selective mixin
    assert transformer.cols is None, 'expected None but got %s' % str(transformer.cols)

    # Test fit, then transform
    transformer = MulticollinearityFilterer().fit(X)
    x = transformer.transform(X)
    assert x.shape[1] == 3

    col_nms = x.columns
    assert col_nms[0] == 'sepal length (cm)'
    assert col_nms[1] == 'sepal width (cm)'
    assert col_nms[2] == 'petal width (cm)'
    assert len(transformer.drop_) == 1

    # Check as_df false
    transformer.as_df = False
    assert isinstance(transformer.transform(X), np.ndarray)

    # check 1.0
    transformer = MulticollinearityFilterer(threshold=1.0).fit(X)
    assert not transformer.drop_

    # make sure non-square will fail
    assert_fails(filter_collinearity, ValueError, pd.DataFrame.from_records(np.ones((3, 2))), 0.6)
Beispiel #27
0
def test_qr():
    # test just the decomp first
    q = QRDecomposition(X)
    aux = q.qraux
    assert_array_almost_equal(aux, np.array([1.07056264,  1.0559255,   1.03857984,  1.04672249]))

    # test that we can get the rank
    assert q.get_rank() == 4

    # test that we can get the R matrix and that it's rank 4
    assert q.get_R_rank() == 4

    # next, let's test that we can get the coefficients:
    coef = q.get_coef(X)
    assert_array_almost_equal(coef, np.array(
        [[  1.00000000e+00,   1.96618714e-16,  -0.00000000e+00,  -2.00339858e-16],
         [  3.00642915e-16,   1.00000000e+00,  -0.00000000e+00,   1.75787325e-16],
         [ -4.04768123e-16,   4.83060041e-17,   1.00000000e+00,   4.23545747e-16],
         [ -1.19866575e-16,  -1.74365433e-17,   1.10216442e-17,   1.00000000e+00]]
    ))

    # ensure dimension error
    assert_fails(q.get_coef, ValueError, X[:140, :])
Beispiel #28
0
def test_pd_stats():
    Y = load_iris_df()

    # add a float copy of species
    Y['species_float'] = Y.Species.astype('float')

    # add an object col
    Y['species_factor'] = [
        'a' if i == 0 else 'b' if i == 1 else 'c' for i in Y.Species
    ]

    # test with all
    stats = pd_stats(Y, col_type='all')
    assert all([nm in stats.columns for nm in Y.columns])
    assert stats['species_float']['dtype'].startswith(
        'int')  # we assert it's considered an int

    # test with numerics
    stats = pd_stats(Y, col_type='numeric')
    assert 'species_factor' not in stats.columns
    assert stats.shape[1] == (Y.shape[1] - 1)

    # test with object
    stats = pd_stats(Y, col_type='object')
    assert 'species_factor' in stats.columns
    assert stats.shape[1] == 1

    # add feature with one value, assert the ratio of min : max is NA string...
    Y['constant'] = np.zeros(Y.shape[0])
    stats = pd_stats(Y, col_type='all')
    assert all([nm in stats.columns for nm in Y.columns])
    assert stats['constant']['dtype'].startswith(
        'int')  # we assert it's considered an int
    assert stats.loc['min_max_class_ratio']['constant'] == '--'

    # test with bad col_type
    assert_fails(pd_stats, ValueError, Y, 'bad_type')
Beispiel #29
0
def test_fixes():
    assert _validate_y(None) is None
    assert_fails(_validate_y, ValueError, X)  # dim 1 is greater than 1

    # try with one column
    X_copy = X.copy().pop(X.columns[0])  # copy and get first column
    assert isinstance(_validate_y(X_copy), np.ndarray)
    assert isinstance(_validate_y(np.array([1, 2, 3])),
                      np.ndarray)  # return the np.ndarray

    # Testing param grid
    param_grid = {'a': np.ones((3, 3))}

    # fails because value has more than 1 dim
    assert_fails(_check_param_grid, ValueError, param_grid)

    # test param grid with a dictionary as the value
    param_grid2 = {'a': {'a': 1}}

    # fails because v must be a tuple, list or np.ndarray
    assert_fails(_check_param_grid, ValueError, param_grid2)

    # fails because v is len 0
    assert_fails(_check_param_grid, ValueError, {'a': []})
Beispiel #30
0
    def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0],
                              n_folds=3,
                              shuffle=True,
                              random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([('scaler', SelectiveScaler()),
                         ('pca', SelectivePCA(weight=True)),
                         ('rf', RandomForestClassifier(random_state=42))])

        # define hyper parameters
        hp = {
            'scaler__scaler':
            [StandardScaler(),
             RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe,
                                  hp,
                                  n_iter=2,
                                  scoring='accuracy',
                                  n_jobs=1,
                                  cv=custom_cv,
                                  random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train,
                     y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{
            'random_search': grid,
            'percentile': 0.0
        })
        assert_fails(report_grid_score_detail, ValueError, **{
            'random_search': grid,
            'percentile': 1.0
        })

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{
            'random_search': grid,
            'y_axis': 'bad_axis'
        })

        # assert passes otherwise
        report_grid_score_detail(
            grid, charts=True, percentile=0.95)  # just ensure percentile works
Beispiel #31
0
def test_regular_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer',      FeatureRetainer()),  # will retain all
        ('dropper',       FeatureDropper()),  # won't drop any
        ('mapper',        FunctionMapper()),  # pass through
        ('encoder',       OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity',  MulticollinearityFilterer(threshold=0.85)),
        ('imputer',       SelectiveImputer()),  # pass through since no missing
        ('scaler',        SelectiveScaler()),
        ('boxcox',        BoxCoxTransformer()),
        ('nzv',           NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca',           SelectivePCA(n_components=0.9)),
        ('model',         RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search (exhaustively, so for the test, just do one of each)
    hp = {
        'collinearity__threshold': [0.90],
        'collinearity__method':    ['spearman'],
        'scaler__scaler':          [RobustScaler()],
        'pca__n_components':       [0.95],
        'pca__whiten':             [True],
        'model__n_estimators':     [5],
        'model__max_depth':        [5],
        'model__min_samples_leaf': [8],
        'model__max_features':     [0.75],
        'model__max_leaf_nodes':   [20]
    }

    # define the gridsearch
    search = GridSearchCV(pipe, hp,
                          scoring='accuracy',
                          cv=custom_cv,
                          verbose=1)

    # fit the search
    search.fit(X_train, y_train)
    # search.score(X_train, y_train) # throws a warning...
    search.predict(X_train)
    search.predict_proba(X_train)
    search.predict_log_proba(X_train)

    # this poses an issue.. the models are trained on X as a
    # array, and selecting the best estimator causes the retained
    # names to go away. We need to find a way to force the best_estimator_
    # to retain the names on which to start the training process.
    # search.best_estimator_.predict(X_train)

    # test the report
    report_grid_score_detail(search, charts=False)

    # test with invalid X and ys
    assert_fails(search.fit, Exception, X_train, None)

    # fit the search with a series
    search.fit(X_train, pd.Series(y_train))

    # fit the search with a DF as y
    search.fit(X_train, pd.DataFrame(pd.Series(y_train)))

    # test with invalid X and ys
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        assert_fails(search.fit, Exception, X_train, pd.DataFrame([pd.Series(y_train), pd.Series(y_train)]))

    # test with short y
    assert_fails(search.fit, ValueError, X_train, [0, 1, 2])
Beispiel #32
0
def test_selective_imputer():
    a = pd.DataFrame.from_records(
        [[1, 2, 3], [np.nan, 2, 2], [2, np.nan, np.nan]],
        columns=['a', 'b', 'c'])

    # first, use an int
    imputer = SelectiveImputer(fill=-1)
    y = imputer.fit_transform(a)
    assert imputer.fills_ == -1
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([x == -1 for x in (y.iloc[1, 0], y.iloc[2, 1], y.iloc[2, 2])])

    # now try with a string...
    imputer = SelectiveImputer(fill='mode')
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all(
        [y.iloc[1, 0] in (1, 2), y.iloc[2, 1] == 2, y.iloc[2, 2] in (3, 2)])

    # now try with a string...
    imputer = SelectiveImputer(fill='mean')
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2.0, y.iloc[2, 2] == 2.5])

    # now try with a string...
    imputer = SelectiveImputer(fill='median')
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5])

    # now test with an iterable
    imputer = SelectiveImputer(fill=[5, 6, 7])
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 6, y.iloc[2, 2] == 7])

    # test with a mixed iterable
    imputer = SelectiveImputer(fill=[5, 'mode', 'mean'])
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5])

    # test with a mixed iterable -- again
    imputer = SelectiveImputer(fill=['median', 3, 'mean'])
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5])

    # test with a dict
    imputer = SelectiveImputer(fill={'a': 'median', 'b': 3, 'c': 'mean'})
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5])

    # test failures now...
    assert_fails(SelectiveImputer(fill='blah').fit, TypeError, a)
    assert_fails(SelectiveImputer(fill=[1, 2]).fit, ValueError, a)
    assert_fails(SelectiveImputer(fill=['a', 'b', 'c']).fit, TypeError, a)
    assert_fails(SelectiveImputer(fill='a').fit, TypeError, a)
    assert_fails(SelectiveImputer(fill=[1, 2, 'a']).fit, TypeError, a)

    # generate anonymous class for test...
    class SomeObject(object):
        def __init__(self):
            pass

    assert_fails(SelectiveImputer(fill=SomeObject()).fit, TypeError, a)
Beispiel #33
0
def test_regular_grid():
    # build a pipeline
    pipe = Pipeline([
        ('retainer', FeatureRetainer()),  # will retain all
        ('dropper', FeatureDropper()),  # won't drop any
        ('mapper', FunctionMapper()),  # pass through
        ('encoder',
         OneHotCategoricalEncoder()),  # no object dtypes, so will pass through
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('imputer', SelectiveImputer()),  # pass through since no missing
        ('scaler', SelectiveScaler()),
        ('boxcox', BoxCoxTransformer()),
        ('nzv', NearZeroVarianceFilterer(threshold=1e-4)),
        ('pca', SelectivePCA(n_components=0.9)),
        ('model', RandomForestClassifier(n_jobs=1))
    ])

    # let's define a set of hyper-parameters over which to search (exhaustively, so for the test, just do one of each)
    hp = {
        'collinearity__threshold': [0.90],
        'collinearity__method': ['spearman'],
        'scaler__scaler': [RobustScaler()],
        'pca__n_components': [0.95],
        'pca__whiten': [True],
        'model__n_estimators': [5],
        'model__max_depth': [5],
        'model__min_samples_leaf': [8],
        'model__max_features': [0.75],
        'model__max_leaf_nodes': [20]
    }

    # define the gridsearch
    search = GridSearchCV(pipe,
                          hp,
                          scoring='accuracy',
                          cv=custom_cv,
                          verbose=1)

    # fit the search
    search.fit(X_train, y_train)
    # search.score(X_train, y_train) # throws a warning...
    search.predict(X_train)
    search.predict_proba(X_train)
    search.predict_log_proba(X_train)

    # this poses an issue.. the models are trained on X as a
    # array, and selecting the best estimator causes the retained
    # names to go away. We need to find a way to force the best_estimator_
    # to retain the names on which to start the training process.
    # search.best_estimator_.predict(X_train)

    # test the report
    report_grid_score_detail(search, charts=False)

    # test with invalid X and ys
    assert_fails(search.fit, Exception, X_train, None)

    # fit the search with a series
    search.fit(X_train, pd.Series(y_train))

    # fit the search with a DF as y
    search.fit(X_train, pd.DataFrame(pd.Series(y_train)))

    # test with invalid X and ys
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        assert_fails(search.fit, Exception, X_train,
                     pd.DataFrame([pd.Series(y_train),
                                   pd.Series(y_train)]))

    # test with short y
    assert_fails(search.fit, ValueError, X_train, [0, 1, 2])
Beispiel #34
0
def test_selective_imputer():
    a = pd.DataFrame.from_records([
        [1, 2, 3],
        [np.nan, 2, 2],
        [2, np.nan, np.nan]
    ], columns=['a', 'b', 'c'])

    # first, use an int
    imputer = SelectiveImputer(fill=-1)
    y = imputer.fit_transform(a)
    assert imputer.fills_ == -1
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([x == -1 for x in (y.iloc[1, 0], y.iloc[2, 1], y.iloc[2, 2])])

    # now try with a string...
    imputer = SelectiveImputer(fill='mode')
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] in (1, 2), y.iloc[2, 1] == 2, y.iloc[2, 2] in (3, 2)])

    # now try with a string...
    imputer = SelectiveImputer(fill='mean')
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2.0, y.iloc[2, 2] == 2.5])

    # now try with a string...
    imputer = SelectiveImputer(fill='median')
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5])

    # now test with an iterable
    imputer = SelectiveImputer(fill=[5, 6, 7])
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 6, y.iloc[2, 2] == 7])

    # test with a mixed iterable
    imputer = SelectiveImputer(fill=[5, 'mode', 'mean'])
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5])

    # test with a mixed iterable -- again
    imputer = SelectiveImputer(fill=['median', 3, 'mean'])
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5])

    # test with a dict
    imputer = SelectiveImputer(fill={'a': 'median', 'b': 3, 'c': 'mean'})
    y = imputer.fit_transform(a)
    assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y)
    assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5])

    # test failures now...
    assert_fails(SelectiveImputer(fill='blah').fit, TypeError, a)
    assert_fails(SelectiveImputer(fill=[1, 2]).fit, ValueError, a)
    assert_fails(SelectiveImputer(fill=['a', 'b', 'c']).fit, TypeError, a)
    assert_fails(SelectiveImputer(fill='a').fit, TypeError, a)
    assert_fails(SelectiveImputer(fill=[1, 2, 'a']).fit, TypeError, a)

    # generate anonymous class for test...
    class SomeObject(object):
        def __init__(self):
            pass

    assert_fails(SelectiveImputer(fill=SomeObject()).fit, TypeError, a)