def test_selective_pca(): original = X cols = [original.columns[0]] # Only perform on first... compare_cols = np.array(original[[ 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)' ]].as_matrix()) # should be the same as the trans cols transformer = SelectivePCA(cols=cols, n_components=0.85).fit(original) transformed = transformer.transform(original) untouched_cols = np.array(transformed[[ 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)' ]].as_matrix()) assert_array_almost_equal(compare_cols, untouched_cols) assert 'PC1' in transformed.columns assert transformed.shape[1] == 4 assert isinstance(transformer.get_decomposition(), PCA) assert SelectivePCA().get_decomposition() is None # test the selective mixin assert isinstance(transformer.cols, list) # what if we want to weight it? pca_df = SelectivePCA(weight=True, n_components=0.99, as_df=False).fit_transform(original) pca_arr = SelectivePCA(weight=True, n_components=0.99, as_df=False).fit_transform(iris.data) assert_array_equal(pca_df, pca_arr) # hack to assert they are not equal if weighted pca_arr = SelectivePCA(weight=False, n_components=0.99, as_df=False).fit_transform(iris.data) assert_fails(assert_array_equal, AssertionError, pca_df, pca_arr)
def test_smote_error(): x = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 4]]) df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c']) # this fails because we can't perform smote on single observation (obs='4', in this case) assert_fails(SMOTEClassBalancer(y='c', ratio=1.0).balance, ValueError, df)
def test_act_stats(): pred = [0.0, 1.0, 1.5] loss = [0.5, 0.5, 1.0] expo = [1.0, 0.5, 1.0] a = GainsStatisticalReport().fit_fold(pred=pred, expo=expo, loss=loss) # now see if we can get one to fail... assert_fails(a.fit_fold, TypeError, **{'pred': pred, 'expo': expo, 'loss': loss, 'prem': 12}) # this one will work: a.fit_fold(pred=pred, expo=expo, loss=loss, prem=[1.0, 1.0, 1.0]) # initializing with a bad 'score_by' will fail assert_fails(GainsStatisticalReport, ValueError, **{'score_by': 'accuracy'}) # purposefully set n_folds and not set n_iter assert_fails(GainsStatisticalReport, ValueError, **{'n_folds': 10}) # purposefully set wrong error_behavior assert_fails(GainsStatisticalReport(error_behavior='').fit_fold, ValueError, **{'pred': pred, 'expo': expo, 'loss': loss}) # purposefully set n_folds so that n_obs is not be divisible by n_folds and n_iter assert_fails(GainsStatisticalReport(n_folds=121, n_iter=111).as_data_frame, ValueError) # set iid to false GainsStatisticalReport(iid=False).as_data_frame() # assert this is two in length... d = a.as_data_frame() assert d.shape[0] == 2
def test_function_mapper(): Y = np.array([['USA', 'RED', 'a'], ['MEX', 'GRN', 'b'], ['FRA', 'RED', 'b']]) y = pd.DataFrame.from_records(data=Y, columns=['A', 'B', 'C']) # Tack on a pseudo-numeric col y['D'] = np.array(['$5,000', '$6,000', '$7']) y['E'] = np.array(['8%', '52%', '0.3%']) def fun(i): return i.replace('[\$,%]', '', regex=True).astype(float) transformer = FunctionMapper(cols=['D', 'E'], fun=fun).fit(y) transformed = transformer.transform(y) assert transformed['D'].dtype == float # test on all, assert all columns captured x = y[['D', 'E']] t = FunctionMapper(fun=fun).fit_transform(x) assert t['D'].dtype == float and t['E'].dtype == float # Try on just one column t = FunctionMapper(cols='D', fun=fun).fit_transform(x) assert t['D'].dtype == float and t['E'].dtype == object # Try on no function assert x.equals(FunctionMapper().fit_transform(x)) # Test on non-function assert_fails(FunctionMapper(fun='woo-hoo').fit, ValueError, x)
def test_qr(): # test just the decomp first q = QRDecomposition(X) aux = q.qraux assert_array_almost_equal( aux, np.array([1.07056264, 1.0559255, 1.03857984, 1.04672249])) # test that we can get the rank assert q.get_rank() == 4 # test that we can get the R matrix and that it's rank 4 assert q.get_R_rank() == 4 # next, let's test that we can get the coefficients: coef = q.get_coef(X) assert_array_almost_equal( coef, np.array([ [1.00000000e+00, 1.96618714e-16, -0.00000000e+00, -2.00339858e-16], [3.00642915e-16, 1.00000000e+00, -0.00000000e+00, 1.75787325e-16], [-4.04768123e-16, 4.83060041e-17, 1.00000000e+00, 4.23545747e-16], [-1.19866575e-16, -1.74365433e-17, 1.10216442e-17, 1.00000000e+00] ])) # ensure dimension error assert_fails(q.get_coef, ValueError, X[:140, :])
def test_yeo_johnson(): transformer = YeoJohnsonTransformer().fit(X) # will fit on all cols # Assert transform works... transformed = transformer.transform(X) assert isinstance(transformed, pd.DataFrame) # assert as df false yields array assert isinstance(YeoJohnsonTransformer(as_df=False).fit_transform(X), np.ndarray) assert transformer.cols is None # Test on only one row... assert_fails(YeoJohnsonTransformer().fit, ValueError, X.iloc[0]) # Test it on a random... m, n = 1000, 5 x = np.random.rand(m, n) # make some random mask = np.random.rand(m, n) % 2 < 0.5 signs = np.ones((m, n)) signs[~mask] = -1 x *= signs YeoJohnsonTransformer().fit(x)
def test_bytes(): # assert works for DF df_memory_estimate(X_no_targ) # assert fails for bad str assert_fails(df_memory_estimate, ValueError, **{ 'X': X_no_targ, 'unit': 'pb' })
def test_interactions(): x_dict = { 'a': [0, 0, 0, 1], 'b': [1, 0, 0, 1], 'c': [0, 1, 0, 1], 'd': [1, 1, 1, 0] } X_pd = pd.DataFrame.from_dict(x_dict)[['a', 'b', 'c', 'd']] # ordering # try with no cols arg trans = InteractionTermTransformer() X_trans = trans.fit_transform(X_pd) expected_names = ['a', 'b', 'c', 'd', 'a_b_I', 'a_c_I', 'a_d_I', 'b_c_I', 'b_d_I', 'c_d_I'] assert all([i == j for i, j in zip(X_trans.columns.tolist(), expected_names)]) # assert col names equal assert_array_equal(X_trans.as_matrix(), np.array([ [0, 1, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 0, 1, 1, 0, 1, 0, 0] ])) # try with a custom function... def cust_add(a, b): return (a + b).values trans = InteractionTermTransformer(interaction_function=cust_add, as_df=False) X_trans = trans.fit_transform(X_pd) assert_array_equal(X_trans, np.array([ [0, 1, 0, 1, 1, 0, 1, 1, 2, 1], [0, 0, 1, 1, 0, 1, 1, 1, 1, 2], [0, 0, 0, 1, 0, 0, 1, 0, 1, 1], [1, 1, 1, 0, 2, 2, 1, 2, 1, 1] ])) # assert fails with a non-function arg assert_fails(InteractionTermTransformer(interaction_function='a').fit, TypeError, X_pd) # test with just two cols # try with no cols arg trans = InteractionTermTransformer(cols=['a', 'b']) X_trans = trans.fit_transform(X_pd) expected_names = ['a', 'b', 'c', 'd', 'a_b_I'] assert all([i == j for i, j in zip(X_trans.columns.tolist(), expected_names)]) # assert col names equal assert_array_equal(X_trans.as_matrix(), np.array([ [0, 1, 0, 1, 0], [0, 0, 1, 1, 0], [0, 0, 0, 1, 0], [1, 1, 1, 0, 1] ])) # test on only_return_interactions... trans = InteractionTermTransformer(cols=['a', 'b'], only_return_interactions=True) X_trans = trans.fit_transform(X_pd) expected_names = sorted(['a', 'b', 'a_b_I']) actual_names = sorted(X_trans.columns.tolist()) assert all([expected_names[i] == actual_names[i] for i in range(len(expected_names))])
def test_as_numpy(): assert_fails(_as_numpy, TypeError, 'blah') assert _as_numpy(None) is None i = [1, 2, 3] x = np.array(i) assert_array_equal(x, _as_numpy(x)) assert_array_equal(np.asarray(i), _as_numpy(i)) assert_array_equal(_as_numpy(pd.DataFrame.from_records(X)), X)
def test_as_numpy(): assert_fails(_as_numpy, TypeError, 'blah') assert _as_numpy(None) is None i = [1, 2, 3] x = np.array(i) assert_array_equal(x, _as_numpy(x)) assert_array_equal(np.asarray(i), _as_numpy(i)) assert_array_equal(_as_numpy(pd.DataFrame.from_records(X)), X)
def test_validate_on_non_df(): x = iris.data validate_is_pd(x, None) # it will try to create a DF out of a String assert_fails(validate_is_pd, TypeError, 'asdf', 'asdf') # try on list of list and no cols x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] validate_is_pd(x, None)
def test_not_implemented_failure(): # define anon decomposer class AnonDecomposer(_BaseSelectiveDecomposer): def __init__(self, cols=None, n_components=None, as_df=True): super(AnonDecomposer, self).__init__(cols, n_components, as_df) def get_decomposition(self): return super(AnonDecomposer, self).get_decomposition() assert_fails(AnonDecomposer().get_decomposition, NotImplementedError)
def test_superclass_not_implemented(): # anon balancer class AnonBalancer(_BaseBalancer): def __init__(self, y=None, ratio=0.2, as_df=True): super(AnonBalancer, self).__init__(ratio, y, as_df) def balance(self, X): return super(AnonBalancer, self).balance(X) assert_fails(AnonBalancer().balance, NotImplementedError, X)
def test_superclass_not_implemented(): # anon balancer class AnonBalancer(_BaseBalancer): def __init__(self, y=None, ratio=0.2, as_df=True): super(AnonBalancer, self).__init__(ratio, y, as_df) def balance(self, X): return super(AnonBalancer, self).balance(X) assert_fails(AnonBalancer().balance, NotImplementedError, X)
def test_smote_error(): x = np.array([ [1, 2, 3], [1, 2, 3], [1, 2, 4] ]) df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c']) # this fails because we can't perform smote on single observation (obs='4', in this case) assert_fails(SMOTEClassBalancer(y='c', ratio=1.0).balance, ValueError, df)
def test_grid_search_fix(): df = load_iris_df(shuffle=True, tgt_name='targ') y = df.pop("targ") pipe = Pipeline([('rf', RandomForestClassifier())]) pipe2 = Pipeline([('pca', SelectivePCA())]) hyp = {'rf__n_estimators': [10, 15]} hyp2 = {'pca__n_components': [1, 2]} with warnings.catch_warnings(): warnings.simplefilter("ignore") for iid in [True, False]: grid1 = _SK17GridSearchCV(estimator=pipe, param_grid=hyp, cv=2, iid=iid) grid1.fit_predict(df, y) grid2 = _SK17RandomizedSearchCV(estimator=pipe, param_distributions=hyp, cv=2, n_iter=2, iid=iid) grid2.fit_predict(df, y) # coverage grid1._estimator_type grid1.score(df, y) # try with just a transformer grid3 = _SK17GridSearchCV(estimator=pipe2, param_grid=hyp2, cv=2) X_trans = grid3.fit_transform(df, None) # test inverse transform grid3.inverse_transform(X_trans) # __repr__ coverage grid3.grid_scores_[0] # test fail with mismatched dims assert_fails(grid3.fit, ValueError, X, np.array([1, 2, 3])) # test value error on missing scorer_ sco = grid2.scorer_ grid2.scorer_ = None assert_fails(grid2.score, ValueError, df, y) grid2.scorer_ = sco # test predict proba grid2.predict_proba(df) grid2.predict_log_proba(df)
def test_corr(): with warnings.catch_warnings(): warnings.simplefilter("ignore") corr_plot(X=X_no_targ, plot_type='cor', corr='precomputed') corr_plot(X=X_no_targ, plot_type='cor', corr='not_precomputed') corr_plot(X=X_no_targ, plot_type='pair', corr='precomputed') corr_plot(X=X_no_targ, plot_type='kde', corr='precomputed') assert_fails(corr_plot, ValueError, **{ 'X': X_no_targ, 'plot_type': 'bad_type' })
def test_strange_input(): # test numpy array input with numeric cols x = iris.data cols = [0, 2] SelectiveScaler(cols=cols).fit_transform(x) SelectiveScaler(cols=[]).fit_transform(x) SelectivePCA(cols=cols).fit_transform(x) SelectivePCA(cols=[]).fit_transform(x) # test bad input assert_fails(validate_is_pd, TypeError, "bad", None)
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def test_linear_combos(): lcf = LinearCombinationFilterer().fit(Z) assert_array_equal(lcf.drop_, ['C']) z = lcf.transform(Z) assert_array_equal(z.columns.values, ['A', 'B']) assert (z.B == 1).all() # test on no linear combos lcf = LinearCombinationFilterer(cols=['A', 'B']).fit(Z) assert not lcf.drop_ assert Z.equals(lcf.transform(Z)) # test too few features assert_fails(LinearCombinationFilterer(cols=['A']).fit, ValueError, Z)
def test_safe_log_exp(): assert log(0) == __min_log__ assert exp(1000000) == __max_exp__ l_res = log([1, 2, 3]) e_res = exp([1, 2, 3]) assert_array_almost_equal(l_res, np.array([0., 0.69314718, 1.09861229])) assert_array_almost_equal(e_res, np.array([2.71828183, 7.3890561, 20.08553692])) assert isinstance(l_res, np.ndarray) assert isinstance(e_res, np.ndarray) # try something with no __iter__ attr assert_fails(log, ValueError, 'A') assert_fails(exp, ValueError, 'A')
def test_boxcox(): transformer = BoxCoxTransformer().fit(X) # Will fit on all cols # Assert similar lambdas assert_array_almost_equal(sorted(dict_values(transformer.lambda_)), np.array([ -0.14475082666963388, 0.26165380763371671, 0.64441777772515185, 0.93129521538860016 ])) # Assert exact shifts assert_array_equal(dict_values(transformer.shift_), np.array([0., 0., 0., 0.])) # Now subtract out some fixed amt from X, assert we get different values: x = X - 10 transformer = BoxCoxTransformer().fit(x) # Assert similar lambdas assert_array_almost_equal(sorted(dict_values(transformer.lambda_)), np.array([ 0.42501980692063013, 0.5928185584100969, 0.59843688208993162, 0.69983717204250795 ])) # Assert exact shifts assert_array_equal(sorted(dict_values(transformer.shift_)), np.array([5.700001, 8.000001, 9.000001, 9.900001])) # assert transform works transformed = transformer.transform(X) assert isinstance(transformed, pd.DataFrame) # assert as df false yields array assert isinstance(BoxCoxTransformer(as_df=False).fit_transform(X), np.ndarray) # test the selective mixin assert transformer.cols is None # Test on only one row... assert_fails(BoxCoxTransformer().fit, ValueError, X.iloc[0]) assert_fails(BoxCoxTransformer().fit, ValueError, np.random.rand(1, 5))
def test_conf_matrix(): a = [0, 1, 0, 1, 1] b = [0, 1, 1, 1, 0] df, ser = report_confusion_matrix(a, b) assert df.iloc[0, 0] == 1 assert df.iloc[0, 1] == 1 assert df.iloc[1, 0] == 1 assert df.iloc[1, 1] == 2 assert_almost_equal(ser['True Pos. Rate'], 0.666666666667) assert_almost_equal(ser['Diagnostic odds ratio'], 2.00000) # assert false yields None on series df, ser = report_confusion_matrix(a, b, False) assert ser is None # assert fails with > 2 classes a[0] = 2 assert_fails(report_confusion_matrix, ValueError, a, b)
def test_sparsity(): x = np.array([ [1, 2, 3], [1, np.nan, np.nan], [1, 2, np.nan] ]) df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c']) # test at .33 level filt = SparseFeatureDropper(threshold=0.3).fit(df) assert len(filt.drop_) == 2 assert all([i in filt.drop_ for i in ('b', 'c')]), 'expected "b" and "c" but got %s' % ', '.join(filt.drop_) assert isinstance(filt.drop_, list) # test at 2/3 level filt = SparseFeatureDropper(threshold=0.6).fit(df) assert len(filt.drop_) == 1 assert 'c' in filt.drop_, 'expected "c" but got %s' % filt.drop_ # test with a bad value assert_fails(SparseFeatureDropper(threshold=1.0).fit, ValueError, df) assert_fails(SparseFeatureDropper(threshold=-0.1).fit, ValueError, df) assert_fails(SparseFeatureDropper(threshold='a').fit, ValueError, df) # only try on the 'a' col filt = SparseFeatureDropper(cols=['a']).fit(df) assert not filt.drop_
def test_nzv_filterer(): transformer = NearZeroVarianceFilterer().fit(X) assert not transformer.drop_ z = X.copy() z['zeros'] = np.zeros(150) transformer = NearZeroVarianceFilterer().fit(z) assert len(transformer.drop_) == 1 assert transformer.drop_[0] == 'zeros' assert transformer.transform(z).shape[1] == 4 # test the selective mixin assert transformer.cols is None, 'expected None but got %s' % str(transformer.cols) # see what happens if we have a nan or inf in the mix: a = pd.DataFrame.from_records(data=np.reshape(np.arange(25), (5, 5))) a.iloc[0, 0] = np.inf a.iloc[0, 1] = np.nan # expect a ValueError assert_fails(NearZeroVarianceFilterer().fit, ValueError, a) # test with the ratio strategy transformer = NearZeroVarianceFilterer(strategy='ratio', threshold=0.1) assert_fails(transformer.fit, ValueError, z) # will fail because thresh must be greater than 1.0 x = np.array([ [1, 2, 3], [1, 5, 3], [1, 2, 4], [2, 5, 4] ]) df = pd.DataFrame.from_records(data=x, columns=['a', 'b', 'c']) transformer = NearZeroVarianceFilterer(strategy='ratio', threshold=3.0).fit(df) assert len(transformer.drop_) == 1 assert transformer.drop_[0] == 'a' assert len(transformer.var_) == 1 assert transformer.var_['a'] == 3.0
def test_multi_collinearity(): transformer = MulticollinearityFilterer() # Test fit_transform x = transformer.fit_transform(X) assert x.shape[1] == 3 col_nms = x.columns assert col_nms[0] == 'sepal length (cm)' assert col_nms[1] == 'sepal width (cm)' assert col_nms[2] == 'petal width (cm)' assert len(transformer.drop_) == 1 assert len(transformer.mean_abs_correlations_) == 1 print(transformer.correlations_) # the correlations... # test the selective mixin assert transformer.cols is None, 'expected None but got %s' % str(transformer.cols) # Test fit, then transform transformer = MulticollinearityFilterer().fit(X) x = transformer.transform(X) assert x.shape[1] == 3 col_nms = x.columns assert col_nms[0] == 'sepal length (cm)' assert col_nms[1] == 'sepal width (cm)' assert col_nms[2] == 'petal width (cm)' assert len(transformer.drop_) == 1 # Check as_df false transformer.as_df = False assert isinstance(transformer.transform(X), np.ndarray) # check 1.0 transformer = MulticollinearityFilterer(threshold=1.0).fit(X) assert not transformer.drop_ # make sure non-square will fail assert_fails(filter_collinearity, ValueError, pd.DataFrame.from_records(np.ones((3, 2))), 0.6)
def test_qr(): # test just the decomp first q = QRDecomposition(X) aux = q.qraux assert_array_almost_equal(aux, np.array([1.07056264, 1.0559255, 1.03857984, 1.04672249])) # test that we can get the rank assert q.get_rank() == 4 # test that we can get the R matrix and that it's rank 4 assert q.get_R_rank() == 4 # next, let's test that we can get the coefficients: coef = q.get_coef(X) assert_array_almost_equal(coef, np.array( [[ 1.00000000e+00, 1.96618714e-16, -0.00000000e+00, -2.00339858e-16], [ 3.00642915e-16, 1.00000000e+00, -0.00000000e+00, 1.75787325e-16], [ -4.04768123e-16, 4.83060041e-17, 1.00000000e+00, 4.23545747e-16], [ -1.19866575e-16, -1.74365433e-17, 1.10216442e-17, 1.00000000e+00]] )) # ensure dimension error assert_fails(q.get_coef, ValueError, X[:140, :])
def test_pd_stats(): Y = load_iris_df() # add a float copy of species Y['species_float'] = Y.Species.astype('float') # add an object col Y['species_factor'] = [ 'a' if i == 0 else 'b' if i == 1 else 'c' for i in Y.Species ] # test with all stats = pd_stats(Y, col_type='all') assert all([nm in stats.columns for nm in Y.columns]) assert stats['species_float']['dtype'].startswith( 'int') # we assert it's considered an int # test with numerics stats = pd_stats(Y, col_type='numeric') assert 'species_factor' not in stats.columns assert stats.shape[1] == (Y.shape[1] - 1) # test with object stats = pd_stats(Y, col_type='object') assert 'species_factor' in stats.columns assert stats.shape[1] == 1 # add feature with one value, assert the ratio of min : max is NA string... Y['constant'] = np.zeros(Y.shape[0]) stats = pd_stats(Y, col_type='all') assert all([nm in stats.columns for nm in Y.columns]) assert stats['constant']['dtype'].startswith( 'int') # we assert it's considered an int assert stats.loc['min_max_class_ratio']['constant'] == '--' # test with bad col_type assert_fails(pd_stats, ValueError, Y, 'bad_type')
def test_fixes(): assert _validate_y(None) is None assert_fails(_validate_y, ValueError, X) # dim 1 is greater than 1 # try with one column X_copy = X.copy().pop(X.columns[0]) # copy and get first column assert isinstance(_validate_y(X_copy), np.ndarray) assert isinstance(_validate_y(np.array([1, 2, 3])), np.ndarray) # return the np.ndarray # Testing param grid param_grid = {'a': np.ones((3, 3))} # fails because value has more than 1 dim assert_fails(_check_param_grid, ValueError, param_grid) # test param grid with a dictionary as the value param_grid2 = {'a': {'a': 1}} # fails because v must be a tuple, list or np.ndarray assert_fails(_check_param_grid, ValueError, param_grid2) # fails because v is len 0 assert_fails(_check_param_grid, ValueError, {'a': []})
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42))]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{ 'random_search': grid, 'percentile': 0.0 }) assert_fails(report_grid_score_detail, ValueError, **{ 'random_search': grid, 'percentile': 1.0 }) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{ 'random_search': grid, 'y_axis': 'bad_axis' }) # assert passes otherwise report_grid_score_detail( grid, charts=True, percentile=0.95) # just ensure percentile works
def test_regular_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since no missing ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search (exhaustively, so for the test, just do one of each) hp = { 'collinearity__threshold': [0.90], 'collinearity__method': ['spearman'], 'scaler__scaler': [RobustScaler()], 'pca__n_components': [0.95], 'pca__whiten': [True], 'model__n_estimators': [5], 'model__max_depth': [5], 'model__min_samples_leaf': [8], 'model__max_features': [0.75], 'model__max_leaf_nodes': [20] } # define the gridsearch search = GridSearchCV(pipe, hp, scoring='accuracy', cv=custom_cv, verbose=1) # fit the search search.fit(X_train, y_train) # search.score(X_train, y_train) # throws a warning... search.predict(X_train) search.predict_proba(X_train) search.predict_log_proba(X_train) # this poses an issue.. the models are trained on X as a # array, and selecting the best estimator causes the retained # names to go away. We need to find a way to force the best_estimator_ # to retain the names on which to start the training process. # search.best_estimator_.predict(X_train) # test the report report_grid_score_detail(search, charts=False) # test with invalid X and ys assert_fails(search.fit, Exception, X_train, None) # fit the search with a series search.fit(X_train, pd.Series(y_train)) # fit the search with a DF as y search.fit(X_train, pd.DataFrame(pd.Series(y_train))) # test with invalid X and ys with warnings.catch_warnings(): warnings.simplefilter('ignore') assert_fails(search.fit, Exception, X_train, pd.DataFrame([pd.Series(y_train), pd.Series(y_train)])) # test with short y assert_fails(search.fit, ValueError, X_train, [0, 1, 2])
def test_selective_imputer(): a = pd.DataFrame.from_records( [[1, 2, 3], [np.nan, 2, 2], [2, np.nan, np.nan]], columns=['a', 'b', 'c']) # first, use an int imputer = SelectiveImputer(fill=-1) y = imputer.fit_transform(a) assert imputer.fills_ == -1 assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([x == -1 for x in (y.iloc[1, 0], y.iloc[2, 1], y.iloc[2, 2])]) # now try with a string... imputer = SelectiveImputer(fill='mode') y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all( [y.iloc[1, 0] in (1, 2), y.iloc[2, 1] == 2, y.iloc[2, 2] in (3, 2)]) # now try with a string... imputer = SelectiveImputer(fill='mean') y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2.0, y.iloc[2, 2] == 2.5]) # now try with a string... imputer = SelectiveImputer(fill='median') y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5]) # now test with an iterable imputer = SelectiveImputer(fill=[5, 6, 7]) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 6, y.iloc[2, 2] == 7]) # test with a mixed iterable imputer = SelectiveImputer(fill=[5, 'mode', 'mean']) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5]) # test with a mixed iterable -- again imputer = SelectiveImputer(fill=['median', 3, 'mean']) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5]) # test with a dict imputer = SelectiveImputer(fill={'a': 'median', 'b': 3, 'c': 'mean'}) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5]) # test failures now... assert_fails(SelectiveImputer(fill='blah').fit, TypeError, a) assert_fails(SelectiveImputer(fill=[1, 2]).fit, ValueError, a) assert_fails(SelectiveImputer(fill=['a', 'b', 'c']).fit, TypeError, a) assert_fails(SelectiveImputer(fill='a').fit, TypeError, a) assert_fails(SelectiveImputer(fill=[1, 2, 'a']).fit, TypeError, a) # generate anonymous class for test... class SomeObject(object): def __init__(self): pass assert_fails(SelectiveImputer(fill=SomeObject()).fit, TypeError, a)
def test_regular_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through since no missing ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search (exhaustively, so for the test, just do one of each) hp = { 'collinearity__threshold': [0.90], 'collinearity__method': ['spearman'], 'scaler__scaler': [RobustScaler()], 'pca__n_components': [0.95], 'pca__whiten': [True], 'model__n_estimators': [5], 'model__max_depth': [5], 'model__min_samples_leaf': [8], 'model__max_features': [0.75], 'model__max_leaf_nodes': [20] } # define the gridsearch search = GridSearchCV(pipe, hp, scoring='accuracy', cv=custom_cv, verbose=1) # fit the search search.fit(X_train, y_train) # search.score(X_train, y_train) # throws a warning... search.predict(X_train) search.predict_proba(X_train) search.predict_log_proba(X_train) # this poses an issue.. the models are trained on X as a # array, and selecting the best estimator causes the retained # names to go away. We need to find a way to force the best_estimator_ # to retain the names on which to start the training process. # search.best_estimator_.predict(X_train) # test the report report_grid_score_detail(search, charts=False) # test with invalid X and ys assert_fails(search.fit, Exception, X_train, None) # fit the search with a series search.fit(X_train, pd.Series(y_train)) # fit the search with a DF as y search.fit(X_train, pd.DataFrame(pd.Series(y_train))) # test with invalid X and ys with warnings.catch_warnings(): warnings.simplefilter('ignore') assert_fails(search.fit, Exception, X_train, pd.DataFrame([pd.Series(y_train), pd.Series(y_train)])) # test with short y assert_fails(search.fit, ValueError, X_train, [0, 1, 2])
def test_selective_imputer(): a = pd.DataFrame.from_records([ [1, 2, 3], [np.nan, 2, 2], [2, np.nan, np.nan] ], columns=['a', 'b', 'c']) # first, use an int imputer = SelectiveImputer(fill=-1) y = imputer.fit_transform(a) assert imputer.fills_ == -1 assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([x == -1 for x in (y.iloc[1, 0], y.iloc[2, 1], y.iloc[2, 2])]) # now try with a string... imputer = SelectiveImputer(fill='mode') y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] in (1, 2), y.iloc[2, 1] == 2, y.iloc[2, 2] in (3, 2)]) # now try with a string... imputer = SelectiveImputer(fill='mean') y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2.0, y.iloc[2, 2] == 2.5]) # now try with a string... imputer = SelectiveImputer(fill='median') y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5]) # now test with an iterable imputer = SelectiveImputer(fill=[5, 6, 7]) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 6, y.iloc[2, 2] == 7]) # test with a mixed iterable imputer = SelectiveImputer(fill=[5, 'mode', 'mean']) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 5, y.iloc[2, 1] == 2, y.iloc[2, 2] == 2.5]) # test with a mixed iterable -- again imputer = SelectiveImputer(fill=['median', 3, 'mean']) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5]) # test with a dict imputer = SelectiveImputer(fill={'a': 'median', 'b': 3, 'c': 'mean'}) y = imputer.fit_transform(a) assert y.isnull().sum().sum() == 0, ('expected no nulls but got:\n', y) assert all([y.iloc[1, 0] == 1.5, y.iloc[2, 1] == 3, y.iloc[2, 2] == 2.5]) # test failures now... assert_fails(SelectiveImputer(fill='blah').fit, TypeError, a) assert_fails(SelectiveImputer(fill=[1, 2]).fit, ValueError, a) assert_fails(SelectiveImputer(fill=['a', 'b', 'c']).fit, TypeError, a) assert_fails(SelectiveImputer(fill='a').fit, TypeError, a) assert_fails(SelectiveImputer(fill=[1, 2, 'a']).fit, TypeError, a) # generate anonymous class for test... class SomeObject(object): def __init__(self): pass assert_fails(SelectiveImputer(fill=SomeObject()).fit, TypeError, a)