def test_multiple_estimators(self): tbl = self.table mean_imp = Imputer(Imputer.MEAN) mode_imp = Imputer(Imputer.MODE) dtree1 = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) dtree2 = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([mean_imp, mode_imp, dtree1, dtree2]) model = pipe.fit(tbl) self.assertEqual(model.__class__.__name__, 'PipelineModel') self.assertEqual(len(model.stages), 4) self.assertTrue(model[0] is mean_imp) self.assertTrue(model[1] is mode_imp) self.assertEqual(model[2].__class__.__name__, 'DecisionTreeModel') self.assertEqual(model[3].__class__.__name__, 'DecisionTreeModel') out = model.score(tbl) self.assertEqual(set(list(out.index)), set(['DecisionTree', 'DecisionTree1']))
def test_unload_model(self): dtree = DecisionTree(target='Cylinders', inputs=['MSRP', 'Horsepower']) model = dtree.fit(self.table) self.assertEqual(model.data.table.tableexists().exists, 1) with ResourceManager() as mgr: mgr.track_model(model) self.assertEqual(model.data.table.tableexists().exists, 0)
def test_unload(self): dtree = DecisionTree(target='Cylinders', nominals=['Make', 'Model'], inputs=['Make', 'Model', 'Horsepower']) model = dtree.fit(self.table) self.assertEqual(model.data.table.tableexists().exists, 1) model.unload() self.assertEqual(model.data.table.tableexists().exists, 0)
def test_fit(self): tbl = self.table params = dtree_defaults.copy() params.update(dict(target='Origin', nominals=nominals, inputs=inputs)) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) model = dtree.fit(tbl) self.assertEqual(model.__class__.__name__, 'DecisionTreeModel') self.assertEqual(model.data.__class__.__name__, 'CASTable') self.assertEqual(model.params, params) self.assertEqual(model.diagnostics.__class__.__name__, 'CASResults') self.assertEqual(sorted(model.diagnostics.keys()), ['ModelInfo', 'OutputCasTables'])
def test_basic(self): tbl = self.table mean_imp = Imputer(Imputer.MEAN) mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([mean_imp, mode_imp, dtree]) model = pipe.fit(tbl) self.assertEqual(model.__class__.__name__, 'PipelineModel') self.assertEqual(len(model.stages), 3) self.assertTrue(model[0] is mean_imp) self.assertTrue(model[1] is mode_imp) self.assertEqual(model[2].__class__.__name__, 'DecisionTreeModel') out = model.score(tbl) self.assertEqual( set(list(out.index)), set([ 'Target', 'Level', 'Var', 'NBins', 'NObsUsed', 'TargetCount', 'TargetMiss', 'PredCount', 'PredMiss', 'Event', 'EventCount', 'NonEventCount', 'EventMiss', 'AreaUnderROCCurve', 'CRCut', 'ClassificationCutOff', 'KS', 'KSCutOff', 'MisClassificationRate' ])) # Bad item type with self.assertRaises(TypeError): Pipeline([mean_imp, mode_imp, 'foo', dtree])
def test_regression_score(self): tbl = self.table mean_imp = Imputer(Imputer.MEAN) mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='MSRP', nominals=nominals, inputs=inputs) pipe = Pipeline([mean_imp, mode_imp, dtree]) model = pipe.fit(tbl) score = model.score(tbl) self.assertTrue(isinstance(score, pd.Series)) self.assertEqual(score.loc['Target'], 'MSRP') self.assertEqual(score.loc['Level'], 'INTERVAL') self.assertEqual(score.loc['NBins'], 100) self.assertEqual(score.loc['NObsUsed'], 428) self.assertTrue(isinstance(score.loc['AverageSquaredError'], float)) self.assertTrue(isinstance(score.loc['AverageAbsoluteError'], float)) self.assertTrue( isinstance(score.loc['AverageSquaredLogarithmicError'], float)) self.assertTrue(isinstance(score.loc['RootAverageSquaredError'], float)) self.assertTrue( isinstance(score.loc['RootAverageAbsoluteError'], float)) self.assertTrue( isinstance(score.loc['RootAverageSquaredLogarithmicError'], float))
def test_interval(self): tbl = self.table estimator = DecisionTree(target='MSRP', nominals=nominals, inputs=inputs) # dict of lists param_grid = dict( max_depth=[6, 10], leaf_size=[3, 5], alpha=0, ) hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid) out = hpt.gridsearch(tbl) params = out['Parameters'] res_params = sorted([{'max_depth': 6, 'leaf_size': 3, 'alpha': 0}, {'max_depth': 6, 'leaf_size': 5, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 5, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 3, 'alpha': 0}], key=lambda x: (x['max_depth'], x['leaf_size'])) self.assertEqual(sorted(list(params), key=lambda x: (x['max_depth'], x['leaf_size'])), res_params) self.assertEqual(len(out['FoldScores'][0]), 3)
def test_set_params(self): tbl = self.table mean_imp = Imputer(Imputer.MEAN) mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([mean_imp, mode_imp, dtree]) out = pipe.fit(tbl).score(tbl) self.assertEqual(out.loc['Target'], 'Origin') # Set extra parameters on Pipeline (not on estimator) pipe.set_params({dtree.target: 'MSRP'}) self.assertEqual(dtree.target, 'Origin') out = pipe.fit(tbl).score(tbl) self.assertEqual(out.loc['Target'], 'MSRP') # Set parameters during fit pipe = Pipeline([mean_imp, mode_imp, dtree]) out = pipe.fit(tbl).score(tbl) self.assertEqual(out.loc['Target'], 'Origin') out = pipe.fit(tbl, {dtree.target: 'MSRP'}).score(tbl) self.assertEqual(out.loc['Target'], 'MSRP')
def test_cv_iter(self): tbl = self.table estimator = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) # dict of lists param_grid = dict( max_depth=[6, 10], leaf_size=[3, 5], alpha=0, ) def cv_gen(tbl): yield tbl.sample(frac=0.1), tbl.sample(frac=0.9) yield tbl.sample(frac=0.2), tbl.sample(frac=0.8) yield tbl.sample(frac=0.3), tbl.sample(frac=0.7) yield tbl.sample(frac=0.4), tbl.sample(frac=0.6) test_cv = cv_gen(tbl) a, b = next(test_cv) self.assertEqual(len(a), 43) self.assertEqual(len(b), 385) a, b = next(test_cv) self.assertEqual(len(a), 86) self.assertEqual(len(b), 342) a, b = next(test_cv) self.assertEqual(len(a), 128) self.assertEqual(len(b), 300) a, b = next(test_cv) self.assertEqual(len(a), 171) self.assertEqual(len(b), 257) with self.assertRaises(StopIteration): a, b = next(test_cv) hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=cv_gen(tbl)) out = hpt.gridsearch(tbl) params = out['Parameters'] res_params = sorted([{'max_depth': 6, 'leaf_size': 3, 'alpha': 0}, {'max_depth': 6, 'leaf_size': 5, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 5, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 3, 'alpha': 0}], key=lambda x: (x['max_depth'], x['leaf_size'])) self.assertEqual(sorted(list(params), key=lambda x: (x['max_depth'], x['leaf_size'])), res_params) self.assertEqual(len(out['FoldScores'][0]), 4)
def test_score(self): tbl = self.table params = dtree_defaults.copy() params.update( dict(target='Cylinders', nominals=['Make', 'Model'], inputs=['Make', 'Model', 'Horsepower'])) dtree = DecisionTree(target='Cylinders', nominals=['Make', 'Model'], inputs=['Make', 'Model', 'Horsepower']) model = dtree.fit(tbl) score = model.score(tbl) self.assertTrue(isinstance(score, pd.Series)) self.assertAlmostEqual(score.loc['MeanSquaredError'], 0.4423817642) self.assertEqual(score.loc['NObsUsed'], 426) self.assertEqual(score.loc['NObsRead'], 428)
def test_classification_score(self): tbl = self.table params = dtree_defaults.copy() params.update(dict(target='Origin', nominals=nominals, inputs=inputs)) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) model = dtree.fit(tbl) score = model.score(tbl) self.assertTrue(isinstance(score, pd.Series)) self.assertEqual(score.loc['Target'], 'Origin') self.assertEqual(score.loc['Level'], 'CLASS') self.assertEqual(score.loc['Event'], 'USA') self.assertEqual(score.loc['NBins'], 100) self.assertEqual(score.loc['NObsUsed'], 428) self.assertTrue(isinstance(score.loc['AreaUnderROCCurve'], float)) self.assertTrue(isinstance(score.loc['CRCut'], float)) self.assertTrue(isinstance(score.loc['KS'], float)) self.assertTrue(isinstance(score.loc['KSCutOff'], float)) self.assertTrue(isinstance(score.loc['MisClassificationRate'], float))
def test_regression_score(self): tbl = self.table params = dtree_defaults.copy() params.update(dict(target='MSRP', nominals=nominals, inputs=inputs)) dtree = DecisionTree(target='MSRP', nominals=nominals, inputs=inputs) model = dtree.fit(tbl) score = model.score(tbl) self.assertTrue(isinstance(score, pd.Series)) self.assertEqual(score.loc['Target'], 'MSRP') self.assertEqual(score.loc['Level'], 'INTERVAL') self.assertEqual(score.loc['NBins'], 100) self.assertEqual(score.loc['NObsUsed'], 428) self.assertTrue(isinstance(score.loc['AverageSquaredError'], float)) self.assertTrue(isinstance(score.loc['AverageAbsoluteError'], float)) self.assertTrue(isinstance(score.loc['AverageSquaredLogarithmicError'], float)) self.assertTrue(isinstance(score.loc['RootAverageSquaredError'], float)) self.assertTrue(isinstance(score.loc['RootAverageAbsoluteError'], float)) self.assertTrue(isinstance(score.loc['RootAverageSquaredLogarithmicError'], float))
def test_unload(self): mean_imp = Imputer(Imputer.MEAN) mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='MSRP', nominals=nominals, inputs=inputs) pipe = Pipeline([mean_imp, mode_imp, dtree]) model = pipe.fit(self.table) self.assertEqual(model[-1].data.table.tableexists().exists, 1) model.unload() self.assertEqual(model[-1].data.table.tableexists().exists, 0)
def test_fit(self): tbl = self.table params = dtree_defaults.copy() params.update( dict(target='Cylinders', nominals=['Make', 'Model'], inputs=['Make', 'Model', 'Horsepower'])) dtree = DecisionTree(target='Cylinders', nominals=['Make', 'Model'], inputs=['Make', 'Model', 'Horsepower']) model = dtree.fit(tbl) self.assertEqual(model.__class__.__name__, 'DecisionTreeModel') self.assertEqual(model.data.__class__.__name__, 'CASTable') self.assertEqual(model.params, params) self.assertEqual(model.diagnostics.__class__.__name__, 'CASResults') self.assertEqual(sorted(model.diagnostics.keys()), ['ModelInfo', 'OutputCasTables']) # Have nominals set automatically dtree = DecisionTree(target='Cylinders', nominals=[], inputs=['Make', 'Model', 'Horsepower']) model = dtree.fit(tbl) self.assertEqual(model.params['nominals'], [])
def test_pipeline(self): tbl = self.table modeimp = Imputer(Imputer.MODE) dtree1 = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) dtree2 = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([modeimp, dtree1, dtree2]) # dict of lists param_grid = dict( max_depth=[6, 10], leaf_size=[3, 5], alpha=0, ) hpt = HyperParameterTuning(estimator=pipe, param_grid=param_grid) out = hpt.gridsearch(tbl) params = out['Parameters'] self.assertEqual(list(sorted(out.index)), ['DecisionTree', 'DecisionTree', 'DecisionTree', 'DecisionTree', 'DecisionTree1', 'DecisionTree1', 'DecisionTree1', 'DecisionTree1']) res_params = sorted([{'max_depth': 6, 'leaf_size': 3, 'alpha': 0}, {'max_depth': 6, 'leaf_size': 3, 'alpha': 0}, {'max_depth': 6, 'leaf_size': 5, 'alpha': 0}, {'max_depth': 6, 'leaf_size': 5, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 3, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 3, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 5, 'alpha': 0}, {'max_depth': 10, 'leaf_size': 5, 'alpha': 0}], key=lambda x: (x['max_depth'], x['leaf_size'])) self.assertEqual(sorted(list(params), key=lambda x: (x['max_depth'], x['leaf_size'])), res_params)
def test_repr(self): mean_imp = Imputer(Imputer.MEAN) mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([mean_imp, mode_imp, dtree]) out = "Pipeline([Imputer(MEAN), Imputer(MODE), " + \ "DecisionTree(alpha=0.0, cf_level=0.25, criterion=None, " + \ "inputs=['MPG_City', 'MPG_Highway', 'Length', 'Weight', " + \ "'Type', 'Cylinders'], leaf_size=5, max_branches=2, " + \ "max_depth=6, n_bins=20, nominals=['Type', 'Cylinders', " + \ "'Origin'], prune=False, target='Origin', var_importance=False)])" self.assertEqual(repr(pipe).replace("u'", "'"), out)
def test_transform(self): tbl = self.table mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([mode_imp, dtree]) self.assertEqual(tbl.nmiss().max(), 2) out = pipe.transform(tbl) self.assertEqual(out.__class__.__name__, 'CASTable') self.assertEqual(tbl.nmiss().max(), 2) self.assertEqual(out.nmiss().max(), 0)
def test_getitem(self): tbl = self.table mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([mode_imp, dtree]) self.assertTrue(pipe[0] is mode_imp) self.assertTrue(pipe[1] is dtree) with self.assertRaises(IndexError): pipe[2] with self.assertRaises(TypeError): pipe['foo']
def test_model_getitem(self): tbl = self.table mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) model = Pipeline([mode_imp, dtree]).fit(tbl) self.assertTrue(model[0] is mode_imp) self.assertTrue(model[1] is not dtree) self.assertEqual(model[1].__class__.__name__, 'DecisionTreeModel') with self.assertRaises(IndexError): model[2] with self.assertRaises(TypeError): model['foo']
def test_classification_score(self): tbl = self.table mean_imp = Imputer(Imputer.MEAN) mode_imp = Imputer(Imputer.MODE) dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) pipe = Pipeline([mean_imp, mode_imp, dtree]) model = pipe.fit(tbl) score = model.score(tbl) self.assertTrue(isinstance(score, pd.Series)) self.assertEqual(score.loc['Target'], 'Origin') self.assertEqual(score.loc['Level'], 'CLASS') self.assertEqual(score.loc['Event'], 'USA') self.assertEqual(score.loc['NBins'], 100) self.assertEqual(score.loc['NObsUsed'], 428) self.assertTrue(isinstance(score.loc['AreaUnderROCCurve'], float)) self.assertTrue(isinstance(score.loc['CRCut'], float)) self.assertTrue(isinstance(score.loc['KS'], float)) self.assertTrue(isinstance(score.loc['KSCutOff'], float)) self.assertTrue(isinstance(score.loc['MisClassificationRate'], float))
def test_params(self): tbl = self.table # Check defaults dtree = DecisionTree() self.assertEqual(dtree.params.to_dict(), dtree_defaults) # Check constructor parameters params = dtree_defaults.copy() params.update(dict(prune=True, target='Origin', nominals=nominals, inputs=inputs)) dtree = DecisionTree(prune=True, target='Origin', nominals=nominals, inputs=inputs) self.assertEqual(dtree.params.to_dict(), params) model = dtree.fit(tbl) self.assertEqual(model.__class__.__name__, 'DecisionTreeModel') self.assertEqual(model.params, params) # Check constructor parameter error with self.assertRaises(ValueError): DecisionTree(prune=True, criterion='foo', target='Origin', nominals=nominals, inputs=inputs) with self.assertRaises(TypeError): DecisionTree(foo='bar') # Check fit parameter overrides params = dtree_defaults.copy() params.update(dict(max_depth=7, leaf_size=5, target='Origin', nominals=nominals, inputs=inputs)) model = dtree.fit(tbl, prune=False, max_depth=7) self.assertEqual(model.__class__.__name__, 'DecisionTreeModel') self.assertEqual(model.params, params) # Check parameter overrides error with self.assertRaises(TypeError): dtree.fit(tbl, prune='foo', max_depth=7) with self.assertRaises(KeyError): dtree.fit(tbl, foo='bar')
def test_unload(self): dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs) model = dtree.fit(self.table) self.assertEqual(model.data.table.tableexists().exists, 1) model.unload() self.assertEqual(model.data.table.tableexists().exists, 0)
def test_params(self): estimator = DecisionTree() param_grid = dict( max_depth=[6, 10], leaf_size=[3, 5], ) # Basic settings and defaults hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid) self.assertEqual(hpt.params['estimator'], estimator) self.assertEqual(hpt.params['param_grid'], param_grid) self.assertEqual(hpt.params['cv'], 3) self.assertTrue(hpt.params['score_type'] is None) # cv = int hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=3) self.assertEqual(hpt.params['cv'], 3) # cv = float hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=0.3) self.assertEqual(hpt.params['cv'], 0.3) # cv = -float with self.assertRaises(ValueError): hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=-0.1) # cv = float > 1 with self.assertRaises(ValueError): hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=1.0001) # cv = generator gen = iter([0]) hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=gen) self.assertEqual(hpt.params['cv'], gen) # cv = list items = [0] hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=items) self.assertEqual(hpt.params['cv'], items) # cv = string with self.assertRaises(TypeError): HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv='foo') # cv = 1 (lower than minimum) with self.assertRaises(ValueError): hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid, cv=1)