Example #1
0
    def test_multiple_estimators(self):
        tbl = self.table

        mean_imp = Imputer(Imputer.MEAN)
        mode_imp = Imputer(Imputer.MODE)
        dtree1 = DecisionTree(target='Origin',
                              nominals=nominals,
                              inputs=inputs)
        dtree2 = DecisionTree(target='Origin',
                              nominals=nominals,
                              inputs=inputs)

        pipe = Pipeline([mean_imp, mode_imp, dtree1, dtree2])

        model = pipe.fit(tbl)
        self.assertEqual(model.__class__.__name__, 'PipelineModel')
        self.assertEqual(len(model.stages), 4)
        self.assertTrue(model[0] is mean_imp)
        self.assertTrue(model[1] is mode_imp)
        self.assertEqual(model[2].__class__.__name__, 'DecisionTreeModel')
        self.assertEqual(model[3].__class__.__name__, 'DecisionTreeModel')

        out = model.score(tbl)
        self.assertEqual(set(list(out.index)),
                         set(['DecisionTree', 'DecisionTree1']))
Example #2
0
 def test_unload_model(self):
     dtree = DecisionTree(target='Cylinders', inputs=['MSRP', 'Horsepower'])
     model = dtree.fit(self.table)
     self.assertEqual(model.data.table.tableexists().exists, 1)
     with ResourceManager() as mgr:
         mgr.track_model(model)
     self.assertEqual(model.data.table.tableexists().exists, 0)
Example #3
0
 def test_unload(self):
     dtree = DecisionTree(target='Cylinders',
                          nominals=['Make', 'Model'],
                          inputs=['Make', 'Model', 'Horsepower'])
     model = dtree.fit(self.table)
     self.assertEqual(model.data.table.tableexists().exists, 1)
     model.unload()
     self.assertEqual(model.data.table.tableexists().exists, 0)
Example #4
0
    def test_fit(self):
        tbl = self.table

        params = dtree_defaults.copy()
        params.update(dict(target='Origin', nominals=nominals, inputs=inputs))

        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)
        model = dtree.fit(tbl)

        self.assertEqual(model.__class__.__name__, 'DecisionTreeModel')
        self.assertEqual(model.data.__class__.__name__, 'CASTable')
        self.assertEqual(model.params, params)
        self.assertEqual(model.diagnostics.__class__.__name__, 'CASResults')
        self.assertEqual(sorted(model.diagnostics.keys()), ['ModelInfo', 'OutputCasTables'])
Example #5
0
    def test_basic(self):
        tbl = self.table

        mean_imp = Imputer(Imputer.MEAN)
        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)

        pipe = Pipeline([mean_imp, mode_imp, dtree])

        model = pipe.fit(tbl)
        self.assertEqual(model.__class__.__name__, 'PipelineModel')
        self.assertEqual(len(model.stages), 3)
        self.assertTrue(model[0] is mean_imp)
        self.assertTrue(model[1] is mode_imp)
        self.assertEqual(model[2].__class__.__name__, 'DecisionTreeModel')

        out = model.score(tbl)

        self.assertEqual(
            set(list(out.index)),
            set([
                'Target', 'Level', 'Var', 'NBins', 'NObsUsed', 'TargetCount',
                'TargetMiss', 'PredCount', 'PredMiss', 'Event', 'EventCount',
                'NonEventCount', 'EventMiss', 'AreaUnderROCCurve', 'CRCut',
                'ClassificationCutOff', 'KS', 'KSCutOff',
                'MisClassificationRate'
            ]))

        # Bad item type
        with self.assertRaises(TypeError):
            Pipeline([mean_imp, mode_imp, 'foo', dtree])
Example #6
0
    def test_regression_score(self):
        tbl = self.table

        mean_imp = Imputer(Imputer.MEAN)
        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='MSRP', nominals=nominals, inputs=inputs)

        pipe = Pipeline([mean_imp, mode_imp, dtree])

        model = pipe.fit(tbl)
        score = model.score(tbl)

        self.assertTrue(isinstance(score, pd.Series))
        self.assertEqual(score.loc['Target'], 'MSRP')
        self.assertEqual(score.loc['Level'], 'INTERVAL')
        self.assertEqual(score.loc['NBins'], 100)
        self.assertEqual(score.loc['NObsUsed'], 428)
        self.assertTrue(isinstance(score.loc['AverageSquaredError'], float))
        self.assertTrue(isinstance(score.loc['AverageAbsoluteError'], float))
        self.assertTrue(
            isinstance(score.loc['AverageSquaredLogarithmicError'], float))
        self.assertTrue(isinstance(score.loc['RootAverageSquaredError'],
                                   float))
        self.assertTrue(
            isinstance(score.loc['RootAverageAbsoluteError'], float))
        self.assertTrue(
            isinstance(score.loc['RootAverageSquaredLogarithmicError'], float))
    def test_interval(self):
        tbl = self.table

        estimator = DecisionTree(target='MSRP', nominals=nominals,
                                 inputs=inputs)

        # dict of lists
        param_grid = dict(
            max_depth=[6, 10],
            leaf_size=[3, 5],
            alpha=0,
        )

        hpt = HyperParameterTuning(estimator=estimator,
                                   param_grid=param_grid)
        out = hpt.gridsearch(tbl)

        params = out['Parameters']

        res_params = sorted([{'max_depth': 6, 'leaf_size': 3, 'alpha': 0},
                             {'max_depth': 6, 'leaf_size': 5, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 5, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 3, 'alpha': 0}],
                            key=lambda x: (x['max_depth'], x['leaf_size']))

        self.assertEqual(sorted(list(params),
                                key=lambda x: (x['max_depth'], x['leaf_size'])),
                         res_params)

        self.assertEqual(len(out['FoldScores'][0]), 3)
Example #8
0
    def test_set_params(self):
        tbl = self.table

        mean_imp = Imputer(Imputer.MEAN)
        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)

        pipe = Pipeline([mean_imp, mode_imp, dtree])
        out = pipe.fit(tbl).score(tbl)
        self.assertEqual(out.loc['Target'], 'Origin')

        # Set extra parameters on Pipeline (not on estimator)
        pipe.set_params({dtree.target: 'MSRP'})
        self.assertEqual(dtree.target, 'Origin')

        out = pipe.fit(tbl).score(tbl)
        self.assertEqual(out.loc['Target'], 'MSRP')

        # Set parameters during fit
        pipe = Pipeline([mean_imp, mode_imp, dtree])

        out = pipe.fit(tbl).score(tbl)
        self.assertEqual(out.loc['Target'], 'Origin')

        out = pipe.fit(tbl, {dtree.target: 'MSRP'}).score(tbl)
        self.assertEqual(out.loc['Target'], 'MSRP')
    def test_cv_iter(self):
        tbl = self.table

        estimator = DecisionTree(target='Origin', nominals=nominals,
                                 inputs=inputs)

        # dict of lists
        param_grid = dict(
            max_depth=[6, 10],
            leaf_size=[3, 5],
            alpha=0,
        )

        def cv_gen(tbl):
            yield tbl.sample(frac=0.1), tbl.sample(frac=0.9)
            yield tbl.sample(frac=0.2), tbl.sample(frac=0.8)
            yield tbl.sample(frac=0.3), tbl.sample(frac=0.7)
            yield tbl.sample(frac=0.4), tbl.sample(frac=0.6)

        test_cv = cv_gen(tbl)

        a, b = next(test_cv)
        self.assertEqual(len(a), 43)
        self.assertEqual(len(b), 385)

        a, b = next(test_cv)
        self.assertEqual(len(a), 86)
        self.assertEqual(len(b), 342)

        a, b = next(test_cv)
        self.assertEqual(len(a), 128)
        self.assertEqual(len(b), 300)

        a, b = next(test_cv)
        self.assertEqual(len(a), 171)
        self.assertEqual(len(b), 257)

        with self.assertRaises(StopIteration):
            a, b = next(test_cv)

        hpt = HyperParameterTuning(estimator=estimator,
                                   param_grid=param_grid,
                                   cv=cv_gen(tbl))
        out = hpt.gridsearch(tbl)

        params = out['Parameters']

        res_params = sorted([{'max_depth': 6, 'leaf_size': 3, 'alpha': 0},
                             {'max_depth': 6, 'leaf_size': 5, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 5, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 3, 'alpha': 0}],
                            key=lambda x: (x['max_depth'], x['leaf_size']))

        self.assertEqual(sorted(list(params),
                                key=lambda x: (x['max_depth'], x['leaf_size'])),
                         res_params)

        self.assertEqual(len(out['FoldScores'][0]), 4)
Example #10
0
    def test_score(self):
        tbl = self.table

        params = dtree_defaults.copy()
        params.update(
            dict(target='Cylinders',
                 nominals=['Make', 'Model'],
                 inputs=['Make', 'Model', 'Horsepower']))

        dtree = DecisionTree(target='Cylinders',
                             nominals=['Make', 'Model'],
                             inputs=['Make', 'Model', 'Horsepower'])
        model = dtree.fit(tbl)
        score = model.score(tbl)
        self.assertTrue(isinstance(score, pd.Series))
        self.assertAlmostEqual(score.loc['MeanSquaredError'], 0.4423817642)
        self.assertEqual(score.loc['NObsUsed'], 426)
        self.assertEqual(score.loc['NObsRead'], 428)
Example #11
0
    def test_classification_score(self):
        tbl = self.table

        params = dtree_defaults.copy()
        params.update(dict(target='Origin', nominals=nominals, inputs=inputs))

        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)
        model = dtree.fit(tbl)
        score = model.score(tbl)
        self.assertTrue(isinstance(score, pd.Series))
        self.assertEqual(score.loc['Target'], 'Origin')
        self.assertEqual(score.loc['Level'], 'CLASS')
        self.assertEqual(score.loc['Event'], 'USA')
        self.assertEqual(score.loc['NBins'], 100)
        self.assertEqual(score.loc['NObsUsed'], 428)
        self.assertTrue(isinstance(score.loc['AreaUnderROCCurve'], float))
        self.assertTrue(isinstance(score.loc['CRCut'], float))
        self.assertTrue(isinstance(score.loc['KS'], float))
        self.assertTrue(isinstance(score.loc['KSCutOff'], float))
        self.assertTrue(isinstance(score.loc['MisClassificationRate'], float))
Example #12
0
    def test_regression_score(self):
        tbl = self.table

        params = dtree_defaults.copy()
        params.update(dict(target='MSRP', nominals=nominals, inputs=inputs))

        dtree = DecisionTree(target='MSRP', nominals=nominals, inputs=inputs)
        model = dtree.fit(tbl)
        score = model.score(tbl)
        self.assertTrue(isinstance(score, pd.Series))
        self.assertEqual(score.loc['Target'], 'MSRP')
        self.assertEqual(score.loc['Level'], 'INTERVAL')
        self.assertEqual(score.loc['NBins'], 100)
        self.assertEqual(score.loc['NObsUsed'], 428)
        self.assertTrue(isinstance(score.loc['AverageSquaredError'], float))
        self.assertTrue(isinstance(score.loc['AverageAbsoluteError'], float))
        self.assertTrue(isinstance(score.loc['AverageSquaredLogarithmicError'], float))
        self.assertTrue(isinstance(score.loc['RootAverageSquaredError'], float))
        self.assertTrue(isinstance(score.loc['RootAverageAbsoluteError'], float))
        self.assertTrue(isinstance(score.loc['RootAverageSquaredLogarithmicError'], float))
Example #13
0
    def test_unload(self):
        mean_imp = Imputer(Imputer.MEAN)
        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='MSRP', nominals=nominals, inputs=inputs)

        pipe = Pipeline([mean_imp, mode_imp, dtree])

        model = pipe.fit(self.table)
        self.assertEqual(model[-1].data.table.tableexists().exists, 1)
        model.unload()
        self.assertEqual(model[-1].data.table.tableexists().exists, 0)
Example #14
0
    def test_fit(self):
        tbl = self.table

        params = dtree_defaults.copy()
        params.update(
            dict(target='Cylinders',
                 nominals=['Make', 'Model'],
                 inputs=['Make', 'Model', 'Horsepower']))

        dtree = DecisionTree(target='Cylinders',
                             nominals=['Make', 'Model'],
                             inputs=['Make', 'Model', 'Horsepower'])
        model = dtree.fit(tbl)

        self.assertEqual(model.__class__.__name__, 'DecisionTreeModel')
        self.assertEqual(model.data.__class__.__name__, 'CASTable')
        self.assertEqual(model.params, params)
        self.assertEqual(model.diagnostics.__class__.__name__, 'CASResults')
        self.assertEqual(sorted(model.diagnostics.keys()),
                         ['ModelInfo', 'OutputCasTables'])

        # Have nominals set automatically
        dtree = DecisionTree(target='Cylinders',
                             nominals=[],
                             inputs=['Make', 'Model', 'Horsepower'])
        model = dtree.fit(tbl)
        self.assertEqual(model.params['nominals'], [])
    def test_pipeline(self):
        tbl = self.table

        modeimp = Imputer(Imputer.MODE)
        dtree1 = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)
        dtree2 = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)
        pipe = Pipeline([modeimp, dtree1, dtree2])

        # dict of lists
        param_grid = dict(
            max_depth=[6, 10],
            leaf_size=[3, 5],
            alpha=0,
        )

        hpt = HyperParameterTuning(estimator=pipe,
                                   param_grid=param_grid)
        out = hpt.gridsearch(tbl)

        params = out['Parameters']

        self.assertEqual(list(sorted(out.index)),
                         ['DecisionTree', 'DecisionTree',
                          'DecisionTree', 'DecisionTree',
                          'DecisionTree1', 'DecisionTree1',
                          'DecisionTree1', 'DecisionTree1'])

        res_params = sorted([{'max_depth': 6, 'leaf_size': 3, 'alpha': 0},
                             {'max_depth': 6, 'leaf_size': 3, 'alpha': 0},
                             {'max_depth': 6, 'leaf_size': 5, 'alpha': 0},
                             {'max_depth': 6, 'leaf_size': 5, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 3, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 3, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 5, 'alpha': 0},
                             {'max_depth': 10, 'leaf_size': 5, 'alpha': 0}],
                            key=lambda x: (x['max_depth'], x['leaf_size']))

        self.assertEqual(sorted(list(params),
                                key=lambda x: (x['max_depth'], x['leaf_size'])),
                         res_params)
Example #16
0
    def test_repr(self):
        mean_imp = Imputer(Imputer.MEAN)
        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)
        pipe = Pipeline([mean_imp, mode_imp, dtree])

        out = "Pipeline([Imputer(MEAN), Imputer(MODE), " + \
              "DecisionTree(alpha=0.0, cf_level=0.25, criterion=None, " + \
              "inputs=['MPG_City', 'MPG_Highway', 'Length', 'Weight', " + \
              "'Type', 'Cylinders'], leaf_size=5, max_branches=2, " + \
              "max_depth=6, n_bins=20, nominals=['Type', 'Cylinders', " + \
              "'Origin'], prune=False, target='Origin', var_importance=False)])"

        self.assertEqual(repr(pipe).replace("u'", "'"), out)
Example #17
0
    def test_transform(self):
        tbl = self.table

        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)

        pipe = Pipeline([mode_imp, dtree])

        self.assertEqual(tbl.nmiss().max(), 2)

        out = pipe.transform(tbl)

        self.assertEqual(out.__class__.__name__, 'CASTable')
        self.assertEqual(tbl.nmiss().max(), 2)
        self.assertEqual(out.nmiss().max(), 0)
Example #18
0
    def test_getitem(self):
        tbl = self.table

        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)

        pipe = Pipeline([mode_imp, dtree])

        self.assertTrue(pipe[0] is mode_imp)
        self.assertTrue(pipe[1] is dtree)

        with self.assertRaises(IndexError):
            pipe[2]

        with self.assertRaises(TypeError):
            pipe['foo']
Example #19
0
    def test_model_getitem(self):
        tbl = self.table

        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)

        model = Pipeline([mode_imp, dtree]).fit(tbl)

        self.assertTrue(model[0] is mode_imp)
        self.assertTrue(model[1] is not dtree)
        self.assertEqual(model[1].__class__.__name__, 'DecisionTreeModel')

        with self.assertRaises(IndexError):
            model[2]

        with self.assertRaises(TypeError):
            model['foo']
Example #20
0
    def test_classification_score(self):
        tbl = self.table

        mean_imp = Imputer(Imputer.MEAN)
        mode_imp = Imputer(Imputer.MODE)
        dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)

        pipe = Pipeline([mean_imp, mode_imp, dtree])

        model = pipe.fit(tbl)
        score = model.score(tbl)

        self.assertTrue(isinstance(score, pd.Series))
        self.assertEqual(score.loc['Target'], 'Origin')
        self.assertEqual(score.loc['Level'], 'CLASS')
        self.assertEqual(score.loc['Event'], 'USA')
        self.assertEqual(score.loc['NBins'], 100)
        self.assertEqual(score.loc['NObsUsed'], 428)
        self.assertTrue(isinstance(score.loc['AreaUnderROCCurve'], float))
        self.assertTrue(isinstance(score.loc['CRCut'], float))
        self.assertTrue(isinstance(score.loc['KS'], float))
        self.assertTrue(isinstance(score.loc['KSCutOff'], float))
        self.assertTrue(isinstance(score.loc['MisClassificationRate'], float))
Example #21
0
    def test_params(self):
        tbl = self.table

        # Check defaults
        dtree = DecisionTree()
        self.assertEqual(dtree.params.to_dict(), dtree_defaults)

        # Check constructor parameters
        params = dtree_defaults.copy()
        params.update(dict(prune=True, target='Origin', nominals=nominals,
                           inputs=inputs))
        dtree = DecisionTree(prune=True, target='Origin', nominals=nominals,
                             inputs=inputs)
        self.assertEqual(dtree.params.to_dict(), params)

        model = dtree.fit(tbl)
        self.assertEqual(model.__class__.__name__, 'DecisionTreeModel')
        self.assertEqual(model.params, params)
       
        # Check constructor parameter error
        with self.assertRaises(ValueError):
            DecisionTree(prune=True, criterion='foo',
                         target='Origin', nominals=nominals,
                         inputs=inputs)

        with self.assertRaises(TypeError):
            DecisionTree(foo='bar')

        # Check fit parameter overrides
        params = dtree_defaults.copy()
        params.update(dict(max_depth=7, leaf_size=5,
                           target='Origin', nominals=nominals,
                           inputs=inputs))

        model = dtree.fit(tbl, prune=False, max_depth=7)
        self.assertEqual(model.__class__.__name__, 'DecisionTreeModel')
        self.assertEqual(model.params, params)

        # Check parameter overrides error
        with self.assertRaises(TypeError):
            dtree.fit(tbl, prune='foo', max_depth=7)

        with self.assertRaises(KeyError):
            dtree.fit(tbl, foo='bar') 
Example #22
0
 def test_unload(self):
     dtree = DecisionTree(target='Origin', nominals=nominals, inputs=inputs)
     model = dtree.fit(self.table)
     self.assertEqual(model.data.table.tableexists().exists, 1)
     model.unload()
     self.assertEqual(model.data.table.tableexists().exists, 0)
    def test_params(self):
        estimator = DecisionTree()
        param_grid = dict(
            max_depth=[6, 10],
            leaf_size=[3, 5],
        )

        # Basic settings and defaults
        hpt = HyperParameterTuning(estimator=estimator, param_grid=param_grid)
        self.assertEqual(hpt.params['estimator'], estimator)
        self.assertEqual(hpt.params['param_grid'], param_grid)
        self.assertEqual(hpt.params['cv'], 3)
        self.assertTrue(hpt.params['score_type'] is None)

        # cv = int
        hpt = HyperParameterTuning(estimator=estimator,
                                   param_grid=param_grid,
                                   cv=3)
        self.assertEqual(hpt.params['cv'], 3)

        # cv = float
        hpt = HyperParameterTuning(estimator=estimator,
                                   param_grid=param_grid,
                                   cv=0.3)
        self.assertEqual(hpt.params['cv'], 0.3)

        # cv = -float
        with self.assertRaises(ValueError):
            hpt = HyperParameterTuning(estimator=estimator,
                                       param_grid=param_grid,
                                       cv=-0.1)

        # cv = float > 1
        with self.assertRaises(ValueError):
            hpt = HyperParameterTuning(estimator=estimator,
                                       param_grid=param_grid,
                                       cv=1.0001)

        # cv = generator
        gen = iter([0])
        hpt = HyperParameterTuning(estimator=estimator,
                                   param_grid=param_grid,
                                   cv=gen)
        self.assertEqual(hpt.params['cv'], gen)

        # cv = list
        items = [0]
        hpt = HyperParameterTuning(estimator=estimator,
                                   param_grid=param_grid,
                                   cv=items)
        self.assertEqual(hpt.params['cv'], items)

        # cv = string
        with self.assertRaises(TypeError):
            HyperParameterTuning(estimator=estimator,
                                 param_grid=param_grid,
                                 cv='foo')

        # cv = 1 (lower than minimum)
        with self.assertRaises(ValueError):
            hpt = HyperParameterTuning(estimator=estimator,
                                       param_grid=param_grid,
                                       cv=1)