def test_find_best_regressor_with_eval(self):
        """Test find best in grid_search with custom eval function"""
        grid_result = self.context.models.grid_search(
            self.regressor_frame, self.regressor_frame,
            [(
             self.context.models.regression.linear_regression,
             {
                "observation_columns": ["feat1", "feat2"],
                "label_column": "class",
                "max_iterations": grid_values(*(5, 50)),
                "elastic_net_parameter": 0.001}),
             (
             self.context.models.regression.random_forest_regressor,
             {
                "observation_columns": ["feat1", "feat2"],
                "label_column": "class",
                "max_depth": grid_values(*xrange(2, 10)),
                "num_trees": 2})],
            lambda a, b:
                getattr(a, "root_mean_squared_error") <
                getattr(b, "root_mean_squared_error"))

        best_model = grid_result.find_best()
        self.assertEqual(
            best_model.descriptor.model_type.__name__,
            "sparktk.models.regression.random_forest_regressor")
        self.assertAlmostEqual(
            best_model.metrics.root_mean_squared_error,
            0.37,
            delta=0.01)
Exemple #2
0
    def test_default_num_fold(self):
        """Test cross validate with default num_fold parameter"""
        result = self.context.models.cross_validate(
            self.classifier_frame,
            [(self.context.models.classification.svm, {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(5, 100),
                "step_size": 0.01
            }),
             (self.context.models.classification.logistic_regression, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column": "res",
                 "num_iterations": grid_values(2, 5, 15),
                 "step_size": 0.001
             })],
            verbose=False)

        # validate number of models
        (svm_count, log_count,
         num_models) = self._get_model_counts(result, "svm")
        expected_num_models = 3 * (2 + 3)
        self.assertEquals(num_models, expected_num_models)
        self.assertEqual(svm_count, 6)
        self.assertEqual(log_count, 9)
 def test_incorect_hyper_parameter(self):
     """Test incorrect hyper parameter name for a model throws exception"""
     with self.assertRaisesRegexp(Exception, "unknown args named: BAD"):
         self.context.models.grid_search(
             self.classifier_frame, self.classifier_frame,
             [(self.context.models.classification.svm, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "BAD":
                 "res",
                 "num_iterations":
                 grid_values(5, 100),
                 "step_size":
                 0.01
             }),
              (self.context.models.classification.logistic_regression, {
                  "observation_columns":
                  ["vec0", "vec1", "vec2", "vec3", "vec4"],
                  "BAD":
                  "res",
                  "num_iterations":
                  grid_values(2, 15),
                  "step_size":
                  0.001
              })])
    def test_find_best_classifier_default(self):
        """Test find best in grid_search with default eval function"""
        grid_result = self.context.models.grid_search(
            self.classifier_frame, self.classifier_frame,
            [(self.context.models.classification.svm, {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(5, 10),
                "step_size": 0.01
            }),
             (self.context.models.classification.logistic_regression, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column": "res",
                 "num_iterations": grid_values(*xrange(2, 15)),
                 "step_size": 0.001
             })])

        best_model = grid_result.find_best()
        self.assertEqual(best_model.descriptor.model_type.__name__,
                         "sparktk.models.classification.logistic_regression")
        self.assertAlmostEqual(best_model.metrics.accuracy,
                               0.87688,
                               delta=0.01)
Exemple #5
0
    def test_two_folds(self):
        """Test cross validate with num_folds = 2"""
        result = self.context.models.cross_validate(
            self.regressor_frame,
            [(self.context.models.regression.linear_regression, {
                "observation_columns": ["feat1", "feat2"],
                "label_column": "class",
                "max_iterations": grid_values(5, 100),
                "reg_param": 0.0001
            }),
             (self.context.models.regression.random_forest_regressor, {
                 "observation_columns": ["feat1", "feat2"],
                 "label_column": "class",
                 "num_trees": grid_values(2, 5, 8),
                 "max_depth": 5
             })],
            verbose=False,
            num_folds=2)

        # validate number of models
        (rf_count, linreg_count,
         num_models) = self._get_model_counts(result, "random")
        expected_num_models = 2 * (2 + 3)
        self.assertEquals(num_models, expected_num_models)
        self.assertEqual(rf_count, 6)
        self.assertEqual(linreg_count, 4)
Exemple #6
0
 def test_invalid_num_fold(self):
     """Test cross validate with num_fold > number of data points"""
     with self.assertRaisesRegexp(Exception, "empty collection"):
         result = self.context.models.cross_validate(
             self.classifier_frame,
             [(self.context.models.classification.svm, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column":
                 "res",
                 "num_iterations":
                 grid_values(5, 100),
                 "step_size":
                 0.01
             }),
              (self.context.models.classification.logistic_regression, {
                  "observation_columns":
                  ["vec0", "vec1", "vec2", "vec3", "vec4"],
                  "label_column":
                  "res",
                  "num_iterations":
                  grid_values(2, 15),
                  "step_size":
                  0.001
              })],
             num_folds=1000000,
             verbose=False)
Exemple #7
0
    def test_all_results_regressors(self):
        """Test number of regressors created given 5 folds """
        result = self.context.models.cross_validate(
            self.regressor_frame,
            [(self.context.models.regression.linear_regression, {
                "observation_columns": ["feat1", "feat2"],
                "label_column": "class",
                "max_iterations": grid_values(*xrange(5, 10)),
                "elastic_net_parameter": 0.001
            }),
             (self.context.models.regression.random_forest_regressor, {
                 "observation_columns": ["feat1", "feat2"],
                 "label_column": "class",
                 "num_trees": grid_values(2, 5, 15),
                 "max_depth": 5
             })],
            num_folds=5,
            verbose=False)

        # validate number of models
        (rf_count, linreg_count,
         num_models) = self._get_model_counts(result, "random_forest")
        expected_num_models = 5 * (5 + 3)
        self.assertEquals(rf_count + linreg_count, expected_num_models)
        self.assertEqual(rf_count, 15)
        self.assertEqual(linreg_count, 25)
    def test_find_best_classifier_default(self):
        """Test find best in grid_search with default eval function"""
        grid_result = self.context.models.grid_search(
            self.classifier_frame, self.classifier_frame,
            [(
             self.context.models.classification.svm,
             {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(5, 10),
                "step_size": 0.01}),
             (
             self.context.models.classification.logistic_regression,
             {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(*xrange(2, 15)),
                "step_size": 0.001})])

        best_model = grid_result.find_best()
        self.assertEqual(
            best_model.descriptor.model_type.__name__,
            "sparktk.models.classification.logistic_regression")
        self.assertAlmostEqual(
            best_model.metrics.accuracy,
            0.87688,
            delta=0.01)
Exemple #9
0
 def test_invalid_model(self):
     """Test cross validate with invalid model"""
     with self.assertRaisesRegexp(Exception, "no attribute \'BAD\'"):
         result = self.context.models.cross_validate(
             self.classifier_frame,
             [(self.context.models.classification.BAD, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column":
                 "res",
                 "num_iterations":
                 grid_values(5, 100),
                 "step_size":
                 0.01
             }),
              (self.context.models.classification.logistic_regression, {
                  "observation_columns":
                  ["vec0", "vec1", "vec2", "vec3", "vec4"],
                  "label_column":
                  "res",
                  "num_iterations":
                  grid_values(2, 15),
                  "step_size":
                  0.001
              })],
             num_folds=2.5,
             verbose=False)
Exemple #10
0
    def test_averages_regressors(self):
        """Test ouptut of cross validatation averages for regressors"""
        result = self.context.models.cross_validate(
            self.regressor_frame,
            [(self.context.models.regression.linear_regression, {
                "observation_columns": ["feat1", "feat2"],
                "label_column": "class",
                "max_iterations": grid_values(*xrange(10, 20)),
                "reg_param": 0.001
            }),
             (self.context.models.regression.random_forest_regressor, {
                 "observation_columns": ["feat1", "feat2"],
                 "label_column": "class",
                 "num_trees": grid_values(*xrange(2, 5)),
                 "max_depth": 4
             })],
            num_folds=3,
            verbose=False)

        avg_models = result.averages

        # validate num of models
        self.assertEqual(len(avg_models.grid_points), 13)

        # validate model with best accuracy
        best_model = avg_models.find_best()
        self.assertEqual(best_model.descriptor.model_type.__name__,
                         "sparktk.models.regression.random_forest_regressor")
        self.assertAlmostEqual(best_model.metrics.r2, 0.415, delta=0.01)
Exemple #11
0
    def test_averages_classifiers(self):
        """Test ouptut of cross validatation averages for classifiers"""
        result = self.context.models.cross_validate(
            self.classifier_frame,
            [(self.context.models.classification.svm, {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(5, 100),
                "step_size": 0.01
            }),
             (self.context.models.classification.logistic_regression, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column": "res",
                 "num_iterations": grid_values(2, 15),
                 "step_size": 0.001
             })],
            num_folds=3,
            verbose=False)

        avg_models = result.averages

        # validate num of models
        self.assertEqual(len(avg_models.grid_points), 4)

        # validate model with best accuracy
        best_model = avg_models.find_best()
        self.assertEqual(best_model.descriptor.model_type.__name__,
                         "sparktk.models.classification.logistic_regression")
        self.assertAlmostEqual(best_model.metrics.accuracy, .87, delta=0.01)
Exemple #12
0
 def test_float_num_fold(self):
     """Test cross validate with float num_fold"""
     with self.assertRaisesRegexp(Exception,
                                  "integer argument expected, got float"):
         result = self.context.models.cross_validate(
             self.classifier_frame,
             [(self.context.models.classification.svm, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column":
                 "res",
                 "num_iterations":
                 grid_values(5, 100),
                 "step_size":
                 0.01
             }),
              (self.context.models.classification.logistic_regression, {
                  "observation_columns":
                  ["vec0", "vec1", "vec2", "vec3", "vec4"],
                  "label_column":
                  "res",
                  "num_iterations":
                  grid_values(2, 15),
                  "step_size":
                  0.001
              })],
             num_folds=2.5,
             verbose=False)
    def test_invalid_eval_name(self):
        """Test grid search throws exception for invalid model name"""
        grid_result = self.context.models.grid_search(
            self.classifier_frame, self.classifier_frame,
            [(
             self.context.models.classification.svm,
             {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column":"res",
                "num_iterations": grid_values(5, 10),
                "step_size": 0.01
                }),
             (
             self.context.models.classification.logistic_regression,
             {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(*xrange(2, 15)),
                "step_size": 0.001
                })],
            lambda a, b:
                getattr(a, "root_mean_squared_error") <
                getattr(b, "root_mean_squared_error"))

        with self.assertRaisesRegexp(
                Exception, "no attribute \'root_mean_squared_error\'"):
            best_model = grid_result.find_best()
Exemple #14
0
    def test_grid_points(self):
        """Test output of grid search on svm and logistic regression"""
        grid_result = self.context.models.grid_search(
            self.frame, self.frame,
            [(self.context.models.classification.svm,
            {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
            "label_column":"res",
            "num_iterations": grid_values(5, 100),
            "step_size": 0.01}),
            (self.context.models.classification.logistic_regression,
            {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
            "label_column":"res",
            "num_iterations": grid_values(2, 15),
            "step_size": 0.001})])

        grid_points = grid_result.grid_points

        #validate number of items in grid
        self.assertEqual(len(grid_points), 4)

        #validate one of the models' name
        self.assertEqual(
            grid_points[0].descriptor.model_type.__name__,
            "sparktk.models.classification.svm")
      
        #validate grid values of the first model
        svm_kwargs_0 = grid_points[0].descriptor.kwargs
        self.assertEqual(svm_kwargs_0['num_iterations'], 5)
        self.assertEqual(svm_kwargs_0['step_size'], 0.01)
        self.assertEqual(svm_kwargs_0['label_column'], "res")
        self.assertItemsEqual(svm_kwargs_0['observation_columns'], ["vec0", "vec1", "vec2", "vec3", "vec4"])

        #validate grid values of the second model
        svm_kwargs_1 = grid_points[1].descriptor.kwargs
        self.assertEqual(svm_kwargs_1['num_iterations'], 100)
        self.assertEqual(svm_kwargs_1['step_size'], 0.01)
        self.assertEqual(svm_kwargs_1['label_column'], "res")
        self.assertItemsEqual(svm_kwargs_1['observation_columns'], ["vec0", "vec1", "vec2", "vec3", "vec4"])

        #validate grid values of the third model
        lr_kwargs_0 = grid_points[2].descriptor.kwargs
        self.assertEqual(lr_kwargs_0['num_iterations'], 2)
        self.assertEqual(lr_kwargs_0['step_size'], 0.001)
        self.assertEqual(lr_kwargs_0['label_column'], "res")
        self.assertItemsEqual(lr_kwargs_0['observation_columns'], ["vec0", "vec1", "vec2", "vec3", "vec4"])

        #validate grid values of the third model
        lr_kwargs_1 = grid_points[3].descriptor.kwargs
        self.assertEqual(lr_kwargs_1['num_iterations'], 15)
        self.assertEqual(lr_kwargs_1['step_size'], 0.001)
        self.assertEqual(lr_kwargs_1['label_column'], "res")
        self.assertItemsEqual(lr_kwargs_1['observation_columns'], ["vec0", "vec1", "vec2", "vec3", "vec4"])

	#validate accuracy metric of one of the models
	self.assertEquals(grid_points[2].metrics.accuracy, 0.8745)
    def test_grid_points_regressors(self):
        """Test output of grid search on regressors"""
        grid_result = self.context.models.grid_search(
            self.regressor_frame, self.regressor_frame,
            [(
             self.context.models.regression.linear_regression,
             {
                "observation_columns":
                ["feat1", "feat2"],
                "label_column": "class",
                "max_iterations": grid_values(5, 50),
                "elastic_net_parameter": 0.001}),
             (
             self.context.models.regression.random_forest_regressor,
             {
                "observation_columns":
                ["feat1", "feat2"],
                "label_column": "class",
                "num_trees": grid_values(2, 4),
                "max_depth": 5})])

        grid_points = grid_result.grid_points

        # validate number of items in grid
        self.assertEqual(len(grid_points), 4)

        # validate one of the models' name
        self.assertEqual(
            grid_points[0].descriptor.model_type.__name__,
            "sparktk.models.regression.linear_regression")

        # validate grid values of the first model
        linreg_kwargs_0 = grid_points[0].descriptor.kwargs
        self.assertEqual(linreg_kwargs_0['max_iterations'], 5)
        self.assertEqual(linreg_kwargs_0['elastic_net_parameter'], 0.001)
        self.assertEqual(linreg_kwargs_0['label_column'], "class")
        self.assertItemsEqual(
            linreg_kwargs_0['observation_columns'],
            ["feat1", "feat2"])

        # validate grid values of the third model
        rf_kwargs_1 = grid_points[2].descriptor.kwargs
        self.assertEqual(rf_kwargs_1['num_trees'], 2)
        self.assertEqual(rf_kwargs_1['max_depth'], 5)
        self.assertEqual(rf_kwargs_1['label_column'], "class")
        self.assertItemsEqual(
            rf_kwargs_1['observation_columns'],
            ["feat1", "feat2"])

        # validate accuracy metric of one of the models
        self.assertAlmostEqual(
            grid_points[1].metrics.r2,
            1.59183568639e-05,
            delta=1e-04)
Exemple #16
0
 def test_incorect_hyper_parameter(self):
     """Test incorrect hyper parameter name for a model throws exception"""
     with self.assertRaisesRegexp(
             Exception, "unknown args named: BAD"):
         self.context.models.grid_search(
         self.frame, self.frame,
         [(self.context.models.classification.svm,
         {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
         "BAD":"res",
         "num_iterations": grid_values(5, 100),
         "step_size": 0.01}),
         (self.context.models.classification.logistic_regression,
         {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
         "BAD":"res",
         "num_iterations": grid_values(2, 15),
         "step_size": 0.001})])
    def test_grid_points_regressors(self):
        """Test output of grid search on regressors"""
        grid_result = self.context.models.grid_search(
            self.regressor_frame, self.regressor_frame,
            [(self.context.models.regression.linear_regression, {
                "observation_columns": ["feat1", "feat2"],
                "label_column": "class",
                "max_iterations": grid_values(5, 50),
                "elastic_net_parameter": 0.001
            }),
             (self.context.models.regression.random_forest_regressor, {
                 "observation_columns": ["feat1", "feat2"],
                 "label_column": "class",
                 "num_trees": grid_values(2, 4),
                 "max_depth": 5
             })])

        grid_points = grid_result.grid_points

        # validate number of items in grid
        self.assertEqual(len(grid_points), 4)

        # validate one of the models' name
        self.assertEqual(grid_points[0].descriptor.model_type.__name__,
                         "sparktk.models.regression.linear_regression")

        # validate grid values of the first model
        linreg_kwargs_0 = grid_points[0].descriptor.kwargs
        self.assertEqual(linreg_kwargs_0['max_iterations'], 5)
        self.assertEqual(linreg_kwargs_0['elastic_net_parameter'], 0.001)
        self.assertEqual(linreg_kwargs_0['label_column'], "class")
        self.assertItemsEqual(linreg_kwargs_0['observation_columns'],
                              ["feat1", "feat2"])

        # validate grid values of the third model
        rf_kwargs_1 = grid_points[2].descriptor.kwargs
        self.assertEqual(rf_kwargs_1['num_trees'], 2)
        self.assertEqual(rf_kwargs_1['max_depth'], 5)
        self.assertEqual(rf_kwargs_1['label_column'], "class")
        self.assertItemsEqual(rf_kwargs_1['observation_columns'],
                              ["feat1", "feat2"])

        # validate accuracy metric of one of the models
        self.assertAlmostEqual(grid_points[1].metrics.r2,
                               1.59183568639e-05,
                               delta=1e-04)
Exemple #18
0
 def test_single_fold(self):
     """Test cross validate with num_folds = 1; should throw exception"""
     with self.assertRaises(Exception):
         self.context.models.cross_validate(
             self.regressor_frame,
             [(self.context.models.regression.linear_regression, {
                 "observation_columns": ["feat1", "feat2"],
                 "label_column": "class",
                 "max_iterations": grid_values(5, 100),
                 "reg_param": 0.0001
             }),
              (self.context.models.regression.random_forest_regressor, {
                  "observation_columns": ["feat1", "feat2"],
                  "label_column": "class",
                  "num_trees": grid_values(2, 5, 8),
                  "max_depth": 5
              })],
             verbose=False,
             num_folds=1)
Exemple #19
0
 def test_missing_test_frame(self):
     """Test grid search throws exception for missing test frame"""
     with self.assertRaisesRegexp(
             Exception, "takes at least 3 arguments"):
         self.context.models.grid_search(
         self.frame,
         [(self.context.models.classification.svm,
         {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
         "label_column":"res",
         "num_iterations": grid_values(1, 4),
         "step_size": 0.001})])
Exemple #20
0
 def test_bad_model_name(self):
     """Test grid search throws exception for invalid model name"""
     with self.assertRaisesRegexp(
             Exception, "no attribute \'BAD\'"):
         self.context.models.grid_search(
         self.frame,
         [(self.context.models.classification.BAD,
         {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
         "label_column":"res",
         "num_iterations": grid_values(1, 4),
         "step_size": 0.001})])
Exemple #21
0
 def test_bad_data_type_in_grid_values(self):
     """Test invalid parameter to grid_values throws exception"""
     with self.assertRaisesRegexp(
             Exception, "Method .* does not exist"):
         self.context.models.grid_search(
         self.frame, self.frame,
         [(self.context.models.classification.svm,
         {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
         "label_column":"res",
         "num_iterations": grid_values("one"),
         "step_size": 0.001})])
Exemple #22
0
    def test_grid_values_with_xrange(self):
        """Test grid values with xrange"""
        grid_result = self.context.models.grid_search(
            self.frame, self.frame,
            [(self.context.models.classification.logistic_regression,
            {"observation_columns":["vec0", "vec1", "vec2", "vec3", "vec4"],
            "label_column":"res",
            "num_iterations": grid_values(*xrange(5, 10)),
            "step_size": 0.001})])

        #validate number of models in the grid
        self.assertEquals(len(grid_result.grid_points), 5)
    def test_default_num_fold(self):
        """Test cross validate with default num_fold parameter"""
        result = self.context.models.cross_validate(
            self.frame,
            [(self.context.models.classification.svm, {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(5, 100),
                "step_size": 0.01
            }),
             (self.context.models.classification.logistic_regression, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column": "res",
                 "num_iterations": grid_values(2, 5, 15),
                 "step_size": 0.001
             })],
            verbose=False)

        #validate number of models
        all_models = result.all_results
        actual_num_models = 0
        svm_count = 0
        log_count = 0
        for fold in all_models:
            grid_points = fold.grid_points
            actual_num_models += len(grid_points)
            for grid_point in grid_points:
                if "svm" in grid_point.descriptor.model_type.__name__:
                    svm_count += 1
                else:
                    log_count += 1

        expected_num_models = 3 * (2 + 3)
        self.assertEquals(actual_num_models, expected_num_models)
        self.assertEqual(svm_count, 6)
        self.assertEqual(log_count, 9)
Exemple #24
0
 def test_missing_test_frame(self):
     """Test grid search throws exception for missing test frame"""
     with self.assertRaisesRegexp(Exception, "takes at least 3 arguments"):
         self.context.models.grid_search(
             self.frame, [(self.context.models.classification.svm, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column":
                 "res",
                 "num_iterations":
                 grid_values(1, 4),
                 "step_size":
                 0.001
             })])
Exemple #25
0
 def test_bad_model_name(self):
     """Test grid search throws exception for invalid model name"""
     with self.assertRaisesRegexp(Exception, "no attribute \'BAD\'"):
         self.context.models.grid_search(
             self.frame, [(self.context.models.classification.BAD, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column":
                 "res",
                 "num_iterations":
                 grid_values(1, 4),
                 "step_size":
                 0.001
             })])
    def test_grid_values_with_xrange(self):
        """Test grid values with xrange"""
        grid_result = self.context.models.grid_search(
            self.classifier_frame, self.classifier_frame,
            [(self.context.models.classification.logistic_regression, {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(*xrange(5, 10)),
                "step_size": 0.001
            })])

        # validate number of models in the grid
        self.assertEquals(len(grid_result.grid_points), 5)
    def test_invalid_eval_name(self):
        """Test grid search throws exception for invalid model name"""
        grid_result = self.context.models.grid_search(
            self.classifier_frame, self.classifier_frame,
            [(self.context.models.classification.svm, {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(5, 10),
                "step_size": 0.01
            }),
             (self.context.models.classification.logistic_regression, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column": "res",
                 "num_iterations": grid_values(*xrange(2, 15)),
                 "step_size": 0.001
             })], lambda a, b: getattr(a, "root_mean_squared_error") < getattr(
                 b, "root_mean_squared_error"))

        with self.assertRaisesRegexp(
                Exception, "no attribute \'root_mean_squared_error\'"):
            best_model = grid_result.find_best()
    def test_find_best_regressor_with_eval(self):
        """Test find best in grid_search with custom eval function"""
        grid_result = self.context.models.grid_search(
            self.regressor_frame, self.regressor_frame,
            [(self.context.models.regression.linear_regression, {
                "observation_columns": ["feat1", "feat2"],
                "label_column": "class",
                "max_iterations": grid_values(*(5, 50)),
                "elastic_net_parameter": 0.001
            }),
             (self.context.models.regression.random_forest_regressor, {
                 "observation_columns": ["feat1", "feat2"],
                 "label_column": "class",
                 "max_depth": grid_values(*xrange(2, 10)),
                 "num_trees": 2
             })], lambda a, b: getattr(a, "root_mean_squared_error") < getattr(
                 b, "root_mean_squared_error"))

        best_model = grid_result.find_best()
        self.assertEqual(best_model.descriptor.model_type.__name__,
                         "sparktk.models.regression.random_forest_regressor")
        self.assertAlmostEqual(best_model.metrics.root_mean_squared_error,
                               0.37,
                               delta=0.01)
 def test_bad_data_type_in_grid_values(self):
     """Test invalid parameter to grid_values throws exception"""
     with self.assertRaisesRegexp(Exception, "Method .* does not exist"):
         self.context.models.grid_search(
             self.classifier_frame, self.classifier_frame,
             [(self.context.models.classification.svm, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column":
                 "res",
                 "num_iterations":
                 grid_values("one"),
                 "step_size":
                 0.001
             })])
    def test_grid_points_classifiers(self):
        """Test output of grid search on svm and logistic regression"""
        grid_result = self.context.models.grid_search(
            self.classifier_frame, self.classifier_frame,
            [(self.context.models.classification.svm, {
                "observation_columns":
                ["vec0", "vec1", "vec2", "vec3", "vec4"],
                "label_column": "res",
                "num_iterations": grid_values(5, 100),
                "step_size": 0.01
            }),
             (self.context.models.classification.logistic_regression, {
                 "observation_columns":
                 ["vec0", "vec1", "vec2", "vec3", "vec4"],
                 "label_column": "res",
                 "num_iterations": grid_values(2, 15),
                 "step_size": 0.001
             })])

        grid_points = grid_result.grid_points

        # validate number of items in grid
        self.assertEqual(len(grid_points), 4)

        # validate one of the models' name
        self.assertEqual(grid_points[0].descriptor.model_type.__name__,
                         "sparktk.models.classification.svm")

        # validate grid values of the first model
        svm_kwargs_0 = grid_points[0].descriptor.kwargs
        self.assertEqual(svm_kwargs_0['num_iterations'], 5)
        self.assertEqual(svm_kwargs_0['step_size'], 0.01)
        self.assertEqual(svm_kwargs_0['label_column'], "res")
        self.assertItemsEqual(svm_kwargs_0['observation_columns'],
                              ["vec0", "vec1", "vec2", "vec3", "vec4"])

        # validate grid values of the second model
        svm_kwargs_1 = grid_points[1].descriptor.kwargs
        self.assertEqual(svm_kwargs_1['num_iterations'], 100)
        self.assertEqual(svm_kwargs_1['step_size'], 0.01)
        self.assertEqual(svm_kwargs_1['label_column'], "res")
        self.assertItemsEqual(svm_kwargs_1['observation_columns'],
                              ["vec0", "vec1", "vec2", "vec3", "vec4"])

        # validate grid values of the third model
        lr_kwargs_0 = grid_points[2].descriptor.kwargs
        self.assertEqual(lr_kwargs_0['num_iterations'], 2)
        self.assertEqual(lr_kwargs_0['step_size'], 0.001)
        self.assertEqual(lr_kwargs_0['label_column'], "res")
        self.assertItemsEqual(lr_kwargs_0['observation_columns'],
                              ["vec0", "vec1", "vec2", "vec3", "vec4"])

        # validate grid values of the fourth model
        lr_kwargs_1 = grid_points[3].descriptor.kwargs
        self.assertEqual(lr_kwargs_1['num_iterations'], 15)
        self.assertEqual(lr_kwargs_1['step_size'], 0.001)
        self.assertEqual(lr_kwargs_1['label_column'], "res")
        self.assertItemsEqual(lr_kwargs_1['observation_columns'],
                              ["vec0", "vec1", "vec2", "vec3", "vec4"])

        # validate accuracy metric of one of the models
        self.assertEquals(grid_points[2].metrics.accuracy, 0.8745)