def test_invalids(self):
     estimator, pd, pt = _set_parameters_and_estimator(
         'RandomForestClassifier', self.table_chard_fp, self.md_chard_fp,
         'Region', n_estimators=10, n_jobs=1, cv=1,
         random_state=123, parameter_tuning=False, classification=True)
     regressor, pd, pt = _set_parameters_and_estimator(
         'RandomForestRegressor', self.table_chard_fp, self.md_chard_fp,
         'Region', n_estimators=10, n_jobs=1, cv=1,
         random_state=123, parameter_tuning=False, classification=True)
     # zero samples (if mapping file and table have no common samples)
     with self.assertRaisesRegex(ValueError, "metadata"):
         estimator, cm, accuracy, importances = split_optimize_classify(
             self.table_ecam_fp, self.md_chard_fp, 'Region', estimator,
             self.temp_dir.name, test_size=0.5, cv=1, random_state=123,
             n_jobs=1, optimize_feature_selection=False,
             parameter_tuning=False, param_dist=None,
             calc_feature_importance=False)
     # too few samples to stratify
     with self.assertRaisesRegex(ValueError, "metadata"):
         estimator, cm, accuracy, importances = split_optimize_classify(
             self.table_chard_fp, self.md_chard_fp, 'Region', estimator,
             self.temp_dir.name, test_size=0.9, cv=1, random_state=123,
             n_jobs=1, optimize_feature_selection=False,
             parameter_tuning=False, param_dist=None,
             calc_feature_importance=False)
     # regressor chosen for classification problem
     with self.assertRaisesRegex(ValueError, "convert"):
         estimator, cm, accuracy, importances = split_optimize_classify(
             self.table_chard_fp, self.md_chard_fp, 'Region', regressor,
             self.temp_dir.name, test_size=0.5, cv=1, random_state=123,
             n_jobs=1, optimize_feature_selection=False,
             parameter_tuning=False, param_dist=None,
             calc_feature_importance=False)
 def test_invalids(self):
     estimator, pad, pt = _set_parameters_and_estimator(
         'RandomForestClassifier',
         self.table_chard_fp,
         self.md_chard_fp,
         'Region',
         n_estimators=10,
         n_jobs=1,
         cv=1,
         random_state=123,
         parameter_tuning=False,
         classification=True,
         missing_samples='ignore')
     regressor, pad, pt = _set_parameters_and_estimator(
         'RandomForestRegressor',
         self.table_chard_fp,
         self.md_chard_fp,
         'Region',
         n_estimators=10,
         n_jobs=1,
         cv=1,
         random_state=123,
         parameter_tuning=False,
         classification=True,
         missing_samples='ignore')
Beispiel #3
0
 def test_feature_ordering(self):
     # replicate minimal split_optimize_classify to extract importances
     estimator, pad, pt = _set_parameters_and_estimator(
         'RandomForestRegressor',
         self.table_ecam_fp,
         self.md_ecam_fp,
         'month',
         n_estimators=10,
         n_jobs=1,
         cv=1,
         random_state=123,
         parameter_tuning=False,
         classification=False)
     X_train, X_test, y_train, y_test = _prepare_training_data(
         self.table_ecam_fp,
         self.md_ecam_fp,
         'month',
         test_size=0.1,
         random_state=123,
         load_data=True,
         stratify=False)
     X_train, X_test, importance = _optimize_feature_selection(
         self.temp_dir.name,
         X_train,
         X_test,
         y_train,
         estimator,
         cv=3,
         step=0.2,
         n_jobs=1)
     estimator, accuracy, y_pred = _fit_and_predict(
         X_train,
         X_test,
         y_train,
         y_test,
         estimator,
         scoring=mean_squared_error)
     # pull important features from a different dataframe
     importances = _calculate_feature_importances(X_train, estimator)
     table = self.table_ecam_fp.loc[:, importances["feature"]]
     # confirm ordering of feature (column) names
     ca = list(X_train.columns.values)
     cb = list(table.columns.values)
     self.assertEqual(ca, cb)
Beispiel #4
0
 def test_regressors(self):
     for regressor in [
             'RandomForestRegressor', 'ExtraTreesRegressor',
             'GradientBoostingRegressor', 'AdaBoostRegressor', 'Lasso',
             'Ridge', 'ElasticNet', 'KNeighborsRegressor', 'LinearSVR',
             'SVR'
     ]:
         tmpd = join(self.temp_dir.name, regressor)
         mkdir(tmpd)
         estimator, pad, pt = _set_parameters_and_estimator(
             regressor,
             self.table_ecam_fp,
             self.md_ecam_fp,
             'month',
             n_estimators=10,
             n_jobs=1,
             cv=1,
             random_state=123,
             parameter_tuning=False,
             classification=False)
         estimator, cm, accuracy, importances = split_optimize_classify(
             self.table_ecam_fp,
             self.md_ecam_fp,
             'month',
             estimator,
             tmpd,
             test_size=0.5,
             cv=1,
             random_state=123,
             n_jobs=1,
             optimize_feature_selection=False,
             parameter_tuning=False,
             param_dist=None,
             classification=False,
             calc_feature_importance=False,
             scoring=mean_squared_error)
         self.assertAlmostEqual(
             accuracy,
             seeded_results[regressor],
             places=4,
             msg='Accuracy of %s regressor was %f, but expected %f' %
             (regressor, accuracy, seeded_results[regressor]))
 def test_classifiers(self):
     for classifier in ['RandomForestClassifier', 'ExtraTreesClassifier',
                        'GradientBoostingClassifier', 'AdaBoostClassifier',
                        'LinearSVC', 'SVC', 'KNeighborsClassifier']:
         tmpd = join(self.temp_dir.name, classifier)
         mkdir(tmpd)
         estimator, pd, pt = _set_parameters_and_estimator(
             classifier, self.table_chard_fp, self.md_chard_fp, 'Region',
             n_estimators=10, n_jobs=1, cv=1,
             random_state=123, parameter_tuning=False, classification=True)
         estimator, cm, accuracy, importances = split_optimize_classify(
             self.table_chard_fp, self.md_chard_fp, 'Region', estimator,
             tmpd, test_size=0.5, cv=1, random_state=123,
             n_jobs=1, optimize_feature_selection=False,
             parameter_tuning=False, param_dist=None,
             calc_feature_importance=False)
         self.assertAlmostEqual(accuracy, seeded_results[classifier])
         self.assertAlmostEqual(
             accuracy, seeded_results[classifier], places=4,
             msg='Accuracy of %s classifier was %f, but expected %f' % (
                 classifier, accuracy, seeded_results[classifier]))