Example #1
0
 def test_predict_regressions(self):
     for regressor in ['RandomForestRegressor', 'ExtraTreesRegressor',
                       'GradientBoostingRegressor', 'AdaBoostRegressor',
                       'Lasso', 'Ridge', 'ElasticNet',
                       'KNeighborsRegressor', 'SVR', 'LinearSVR']:
         estimator, importances = fit_regressor(
             self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
             n_estimators=2, estimator=regressor, n_jobs=1,
             missing_samples='ignore')
         pred = predict_regression(self.table_ecam_fp, estimator)
         exp = self.mdc_ecam_fp.to_series()
         # reindex both pred and exp because not all samples present in pred
         # are present in the metadata! (hence missing_samples='ignore')
         sample_ids = pred.index.intersection(exp.index)
         pred = pred.loc[sample_ids]
         exp = exp.loc[sample_ids]
         # test that expected MSE is achieved (these are mostly quite high
         # as we would expect)
         mse = mean_squared_error(exp, pred)
         # TODO: Remove this conditional when
         # https://github.com/qiime2/q2-sample-classifier/issues/193 is
         # closed
         if regressor == 'Ridge':
             self.assertAlmostEqual(
                 mse, seeded_predict_results[regressor], places=4,
                 msg='Accuracy of %s regressor was %f, but expected %f' % (
                     regressor, mse, seeded_predict_results[regressor]))
         else:
             self.assertAlmostEqual(
                 mse, seeded_predict_results[regressor],
                 msg='Accuracy of %s regressor was %f, but expected %f' % (
                     regressor, mse, seeded_predict_results[regressor]))
    def test_predict_feature_order_aint_no_thing(self):
        table = self.table_ecam_fp
        estimator, importances = fit_regressor(table,
                                               self.mdc_ecam_fp,
                                               random_state=123,
                                               n_estimators=2,
                                               n_jobs=1,
                                               missing_samples='ignore')

        # randomly shuffle and reorder features in biom table.
        feature_ids = table.ids(axis='observation')
        # look ma no seed! we should get the same result no matter the order.
        np.random.shuffle(feature_ids)
        shuffled_table = table.sort_order(feature_ids, axis='observation')

        # now predict values on shuffled data
        pred = predict_regression(shuffled_table, estimator)
        exp = self.mdc_ecam_fp.to_series()
        # reindex both pred and exp because not all samples present in pred
        # are present in the metadata! (hence missing_samples='ignore')
        sample_ids = pred.index.intersection(exp.index)
        pred = pred.loc[sample_ids]
        exp = exp.loc[sample_ids]
        # test that expected MSE is achieved (these are mostly quite high
        # as we would expect)
        mse = mean_squared_error(exp, pred)
        self.assertAlmostEqual(mse,
                               seeded_predict_results['RandomForestRegressor'])
Example #3
0
 def test_fit_regressor(self):
     pipeline, importances = fit_regressor(
         self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
         n_estimators=2, n_jobs=1, missing_samples='ignore')
     exp_imp = pd.read_csv(
         self.get_data_path('importance_cv.tsv'), sep='\t', header=0,
         index_col=0)
     pdt.assert_frame_equal(importances, exp_imp)