def test_predict_regressions(self): for regressor in ['RandomForestRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor', 'AdaBoostRegressor', 'Lasso', 'Ridge', 'ElasticNet', 'KNeighborsRegressor', 'SVR', 'LinearSVR']: estimator, importances = fit_regressor( self.table_ecam_fp, self.mdc_ecam_fp, random_state=123, n_estimators=2, estimator=regressor, n_jobs=1, missing_samples='ignore') pred = predict_regression(self.table_ecam_fp, estimator) exp = self.mdc_ecam_fp.to_series() # reindex both pred and exp because not all samples present in pred # are present in the metadata! (hence missing_samples='ignore') sample_ids = pred.index.intersection(exp.index) pred = pred.loc[sample_ids] exp = exp.loc[sample_ids] # test that expected MSE is achieved (these are mostly quite high # as we would expect) mse = mean_squared_error(exp, pred) # TODO: Remove this conditional when # https://github.com/qiime2/q2-sample-classifier/issues/193 is # closed if regressor == 'Ridge': self.assertAlmostEqual( mse, seeded_predict_results[regressor], places=4, msg='Accuracy of %s regressor was %f, but expected %f' % ( regressor, mse, seeded_predict_results[regressor])) else: self.assertAlmostEqual( mse, seeded_predict_results[regressor], msg='Accuracy of %s regressor was %f, but expected %f' % ( regressor, mse, seeded_predict_results[regressor]))
def test_predict_feature_order_aint_no_thing(self): table = self.table_ecam_fp estimator, importances = fit_regressor(table, self.mdc_ecam_fp, random_state=123, n_estimators=2, n_jobs=1, missing_samples='ignore') # randomly shuffle and reorder features in biom table. feature_ids = table.ids(axis='observation') # look ma no seed! we should get the same result no matter the order. np.random.shuffle(feature_ids) shuffled_table = table.sort_order(feature_ids, axis='observation') # now predict values on shuffled data pred = predict_regression(shuffled_table, estimator) exp = self.mdc_ecam_fp.to_series() # reindex both pred and exp because not all samples present in pred # are present in the metadata! (hence missing_samples='ignore') sample_ids = pred.index.intersection(exp.index) pred = pred.loc[sample_ids] exp = exp.loc[sample_ids] # test that expected MSE is achieved (these are mostly quite high # as we would expect) mse = mean_squared_error(exp, pred) self.assertAlmostEqual(mse, seeded_predict_results['RandomForestRegressor'])
def test_fit_regressor(self): pipeline, importances = fit_regressor( self.table_ecam_fp, self.mdc_ecam_fp, random_state=123, n_estimators=2, n_jobs=1, missing_samples='ignore') exp_imp = pd.read_csv( self.get_data_path('importance_cv.tsv'), sep='\t', header=0, index_col=0) pdt.assert_frame_equal(importances, exp_imp)