コード例 #1
0
    def setUp(self):
        super().setUp()

        def _load_biom(table_fp):
            table_fp = self.get_data_path(table_fp)
            table = qiime2.Artifact.load(table_fp)
            table = table.view(biom.Table)
            return table

        def _load_cmc(md_fp, column):
            md_fp = self.get_data_path(md_fp)
            md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
            md = qiime2.CategoricalMetadataColumn(md[column])
            return md

        table_chard_fp = _load_biom('chardonnay.table.qza')
        mdc_chard_fp = _load_cmc('chardonnay.map.txt', 'Region')

        pipeline, importances = fit_classifier(table_chard_fp,
                                               mdc_chard_fp,
                                               random_state=123,
                                               n_estimators=2,
                                               n_jobs=1,
                                               optimize_feature_selection=True,
                                               parameter_tuning=True,
                                               missing_samples='ignore')
        transformer = self.get_transformer(Pipeline, SampleEstimatorDirFmt)
        self._sklp = transformer(pipeline)
        sklearn_pipeline = self._sklp.sklearn_pipeline.view(PickleFormat)
        self.sklearn_pipeline = str(sklearn_pipeline)
        self.pipeline = pipeline
コード例 #2
0
 def test_predict_classifications(self):
     for classifier in [
             'RandomForestClassifier', 'ExtraTreesClassifier',
             'GradientBoostingClassifier', 'AdaBoostClassifier',
             'LinearSVC', 'SVC', 'KNeighborsClassifier'
     ]:
         estimator, importances = fit_classifier(self.table_chard_fp,
                                                 self.mdc_chard_fp,
                                                 random_state=123,
                                                 n_estimators=2,
                                                 estimator=classifier,
                                                 n_jobs=1,
                                                 missing_samples='ignore')
         pred, prob = predict_classification(self.table_chard_fp, estimator)
         exp = self.mdc_chard_fp.to_series().reindex(pred.index).dropna()
         # reindex both pred and exp because not all samples present in pred
         # are present in the metadata! (hence missing_samples='ignore')
         sample_ids = pred.index.intersection(exp.index)
         pred = pred.loc[sample_ids]
         exp = exp.loc[sample_ids]
         # test that expected number of correct results is achieved (these
         # are mostly quite high as we would expect (total n=21))
         correct_results = np.sum(pred == exp)
         self.assertEqual(
             correct_results,
             seeded_predict_results[classifier],
             msg='Accuracy of %s classifier was %f, but expected %f' %
             (classifier, correct_results,
              seeded_predict_results[classifier]))
コード例 #3
0
 def test_fit_classifier(self):
     pipeline, importances = fit_classifier(self.table_ecam_fp,
                                            self.mdc_ecam_fp,
                                            random_state=123,
                                            n_estimators=2,
                                            n_jobs=1,
                                            optimize_feature_selection=True,
                                            parameter_tuning=True,
                                            missing_samples='ignore')
コード例 #4
0
 def test_predict_classifications(self):
     for classifier in [
             'RandomForestClassifier', 'ExtraTreesClassifier',
             'GradientBoostingClassifier', 'AdaBoostClassifier',
             'LinearSVC', 'SVC', 'KNeighborsClassifier'
     ]:
         estimator, importances = fit_classifier(self.table_chard_fp,
                                                 self.mdc_chard_fp,
                                                 random_state=123,
                                                 n_estimators=2,
                                                 estimator=classifier,
                                                 n_jobs=1,
                                                 missing_samples='ignore')
         pred, prob = predict_classification(self.table_chard_fp, estimator)
         exp = self.mdc_chard_fp.to_series().reindex(pred.index).dropna()
         # reindex both pred and exp because not all samples present in pred
         # are present in the metadata! (hence missing_samples='ignore')
         sample_ids = pred.index.intersection(exp.index)
         pred = pred.loc[sample_ids]
         exp = exp.loc[sample_ids]
         # verify predictions:
         # test that expected number of correct results is achieved (these
         # are mostly quite high as we would expect (total n=21))
         correct_results = np.sum(pred == exp)
         self.assertEqual(
             correct_results,
             seeded_predict_results[classifier],
             msg='Accuracy of %s classifier was %f, but expected %f' %
             (classifier, correct_results,
              seeded_predict_results[classifier]))
         # verify probabilities
         # test whether all are in correct range (0 to 1)
         ls_pred_classes = prob.columns.tolist()
         ls_correct_range = [
             col for col in ls_pred_classes
             if prob[col].between(0, 1, inclusive=True).all()
         ]
         self.assertEqual(len(ls_correct_range),
                          prob.shape[1],
                          msg='Predicted probabilities of class {}'
                          'are not in range [0,1]'.format([
                              col for col in ls_pred_classes
                              if col not in ls_correct_range
                          ]))