コード例 #1
0
    def test_BagScorer_metric(self):
        """Define scoring functions, such as accuracy or recall,
        which will be used to score how well single-instance inference
        performs on the bag classification task

        The scoring functions have some requirements -
        a) They are passed to BagScorer on initialization
        b) Must have a method "_score_func" with a signature f(y_true, y_pred)
            (This is provided by default when using sklearn.metrics.make_scorer)

        Successful conditions:
            The bagscorer must report the same performance metrics as when the 
            metrics are manually calculated
        This tests if the bagscorer property fits, trains, and evaluates
        the estimator passed to it
        """

        # Generate a scoring metric for the bag scorer
        accuracy_scorer = make_scorer(accuracy_score)
        self.assertTrue(hasattr(accuracy_scorer, '_score_func'),
                        msg='accuracy scorer must have _score_function method')

        # Generate some data
        train_bags, train_labels = self.train_bags, self.train_labels
        test_bags, test_labels = self.test_bags, self.test_labels

        # Create a dummy estimator
        dumb = DummyClassifier(strategy='constant', constant=1)

        # concatenate arrays across 1st axis
        SI_train, SI_train_labels = bags_2_si(train_bags, train_labels)
        SI_test, SI_test_labels = bags_2_si(test_bags, test_labels)
        dumb.fit(SI_train, SI_train_labels)
        pred_test = dumb.predict(SI_test)
        pred_train = dumb.predict(SI_train)
        """Calculate the correct number of predictions based on dummy classifier
        The dummy classifier predicts 1 always (constant)
        The training set bas """
        pct_train = sum(train_labels) / len(train_labels)
        pct_test = sum(test_labels) / len(test_labels)
        dumb_accuracy_train = accuracy_score(SI_train_labels, pred_train)
        dumb_accuracy_test = accuracy_score(SI_test_labels, pred_test)

        # Test custom scorer, with the same dummy estimator
        bagAccScorer = BagScorer(accuracy_scorer, sparse=True)
        estimator = bagAccScorer.estimator_fit(dumb, train_bags, train_labels)
        test_score = bagAccScorer(estimator, test_bags, test_labels)
        train_score = bagAccScorer(estimator, train_bags, train_labels)
        """test_score should output the accuracy for predictions among bags
        The test_score for bagScorer should be equal to the dumb_accuracy_test
        because bag labels are reduced by the most frequest SI prediction

        If all SI labels are predicted + then all bags will be predicted +
        The accuracy of bag labels reduced by BagScorer will be equal to
        percent of bag labels that are positive"""

        self.assertEqual(test_score, pct_test)
        self.assertEqual(train_score, pct_train)
        self.assertEqual(pct_train, dumb_accuracy_train)
        self.assertEqual(pct_test, dumb_accuracy_test)
コード例 #2
0
    def test_BagScorer(self):
        """Define scoring functions, such as accuracy or recall,
        which will be used to score how well single-instance inference
        performs on the bag classification task

        The scoring functions have some requirements -
        a) They are passed to BagScorer on initialization
        b) Must have a method "_score_func" with a signature f(y_true, y_pred)
            (This is provided by default when using sklearn.metrics.make_scorer)

        """

        # Create scoring metrics, and load scoring metric into BagScorer
        accuracy_scorer = make_scorer(accuracy_score, normalize=True)
        precision_scorer = make_scorer(precision_score, average='weighted')
        recall_scorer = make_scorer(recall_score, average='weighted')
        # {'normalize':'weighted'}
        self.assertDictContainsSubset({'normalize': True},
                                      accuracy_scorer._kwargs)
        self.assertIn('_score_func', accuracy_scorer.__dict__.keys())

        # Dummy data
        train_bags, train_labels = self.train_bags, self.train_labels
        test_bags, test_labels = self.test_bags, self.test_labels

        # Create a single-instance estimator
        compNB = ComplementNB(alpha=1.0,
                              fit_prior=True,
                              class_prior=None,
                              norm=False)

        # Test custom scorer
        bagAccScorer = BagScorer(accuracy_scorer, sparse=True)
        bagPrecisionScorer = BagScorer(precision_scorer, sparse=True)
        bagRecallScorer = BagScorer(recall_scorer, sparse=True)
        estimator = bagAccScorer.estimator_fit(compNB, train_bags,
                                               train_labels)

        # The estimator is the same for all instances...
        accuracy = bagAccScorer(estimator, test_bags, test_labels)
        precision = bagPrecisionScorer(estimator, test_bags, test_labels)
        recall = bagRecallScorer(estimator, test_bags, test_labels)

        self.assertIsInstance(accuracy, float)
        self.assertLess(accuracy, 1)
        self.assertGreater(accuracy, 0)

        self.assertIsInstance(precision, float)
        self.assertLess(precision, 1)
        self.assertGreater(precision, 0)

        self.assertIsInstance(recall, float)
        self.assertLess(recall, 1)
        self.assertGreater(recall, 0)

        return None
コード例 #3
0
    def test_fit_and_score_return_dict(self):

        # Scoring
        accuracy_scorer = make_scorer(accuracy_score, normalize='weighted')

        # Test estimator
        dumb = DummyClassifier(strategy='constant', constant=1)

        # Test custom scorer
        bagAccScorer = BagScorer(accuracy_scorer, sparse=True)

        # Rename for easier parameters
        X = self.train_bags
        y = self.train_labels
        scoring = {'bag-scorer': bagAccScorer}
        estimator = dumb
        groups = None
        cv = 3
        n_jobs = 3
        verbose = 0
        pre_dispatch = 6
        fit_params = None
        return_estimator = True
        error_score = 'raise'
        return_train_score = True
        parameters = None

        # Test _fit_and_score method
        X, y, groups = indexable(X, y, groups)
        cv = check_cv(cv, y, classifier=is_classifier(estimator))
        scorers = _check_multimetric_scoring(estimator, scoring=scoring)

        # Use one cross-validation split
        generator = cv.split(X, y, groups)
        # Get training and test split of training data
        train, test = next(generator)
        # Generate scores using BagScorer
        scores = _fit_and_score(clone(estimator),
                                X,
                                y,
                                scorers,
                                train,
                                test,
                                verbose,
                                parameters,
                                fit_params,
                                return_train_score=return_train_score,
                                return_times=True,
                                return_estimator=return_estimator,
                                return_n_test_samples=False,
                                error_score=error_score)

        # Returned dictionary contains keys
        self.assertIn('train_scores', scores.keys())
        self.assertIn('test_scores', scores.keys())
        self.assertIn('fit_time', scores.keys())
        self.assertIn('score_time', scores.keys())
        self.assertIn('estimator', scores.keys())

        return None
コード例 #4
0
    def test_BagScorer_signature(self):

        # Test custom scorer
        accuracy_scorer = make_scorer(accuracy_score, normalize='weighted')
        bagAccScorer = BagScorer(accuracy_scorer, sparse=True)

        self.assertTrue(callable(bagAccScorer),
                        msg="BagScorer must be callable")

        return None
コード例 #5
0
    def predict(
        self, data: Union[List[MutableMapping], MutableMapping, pd.DataFrame]
    ) -> np.ndarray:
        """Predict on an embedded bag
        inputs
        -------
        data: (list(RawInputData), RawInputData, pandas.DataFrame) Raw data 
        input which is transformed by this class
        outputs
        -------
        bag_prediction: (np.ndarray) results of aggregation with single-instance
        inference of a bags label from the instances within the bag"""
        # Transform raw data
        transformed_data = self._transform_data(data)
        # Predict on transformed data
        predictions = self.classifier.predict(
            self._determine_reshape(self.custom_transform(transformed_data)))
        # Add aggregation of prediction
        bag_prediction = BagScorer.reduce_bag_label(predictions, method='mode')

        return np.array([bag_prediction], dtype=np.unicode_)
コード例 #6
0
train_index, test_index = next(rs.split(bags, labels))
train_bags, train_labels = bags[train_index], labels[train_index]
test_bags, test_labels = bags[test_index], labels[test_index]

# Create an estimator
dumb = DummyClassifier(strategy='constant', constant=1)
radiusNeighbor = RadiusNeighborsClassifier(
    weights='distance',
    algorithm='auto',
    p=1,  # Manhattan distance
)

# Create an evaluation metric
# Multiple evaluation metrics are allowed
accuracy_scorer = make_scorer(accuracy_score)
bagAccScorer = BagScorer(
    accuracy_scorer)  # Accuracy score, no factory function
precision_scorer = make_scorer(precision_score, average='binary')
bagPreScorer = BagScorer(precision_scorer)
jaccard_scorer = make_scorer(jaccard_score, average='binary')
bagJacScorer = BagScorer(jaccard_scorer)
scoring = {
    'bag_accuracy': bagAccScorer,
    'bag_precision': bagPreScorer,
    'bag_jaccard': bagJacScorer,
}

#%%

# Cross validate the dummy data and estimator
result_dumb = cross_validate_bag(
    estimator=dumb,
コード例 #7
0
# Filter out bags with only a single instance
_filter = _filter_bags_by_size(train_bags_cat, 
                               min_instances=5,
                               max_instances=1000)

# Convert bags to dense for KNN estimator
_train_bags_dense = _densify_bags(train_bags[_filter])
_train_labels = train_labels[_filter]
# Keep bags sparse for Complement Native Bayes and Multinomial
_train_bags_cat = train_bags_cat[_filter]
_train_labels_cat = train_labels_cat[_filter]

# Define evaluation metrics
accuracy_scorer = make_scorer(accuracy_score)
bagAccScorer = BagScorer(accuracy_scorer, sparse_input=False)
precision_scorer = make_scorer(precision_score, average='weighted')
bagPreScorer = BagScorer(precision_scorer, sparse_input=False)
recall_scorer = make_scorer(recall_score, average='weighted')
bagRecScorer = BagScorer(recall_scorer, sparse_input=False)

scoring_dense = {'bag_accuracy':bagAccScorer,
                 'bag_precision':bagPreScorer,
                 'bag_recall':bagRecScorer,
                 }

# Cross validate bags
res_knn_cv = cross_validate_bag(
    estimator=knn, 
    X=_train_bags_dense, 
    y=_train_labels, 
コード例 #8
0
    def test_fit_and_score(self):

        # Scoring
        accuracy_scorer = make_scorer(accuracy_score, normalize='weighted')

        # Test estimator
        dumb = DummyClassifier(strategy='constant', constant=1)

        # Test custom scorer
        bagAccScorer = BagScorer(accuracy_scorer, sparse=True)

        # _fit_and_score testing
        X = self.train_bags
        y = self.train_labels
        scoring = {
            'bag-accuracy-scorer': bagAccScorer,
        }
        estimator = dumb
        groups = None
        cv = 3
        n_jobs = 3
        verbose = 0
        pre_dispatch = 6
        fit_params = None
        return_estimator = None
        error_score = 'raise'
        return_train_score = None
        parameters = None

        # Test _fit_and_score method
        X, y, groups = indexable(X, y, groups)
        cv = check_cv(cv, y, classifier=is_classifier(estimator))
        scorers = _check_multimetric_scoring(estimator, scoring=scoring)

        # We clone the estimator to make sure that all the folds are
        # independent, and that it is pickle-able.
        parallel = Parallel(n_jobs=n_jobs,
                            verbose=verbose,
                            pre_dispatch=pre_dispatch)

        # Scores is a list of dictonaries
        """When scoring is a dictionary, the returned result looks like
        [{'test_scores': {'bag-accuracy-scorer': 0.5185185185185185},
          'fit_time': 0.0,
          'score_time': 0.0},
         {'test_scores': {'bag-accuracy-scorer': 0.5185185185185185},
          'fit_time': 0.0,
          'score_time': 0.0}, ... ]"""
        scores = parallel(
            delayed(_fit_and_score)(clone(estimator),
                                    X,
                                    y,
                                    scorers,
                                    train,
                                    test,
                                    verbose,
                                    parameters,
                                    fit_params,
                                    return_train_score=return_train_score,
                                    return_times=True,
                                    return_estimator=return_estimator,
                                    error_score=error_score)
            for train, test in cv.split(X, y, groups))

        for score in scores:
            bag_scoring_metric = score['test_scores']
            self.assertLessEqual(bag_scoring_metric['bag-accuracy-scorer'], 1)
            self.assertGreaterEqual(bag_scoring_metric['bag-accuracy-scorer'],
                                    0)

            fit_time = score['fit_time']
            self.assertIsInstance(fit_time, float)

            score_time = score['score_time']
            self.assertIsInstance(score_time, float)

        return None
コード例 #9
0
    def test_cross_validate_bag(self):

        # Scoring
        accuracy_scorer = make_scorer(accuracy_score, normalize='weighted')

        # Dummy data
        train_bags, train_labels = self.train_bags, self.train_labels
        test_bags, test_labels = self.test_bags, self.test_labels

        # Define an estimator
        dumb = DummyClassifier(strategy='constant', constant=1)

        # Calculate metrics manually
        expected_accuracy = sum(train_labels) / len(train_labels)
        kf = KFold(n_splits=4)
        accuracies = []
        for train_index, test_index in kf.split(train_labels):
            _fold = train_labels[test_index]
            _acc = sum(_fold) / len(_fold)
            print(sum(_fold))
            accuracies.append(_acc)
        print('Global Accuracy : ', sum(train_labels) / len(train_labels))
        print('Averaged accuracies : ', np.mean(accuracies))

        # Custom scorer
        bagAccScorer = BagScorer(accuracy_scorer, sparse=True)
        scorer = {
            'bag-accuracy-scorer': bagAccScorer,
        }

        # Test cross_validate_bag
        # Res is a dictonary of lists {'fit_time':[1,2,3],
        # 'test_bag-accuracy-scorer':[0.1,0.2,0.3]}
        res = cross_validate_bag(dumb,
                                 train_bags,
                                 train_labels,
                                 cv=4,
                                 scoring=scorer,
                                 n_jobs=1,
                                 verbose=0,
                                 fit_params=None,
                                 pre_dispatch='2*n_jobs',
                                 return_train_score=False,
                                 return_estimator=False,
                                 error_score='raise')
        """The arithmetic mean of all accuracy predictions should equal the
        prediction accuracy of the training bags (At least if all splits are
        equal size -> Which is not true if the number of training instances
        is not divisible by the number of splits)
        This is only true because the dummy classifier always predicts 1
        If the splits are not equal size then they will be close to equal"""
        self.assertAlmostEqual(np.mean(res['test_bag-accuracy-scorer']),
                               expected_accuracy, 3)
        # Just check the mean also LOL
        self.assertEqual(np.mean(res['test_bag-accuracy-scorer']),
                         expected_accuracy)
        # 4 Crossvalidation splits
        self.assertTrue(len(res['test_bag-accuracy-scorer']) == 4)
        # Assert result has dictionary values
        self.assertIn('fit_time', res.keys())
        self.assertIn('score_time', res.keys())

        return None