Example #1
0
    def test_evaluate(self):
        t = self.dtrain[self.target]
        c = self.model.predict(self.dtrain, "class")
        p = self.model.predict(self.dtrain, "probability_vector")
        ans_metrics = [
            "accuracy",
            "auc",
            "confusion_matrix",
            "f1_score",
            "log_loss",
            "precision",
            "recall",
            "roc_curve",
        ]

        self.sm_metrics = {
            "accuracy": evaluation.accuracy(t, c),
            "auc": evaluation.auc(t, p),
            "confusion_matrix": evaluation.confusion_matrix(t, c),
            "f1_score": evaluation.f1_score(t, c),
            "log_loss": evaluation.log_loss(t, p),
            "precision": evaluation.precision(t, c),
            "recall": evaluation.recall(t, c),
            "roc_curve": evaluation.roc_curve(t, p),
        }
        model = self.model

        def check_cf_matrix(ans):
            self.assertTrue(ans is not None)
            self.assertTrue("confusion_matrix" in ans)
            cf = ans["confusion_matrix"].sort(
                ["target_label", "predicted_label"])
            ans_cf = self.sm_metrics["confusion_matrix"].sort(
                ["target_label", "predicted_label"])
            self.assertEqual(list(cf["count"]), list(ans_cf["count"]))

        def check_roc_curve(ans):
            self.assertTrue(ans is not None)
            self.assertTrue("roc_curve" in ans)
            roc = ans["roc_curve"]
            self.assertEqual(type(roc), tc.SFrame)

        def check_metric(ans, metric):
            if metric == "confusion_matrix":
                check_cf_matrix(ans)
            elif metric == "roc_curve":
                check_roc_curve(ans)
            else:
                self.assertTrue(ans is not None)
                self.assertTrue(metric in ans)
                self.assertAlmostEqual(
                    ans[metric],
                    self.sm_metrics[metric],
                    places=4,
                    msg="%s = (%s,%s)" %
                    (metric, ans[metric], self.sm_metrics[metric]),
                )

        # Default
        ans = model.evaluate(self.dtrain)
        self.assertEqual(sorted(ans.keys()), sorted(ans_metrics))
        for m in ans_metrics:
            check_metric(ans, m)

        # Individual
        for m in ans_metrics:
            ans = model.evaluate(self.dtrain, metric=m)
            check_metric(ans, m)

        # Test evaluate with new class
        test_data = self.dtrain.copy().head()
        test_data[self.target] = test_data[self.target].apply(
            lambda x: str(x) + "-new")
        for m in ans_metrics:
            ans = model.evaluate(test_data, metric=m)
    def evaluate(self, dataset, metric='auto'):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the session_id, target and features used for model training.
            Additional columns are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        create, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve']
        _tkutl._check_categorical_option_type(
            'metric', metric, avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        probs = self.predict(dataset, output_type='probability_vector')
        classes = self.predict(dataset, output_type='class')

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(dataset[self.target], classes)
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(dataset[self.target], probs)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(dataset[self.target], classes)
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(dataset[self.target], classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(dataset[self.target], classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = _evaluation.log_loss(dataset[self.target], probs)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(dataset[self.target], classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(dataset[self.target], probs)

        return ret
Example #3
0
    def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset to use for evaluation, must include a column with the same
            name as the features used for model training. Additional columns
            are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        verbose : bool, optional
            If True, prints progress updates and model details.

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        classify, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """
        from turicreate.toolkits import evaluation

        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError('\'dataset\' parameter must be an SFrame')

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve']
        _tk_utils._check_categorical_option_type(
            'metric', metric, avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        if _is_deep_feature_sarray(dataset[self.feature]):
            deep_features = dataset[self.feature]
        else:
            deep_features = get_deep_features(dataset[self.feature], verbose=verbose)
        data = _tc.SFrame({'deep features': deep_features})
        data = data.add_row_number()
        missing_ids = data.filter_by([[]], 'deep features')['id']

        if len(missing_ids) > 0:
            data = data.filter_by([[]], 'deep features', exclude=True)
            # Remove the labels for entries without deep features
            _logging.warning("Dropping %d examples which are less than 975ms in length." % len(missing_ids))
            labels = dataset[[self.target]].add_row_number()
            labels = data.join(labels, how='left')[self.target]
        else:
            labels = dataset[self.target]
        assert(len(labels) == len(data))

        if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]):
            probs = self.predict(data['deep features'], output_type='probability_vector',
                                 verbose=verbose, batch_size=batch_size)
        if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]):
            classes = self.predict(data['deep features'], output_type='class',
                                   verbose=verbose, batch_size=batch_size)

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = evaluation.accuracy(labels, classes)
        if 'auc' in metrics:
            ret['auc'] = evaluation.auc(labels, probs, index_map=self._class_label_to_id)
        if 'precision' in metrics:
            ret['precision'] = evaluation.precision(labels, classes)
        if 'recall' in metrics:
            ret['recall'] = evaluation.recall(labels, classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = evaluation.f1_score(labels, classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = evaluation.log_loss(labels, probs, index_map=self._class_label_to_id)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = evaluation.confusion_matrix(labels, classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = evaluation.roc_curve(labels, probs, index_map=self._class_label_to_id)

        return ret
Example #4
0
    def evaluate(self, dataset, metric='auto', batch_size=256, verbose=True):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the feature and target columns used for model training.
            Additional columns are ignored.

        metric : str, optional
            Name of the evaluation metric. Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve
            performance.

        verbose : bool, optional
            If True, prints prediction progress.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        create, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print(results['accuracy'])
        """
        import os, json, math

        if self.target not in dataset.column_names():
            raise _ToolkitError("Must provide ground truth column, '"
                + self.target + "' in the evaluation dataset.")

        predicted = self._predict_with_probabilities(dataset, batch_size, verbose)

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'confusion_matrix', 'roc_curve', 'log_loss']

        _tkutl._check_categorical_option_type(
                        'metric', metric, avail_metrics + ['auto'])

        metrics = avail_metrics if metric == 'auto' else [metric]

        labels = self.classes

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(
                dataset[self.target], predicted[self.target])
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(
                dataset[self.target], predicted['probability'],
                index_map=self._class_to_index)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(
                dataset[self.target], predicted[self.target])
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(
                dataset[self.target], predicted[self.target])
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(
                dataset[self.target], predicted[self.target])
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(
                dataset[self.target], predicted[self.target])
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(
                dataset[self.target], predicted['probability'],
                index_map=self._class_to_index)
        if 'log_loss' in metrics:
            ret['log_loss'] = _evaluation.log_loss(
                dataset[self.target], predicted['probability'],
                index_map=self._class_to_index)

        from .._evaluate_utils import  (
            entropy,
            confidence,
            relative_confidence,
            get_confusion_matrix,
            hclusterSort,
            l2Dist
        )
        evaluation_result = {k: ret[k] for k in metrics}
        evaluation_result['num_test_examples'] = len(dataset)
        for k in ['num_classes', 'num_examples', 'training_loss', 'training_time', 'max_iterations']:
            evaluation_result[k] = getattr(self, k)

        #evaluation_result['input_image_shape'] = getattr(self, 'input_image_shape')

        evaluation_result["model_name"] = "Drawing Classifier"
        extended_test = dataset.add_column(predicted["probability"], 'probs')
        extended_test['label'] = dataset[self.target]

        extended_test = extended_test.add_columns( [extended_test.apply(lambda d: labels[d['probs'].index(confidence(d['probs']))]),
            extended_test.apply(lambda d: entropy(d['probs'])),
            extended_test.apply(lambda d: confidence(d['probs'])),
            extended_test.apply(lambda d: relative_confidence(d['probs']))],
            ['predicted_label', 'entropy', 'confidence', 'relative_confidence'])

        extended_test = extended_test.add_column(extended_test.apply(lambda d: d['label'] == d['predicted_label']), 'correct')

        sf_conf_mat = get_confusion_matrix(extended_test, labels)
        confidence_threshold = 0.5
        hesitant_threshold = 0.2
        evaluation_result['confidence_threshold'] = confidence_threshold
        evaluation_result['hesitant_threshold'] = hesitant_threshold
        evaluation_result['confidence_metric_for_threshold'] = 'relative_confidence'

        evaluation_result['conf_mat'] = list(sf_conf_mat)

        vectors = map(lambda l: {'name': l, 'pos':list(sf_conf_mat[sf_conf_mat['target_label']==l].sort('predicted_label')['norm_prob'])},
                    labels)
        evaluation_result['sorted_labels'] = hclusterSort(vectors, l2Dist)[0]['name'].split("|")

        per_l = extended_test.groupby(['label'], {'count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') })
        per_l['recall'] = per_l.apply(lambda l: l['correct_count']*1.0 / l['count'])

        per_pl = extended_test.groupby(['predicted_label'], {'predicted_count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') })
        per_pl['precision'] = per_pl.apply(lambda l: l['correct_count']*1.0 / l['predicted_count'])
        per_pl = per_pl.rename({'predicted_label': 'label'})
        evaluation_result['label_metrics'] = list(per_l.join(per_pl, on='label', how='outer').select_columns(['label', 'count', 'correct_count', 'predicted_count', 'recall', 'precision']))
        evaluation_result['labels'] = labels

        extended_test = extended_test.add_row_number('__idx').rename({'label': 'target_label'})

        evaluation_result['test_data'] = extended_test
        evaluation_result['feature'] = self.feature

        return _Evaluation(evaluation_result)
    def evaluate(self, dataset, metric = 'auto', verbose = True):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.
        
        Parameters
        ----------
        dataset : SFrame
        Dataset of new observations. Must include columns with the same
        names as the feature and target columns used for model training.
        Additional columns are ignored.
        
        metric : str optional
        Name of the evaluation metric. Possible values are:
        
        - 'auto'             : Returns all available metrics.
        - 'accuracy'         : Classification accuracy (micro average).
        - 'auc'              : Area under the ROC curve (macro average)
        - 'precision'        : Precision score (macro average)
        - 'recall'           : Recall score (macro average)
        - 'f1_score'         : F1 score (macro average)
        - 'confusion_matrix' : An SFrame with counts of possible 
                               prediction/true label combinations.
        - 'roc_curve'        : An SFrame containing information needed for an
                               ROC curve
        
        verbose : bool optional
        If True, prints prediction progress.

        Returns
        -------
        out : dict
        Dictionary of evaluation results where the key is the name of the
        evaluation metric (e.g. `accuracy`) and the value is the evaluation
        score.
        
        See Also
        ----------
        create, predict
        
        Examples
        ----------
        .. sourcecode:: python
        
        >>> results = model.evaluate(data)
        >>> print(results['accuracy'])
        """

        if self.target not in dataset.column_names():
            raise _ToolkitError("Dataset provided to evaluate does not have " 
                + "ground truth in the " + self.target + " column.")

        predicted = self._predict_with_probabilities(dataset, verbose)

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'confusion_matrix', 'roc_curve']

        _tkutl._check_categorical_option_type(
                        'metric', metric, avail_metrics + ['auto'])

        metrics = avail_metrics if metric == 'auto' else [metric]
        
        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = _evaluation.accuracy(
                dataset[self.target], predicted[self.target])
        if 'auc' in metrics:
            ret['auc'] = _evaluation.auc(
                dataset[self.target], predicted['probability'], 
                index_map=self._class_to_index)
        if 'precision' in metrics:
            ret['precision'] = _evaluation.precision(
                dataset[self.target], predicted[self.target])
        if 'recall' in metrics:
            ret['recall'] = _evaluation.recall(
                dataset[self.target], predicted[self.target])
        if 'f1_score' in metrics:
            ret['f1_score'] = _evaluation.f1_score(
                dataset[self.target], predicted[self.target])
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = _evaluation.confusion_matrix(
                dataset[self.target], predicted[self.target])
        if 'roc_curve' in metrics:
            ret['roc_curve'] = _evaluation.roc_curve(
                dataset[self.target], predicted['probability'], 
                index_map=self._class_to_index)
        
        return ret
Example #6
0
    def evaluate(self, dataset, metric='auto', batch_size=64):
        """
        Evaluate the model by making predictions of target values and comparing
        these to actual values.

        Parameters
        ----------
        dataset : SFrame
            Dataset to use for evaluation, must include a column with the same
            name as the features used for model training. Additional columns
            are ignored.

        metric : str, optional
            Name of the evaluation metric.  Possible values are:

            - 'auto'             : Returns all available metrics.
            - 'accuracy'         : Classification accuracy (micro average).
            - 'auc'              : Area under the ROC curve (macro average)
            - 'precision'        : Precision score (macro average)
            - 'recall'           : Recall score (macro average)
            - 'f1_score'         : F1 score (macro average)
            - 'log_loss'         : Log loss
            - 'confusion_matrix' : An SFrame with counts of possible
                                   prediction/true label combinations.
            - 'roc_curve'        : An SFrame containing information needed for an
                                   ROC curve

        batch_size : int, optional
            If you are getting memory errors, try decreasing this value. If you
            have a powerful computer, increasing this value may improve performance.

        Returns
        -------
        out : dict
            Dictionary of evaluation results where the key is the name of the
            evaluation metric (e.g. `accuracy`) and the value is the evaluation
            score.

        See Also
        ----------
        classify, predict

        Examples
        ----------
        .. sourcecode:: python

          >>> results = model.evaluate(data)
          >>> print results['accuracy']
        """
        from turicreate.toolkits import evaluation

        # parameter checking
        if not isinstance(dataset, _tc.SFrame):
            raise TypeError('\'dataset\' parameter must be an SFrame')
        if(batch_size < 1):
            raise ValueError('\'batch_size\' must be greater than or equal to 1')

        avail_metrics = ['accuracy', 'auc', 'precision', 'recall',
                         'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve']
        _tk_utils._check_categorical_option_type(
            'metric', metric, avail_metrics + ['auto'])

        if metric == 'auto':
            metrics = avail_metrics
        else:
            metrics = [metric]

        if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]):
            probs = self.predict(dataset, output_type='probability_vector', batch_size=batch_size)
        if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]):
            classes = self.predict(dataset, output_type='class', batch_size=batch_size)

        ret = {}
        if 'accuracy' in metrics:
            ret['accuracy'] = evaluation.accuracy(dataset[self.target], classes)
        if 'auc' in metrics:
            ret['auc'] = evaluation.auc(dataset[self.target], probs, index_map=self._class_label_to_id)
        if 'precision' in metrics:
            ret['precision'] = evaluation.precision(dataset[self.target], classes)
        if 'recall' in metrics:
            ret['recall'] = evaluation.recall(dataset[self.target], classes)
        if 'f1_score' in metrics:
            ret['f1_score'] = evaluation.f1_score(dataset[self.target], classes)
        if 'log_loss' in metrics:
            ret['log_loss'] = evaluation.log_loss(dataset[self.target], probs, index_map=self._class_label_to_id)
        if 'confusion_matrix' in metrics:
            ret['confusion_matrix'] = evaluation.confusion_matrix(dataset[self.target], classes)
        if 'roc_curve' in metrics:
            ret['roc_curve'] = evaluation.roc_curve(dataset[self.target], probs, index_map=self._class_label_to_id)

        return ret