def test_evaluate(self): t = self.dtrain[self.target] c = self.model.predict(self.dtrain, "class") p = self.model.predict(self.dtrain, "probability_vector") ans_metrics = [ "accuracy", "auc", "confusion_matrix", "f1_score", "log_loss", "precision", "recall", "roc_curve", ] self.sm_metrics = { "accuracy": evaluation.accuracy(t, c), "auc": evaluation.auc(t, p), "confusion_matrix": evaluation.confusion_matrix(t, c), "f1_score": evaluation.f1_score(t, c), "log_loss": evaluation.log_loss(t, p), "precision": evaluation.precision(t, c), "recall": evaluation.recall(t, c), "roc_curve": evaluation.roc_curve(t, p), } model = self.model def check_cf_matrix(ans): self.assertTrue(ans is not None) self.assertTrue("confusion_matrix" in ans) cf = ans["confusion_matrix"].sort( ["target_label", "predicted_label"]) ans_cf = self.sm_metrics["confusion_matrix"].sort( ["target_label", "predicted_label"]) self.assertEqual(list(cf["count"]), list(ans_cf["count"])) def check_roc_curve(ans): self.assertTrue(ans is not None) self.assertTrue("roc_curve" in ans) roc = ans["roc_curve"] self.assertEqual(type(roc), tc.SFrame) def check_metric(ans, metric): if metric == "confusion_matrix": check_cf_matrix(ans) elif metric == "roc_curve": check_roc_curve(ans) else: self.assertTrue(ans is not None) self.assertTrue(metric in ans) self.assertAlmostEqual( ans[metric], self.sm_metrics[metric], places=4, msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), ) # Default ans = model.evaluate(self.dtrain) self.assertEqual(sorted(ans.keys()), sorted(ans_metrics)) for m in ans_metrics: check_metric(ans, m) # Individual for m in ans_metrics: ans = model.evaluate(self.dtrain, metric=m) check_metric(ans, m) # Test evaluate with new class test_data = self.dtrain.copy().head() test_data[self.target] = test_data[self.target].apply( lambda x: str(x) + "-new") for m in ans_metrics: ans = model.evaluate(test_data, metric=m)
def evaluate(self, dataset, metric='auto'): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the session_id, target and features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy' : Classification accuracy (micro average). - 'auc' : Area under the ROC curve (macro average) - 'precision' : Precision score (macro average) - 'recall' : Recall score (macro average) - 'f1_score' : F1 score (macro average) - 'log_loss' : Log loss - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an ROC curve Returns ------- out : dict Dictionary of evaluation results where the key is the name of the evaluation metric (e.g. `accuracy`) and the value is the evaluation score. See Also ---------- create, predict Examples ---------- .. sourcecode:: python >>> results = model.evaluate(data) >>> print results['accuracy'] """ avail_metrics = ['accuracy', 'auc', 'precision', 'recall', 'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve'] _tkutl._check_categorical_option_type( 'metric', metric, avail_metrics + ['auto']) if metric == 'auto': metrics = avail_metrics else: metrics = [metric] probs = self.predict(dataset, output_type='probability_vector') classes = self.predict(dataset, output_type='class') ret = {} if 'accuracy' in metrics: ret['accuracy'] = _evaluation.accuracy(dataset[self.target], classes) if 'auc' in metrics: ret['auc'] = _evaluation.auc(dataset[self.target], probs) if 'precision' in metrics: ret['precision'] = _evaluation.precision(dataset[self.target], classes) if 'recall' in metrics: ret['recall'] = _evaluation.recall(dataset[self.target], classes) if 'f1_score' in metrics: ret['f1_score'] = _evaluation.f1_score(dataset[self.target], classes) if 'log_loss' in metrics: ret['log_loss'] = _evaluation.log_loss(dataset[self.target], probs) if 'confusion_matrix' in metrics: ret['confusion_matrix'] = _evaluation.confusion_matrix(dataset[self.target], classes) if 'roc_curve' in metrics: ret['roc_curve'] = _evaluation.roc_curve(dataset[self.target], probs) return ret
def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset to use for evaluation, must include a column with the same name as the features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy' : Classification accuracy (micro average). - 'auc' : Area under the ROC curve (macro average) - 'precision' : Precision score (macro average) - 'recall' : Recall score (macro average) - 'f1_score' : F1 score (macro average) - 'log_loss' : Log loss - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an ROC curve verbose : bool, optional If True, prints progress updates and model details. batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : dict Dictionary of evaluation results where the key is the name of the evaluation metric (e.g. `accuracy`) and the value is the evaluation score. See Also ---------- classify, predict Examples ---------- .. sourcecode:: python >>> results = model.evaluate(data) >>> print results['accuracy'] """ from turicreate.toolkits import evaluation # parameter checking if not isinstance(dataset, _tc.SFrame): raise TypeError('\'dataset\' parameter must be an SFrame') avail_metrics = ['accuracy', 'auc', 'precision', 'recall', 'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve'] _tk_utils._check_categorical_option_type( 'metric', metric, avail_metrics + ['auto']) if metric == 'auto': metrics = avail_metrics else: metrics = [metric] if _is_deep_feature_sarray(dataset[self.feature]): deep_features = dataset[self.feature] else: deep_features = get_deep_features(dataset[self.feature], verbose=verbose) data = _tc.SFrame({'deep features': deep_features}) data = data.add_row_number() missing_ids = data.filter_by([[]], 'deep features')['id'] if len(missing_ids) > 0: data = data.filter_by([[]], 'deep features', exclude=True) # Remove the labels for entries without deep features _logging.warning("Dropping %d examples which are less than 975ms in length." % len(missing_ids)) labels = dataset[[self.target]].add_row_number() labels = data.join(labels, how='left')[self.target] else: labels = dataset[self.target] assert(len(labels) == len(data)) if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]): probs = self.predict(data['deep features'], output_type='probability_vector', verbose=verbose, batch_size=batch_size) if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]): classes = self.predict(data['deep features'], output_type='class', verbose=verbose, batch_size=batch_size) ret = {} if 'accuracy' in metrics: ret['accuracy'] = evaluation.accuracy(labels, classes) if 'auc' in metrics: ret['auc'] = evaluation.auc(labels, probs, index_map=self._class_label_to_id) if 'precision' in metrics: ret['precision'] = evaluation.precision(labels, classes) if 'recall' in metrics: ret['recall'] = evaluation.recall(labels, classes) if 'f1_score' in metrics: ret['f1_score'] = evaluation.f1_score(labels, classes) if 'log_loss' in metrics: ret['log_loss'] = evaluation.log_loss(labels, probs, index_map=self._class_label_to_id) if 'confusion_matrix' in metrics: ret['confusion_matrix'] = evaluation.confusion_matrix(labels, classes) if 'roc_curve' in metrics: ret['roc_curve'] = evaluation.roc_curve(labels, probs, index_map=self._class_label_to_id) return ret
def evaluate(self, dataset, metric='auto', batch_size=256, verbose=True): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the feature and target columns used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy' : Classification accuracy (micro average). - 'auc' : Area under the ROC curve (macro average) - 'precision' : Precision score (macro average) - 'recall' : Recall score (macro average) - 'f1_score' : F1 score (macro average) - 'log_loss' : Log loss - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an ROC curve batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. verbose : bool, optional If True, prints prediction progress. Returns ------- out : dict Dictionary of evaluation results where the key is the name of the evaluation metric (e.g. `accuracy`) and the value is the evaluation score. See Also ---------- create, predict Examples ---------- .. sourcecode:: python >>> results = model.evaluate(data) >>> print(results['accuracy']) """ import os, json, math if self.target not in dataset.column_names(): raise _ToolkitError("Must provide ground truth column, '" + self.target + "' in the evaluation dataset.") predicted = self._predict_with_probabilities(dataset, batch_size, verbose) avail_metrics = ['accuracy', 'auc', 'precision', 'recall', 'f1_score', 'confusion_matrix', 'roc_curve', 'log_loss'] _tkutl._check_categorical_option_type( 'metric', metric, avail_metrics + ['auto']) metrics = avail_metrics if metric == 'auto' else [metric] labels = self.classes ret = {} if 'accuracy' in metrics: ret['accuracy'] = _evaluation.accuracy( dataset[self.target], predicted[self.target]) if 'auc' in metrics: ret['auc'] = _evaluation.auc( dataset[self.target], predicted['probability'], index_map=self._class_to_index) if 'precision' in metrics: ret['precision'] = _evaluation.precision( dataset[self.target], predicted[self.target]) if 'recall' in metrics: ret['recall'] = _evaluation.recall( dataset[self.target], predicted[self.target]) if 'f1_score' in metrics: ret['f1_score'] = _evaluation.f1_score( dataset[self.target], predicted[self.target]) if 'confusion_matrix' in metrics: ret['confusion_matrix'] = _evaluation.confusion_matrix( dataset[self.target], predicted[self.target]) if 'roc_curve' in metrics: ret['roc_curve'] = _evaluation.roc_curve( dataset[self.target], predicted['probability'], index_map=self._class_to_index) if 'log_loss' in metrics: ret['log_loss'] = _evaluation.log_loss( dataset[self.target], predicted['probability'], index_map=self._class_to_index) from .._evaluate_utils import ( entropy, confidence, relative_confidence, get_confusion_matrix, hclusterSort, l2Dist ) evaluation_result = {k: ret[k] for k in metrics} evaluation_result['num_test_examples'] = len(dataset) for k in ['num_classes', 'num_examples', 'training_loss', 'training_time', 'max_iterations']: evaluation_result[k] = getattr(self, k) #evaluation_result['input_image_shape'] = getattr(self, 'input_image_shape') evaluation_result["model_name"] = "Drawing Classifier" extended_test = dataset.add_column(predicted["probability"], 'probs') extended_test['label'] = dataset[self.target] extended_test = extended_test.add_columns( [extended_test.apply(lambda d: labels[d['probs'].index(confidence(d['probs']))]), extended_test.apply(lambda d: entropy(d['probs'])), extended_test.apply(lambda d: confidence(d['probs'])), extended_test.apply(lambda d: relative_confidence(d['probs']))], ['predicted_label', 'entropy', 'confidence', 'relative_confidence']) extended_test = extended_test.add_column(extended_test.apply(lambda d: d['label'] == d['predicted_label']), 'correct') sf_conf_mat = get_confusion_matrix(extended_test, labels) confidence_threshold = 0.5 hesitant_threshold = 0.2 evaluation_result['confidence_threshold'] = confidence_threshold evaluation_result['hesitant_threshold'] = hesitant_threshold evaluation_result['confidence_metric_for_threshold'] = 'relative_confidence' evaluation_result['conf_mat'] = list(sf_conf_mat) vectors = map(lambda l: {'name': l, 'pos':list(sf_conf_mat[sf_conf_mat['target_label']==l].sort('predicted_label')['norm_prob'])}, labels) evaluation_result['sorted_labels'] = hclusterSort(vectors, l2Dist)[0]['name'].split("|") per_l = extended_test.groupby(['label'], {'count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) per_l['recall'] = per_l.apply(lambda l: l['correct_count']*1.0 / l['count']) per_pl = extended_test.groupby(['predicted_label'], {'predicted_count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) per_pl['precision'] = per_pl.apply(lambda l: l['correct_count']*1.0 / l['predicted_count']) per_pl = per_pl.rename({'predicted_label': 'label'}) evaluation_result['label_metrics'] = list(per_l.join(per_pl, on='label', how='outer').select_columns(['label', 'count', 'correct_count', 'predicted_count', 'recall', 'precision'])) evaluation_result['labels'] = labels extended_test = extended_test.add_row_number('__idx').rename({'label': 'target_label'}) evaluation_result['test_data'] = extended_test evaluation_result['feature'] = self.feature return _Evaluation(evaluation_result)
def evaluate(self, dataset, metric = 'auto', verbose = True): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset of new observations. Must include columns with the same names as the feature and target columns used for model training. Additional columns are ignored. metric : str optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy' : Classification accuracy (micro average). - 'auc' : Area under the ROC curve (macro average) - 'precision' : Precision score (macro average) - 'recall' : Recall score (macro average) - 'f1_score' : F1 score (macro average) - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an ROC curve verbose : bool optional If True, prints prediction progress. Returns ------- out : dict Dictionary of evaluation results where the key is the name of the evaluation metric (e.g. `accuracy`) and the value is the evaluation score. See Also ---------- create, predict Examples ---------- .. sourcecode:: python >>> results = model.evaluate(data) >>> print(results['accuracy']) """ if self.target not in dataset.column_names(): raise _ToolkitError("Dataset provided to evaluate does not have " + "ground truth in the " + self.target + " column.") predicted = self._predict_with_probabilities(dataset, verbose) avail_metrics = ['accuracy', 'auc', 'precision', 'recall', 'f1_score', 'confusion_matrix', 'roc_curve'] _tkutl._check_categorical_option_type( 'metric', metric, avail_metrics + ['auto']) metrics = avail_metrics if metric == 'auto' else [metric] ret = {} if 'accuracy' in metrics: ret['accuracy'] = _evaluation.accuracy( dataset[self.target], predicted[self.target]) if 'auc' in metrics: ret['auc'] = _evaluation.auc( dataset[self.target], predicted['probability'], index_map=self._class_to_index) if 'precision' in metrics: ret['precision'] = _evaluation.precision( dataset[self.target], predicted[self.target]) if 'recall' in metrics: ret['recall'] = _evaluation.recall( dataset[self.target], predicted[self.target]) if 'f1_score' in metrics: ret['f1_score'] = _evaluation.f1_score( dataset[self.target], predicted[self.target]) if 'confusion_matrix' in metrics: ret['confusion_matrix'] = _evaluation.confusion_matrix( dataset[self.target], predicted[self.target]) if 'roc_curve' in metrics: ret['roc_curve'] = _evaluation.roc_curve( dataset[self.target], predicted['probability'], index_map=self._class_to_index) return ret
def evaluate(self, dataset, metric='auto', batch_size=64): """ Evaluate the model by making predictions of target values and comparing these to actual values. Parameters ---------- dataset : SFrame Dataset to use for evaluation, must include a column with the same name as the features used for model training. Additional columns are ignored. metric : str, optional Name of the evaluation metric. Possible values are: - 'auto' : Returns all available metrics. - 'accuracy' : Classification accuracy (micro average). - 'auc' : Area under the ROC curve (macro average) - 'precision' : Precision score (macro average) - 'recall' : Recall score (macro average) - 'f1_score' : F1 score (macro average) - 'log_loss' : Log loss - 'confusion_matrix' : An SFrame with counts of possible prediction/true label combinations. - 'roc_curve' : An SFrame containing information needed for an ROC curve batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. Returns ------- out : dict Dictionary of evaluation results where the key is the name of the evaluation metric (e.g. `accuracy`) and the value is the evaluation score. See Also ---------- classify, predict Examples ---------- .. sourcecode:: python >>> results = model.evaluate(data) >>> print results['accuracy'] """ from turicreate.toolkits import evaluation # parameter checking if not isinstance(dataset, _tc.SFrame): raise TypeError('\'dataset\' parameter must be an SFrame') if(batch_size < 1): raise ValueError('\'batch_size\' must be greater than or equal to 1') avail_metrics = ['accuracy', 'auc', 'precision', 'recall', 'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve'] _tk_utils._check_categorical_option_type( 'metric', metric, avail_metrics + ['auto']) if metric == 'auto': metrics = avail_metrics else: metrics = [metric] if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]): probs = self.predict(dataset, output_type='probability_vector', batch_size=batch_size) if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]): classes = self.predict(dataset, output_type='class', batch_size=batch_size) ret = {} if 'accuracy' in metrics: ret['accuracy'] = evaluation.accuracy(dataset[self.target], classes) if 'auc' in metrics: ret['auc'] = evaluation.auc(dataset[self.target], probs, index_map=self._class_label_to_id) if 'precision' in metrics: ret['precision'] = evaluation.precision(dataset[self.target], classes) if 'recall' in metrics: ret['recall'] = evaluation.recall(dataset[self.target], classes) if 'f1_score' in metrics: ret['f1_score'] = evaluation.f1_score(dataset[self.target], classes) if 'log_loss' in metrics: ret['log_loss'] = evaluation.log_loss(dataset[self.target], probs, index_map=self._class_label_to_id) if 'confusion_matrix' in metrics: ret['confusion_matrix'] = evaluation.confusion_matrix(dataset[self.target], classes) if 'roc_curve' in metrics: ret['roc_curve'] = evaluation.roc_curve(dataset[self.target], probs, index_map=self._class_label_to_id) return ret