def test_regression_prediction_type(self): y_t = [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_p = [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} # Using the `regression` prediction type should not crash _create_group_metric_set(y_t, predictions, sensitive_feature, 'regression')
def test_specific_metrics(self): y_t = [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_p = [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] exp_acc = accuracy_score_group_summary(y_t, y_p, sensitive_features=s_f) exp_roc = roc_auc_score_group_summary(y_t, y_p, sensitive_features=s_f) predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} actual = _create_group_metric_set(y_t, predictions, sensitive_feature, 'binary_classification') # Do some sanity checks validate_dashboard_dictionary(actual) assert actual['trueY'] == y_t assert actual['predictedY'][0] == y_p assert actual['precomputedFeatureBins'][0]['binVector'] == s_f assert len(actual['precomputedMetrics'][0][0]) == 11 # Cross check the two metrics we computed # Comparisons simplified because s_f was already {0,1} actual_acc = actual['precomputedMetrics'][0][0]['accuracy_score'] assert actual_acc['global'] == exp_acc.overall assert actual_acc['bins'] == list(exp_acc.by_group.values()) actual_roc = actual['precomputedMetrics'][0][0][ 'balanced_accuracy_score'] assert actual_roc['global'] == exp_roc.overall assert actual_roc['bins'] == list(exp_roc.by_group.values())
def __get_dashboard_dict(self, A_test, Y_test, dominant_all_ids): sf = {'diabetic': A_test.diabetic, 'asthmatic': A_test.asthmatic, 'smoker': A_test.smoker} return _create_group_metric_set(y_true=Y_test, predictions=dominant_all_ids, sensitive_features=sf, prediction_type='binary_classification')
def test_roc_auc_single_class(self, recwarn): # Note that y_t and s_f are identical, so subgroup evaluation will fail for # roc_auc_score y_p = [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_t = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} actual = _create_group_metric_set(y_t, predictions, sensitive_feature, "binary_classification") # Check that the error case was intercepted for roc_auc_score validate_dashboard_dictionary(actual) actual_roc = actual["precomputedMetrics"][0][0][ "balanced_accuracy_score"] expected_all_roc = skm.roc_auc_score(y_t, y_p) assert actual_roc["global"] == expected_all_roc assert actual_roc["bins"] == [0, 0] # We substituted zero # Check that the right warnings were issued assert len(recwarn) == 3 msgs = sorted([str(x.message) for x in recwarn]) # We get the message from roc_auc_score once for each subgroup assert msgs[0] == "Evaluation of roc_auc_score failed. Substituting 0" assert msgs[1] == "Evaluation of roc_auc_score failed. Substituting 0" assert msgs[2].startswith("Recall is ill-defined and being set to 0.0")
def fairness_regression(model, model_id, sensitive_feat, X_test, y_test): # Create a dictionary of model(s) you want to assess for fairness ys_pred = {model_id: model.predict(X_test).reshape(-1, 1)} dash_dict = _create_group_metric_set(y_true=y_test, predictions=ys_pred, sensitive_features=sensitive_feat, prediction_type='regression') return dash_dict
def fairness_binary(model, model_id, sensitive_feat, X_test, y_test): # Create a dictionary of model(s) you want to assess for fairness ys_pred = {model_id: model.predict(X_test)} dash_dict = _create_group_metric_set( y_true=y_test, predictions=ys_pred, sensitive_features=validate_for_fairness(sensitive_feat, y_test), prediction_type='binary_classification') return dash_dict
def test_round_trip_1p_1f(self, t_y_t, t_y_p, t_sf): expected = load_sample_dashboard(_BC_1P_1F) y_true = t_y_t(expected['trueY']) y_pred = {expected['modelNames'][0]: t_y_p(expected['predictedY'][0])} sf_file = expected['precomputedFeatureBins'][0] sf = [sf_file['binLabels'][x] for x in sf_file['binVector']] sensitive_feature = {sf_file['featureBinName']: t_sf(sf)} actual = _create_group_metric_set(y_true, y_pred, sensitive_feature, 'binary_classification') validate_dashboard_dictionary(actual) assert expected == actual
def test_regression_prediction_type(self): # For regression, both y_t and y_p can have floating point values y_t = [0, 1, 1, 0, 1, 1, 1.5, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_p = [1, 1, 1, 0, 1, 1, 1.5, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} # Using the `regression` prediction type should not crash result = _create_group_metric_set(y_t, predictions, sensitive_feature, 'regression') assert result['predictionType'] == 'regression' assert len(result['precomputedMetrics'][0][0]) == 6
def test_json_serializable(self): y_t = [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_p = [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} actual = _create_group_metric_set(y_t, predictions, sensitive_feature, "binary_classification") # Check that we can turn the dictionary into JSON # Sometimes, you need to listen carefully to the quack result = json.dumps(actual) assert isinstance(result, str)
def test_probability_prediction_type(self): # For probability, y_p can have real values [0, 1] y_t = [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_p = [ 0.9, 1, 1, 0.1, 1, 1, 0.8, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0 ] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} # Using the `probability` prediction type should not crash result = _create_group_metric_set(y_t, predictions, sensitive_feature, 'probability') assert result['predictionType'] == 'probability' assert len(result['precomputedMetrics'][0][0]) == 10
def test_round_trip_2p_3f(self, t_y_t, t_y_p, t_sf): expected = load_sample_dashboard(_BC_2P_3F) y_true = t_y_t(expected['trueY']) y_pred = {} y_p_ts = [t_y_p, lambda x: x] # Only transform one y_p for i, name in enumerate(expected['modelNames']): y_pred[name] = y_p_ts[i](expected['predictedY'][i]) sensitive_features = {} t_sfs = [lambda x: x, t_sf, lambda x: x] # Only transform one sf for i, sf_file in enumerate(expected['precomputedFeatureBins']): sf = [sf_file['binLabels'][x] for x in sf_file['binVector']] sensitive_features[sf_file['featureBinName']] = t_sfs[i](sf) actual = _create_group_metric_set(y_true, y_pred, sensitive_features, 'binary_classification') validate_dashboard_dictionary(actual) assert expected == actual
def test_specific_metrics(self): y_t = [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1] y_p = [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0] s_f = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1] expected = MetricFrame( metrics={ "accuracy_score": skm.accuracy_score, "roc_auc_score": skm.roc_auc_score, }, y_true=y_t, y_pred=y_p, sensitive_features=s_f, ) predictions = {"some model": y_p} sensitive_feature = {"my sf": s_f} actual = _create_group_metric_set(y_t, predictions, sensitive_feature, "binary_classification") # Do some sanity checks validate_dashboard_dictionary(actual) assert actual["trueY"] == y_t assert actual["predictedY"][0] == y_p assert actual["precomputedFeatureBins"][0]["binVector"] == s_f assert len(actual["precomputedMetrics"][0][0]) == 12 # Cross check the two metrics we computed # Comparisons simplified because s_f was already {0,1} actual_acc = actual["precomputedMetrics"][0][0]["accuracy_score"] assert actual_acc["global"] == expected.overall["accuracy_score"] assert actual_acc["bins"] == list(expected.by_group["accuracy_score"]) actual_roc = actual["precomputedMetrics"][0][0][ "balanced_accuracy_score"] assert actual_roc["global"] == expected.overall["roc_auc_score"] assert actual_roc["bins"] == list(expected.by_group["roc_auc_score"])