def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(20, 20), max_iter=400, random_state=0))]) clf2 = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=400, random_state=0))]) clf.fit(self.X, self.y) clf2.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, label="model1", verbose=False) self.exp2 = dx.Explainer(clf2, self.X, self.y, verbose=False) self.exp3 = dx.Explainer(clf, self.X, self.y, label="model3", verbose=False)
def test(self): case1 = dx.Explainer(self.model, self.X, self.y, verbose=False) case2 = dx.Explainer(self.model, self.X, None, verbose=False) case3 = dx.Explainer(self.model, None, self.y, verbose=False) case4 = dx.Explainer(self.model, None, None, verbose=False) self.assertIsInstance(case1, dx.Explainer) self.assertIsInstance(case2, dx.Explainer) self.assertIsInstance(case3, dx.Explainer) self.assertIsInstance(case4, dx.Explainer) with self.assertRaises(ValueError): case2.model_performance() with self.assertRaises(ValueError): case3.model_parts() with self.assertRaises(ValueError): case4.model_profile() case5 = case2.predict_parts(self.X.iloc[[0]]) case6 = case2.predict_profile(self.X.iloc[[0]]) self.assertIsInstance(case5, dx.instance_level.BreakDown) self.assertIsInstance(case6, dx.instance_level.CeterisParibus) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. case5 = dx.Explainer(self.model, self.X, self.y, predict_function=1, verbose=False) assert issubclass(w[-1].category, UserWarning) self.assertIsInstance(case5, dx.Explainer)
def test(self): case1 = dx.Explainer(self.model, self.X, self.y, verbose=False) case2 = dx.Explainer(self.model, self.X, None, verbose=False) case3 = dx.Explainer(self.model, None, self.y, verbose=False) case4 = dx.Explainer(self.model, None, None, verbose=False) self.assertIsInstance(case1, dx.Explainer) self.assertIsInstance(case2, dx.Explainer) self.assertIsInstance(case3, dx.Explainer) self.assertIsInstance(case4, dx.Explainer) with self.assertRaises(ValueError): case2.model_performance() with self.assertRaises(ValueError): case3.model_parts() with self.assertRaises(ValueError): case4.model_profile() case5 = case2.predict_parts(self.X.iloc[[0]]) case6 = case2.predict_profile(self.X.iloc[[0]]) self.assertIsInstance(case5, dx.instance_level.BreakDown) self.assertIsInstance(case6, dx.instance_level.CeterisParibus) case5 = dx.Explainer(self.model, self.X, self.y, predict_function=1, verbose=False) self.assertIsInstance(case5, dx.Explainer)
def pdp_profile(self, X, y, model_names=None, features=None, figsize=(8, 8)): """ Function plots pdp profile. """ if model_names is None: model_names = self.pdp_measures.columns.tolist()[1:] if features is None: features = self.pdp_measures.colname.tolist() for fe in features: profile_base = dx.Explainer(self.base_model[1], X, y, label=self.base_model[0], verbose=False) profile_base = profile_base.model_profile(verbose=False, variables=[fe]) df = pd.DataFrame({ 'x': profile_base.result._x_, self.base_model[0]: profile_base.result._yhat_ }) del profile_base plt.subplots(figsize=figsize) for model in self.models: profile = dx.Explainer(model[1], X, y, label=model[0], verbose=False) profile = profile.model_profile(verbose=False, variables=[fe]) y_result = profile.result._yhat_ df[model[0]] = y_result del profile for col in df.columns[1:]: plt.plot(df['x'], df[col]) plt.title("PDP curves for feature: " + fe) plt.legend([col for col in df.columns[1:]]) plt.xlabel(fe) plt.show()
def test_errors(self): from sklearn.ensemble import RandomForestRegressor data = dx.datasets.load_fifa() X = data.drop(columns=['nationality', 'value_eur']).iloc[1:100, :] y = data['value_eur'][1:100] model = RandomForestRegressor() model.fit(X, y) def predict_function_return_2d(m, d): n_rows = d.shape[0] prediction = m.predict(d) return prediction.reshape((n_rows, 1)) def predict_function_return_3d(m, d): n_rows = d.shape[0] prediction = m.predict(d) return prediction.reshape((n_rows, 1, 1)) def predict_function_return_one_element_array(m, d): return np.array(0.2) warnings.simplefilter("always") with warnings.catch_warnings(record=True) as w: # Trigger a warning. dx.Explainer(model, X, y, verbose=False, model_type='regression', predict_function=predict_function_return_2d) assert issubclass(w[-1].category, UserWarning) with warnings.catch_warnings(record=True) as w: # Trigger a warning. dx.Explainer(model, X, y, verbose=False, model_type='regression', predict_function=predict_function_return_3d) assert issubclass(w[-1].category, UserWarning) with warnings.catch_warnings(record=True) as w: # Trigger a warning. dx.Explainer( model, X, y, verbose=False, model_type='regression', predict_function=predict_function_return_one_element_array) assert issubclass(w[-1].category, UserWarning)
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(20, 20), max_iter=400, random_state=0))]) clf2 = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=400, random_state=0))]) clf.fit(self.X, self.y) clf2.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, label="model1", verbose=False) self.exp2 = dx.Explainer(clf2, self.X, self.y, label="model2", verbose=False) # This plots should be supported self.reference_plots = [ ROCContainer, ShapleyValuesContainer, BreakDownContainer, CeterisParibusContainer, FeatureImportanceContainer, PartialDependenceContainer, AccumulatedDependenceContainer, MetricsContainer ]
def generate_breakdown_explainer(self, i): model_name = type(self.model).__name__ model = self.model if model_name == 'Sequential': X_test = pd.DataFrame.from_records(self.X_test) model_name = self.model.name if self.model.name == 'DNN': instc = self.X_test[i].reshape((1, -1)) dnn_clf = tf.keras.wrappers.scikit_learn.KerasClassifier( dnn_model_create, input=self.X_test.shape[1], epochs=10, verbose=False) dnn_clf._estimator_type = "classifier" dnn_clf.fit(self.X_train, self.y_train) model = dnn_clf elif self.model.name == 'RNN': print("") else: instc = self.X_test.iloc[i] X_test = self.X_test smart_grid_exp = dx.Explainer( model, X_test, self.y_test, label=("Smart Grid New England " + model_name + " Pipeline on instance: " + str(i))) X_test.columns = self.feature_names instance = smart_grid_exp.predict_parts(instc, type='break_down') fig = instance.plot(max_vars=30, show=False) fig.write_image("explainer_outputs/Break_Down/" + model_name + "_" + str(i) + "_" + self.grid + ".svg")
def setUp(self): data = pd.read_csv("titanic.csv", index_col=0).dropna() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked', 'country'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPRegressor(hidden_layer_sizes=(150, 100, 50), max_iter=500, random_state=0))]) clf.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y)
def setUp(self): self.protected = np.array(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'c']) self.y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0]) self.y_pred = np.array([0.32, 0.43, 0.56, 0.67, 0.9, 0.67, 0.98, 0.1, 0.44, 1, 0.65, 0.55, 1]) self.cutoff = {'a': 0.5, 'b': 0.4, 'c': 0.6} # classifier data = dx.datasets.load_german() X = data.drop(columns='risk') y = data.risk categorical_features = ['sex', 'job', 'housing', 'saving_accounts', "checking_account", 'purpose'] categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier(max_depth=7, random_state=123))]) clf.fit(X, y) self.exp = dx.Explainer(clf, X, y, verbose=False) self.german_protected = data.sex + '_' + np.where(data.age < 25, 'young', 'old') self.mgf = self.exp.model_fairness(protected=self.german_protected, privileged='male_old', verbose=False) self.mgf2 = deepcopy(self.mgf) self.mgf.label = 'first' self.mgf2.label = 'second' # regressor self.protected_reg = np.array([np.tile('A', 1000), np.tile('B', 1000)]).flatten() first = np.array([np.random.normal(100, 20, 1000), np.random.normal(50, 10, 1000)]).flatten() second = np.array([np.random.normal(60, 20, 1000), np.random.normal(60, 10, 1000)]).flatten() target = np.array([np.random.normal(10000, 2000, 1000), np.random.normal(8000, 1000, 1000)]).flatten() data2 = pd.DataFrame({'first': first, 'second': second}) reg = DecisionTreeRegressor() reg.fit(data2, target) self.exp_reg = dx.Explainer(reg, data2, target) self.mgf_reg = self.exp_reg.model_fairness(self.protected_reg, 'A')
def ale_plot_uplift(model, X_train, Y_train, treatment_col): exp = dalex.Explainer( model, X_train, Y_train, predict_function=lambda model, x: calc_uplift(model, x, treatment_col)) ale = exp.model_profile(type='accumulated') ale.plot()
def pdp_plot_uplift(model, X_train, Y_train, treatment_col): exp = dalex.Explainer( model, X_train, Y_train, predict_function=lambda model, x: calc_uplift(model, x, treatment_col)) partial = exp.model_profile(type='partial') partial.plot()
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.loc[:, ["age", "fare", "sibsp", "parch"]] self.y = data.survived clf = RandomForestClassifier(n_estimators=100) clf.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, verbose=False)
def setUp(self): data = dx.datasets.load_titanic() self.X = data.drop(columns=['survived', 'class', 'embarked']) self.y = data.survived self.X.gender = LabelEncoder().fit_transform(self.X.gender) # this checks for no feature_importances_ attribute model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=400, random_state=0) model.fit(self.X, self.y) self.exp = dx.Explainer(model, self.X, self.y, verbose=False) data2 = dx.datasets.load_fifa() self.X2 = data2.drop(["nationality", "overall", "potential", "value_eur", "wage_eur"], axis=1).iloc[0:2000, 0:10] self.y2 = data2['value_eur'].iloc[0:2000] # this checks for feature_importances_ attribute model2 = RandomForestRegressor(random_state=0) model2.fit(self.X2, self.y2) self.exp2 = dx.Explainer(model2, self.X2, self.y2, verbose=False)
def init_explainer(model, data_x: pd.DataFrame, exp_type: str = 'shap', log_loss: bool = False, verbose: bool = False): """ Construye los objetos de explainer para las librerias de SHAP o DALEX, estos se ocupan más adelante tanto para realizar calculos como crear graficos Arg: model: Modelo del tipo Tree data_x: Datos que se ajusten a la entrada del modelo exp_type: Tipo de explainer a contruir, {'shap', 'dalex'} log_loss: Para decidir si calcular los SHAP values de la contribucion a la salida del modelo (False) o al error de clasificacion (True) Returns: Objeto explainer del tipo especificado """ if exp_type == 'shap': if log_loss: try: return shap.TreeExplainer(model, data_x, model_output='log_loss') except: return shap.TreeExplainer(model, model_output='log_loss') else: try: return shap.TreeExplainer(model, data_x) except: return shap.TreeExplainer(model) elif exp_type == 'dalex': return dx.Explainer(model, data_x, model.predict(data_x), verbose=verbose) else: raise ValueError('exp_type="shap" or exp_type="dalex"')
# 전이학습을 이용한 사용자 분류 # ResNet-100 from keras.datasets import cifar100, cifar10 from keras.applications import VGG16, VGG19, Xception, ResNet101 from keras.models import Sequential, Model, Input from keras.layers import Dense, Conv2D, Flatten, BatchNormalization, Activation, MaxPooling2D, Dropout from keras.optimizers import Adam import dalex as dx expl = dx.Explainer(clf, X, y, label="Titanic MLP Pipeline") (x_train, y_train), (x_test, y_test) = cifar10.load_data() x_train = x_train.reshape(50000, 32, 32, 3).astype('float32') / 255.0 x_test = x_test.reshape(10000, 32, 32, 3).astype('float32') / 255.0 resnet101 = ResNet101(input_shape=(32, 32, 3), include_top=False) model = Sequential() model.add(resnet101) model.add(Flatten()) model.add(Dense(512)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(10, activation='softmax')) model.summary() model.compile(optimizer=Adam(2e-4),
class FairnessTest(unittest.TestCase): def test_ConfusionMatrix(self): from dalex.fairness.group_fairness.utils import ConfusionMatrix y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1]) y_pred = np.array([0.32, 0.54, 0.56, 0.67, 0.34, 0.67, 0.98, 1]) cutoff = 0.55 cm = ConfusionMatrix(y_true, y_pred, cutoff) # proper calculations self.assertEqual(cm.cutoff, 0.55) self.assertEqual(cm.tp, 3) self.assertEqual(cm.tn, 2) self.assertEqual(cm.fp, 2) self.assertEqual(cm.fn, 1) # error assertions y_true = y_true[:-1] with self.assertRaises(AssertionError): cm_ = ConfusionMatrix(y_true, y_pred, cutoff) y_true = np.append(y_true, 1) cutoff = 1.5 with self.assertRaises(AssertionError): cm_ = ConfusionMatrix(y_true, y_pred, cutoff) protected = np.array(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'c']) y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0]) y_pred = np.array([0.32, 0.43, 0.56, 0.67, 0.9, 0.67, 0.98, 0.1, 0.44, 1, 0.65, 0.55, 1]) cutoff = {'a': 0.5, 'b': 0.4, 'c': 0.6} def test_SubConfusionMatrix(self): from dalex.fairness.group_fairness.utils import SubgroupConfusionMatrix y_true = FairnessTest.y_true y_pred = FairnessTest.y_pred protected = FairnessTest.protected cutoff = FairnessTest.cutoff scf = SubgroupConfusionMatrix(y_true, y_pred, protected, cutoff) # proper calculations self.assertEqual(scf.sub_dict.get('a').tp, 2) self.assertEqual(scf.sub_dict.get('a').fp, 2) self.assertEqual(scf.sub_dict.get('a').fn, 0) self.assertEqual(scf.sub_dict.get('a').tn, 2) self.assertEqual(scf.sub_dict.get('b').tp, 2) self.assertEqual(scf.sub_dict.get('b').fp, 0) self.assertEqual(scf.sub_dict.get('b').fn, 1) self.assertEqual(scf.sub_dict.get('b').tn, 0) self.assertEqual(scf.sub_dict.get('c').tp, 2) self.assertEqual(scf.sub_dict.get('c').fp, 1) self.assertEqual(scf.sub_dict.get('c').fn, 0) self.assertEqual(scf.sub_dict.get('c').tn, 1) # error assertions y_true = y_true[:-1] with self.assertRaises(AssertionError): cm_ = SubgroupConfusionMatrix(y_true, y_pred, protected, cutoff) y_true = np.append(y_true, 1) cutoff = [0.1, 0.2, 0.4] # list instead of dict with self.assertRaises(AssertionError): cm_ = SubgroupConfusionMatrix(y_true, y_pred, protected, cutoff) def test_SubgroupConfusionMatrixMetrics(self): y_true = FairnessTest.y_true y_pred = FairnessTest.y_pred protected = FairnessTest.protected cutoff = FairnessTest.cutoff scf = SubgroupConfusionMatrix(y_true, y_pred, protected, cutoff) scf_metrics = SubgroupConfusionMatrixMetrics(scf) scmm = scf_metrics.subgroup_confusion_matrix_metrics self.assertEqual(scmm.get('a').get('TPR'), 1) self.assertEqual(scmm.get('b').get('TPR'), 0.667) self.assertTrue(np.isnan(scmm.get('b').get('TNR'))) def test_calculate_ratio(self): y_true = FairnessTest.y_true y_pred = FairnessTest.y_pred protected = FairnessTest.protected cutoff = FairnessTest.cutoff scf = SubgroupConfusionMatrix(y_true, y_pred, protected, cutoff) scf_metrics = SubgroupConfusionMatrixMetrics(scf) df_ratio = calculate_ratio(scf_metrics, 'a') b = list(scf_metrics.subgroup_confusion_matrix_metrics.get('b').values()) a = list(scf_metrics.subgroup_confusion_matrix_metrics.get('a').values()) ratio = np.array(b) / np.array(a) ratio[np.isinf(ratio)] = np.nan ratio[ratio == 0] = np.nan ratio_nonnan = ratio[np.isfinite(ratio)] df_ratio_nonnan = np.array(df_ratio.iloc[1, :][np.isfinite(df_ratio.iloc[1, :])]) self.assertTrue(np.equal(ratio_nonnan, df_ratio_nonnan).all()) def test_calculate_parity_loss(self): y_true = FairnessTest.y_true y_pred = FairnessTest.y_pred protected = FairnessTest.protected cutoff = FairnessTest.cutoff scf = SubgroupConfusionMatrix(y_true, y_pred, protected, cutoff) scf_metrics = SubgroupConfusionMatrixMetrics(scf) parity_loss = calculate_parity_loss(scf_metrics, "a") ratios = calculate_ratio(scf_metrics, "a") TPR_parity_loss = parity_loss.iloc[0] TPR_ratios = ratios.TPR / ratios.TPR[0] TPR_log =abs(np.log(TPR_ratios)) self.assertEqual(TPR_log.sum(), TPR_parity_loss) data = dx.datasets.load_german() X = data.drop(columns='Risk') y = data.Risk categorical_features = ['Sex', 'Job', 'Housing', 'Saving_accounts', "Checking_account", 'Purpose'] categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier(max_depth=7, random_state=123))]) clf.fit(X, y) exp = dx.Explainer(clf, X, y) german_protected = data.Sex + '_' + np.where(data.Age < 25, 'young', 'old') def test_GroupFairnessClassificationObject(self): exp = FairnessTest.exp protected = FairnessTest.german_protected gfco = GroupFairnessClassification(y=exp.y, y_hat=exp.y_hat, protected=protected, privileged='male_old', verbose=False, label=exp.label) self.assertEqual(gfco.__class__.__name__, 'GroupFairnessClassification') def test_parameter_checks(self): exp = FairnessTest.exp protected = FairnessTest.german_protected # error handling wrong_protected = np.array([protected, protected]) with self.assertRaises(ParameterCheckError): gfco = exp.model_fairness(protected=wrong_protected, privileged='male_old', verbose=False) with self.assertRaises(ParameterCheckError): gfco = exp.model_fairness(protected=protected, privileged='not_existing', verbose=False) with self.assertRaises(ParameterCheckError): gfco = GroupFairnessClassification(y=exp.y[:-1, ], y_hat=exp.y_hat, protected=protected, privileged='male_old', verbose=False, label=exp.label) with self.assertRaises(ParameterCheckError): gfco = GroupFairnessClassification(y=exp.y[:-1, ], y_hat=exp.y_hat[:-1, ], protected=protected, privileged='male_old', verbose=False, label=exp.label) with self.assertRaises(ParameterCheckError): gfco = exp.model_fairness(protected=protected, privileged='male_old', cutoff=1.2, verbose=False) with self.assertRaises(ParameterCheckError): gfco = exp.model_fairness(protected=protected, privileged='male_old', cutoff='not_int', verbose=False) # conversion check gfco = exp.model_fairness(protected=protected, privileged='male_old', cutoff=0.6, verbose=False) self.assertEqual(list(gfco.cutoff.values()), [0.6, 0.6, 0.6, 0.6]) gfco = exp.model_fairness(protected=protected, privileged='male_old', cutoff={'male_old': 0.9}, verbose=False) self.assertEqual(gfco.cutoff, {'male_old': 0.9, 'female_old': 0.5, 'male_young': 0.5, 'female_young': 0.5}) np.random.seed(1) new_protected = np.random.choice(np.array([0, 1]), 1000) gfco = exp.model_fairness(protected=new_protected, privileged=1, verbose=False) self.assertEqual(gfco.privileged, '1') self.assertEqual(list(gfco.protected), list(new_protected.astype('U'))) gfco = exp.model_fairness(protected=list(protected), privileged='male_old', verbose=False) self.assertEqual(type(gfco.protected), np.ndarray) def test_model_group_fairness(self): exp = FairnessTest.exp protected = FairnessTest.german_protected mgf = exp.model_fairness(protected=protected, privileged='male_old', verbose=False) self.assertEqual(mgf.__class__.__name__, 'GroupFairnessClassification') def test_plot_fairness_check(self): import plotly.graph_objects as go exp = FairnessTest.exp protected = FairnessTest.german_protected mgf = exp.model_fairness(protected=protected, privileged='male_old', verbose=False) fig = mgf.plot(show=False) self.assertEqual(fig.layout.title.text, "Fairness Check") self.assertEqual(fig.__class__, go.Figure) mgf2 = deepcopy(mgf) mgf.label = 'first' mgf2.label = 'second' fig = mgf.plot(objects=[mgf2], show=False) self.assertEqual(fig.__class__, go.Figure) self.assertEqual(fig['data'][0]['legendgroup'], "first") self.assertEqual(fig['data'][5]['legendgroup'], "second") # test errors in plots with self.assertRaises(FairnessObjectsDifferenceError): mgf_wrong = exp.model_fairness(protected=protected, privileged='male_young', verbose=False ) mgf.plot([mgf_wrong]) with self.assertRaises(FairnessObjectsDifferenceError): exp_wrong = deepcopy(exp) exp_wrong.y = exp_wrong.y[:-1] exp_wrong.y_hat = exp_wrong.y_hat[:-1] mgf_wrong = exp_wrong.model_fairness(protected=protected[:-1], privileged='male_old', verbose=False) mgf.plot([mgf_wrong]) def test_plot_metric_scores(self): import plotly.graph_objects as go exp = FairnessTest.exp protected = FairnessTest.german_protected mgf = exp.model_fairness(protected=protected, privileged='male_old', verbose=False) fig = mgf.plot(show=False, type='metric_scores') self.assertEqual(fig.layout.title.text, "Metric Scores") self.assertEqual(fig.__class__, go.Figure) mgf2 = deepcopy(mgf) mgf.label = 'first' mgf2.label = 'second' fig = mgf.plot(objects=[mgf2], show=False, type='metric_scores') self.assertEqual(fig.__class__, go.Figure) self.assertEqual(fig['data'][0]['legendgroup'], "first") self.assertEqual(fig['data'][5]['legendgroup'], "second") # test errors in plots with self.assertRaises(FairnessObjectsDifferenceError): mgf_wrong = exp.model_fairness(protected=protected, privileged='male_young', verbose=False ) mgf.plot([mgf_wrong], type='metric_scores') with self.assertRaises(FairnessObjectsDifferenceError): exp_wrong = deepcopy(exp) exp_wrong.y = exp_wrong.y[:-1] exp_wrong.y_hat = exp_wrong.y_hat[:-1] mgf_wrong = exp_wrong.model_fairness(protected=protected[:-1], privileged='male_old', verbose=False) mgf.plot([mgf_wrong], type='metric_scores')
import dalex as dx dx.Explainer()
from _train import data, X, y, pipeline, pipeline_for_encoded_data, encoded_X import pandas as pd import sklearn import numpy as np from lime import lime_tabular import dalex as dx model = pipeline X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, train_size=0.80) model.fit(X_train, y_train) # create an explainer for the model: exp = dx.Explainer(model, X, y, label="Lung's Cancer MLP Pipeline") # BreakDown and BreakDownInt methods def BreakDown(number_of_observation): bd = exp.predict_parts(pd.DataFrame( X_test.iloc[number_of_observation, :]).T, type='break_down') bd.plot() def BreakDownI(number_of_observation): bd_interactions = exp.predict_parts(pd.DataFrame( X_test.iloc[number_of_observation, :]).T, type='break_down_interactions') bd_interactions.plot()
X_valid, Y_valid, treatment_col, just_get_model=True) def calc_uplift_filled(model, x): treatment_col = 11 #x = np.array(x) return predict_treatment(model, treatment_col, 1)( np.array(x)) - predict_treatment(model, treatment_col, 0)(np.array(x)) exp = dalex.Explainer(r_xgb_model, X_train, Y_train, label="xgboost", predict_function=calc_uplift_filled) exp.model_profile(type='partial') #forty_ice_df = ice(data=train_X_imp_df[:5000], column='history', # predict = lambda X: calc_uplift(r_xgb_model, X,treatment_col)) #ice_plot(forty_ice_df, c='dimgray', linewidth=0.3) #plt.ylabel('Pred. AV %ile') #plt.xlabel('history'); #ice_plot(forty_ice_df, linewidth=.5, plot_pdp=True, # pdp_kwargs={'c': 'k', 'linewidth': 3}) #plt.ylabel('Pred. AV %ile') #plt.xlabel('history');
def pdp_comparator(self, X, y, metric='abs_sum', save_model_profiles=False, variables=None, calculate_metric_for_base_model=False): """ Compares pdp profiles with given metric. You can save (inside this object) model profiles from dalex if save_model_profiles set to True. If you set save_model_profiles=True, it requiers more memory, but you can calculate very fast different metrics with pdp_comparator_change_metric method. You can choose a certain subset of features by giving a list of these feature names as a variables parameter. If it's None, all features will be calculated. """ distance = RashomonSetAnalyser.distance_function_generator(metric) profile_base = dx.Explainer(self.base_model[1], X, y, label=self.base_model[0], verbose=False) if variables is None: profile_base = profile_base.model_profile(verbose=False) else: profile_base = profile_base.model_profile(verbose=False, variables=variables) df = pd.DataFrame({'colname': profile_base.result._vname_.unique()}) if save_model_profiles: self.model_profiles = [profile_base] y_base = profile_base.result._yhat_ x_base = profile_base.result._x_ sample_length = y_base.size / profile_base.result._vname_.nunique() for model in self.models: profile = dx.Explainer(model[1], X, y, label=model[0], verbose=False) if variables is None: profile = profile.model_profile(verbose=False) else: profile = profile.model_profile(verbose=False, variables=variables) y_result = profile.result._yhat_ x_result = profile.result._x_ tab_res = [] for i in range(len(df.colname)): lower = int(i * sample_length) higher = int((i + 1) * sample_length) tab_res.append( distance(x_base[lower:higher], y_base[lower:higher], x_result[lower:higher], y_result[lower:higher])) df[model[0]] = tab_res if save_model_profiles: self.model_profiles.append(profile) else: del profile if calculate_metric_for_base_model: tab_res = [] for i in range(len(df.colname)): lower = int(i * sample_length) higher = int((i + 1) * sample_length) tab_res.append( distance(x_base[lower:higher], y_base[lower:higher], x_base[lower:higher], y_base[lower:higher])) df[self.base_model[0]] = tab_res self.pdp_measures = df return df