def explain_wrap(index, columns): print("DOING {}".format(index)) global count x, y = X_valid[index], Y_valid[index] explainer_xgb = explain(xg_predicted, data=X_train, y=Y_train, label="XGBoost model", predict_function=lambda X: xgmodel.predict_proba(X.to_numpy())[::, 1], variable_names=column_names) explainer_linear = explain(lin_predicted, data=X_train, y=Y_train, label="Logistic model", predict_function=lambda X: logmodel.predict_proba(X.to_numpy())[::, 1], variable_names=column_names) cp_xgb = individual_variable_profile(explainer_xgb, x, y) cp_lin = individual_variable_profile(explainer_linear, x, y) plot(cp_xgb, cp_lin, selected_variables=columns, destination="browser", show_observations=False) #IFrame(src="./_plot_files/plots{}.html".format(count), width=700, height=600) #with open("_plot_files/plots{}.html".format(count), 'r') as myfile: # display(HTML(myfile.read())) count += 1
def test_iris_classification_1(self): cp_profile = individual_variable_profile(self.explainer_rf, self.X[1], y=self.y[1]) self.assertTrue(isinstance(cp_profile, CeterisParibus)) self.assertIsNotNone(cp_profile.new_observation_true) self.assertEqual(len(cp_profile._predict_function(self.X[:3])), 3)
def test_regression_2(self): n = 3 sample = select_sample(self.X_train, n=n) cp2 = individual_variable_profile(self.explainer_rf, sample, variables=['TAX', 'CRIM']) self.assertEqual(len(cp2.profile), cp2._grid_points * 2 * n)
def test_iris_classification_6(self): X_data = pd.DataFrame(self.X[5]).T cp_profile = individual_variable_profile(self.explainer_rf, X_data, self.y[5]) self.assertEqual( len(cp_profile.profile), len(self.iris['feature_names']) * cp_profile._grid_points)
def test_keras_3(self): cp = individual_variable_profile( self.explainer_keras, self.x_train[5], y=self.y_train[5], variables=["CRIM", "ZN", "AGE", "INDUS", "B"]) self.assertEqual(len(cp.new_observation_true), 1) self.assertEqual(len(cp.profile), cp._grid_points * 5)
def test_keras_2(self): cp = individual_variable_profile( self.explainer_keras, pd.DataFrame(self.x_train[:10]), y=list(self.y_train[:10]), variables=["CRIM", "ZN", "AGE", "INDUS", "B"]) self.assertEqual(len(cp.new_observation_true), 10) self.assertEqual(len(cp.profile), cp._grid_points * 5 * 10)
def test_regression_1(self): cp_profile = individual_variable_profile(self.explainer_rf, self.X_train[0], y=self.y_train[0], variables=['TAX', 'CRIM']) self.assertIsNotNone(cp_profile.new_observation_true) self.assertEqual(len(cp_profile.selected_variables), 2) self.assertEqual(len(cp_profile.profile), cp_profile._grid_points * 2) self.assertIn("TAX", cp_profile.profile.columns)
def test_iris_classification_2(self): grid_points = 3 feature = self.iris['feature_names'][0] cp_profile = individual_variable_profile(self.explainer_rf, self.X[:10], variables=[feature], grid_points=grid_points) self.assertIn(feature, cp_profile.profile.columns) self.assertEqual(len(cp_profile.profile), 10 * grid_points) self.assertIsNone(cp_profile.new_observation_true) self.assertEqual(cp_profile.selected_variables, [feature])
def test_iris_classification_5(self): feature = self.iris['feature_names'][0] X_data = pd.DataFrame(self.X[:20]) y_data = pd.DataFrame(self.y) cp_profile = individual_variable_profile(self.explainer_rf, X_data, y_data, variables=[feature]) self.assertEqual(len(cp_profile.profile), 20 * cp_profile._grid_points) self.assertLessEqual(max(cp_profile.profile['_yhat_']), 1) self.assertGreaterEqual(min(cp_profile.profile['_yhat_']), 0)
def create_cp(model, label, idx, path='../data/heloc_dataset_v1.csv'): data = pd.read_csv(path) column_list = prepare_data() explainer = explain( model, data=data[column_list], y=data.RiskPerformance, label=label, predict_function=lambda X: model.predict_proba(X)[::, 1]) return individual_variable_profile(explainer, data[column_list].loc[idx], data.loc[idx, 'RiskPerformance'])
def test_iris_classification_3(self): feature = self.iris['feature_names'][0] grid_points = 5 # should be ignored num_points = 3 cp_profile = individual_variable_profile( self.explainer_rf, self.X[:num_points], y=self.y[:num_points], variables=[feature], grid_points=grid_points, variable_splits={feature: [10, 31]}) self.assertEqual(len(cp_profile.profile), num_points * 2)
def ceterisParibus_connector(self, feature, *arg): from ceteris_paribus.plots.plots import plot query_instance = dict(s.split(':') for s in arg) #print(feature) #prepare data instance (nparray) categories = self.getCategoricalFeatures() np_instance = [] for f in self.featureNames: if f in categories: np_instance.append(query_instance[f]) else: np_instance.append(float(query_instance[f])) #print(np_instance) prediction_proba = self.model.predict_proba( pd.DataFrame([query_instance]))[0] prediction = np.where( prediction_proba == np.amax(prediction_proba))[0][0] #print(prediction) explainer = explain( self.model, variable_names=self.featureNames, data=self.X_train, y=self.Y_train, label='Model', predict_function=lambda x: self.model.predict_proba(x)[::, 1]) i = individual_variable_profile(explainer, np.array(np_instance), np.array([prediction])) p = plot(i, selected_variables=[feature], width=700, height=800, size=4) options = {'height': '500', 'width': '600'} imgkit.from_file('_plot_files/plots' + p + '.html', 'temp/plots' + p + '.jpg', options=options) self.certainty = "I am 100 percent sure about the graph." return ("temp/plots" + str(p) + ".jpg")
def test_regression_3(self): variable_names = self.variable_names neighbours = select_neighbours(self.X_train, self.X_train[0], variable_names=variable_names, selected_variables=variable_names, n=15) cp3 = individual_variable_profile(self.explainer_rf, neighbours, variables=['LSTAT', 'RM'], variable_splits={ 'LSTAT': [10, 20, 30], 'RM': [4, 5, 6, 7] }) self.assertEqual(cp3.selected_variables, ['LSTAT', 'RM']) # num of different values in splits self.assertEqual(len(cp3.profile), 15 * 7)
numeric_features = ['age', 'bmi', 'children'] numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) categorical_features = ['sex', 'smoker', 'region'] categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestRegressor())]) clf.fit(x, y) from ceteris_paribus.explainer import explain explainer_cat = explain(clf, var_names, x, y, label="categorical_model") from ceteris_paribus.profiles import individual_variable_profile cp_cat = individual_variable_profile(explainer_cat, x.iloc[:10], y.iloc[:10]) cp_cat.print_profile() plot(cp_cat) plot(cp_cat, color="smoker")
svm_model.fit(x, y) return svm_model, x, y, var_names if __name__ == "__main__": (linear_model, data, labels, variable_names) = linear_regression_model() (gb_model, _, _, _) = gradient_boosting_model() (svm_model, _, _, _) = supported_vector_machines_model() explainer_linear = explain(linear_model, variable_names, data, y) explainer_gb = explain(gb_model, variable_names, data, y) explainer_svm = explain(svm_model, variable_names, data, y) # single profile cp_1 = individual_variable_profile(explainer_gb, x[0], y[0]) plot(cp_1, destination="notebook", selected_variables=["bmi"], print_observations=False) # local fit neighbours_x, neighbours_y = select_neighbours(x, x[10], y=y, n=10) cp_2 = individual_variable_profile(explainer_gb, neighbours_x, neighbours_y) plot(cp_2, show_residuals=True, selected_variables=["age"], print_observations=False, color_residuals='red', plot_title='')
random_state=42) gb_model.fit(x, y) return gb_model, x, y, var_names def supported_vector_machines_model(): svm_model = svm.SVR(C=0.01, gamma='scale') svm_model.fit(x, y) return svm_model, x, y, var_names if __name__ == "__main__": (linear_model, data, labels, variable_names) = linear_regression_model() (gb_model, _, _, _) = gradient_boosting_model() (svm_model, _, _, _) = supported_vector_machines_model() explainer_linear = explain(linear_model, variable_names, data, y) explainer_gb = explain(gb_model, variable_names, data, y) explainer_svm = explain(svm_model, variable_names, data, y) cp_profile = individual_variable_profile(explainer_linear, x[0], y[0]) plot(cp_profile, show_residuals=True) sample_x, sample_y = select_sample(x, y, n=10) cp2 = individual_variable_profile(explainer_gb, sample_x, y=sample_y) cp3 = individual_variable_profile(explainer_gb, x[0], y[0]) plot(cp3, show_residuals=True) plot(cp_profile, cp3, show_residuals=True)
X = iris['data'] y = iris['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) print(iris['feature_names']) def random_forest_classifier(): rf_model = RandomForestClassifier(n_estimators=100, random_state=42) rf_model.fit(X_train, y_train) return rf_model, X_train, y_train, iris['feature_names'] if __name__ == "__main__": (model, data, labels, variable_names) = random_forest_classifier() predict_function = lambda X: model.predict_proba(X)[::, 0] explainer_rf = explain(model, variable_names, data, labels, predict_function=predict_function) cp_profile = individual_variable_profile(explainer_rf, X[1], y=y[1]) plot(cp_profile)
rf_model = ensemble.RandomForestRegressor(n_estimators=100, random_state=42) # Train the model using the training set rf_model.fit(X_train, y_train) # model, data, labels, variable_names return rf_model, X_train, y_train, list(boston['feature_names']) if __name__ == "__main__": (model, data, labels, variable_names) = random_forest_regression() explainer_rf = explain(model, variable_names, data, labels) cp_profile = individual_variable_profile(explainer_rf, X_train[0], y=y_train[0], variables=['TAX', 'CRIM']) plot(cp_profile) sample = select_sample(X_train, n=3) cp2 = individual_variable_profile(explainer_rf, sample, variables=['TAX', 'CRIM']) plot(cp2) neighbours = select_neighbours(X_train, X_train[0], variable_names=variable_names, selected_variables=variable_names, n=15) cp3 = individual_variable_profile(explainer_rf,
data = np.array(trainX) yt = np.array(trainY) labels = yt.ravel() variable_names = data_for_prediction.columns predict_function = lambda X: RFModel.predict_proba(X)[::, 1] explainer_rf = explain(RFModel, variable_names, data, y=labels, predict_function=predict_function, label="sRNARFTarget") #cp_profile = individual_variable_profile(explainer_rf, data_for_prediction, y = 1, grid_points = 100) cp_profile = individual_variable_profile(explainer_rf, data_for_prediction, grid_points=200, variables=[sys.argv[3]]) plot(cp_profile, show_profiles=True, show_residuals=True, show_rugs=True, height=700, width=750, yaxis_title='Prediction probablity for class 1', plot_title='Ceteris paribus profiles of feature ' + sys.argv[3] + ' for ' + sys.argv[1] + '-' + sys.argv[2] + ' pair interaction', color='blue', size=3, alpha=0.5, color_residuals='red', size_residuals=20,
def test_iris_classification_7(self): X_data = pd.DataFrame(self.X[5]) with self.assertRaises(ValueError): cp_profile = individual_variable_profile(self.explainer_rf, X_data, self.y[5])
def test_categorical_1(self): cp = individual_variable_profile(self.explainer_cat, self.x.iloc[:10], self.y.iloc[:10]) self.assertEqual(len(cp.new_observation_true), 10) self.assertIn('female', list(cp.profile['sex'])) self.assertIn('sex', list(cp.profile['_vname_']))
model.compile(loss='mean_squared_error', optimizer='adam') return model def keras_model(): estimators = [('scaler', StandardScaler()), ('mlp', KerasRegressor(build_fn=network_architecture, epochs=200))] model = Pipeline(estimators) model.fit(x_train, y_train) return model, x_train, y_train, boston.feature_names if __name__ == "__main__": model, x_train, y_train, var_names = keras_model() explainer_keras = explain(model, var_names, x_train, y_train, label='KerasMLP') cp = individual_variable_profile( explainer_keras, x_train[:10], y=y_train[:10], variables=["CRIM", "ZN", "AGE", "INDUS", "B"]) plot(cp, show_residuals=True, selected_variables=["CRIM", "ZN", "AGE", "B"], show_observations=True, show_rugs=True)