def explain_wrap(index, columns): print("DOING {}".format(index)) global count x, y = X_valid[index], Y_valid[index] explainer_xgb = explain(xg_predicted, data=X_train, y=Y_train, label="XGBoost model", predict_function=lambda X: xgmodel.predict_proba(X.to_numpy())[::, 1], variable_names=column_names) explainer_linear = explain(lin_predicted, data=X_train, y=Y_train, label="Logistic model", predict_function=lambda X: logmodel.predict_proba(X.to_numpy())[::, 1], variable_names=column_names) cp_xgb = individual_variable_profile(explainer_xgb, x, y) cp_lin = individual_variable_profile(explainer_linear, x, y) plot(cp_xgb, cp_lin, selected_variables=columns, destination="browser", show_observations=False) #IFrame(src="./_plot_files/plots{}.html".format(count), width=700, height=600) #with open("_plot_files/plots{}.html".format(count), 'r') as myfile: # display(HTML(myfile.read())) count += 1
def setUp(self): df = pd.read_csv(os.path.join(DATASETS_DIR, 'insurance.csv')) self.x = df.drop(['charges'], inplace=False, axis=1) self.y = df['charges'] var_names = list(self.x) # We create the preprocessing pipelines for both numeric and categorical data. numeric_features = ['age', 'bmi', 'children'] numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) categorical_features = ['sex', 'smoker', 'region'] categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline( steps=[('preprocessor', preprocessor), ('classifier', RandomForestRegressor())]) clf.fit(self.x, self.y) self.explainer_cat = explain(clf, var_names, self.x, self.y, label="categorical_model")
def setUp(self): boston = datasets.load_boston() x = boston.data y = boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) def network_architecture(): model = Sequential() model.add(Dense(640, input_dim=x.shape[1])) model.add(Activation('tanh')) model.add(Dense(320)) model.add(Activation('tanh')) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') return model def keras_model(): estimators = [('scaler', StandardScaler()), ('mlp', KerasRegressor(build_fn=network_architecture, epochs=20))] model = Pipeline(estimators) model.fit(x_train, y_train) return model, x_train, y_train, boston.feature_names model, self.x_train, self.y_train, self.var_names = keras_model() self.explainer_keras = explain(model, self.var_names, self.x_train, self.y_train, label='KerasMLP')
def test_explainer_16(self): # predict function for array explainer = explain(self.rf_model, variable_names=self.var_names, data=self.X[:10], y=self.y[:10]) self.assertEqual(len(explainer.predict_fun(pd.DataFrame(self.X[:10]))), 10)
def test_explainer_14(self): # data for one observation - 1D array explainer = explain(self.rf_model, variable_names=["a", "b"], data=np.array(["cc", "dd"])) np.testing.assert_array_equal( explainer.data, pd.DataFrame.from_dict({ "a": ["cc"], "b": ["dd"] }))
def create_cp(model, label, idx, path='../data/heloc_dataset_v1.csv'): data = pd.read_csv(path) column_list = prepare_data() explainer = explain( model, data=data[column_list], y=data.RiskPerformance, label=label, predict_function=lambda X: model.predict_proba(X)[::, 1]) return individual_variable_profile(explainer, data[column_list].loc[idx], data.loc[idx, 'RiskPerformance'])
def ceterisParibus_connector(self, feature, *arg): from ceteris_paribus.plots.plots import plot query_instance = dict(s.split(':') for s in arg) #print(feature) #prepare data instance (nparray) categories = self.getCategoricalFeatures() np_instance = [] for f in self.featureNames: if f in categories: np_instance.append(query_instance[f]) else: np_instance.append(float(query_instance[f])) #print(np_instance) prediction_proba = self.model.predict_proba( pd.DataFrame([query_instance]))[0] prediction = np.where( prediction_proba == np.amax(prediction_proba))[0][0] #print(prediction) explainer = explain( self.model, variable_names=self.featureNames, data=self.X_train, y=self.Y_train, label='Model', predict_function=lambda x: self.model.predict_proba(x)[::, 1]) i = individual_variable_profile(explainer, np.array(np_instance), np.array([prediction])) p = plot(i, selected_variables=[feature], width=700, height=800, size=4) options = {'height': '500', 'width': '600'} imgkit.from_file('_plot_files/plots' + p + '.html', 'temp/plots' + p + '.jpg', options=options) self.certainty = "I am 100 percent sure about the graph." return ("temp/plots" + str(p) + ".jpg")
def setUp(self): boston = datasets.load_boston() X = boston['data'] y = boston['target'] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.33, random_state=42) (model, data, labels, self.variable_names) = random_forest_regression( self.X_train, self.y_train, list(boston['feature_names'])) self.explainer_rf = explain(model, self.variable_names, data, labels, label="rf_model")
def setUp(self): self.iris = load_iris() self.X = self.iris['data'] self.y = self.iris['target'] X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.33, random_state=42) (model, data, labels, variable_names) = random_forest_classifier( X_train, y_train, list(self.iris['feature_names'])) predict_function = lambda X: model.predict_proba(X)[::, 0] self.explainer_rf = explain(model, variable_names, data, labels, predict_function=predict_function, label="rf_model")
def test_explainer_5(self): # raises warning explainer = explain(self.rf_model, []) self.assertEqual(explainer.label, "RandomForestRegressor")
def test_explainer_2(self): model = MagicMock(predict=id) explainer = explain(model, data=pd.DataFrame()) self.assertEqual(explainer.predict_fun, id)
def random_forest_regression(): # Create linear regression object rf_model = ensemble.RandomForestRegressor(n_estimators=100, random_state=42) # Train the model using the training set rf_model.fit(X_train, y_train) # model, data, labels, variable_names return rf_model, X_train, y_train, list(boston['feature_names']) if __name__ == "__main__": (model, data, labels, variable_names) = random_forest_regression() explainer_rf = explain(model, variable_names, data, labels) cp_profile = individual_variable_profile(explainer_rf, X_train[0], y=y_train[0], variables=['TAX', 'CRIM']) plot(cp_profile) sample = select_sample(X_train, n=3) cp2 = individual_variable_profile(explainer_rf, sample, variables=['TAX', 'CRIM']) plot(cp2) neighbours = select_neighbours(X_train, X_train[0],
data_for_prediction = datarow.iloc[:, 2:] trainX = pickle.load( open('./PickledModelData/RFData/trainX_sRNARFTarget.pkl', 'rb')) trainY = pickle.load( open('./PickledModelData/RFData/trainY_sRNARFTarget.pkl', 'rb')) data = np.array(trainX) yt = np.array(trainY) labels = yt.ravel() variable_names = data_for_prediction.columns predict_function = lambda X: RFModel.predict_proba(X)[::, 1] explainer_rf = explain(RFModel, variable_names, data, y=labels, predict_function=predict_function, label="sRNARFTarget") #cp_profile = individual_variable_profile(explainer_rf, data_for_prediction, y = 1, grid_points = 100) cp_profile = individual_variable_profile(explainer_rf, data_for_prediction, grid_points=200, variables=[sys.argv[3]]) plot(cp_profile, show_profiles=True, show_residuals=True, show_rugs=True, height=700, width=750, yaxis_title='Prediction probablity for class 1',
random_state=42) gb_model.fit(x, y) return gb_model, x, y, var_names def supported_vector_machines_model(): svm_model = svm.SVR(C=0.01, gamma='scale') svm_model.fit(x, y) return svm_model, x, y, var_names if __name__ == "__main__": (linear_model, data, labels, variable_names) = linear_regression_model() (gb_model, _, _, _) = gradient_boosting_model() (svm_model, _, _, _) = supported_vector_machines_model() explainer_linear = explain(linear_model, variable_names, data, y) explainer_gb = explain(gb_model, variable_names, data, y) explainer_svm = explain(svm_model, variable_names, data, y) cp_profile = individual_variable_profile(explainer_linear, x[0], y[0]) plot(cp_profile, show_residuals=True) sample_x, sample_y = select_sample(x, y, n=10) cp2 = individual_variable_profile(explainer_gb, sample_x, y=sample_y) cp3 = individual_variable_profile(explainer_gb, x[0], y[0]) plot(cp3, show_residuals=True) plot(cp_profile, cp3, show_residuals=True)
X = iris['data'] y = iris['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) print(iris['feature_names']) def random_forest_classifier(): rf_model = RandomForestClassifier(n_estimators=100, random_state=42) rf_model.fit(X_train, y_train) return rf_model, X_train, y_train, iris['feature_names'] if __name__ == "__main__": (model, data, labels, variable_names) = random_forest_classifier() predict_function = lambda X: model.predict_proba(X)[::, 0] explainer_rf = explain(model, variable_names, data, labels, predict_function=predict_function) cp_profile = individual_variable_profile(explainer_rf, X[1], y=y[1]) plot(cp_profile)
def test_explainer_17(self): # predict function for dataframe boston_df = pd.DataFrame(self.X[:10]) explainer = explain(self.rf_model, data=boston_df) self.assertEqual(len(explainer.predict_fun(boston_df)), 10)
def test_explainer_7(self): # no labels given with self.assertRaises(ValueError) as c: explainer = explain(self.rf_model)
return gb_model, x, y, var_names def supported_vector_machines_model(): svm_model = svm.SVR(C=0.01, gamma='scale', kernel='poly') svm_model.fit(x, y) return svm_model, x, y, var_names if __name__ == "__main__": (linear_model, data, labels, variable_names) = linear_regression_model() (gb_model, _, _, _) = gradient_boosting_model() (svm_model, _, _, _) = supported_vector_machines_model() explainer_linear = explain(linear_model, variable_names, data, y) explainer_gb = explain(gb_model, variable_names, data, y) explainer_svm = explain(svm_model, variable_names, data, y) # single profile cp_1 = individual_variable_profile(explainer_gb, x[0], y[0]) plot(cp_1, destination="notebook", selected_variables=["bmi"], print_observations=False) # local fit neighbours_x, neighbours_y = select_neighbours(x, x[10], y=y, n=10) cp_2 = individual_variable_profile(explainer_gb, neighbours_x, neighbours_y) plot(cp_2,
def test_explainer_15(self): # wrong number of variables with self.assertRaises(ValueError): explainer = explain(self.rf_model, variable_names=["a", "b", "c"], data=self.df.values)
def test_explainer_11(self): explainer = explain(self.rf_model, variable_names=["a", "b"], y=pd.DataFrame(np.array([1, 4]))) np.testing.assert_array_equal(explainer.y, pd.Series([1, 4]))
def test_explainer_9(self): explainer = explain(self.rf_model, variable_names=["a", "b", "c"], y=[1, 2, 3]) np.testing.assert_array_equal(explainer.y, pd.Series([1, 2, 3]))
def test_explainer_8(self): # labels imputed from the dataframe explainer = explain(self.rf_model, data=self.df) self.assertEqual(explainer.var_names, ['a', 'b'])
def test_explainer_1(self): model = MagicMock() delattr(model, 'predict') with self.assertRaises(ValueError) as c: explain(model, self.var_names)
def test_explainer_4(self): label = "xyz" explainer = explain(self.rf_model, [], label=label) self.assertEqual(explainer.label, label)
model.compile(loss='mean_squared_error', optimizer='adam') return model def keras_model(): estimators = [('scaler', StandardScaler()), ('mlp', KerasRegressor(build_fn=network_architecture, epochs=200))] model = Pipeline(estimators) model.fit(x_train, y_train) return model, x_train, y_train, boston.feature_names if __name__ == "__main__": model, x_train, y_train, var_names = keras_model() explainer_keras = explain(model, var_names, x_train, y_train, label='KerasMLP') cp = individual_variable_profile( explainer_keras, x_train[:10], y=y_train[:10], variables=["CRIM", "ZN", "AGE", "INDUS", "B"]) plot(cp, show_residuals=True, selected_variables=["CRIM", "ZN", "AGE", "B"], show_observations=True, show_rugs=True)
def test_explainer_6(self): model = MagicMock() model.__str__.return_value = 'xyz' # raises warning explainer = explain(model, []) self.assertEqual(explainer.label, "unlabeled_model")
def test_explainer_12(self): # data from dataframe explainer = explain(self.rf_model, data=self.df) np.testing.assert_array_equal(explainer.data, self.df)
def test_explainer_3(self): explainer = explain(self.rf_model, [], predict_function=sum) self.assertEqual(explainer.predict_fun, sum)
numeric_features = ['age', 'bmi', 'children'] numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) categorical_features = ['sex', 'smoker', 'region'] categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestRegressor())]) clf.fit(x, y) from ceteris_paribus.explainer import explain explainer_cat = explain(clf, var_names, x, y, label="categorical_model") from ceteris_paribus.profiles import individual_variable_profile cp_cat = individual_variable_profile(explainer_cat, x.iloc[:10], y.iloc[:10]) cp_cat.print_profile() plot(cp_cat) plot(cp_cat, color="smoker")
def test_explainer_13(self): # data from numpy array explainer = explain(self.rf_model, variable_names=["a", "b"], data=self.df.values) np.testing.assert_array_equal(explainer.data, self.df)