def test_partial_dependence_multiclass(self): # Iris data classes: ['setosa', 'versicolor', 'virginica'] iris = datasets.load_iris() # 1. Using GB Classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data) interpreter = Interpretation() interpreter.load_data(iris.data, iris.feature_names) pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn, grid_resolution=25, sample=True) expected_feature_name = PartialDependence.feature_column_name_formatter('sepal length (cm)') self.assertIn(expected_feature_name, pdp_df.columns.values, "{0} not in columns {1}".format(expected_feature_name, pdp_df.columns.values)) # 2. Using SVC from sklearn import svm # With SVC, predict_proba is supported only if probability flag is enabled, by default it is false clf = svm.SVC(probability=True) clf.fit(iris.data, iris.target) classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data) interpreter = Interpretation() interpreter.load_data(iris.data, iris.feature_names) pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn, grid_resolution=25, sample=True) self.assertIn(expected_feature_name, pdp_df.columns.values, "{} not in columns {}".format(*[expected_feature_name, pdp_df.columns.values]))
def setUp(self): args = create_parser().parse_args() debug = args.debug self.seed = args.seed self.n = args.n self.dim = args.dim self.features = [str(i) for i in range(self.dim)] self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed) self.B = np.array([-10.1, 2.2, 6.1]) self.y = np.dot(self.X, self.B) self.y_as_int = np.round(expit(self.y)) self.y_as_string = np.array([str(i) for i in self.y_as_int]) # example dataset for y = B.X # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]]) (1000 * 3) # B = array([-10.1, 2.2, 6.1]) # y = array([ -2.09736000e+01, -1.29850618e+00, -1.73511155e+01, ...]) (1000 * 1) # features = ['0', '1', '2'] ## # Other output types: # y_as_int = array[ 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., ...] # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ] # Another set of input # sample data self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) self.sample_y = np.array([-1, -1, -1, 1, 1, 1]) self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])] if debug: self.interpreter = Interpretation(log_level='DEBUG') else: self.interpreter = Interpretation() # default level is 'WARNING' self.interpreter.load_data(self.X, feature_names=self.features) self.regressor = LinearRegression() self.regressor.fit(self.X, self.y) self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X) self.classifier = LogisticRegression() self.classifier.fit(self.X, self.y_as_int) self.classifier_predict_fn = InMemoryModel(self.classifier.predict, examples=self.X, unique_values=self.classifier.classes_) self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba, examples=self.X) self.string_classifier = LogisticRegression() self.string_classifier.fit(self.X, self.y_as_string) self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba, examples=self.X) # Yet another set of input!! self.sample_x_categorical = np.array([['B', -1], ['A', -1], ['A', -2], ['C', 1], ['C', 2], ['A', 1]]) self.sample_y_categorical = np.array(['A', 'A', 'A', 'B', 'B', 'B']) self.categorical_feature_names = ['Letters', 'Numbers'] self.categorical_transformer = MultiColumnLabelBinarizer() self.categorical_transformer.fit(self.sample_x_categorical) self.sample_x_categorical_transormed = self.categorical_transformer.transform(self.sample_x_categorical) self.categorical_classifier = LogisticRegression() self.categorical_classifier.fit(self.sample_x_categorical_transormed, self.sample_y_categorical) self.categorical_predict_fn = lambda x: self.categorical_classifier.predict_proba(self.categorical_transformer.transform(x)) self.categorical_model = InMemoryModel(self.categorical_predict_fn, examples=self.sample_x_categorical)
def setUp(self): args = create_parser().parse_args() debug = args.debug self.seed = args.seed self.n = args.n self.dim = args.dim self.features = [str(i) for i in range(self.dim)] self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed) self.B = np.array([-10.1, 2.2, 6.1]) self.y = np.dot(self.X, self.B) self.y_as_int = np.round(expit(self.y)) self.y_as_string = np.array([str(i) for i in self.y_as_int]) # example dataset for y = B.X # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]]) (1000 * 3) # B = array([-10.1, 2.2, 6.1]) # y = array([ -2.09736000e+01, -1.29850618e+00, -1.73511155e+01, ...]) (1000 * 1) # features = ['0', '1', '2'] ## # Other output types: # y_as_int = array[ 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., ...] # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ] # Another set of input # sample data self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) self.sample_y = np.array([-1, -1, -1, 1, 1, 1]) self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])] if debug: self.interpreter = Interpretation(training_data=self.X, feature_names=self.features, log_level='DEBUG') else: self.interpreter = Interpretation(training_data=self.X, feature_names=self.features) # default level is 'WARNING' self.regressor = LinearRegression() self.regressor.fit(self.X, self.y) self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X) self.classifier = LogisticRegression() self.classifier.fit(self.X, self.y_as_int) self.classifier_predict_fn = InMemoryModel(self.classifier.predict, examples=self.X, unique_values=self.classifier.classes_, probability=False) self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba, examples=self.X, probability=True) self.string_classifier = LogisticRegression() self.string_classifier.fit(self.X, self.y_as_string) self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba, examples=self.X, probability=True)
def test_partial_dependence_binary_classification(self): # In the default implementation of pdp on sklearn, there is an approx. done # if the number of unique values for a feature space < grid_resolution specified. # For now, we have decided to not have that approximation. In V2, we will be benchmarking for # performance as well. Around that time we will revisit the same. # Reference: https://github.com/scikit-learn/scikit-learn/blob/4d9a12d175a38f2bcb720389ad2213f71a3d7697/sklearn/ensemble/tests/test_partial_dependence.py # TODO: check on the feature space approximation (V2) # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(self.sample_x, self.sample_y) classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=self.sample_x) interpreter = Interpretation() interpreter.load_data(np.array(self.sample_x), self.sample_feature_name) pdp_df = interpreter.partial_dependence.partial_dependence(['0'], classifier_predict_fn, grid_resolution=5, sample=True) self.assertEquals(pdp_df.shape[0], len(np.unique(interpreter.data_set['0']))) # now with our own grid ud_grid = np.unique(self.sample_x[:, 0]) # input: array([-2, -1, 1, 2]) # the returned grid should have only 4 values as specified by the user pdp_df = interpreter.partial_dependence.partial_dependence(['0'], classifier_predict_fn, grid=ud_grid, sample=True) self.assertEquals(pdp_df.shape[0], 4)
def init_skater(self, target_names=None): """ Initialize skater. Set ups skater interpreter and in-memory model. :return: void (Sets the values of the skater_interpreter and skater_model variables) """ from skater.core.explanations import Interpretation from skater.model import InMemoryModel if not self.skater_interpreter or not self.skater_model: log.info( "Initializing Skater - generating new in-memory model." " This operation may be time-consuming so please be patient.") self.skater_interpreter = Interpretation( training_data=self.X_train_ohe, training_labels=self.y_train, feature_names=self.features_ohe) self.skater_model = InMemoryModel( self.model[1].predict_proba, examples=self.X_test_ohe, target_names=target_names, unique_values=self.y_train.unique()) else: log.info("Skater is already initialized.")
def partialDependence(self, feature_names, tick_labels_list = None): ''' Calculates the partial dependence of one or several features with the output label prediction. feature_names should be a list containing at least one string value which is feature name from the data. tick_labels_list should be a list of strings that would replace default ticks on the plot. Outputs line plot. X-axis show the values of the corresponding feature. Y-axis show the magnitude of partial dependence. ''' mem_model = InMemoryModel(self.model.predict_proba, examples = self.train_x, target_names=['Probability of no recidive', 'Probability of recidive']) axes_list = self.interpreter.partial_dependence.plot_partial_dependence(feature_names, mem_model, grid_resolution=25, with_variance=True, figsize = (8, 4), progressbar=False) ax = axes_list[0][1] title = 'Dependence between ' + feature_names[0] + ' and predicted label' if(tick_labels_list != None): ax.set_xticklabels(tick_labels_list) ax.set_title(title) ax.set_ylim(0, 1) ax.plot()
def get_permuted_feature_scores(model, data): """Computed permuted feature importances, using skater """ interpreter = Interpretation(data.testX, feature_names=data.feature_names) pyint_model = InMemoryModel(model.predict, examples=data.testX) feature_scores = list( interpreter.feature_importance.feature_importance( pyint_model, ascending=False, progressbar=False).items()) return feature_scores
def analyze(model_prediction, X_train, render=False): skater_model = InMemoryModel(model_prediction, examples=X_train) interpreter = Interpretation(X_train, feature_names=X_train.columns) result = interpreter.feature_importance.feature_importance(skater_model, ascending=False) if render: return render_feature_importance(result) else: return result
def __init__(self, random_forest_model, x_train, y_train): self.rf_model = random_forest_model self.x_train = x_train self.y_train = y_train self.columns = list(x_train.columns) self.explainer = LimeTabularExplainer(x_train.values, feature_names=self.columns) self.model = InMemoryModel(self.rf_model.predict_proba, examples=self.x_train) self.interpreter = Interpretation(training_data=self.x_train, feature_names=self.columns, training_labels=self.y_train)
def analyze(features, model_prediction, X_train, resolution=20, render=False): skater_model = InMemoryModel(model_prediction, examples=X_train) interpreter = Interpretation(X_train, feature_names=X_train.columns) result = interpreter.partial_dependence.partial_dependence( features, skater_model, grid_resolution=resolution) result.rename(columns={'predicted_1': 'Prediction'}, inplace=True) if render: return render_partial_dependence(result, features) else: return result
def test_issues_161_and_189(self): """ ensure DataManager(data).data == data """ X, y = load_breast_cancer(True) X, y = X[15:40], y[15:40] model = KNeighborsClassifier(weights='distance', p=2, n_neighbors=10).fit(X, y) skater_model = InMemoryModel(model.predict_proba, examples=X, probability=True) assert skater_model.probability is True assert skater_model.model_type == StaticTypes.model_types.classifier
def plot_partial_dependence_skater(estimator, X_train, feature_names): # Initialize names and interpreter class (which serves as a 'data manager') interpreter = Interpretation() interpreter.load_data(X_train, feature_names=feature_names) model = InMemoryModel(estimator.predict_proba, examples=X_train) # Plot partial dependence plots pdplots = interpreter.partial_dependence.plot_partial_dependence( feature_names, model, n_samples=100, n_jobs=3, grid_resolution=50, figsize=(10, 15))
def explain(skater_exp: Explanation, training_df, test_df, explanation_target, prefix_target): job = skater_exp.job model = joblib.load(job.predictive_model.model_path) model = model[0] features = list(training_df.drop(['trace_id', 'label'], 1).columns.values) interpreter = Interpretation(training_df, feature_names=features) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df['label'].values model_inst = InMemoryModel(model.predict, examples=X_train, model_type=model._estimator_type, unique_values=[1, 2], feature_names=features, target_names=['label']) surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5) surrogate_explainer.fit(X_train, Y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.class_names = features viz = dtreeviz(surrogate_explainer.estimator_, X_train, Y_train, target_name='label', feature_names=features, orientation="TD", class_names=list(surrogate_explainer.class_names), fancy=True, X=None, label_fontsize=12, ticks_fontsize=8, fontname="Arial") name = create_unique_name("skater_plot.svg") viz.save(name) if os.path.getsize(name) > 15000000: return 'The file size is too big' f = open(name, "r") response = f.read() os.remove(name) if os.path.isfile(name.split('.svg')[0]): os.remove(name.split('.svg')[0]) return response
def test_compute_default_scores(self): # For classification default scorer is weighted F1-score model_inst = InMemoryModel(self.classifier_est.predict, examples=self.X_train, model_type='classifier', unique_values=[0, 1, 2]) scorer = model_inst.scorers.get_scorer_function(scorer_type='default') self.assertEqual(scorer.name == 'f1-score', True) scorer = model_inst.scorers.get_scorer_function(scorer_type='f1') self.assertEqual(scorer.name == 'f1-score', True) y_hat = self.classifier_est.predict(self.X_test) value = scorer(self.y_test, y_hat, average='weighted') self.assertEquals(value > 0, True)
def test_compute_log_loss(self): model_inst = InMemoryModel(self.classifier_est.predict_proba, examples=self.X_train, probability=True, model_type='classifier') scorer = model_inst.scorers.get_scorer_function(scorer_type='default') self.assertEqual(scorer.name == 'cross-entropy', True) scorer = model_inst.scorers.get_scorer_function( scorer_type='cross_entropy') self.assertEqual(scorer.name == 'cross-entropy', True) y_hat = self.classifier_est.predict_proba(self.X_test) value = scorer(self.y_test, y_hat) self.assertEquals(value > 0, True)
def analyze(model_prediction, X_train, y_train): skater_model = InMemoryModel(model_prediction, examples=X_train) interpreter = Interpretation(X_train, feature_names=X_train.columns) surrogate_explainer = interpreter.tree_surrogate(skater_model, seed=5) surrogate_explainer.fit(X_train, y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.plot_global_decisions( colors=['coral', 'lightsteelblue', 'darkkhaki'], file_name='simple_tree_pre.png') return Image(filename='simple_tree_pre.png')
def understanding_interaction(): pyint_model = InMemoryModel(estimator.predict_proba, examples=X_test, target_names=features) # ['worst area', 'mean perimeter'] --> list(feature_selection.value) # Two-way iteraction interpreter.partial_dependence.plot_partial_dependence(["mass_tag_tag_max_mass", "maxDeltaEta_jet_jet"], model, grid_resolution=grid_resolution.value, with_variance=True) # Lets understand interaction using 2-way interaction using the same covariates # feature_selection.value --> ('worst area', 'mean perimeter') # Two-way iteraction axes_list = interpreter.partial_dependence.plot_partial_dependence(["mass_tag_tag_max_mass", "maxDeltaEta_jet_jet"], pyint_model, grid_resolution=grid_resolution.value, with_variance=True)
def test_surrogate_with_cross_entropy(self): model_inst = InMemoryModel(self.classifier_est.predict_proba, examples=self.X_train_c, model_type='classifier', feature_names=self.X_c.columns, target_names=self.target_names, log_level=_INFO, probability=True) surrogate_explainer = self.interpreter.tree_surrogate( oracle=model_inst, seed=5) result = surrogate_explainer.fit(self.X_train_c, self.y_train_c, use_oracle=True, prune='post', scorer_type='default') self.assertEqual(surrogate_explainer.scorer_name_, 'cross-entropy', True) self.assertEquals(result != 0, True)
def setUpClass(cls): # Classification use-case cls.X_c, cls.y_c = make_moons(1000, noise=0.5) cls.X_c = pd.DataFrame(cls.X_c, columns=['F1', 'F2']) cls.target_names = ['class 0', 'class 1'] cls.X_train_c, cls.X_test_c, cls.y_train_c, cls.y_test_c = train_test_split( cls.X_c, cls.y_c) cls.classifier_est = DecisionTreeClassifier(max_depth=5, random_state=5) cls.classifier_est.fit(cls.X_train_c, cls.y_train_c) cls.interpreter = Interpretation(cls.X_train_c, feature_names=cls.X_c.columns) cls.model_inst = InMemoryModel(cls.classifier_est.predict, examples=cls.X_train_c, model_type='classifier', unique_values=[0, 1], feature_names=cls.X_c.columns, target_names=cls.target_names, log_level=_INFO)
def general_explanation_using_skater(all_roles_scores, labels_training_set, labels_test_set, df_train_set, df_test_set, alpha): ''' Show the weight that more influenced a decision in eli 5 framework ---------------------------------------------------------------- Params: all_roles_score = list of all the marks present in test and train set for each role labels_training_set labels_test_set df_train_set df_test_set ''' le = preprocessing.LabelEncoder() le.fit(all_roles_scores) train_encoded_values = le.transform(labels_training_set) test_encoded_values = le.transform(labels_test_set) # boost_classifier = XGBClassifier(gamma = gamma, max_depth = maxde, min_child_weight = minchild) # boost_classifier.fit(df_train_set, train_encoded_values) # predictions = boost_classifier.predict(df_test_set) # predictions = predictions.astype('int') model_ordinal = LogisticAT(alpha=alpha) model_ordinal.fit(df_train_set.values, train_encoded_values) predictions = model_ordinal.predict(df_test_set) interpreter = Interpretation(df_train_set, feature_names=list(df_train_set.columns)) model = InMemoryModel(model_ordinal.predict_proba, examples=df_train_set[:10]) plots = interpreter.feature_importance.feature_importance(model, ascending=True) # fig, ax = plt.subplots(figsize=(5,35)) # plots = interpreter.feature_importance.plot_feature_importance(model, ascending=True, ax= ax) return plots
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 71 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path)[0] # load data training_df, test_df = get_encoded_logs(job) features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) interpreter = Interpretation(training_df, feature_names=features) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df['label'].values model_inst = InMemoryModel(model.predict, examples=X_train, model_type='classifier', unique_values=[1, 2], feature_names=features, target_names=['label']) surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5) surrogate_explainer.fit(X_train, Y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.class_names = features viz = dtreeviz(surrogate_explainer.estimator_, X_train, Y_train, target_name='label', feature_names=features, orientation="TD", class_names=list(surrogate_explainer.class_names), fancy=True, X=None, label_fontsize=12, ticks_fontsize=8, fontname="Arial") viz.save("skater_plot_train_2_2.svg")
def _create_skater_stuff(mdl, test_x, test_z): from skater.model import InMemoryModel from skater.core.explanations import Interpretation from hassbrain_algorithm.benchmark.interpretation import ModelWrapper from hassbrain_algorithm.benchmark.interpretation import _boolean2str wrapped_model = ModelWrapper(mdl) class_names = mdl.get_state_lbl_lst() feature_names = mdl.get_obs_lbl_lst() # this has to be done in order for skater to recognize the values as categorical and not numerical test_x = _boolean2str(test_x) # create interpretation interpreter = Interpretation( test_x, #class_names=class_names, feature_names=feature_names) # create model # supports classifiers with or without probability scores examples = test_x[:10] skater_model = InMemoryModel( wrapped_model.predict, #target_names=class_names, feature_names=feature_names, model_type='classifier', unique_values=class_names, probability=False, examples=examples) interpreter.load_data(test_x, training_labels=test_z, feature_names=feature_names) # todo flag for deletion (3lines below) # if this can savely be deleted tmp = interpreter.data_set.feature_info for key, val in tmp.items(): val['numeric'] = False return skater_model, interpreter
def part_dep_plot(features): for feature in features: interpreter = Interpretation() interpreter.load_data(import_quest_demos, feature_names=[feature]) model = InMemoryModel(rf_final.predict_proba, examples=import_quest_demos) pdplots = interpreter.partial_dependence.plot_partial_dependence( [feature], model, n_samples=100, n_jobs=-1, grid_resolution=50, figsize=(15, 15)) name = "images/pdp_" + feature + ".png" plt.title("Partial Dependency Plot of Question " + feature, fontsize=20) plt.ylabel( "Average Predicted Probability of Attrition by Question Value (*0.1)", fontsize=15) plt.xlabel("Question " + feature + " Response Value", fontsize=15) plt.savefig(name) plt.close()
def __init__(self, predictive_model, feature_labels=None, dataset=None): if isinstance(predictive_model, MatPipe): self.predictive_model = predictive_model self.feature_labels = predictive_model.learner.features self.target = predictive_model.learner.fitted_target self.dataset = predictive_model.post_fit_df.drop([self.target], axis=1) if feature_labels is not None: self.feature_labels = feature_labels if dataset is not None: self.dataset = dataset self.interpreter = Interpretation(self.dataset, feature_names=self.feature_labels) def predict_func(x): prediction = self.predictive_model.learner.predict(x, self.target) return prediction[self.target + " predicted"].values self.model = InMemoryModel( predict_func, examples=self.dataset )
def featureImportance(self): ''' Calculates the importance of each feature in training set and the value of this feature in resulting output Outputs a horizontal bar chart. X-axis show the feature importance. Y-axis display the name of corresponding feature. ''' # Load model in Skater memory mem_model = InMemoryModel(self.model.predict_proba, examples = self.train_x) # Generate feature importance plots f = plt.figure(figsize=(10,3)) plots = self.interpreter.feature_importance.plot_feature_importance(mem_model, ascending = True, progressbar=False) figure = plots[0] figure.set_size_inches(15, 8) ax = plots[1] ax.set_title("Feature importance of Random Forest model, trained on COMPAS dataset") ax.plot()
loss=loss, batch_size=256, epochs=35, verbose=1)) } ## Applying Model Agnostic Interpretation to Ensemble Models # source: # - https://github.com/datascienceinc/Skater/blob/master/examples/ensemble_model.ipynb interpreter = Interpretation(X_test, feature_names=features) estimator = binary_pipe['kerasclassifier'] estimator.fit(X_train, y_train) model = InMemoryModel(estimator.predict_proba, examples=X_train) # Model-agnostic Variable Importance for global interpretation plots = interpreter.feature_importance.plot_feature_importance(model, ascending=True) # Use partial dependence to understand the relationship between a variable and a model's predictions model = InMemoryModel(estimator.predict_proba, examples=X_test, #unique_values=model.classes_ target_names=list(set(y_train))) # Lets understand interaction using 2-way interaction using the same covariates # feature_selection # Partial dependence plots for global interpretation # A visualization technique that can be used to understnd and estimate the dependence
print(classification_report(test_target, prediction)) for model, title in zip(models, titles): clf = model.fit(train_data, train_target) prediction = clf.predict(test_data) print(f"{title}") print(classification_report(test_target, prediction)) print( f"Confusion Matrix: \n {confusion_matrix(test_target, prediction)}" ) # ax = axs[modelno - 1, fold - 1] interpreter = Interpretation(test_data, feature_names=featureNames[1:9]) # model_no_proba = InMemoryModel(model.predict, examples=test_data, unique_values=model.classes_) pyint_model = InMemoryModel( model.predict_proba, examples=test_data, target_names=["CYT", "ME3", "MIT", "NUC"]) # interpreter.feature_importance.plot_feature_importance(pyint_model, ascending=False, ax=ax, # progressbar=False) # ax.set_title(f"{title} on fold {fold}") # print("\n") ## To avoid clutter I only produce plots for gradient boosting and one fold only if (fold == 2 and modelno == 5): # Plot PDPs of variable "alm" since it is the most important feature, for 3 of the 4 models ## alm not the most important feature for Gaussian Naive bayes tho, explain that # for other variables just change the name # for other models just change the number # interpreter.partial_dependence.plot_partial_dependence(["alm"], # pyint_model, grid_resolution=30, # with_variance=True)
def fail_func(): self.interpreter.partial_dependence.partial_dependence(self.features[:1], InMemoryModel(self.string_classifier.predict, examples=self.X), grid_resolution=10)
def run_explanations(csv_path, csv_columns, target_column, zero_value): # Read the dataset from the provided CSV and print out information about it. df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True, skiprows=1) #df = df.drop('Target',axis=1) input_features = [name for name in csv_columns if name != target_column] #data, labels = shap.datasets.adult(display=True) if target_column not in csv_columns: print("target column error") return ("target column error") elif zero_value not in df[target_column].tolist(): if str.isdecimal(zero_value) and ( np.int64(zero_value) in df[target_column].tolist() or np.float64(zero_value) in df[target_column].tolist()): print("happy") zero_value = np.int64(zero_value) else: print(zero_value, df[target_column].tolist(), df[target_column].dtype) return ("zero value error") labels = df[target_column].tolist() #labels = np.array([int(label) for label in labels]) labels2 = [] for label in labels: if label == zero_value: labels2.append(0) else: labels2.append(1) labels = np.array(labels2) data = df[input_features] for feature in input_features: if data[feature].dtype is not np.dtype( np.int64) and data[feature].dtype is not np.dtype( np.float64) and data[feature].dtype is not np.dtype( np.float32): data[feature] = data[feature].astype('category') cat_cols = data.select_dtypes(['category']).columns data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42) data_disp, labels_disp = shap.datasets.adult(display=True) X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split( data_disp, labels_disp, test_size=0.3, random_state=42) xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5, objective='binary:logistic', random_state=42) xgc.fit(X_train, y_train) predictions = xgc.predict(X_test) fig = plt.figure(figsize=(16, 12)) title = fig.suptitle("Default Feature Importances from XGBoost", fontsize=14) ax1 = fig.add_subplot(2, 2, 1) xgb.plot_importance(xgc, importance_type='weight', ax=ax1) t = ax1.set_title("Feature Importance - Feature Weight") ax2 = fig.add_subplot(2, 2, 2) xgb.plot_importance(xgc, importance_type='gain', ax=ax2) t = ax2.set_title("Feature Importance - Split Mean Gain") ax3 = fig.add_subplot(2, 2, 3) xgb.plot_importance(xgc, importance_type='cover', ax=ax3) t = ax3.set_title("Feature Importance - Sample Coverage") #plt.savefig('static/explanations.png') explanation = eli5.explain_weights(xgc.get_booster()) explanation_html = eli5.formatters.html.format_as_html(explanation) print(explanation_html) with open("templates/explanation.html", "a+") as file: file.write(explanation_html) doc_num = 0 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) #eli5.show_prediction(xgc.get_booster(), X_test.iloc[doc_num], # feature_names=list(data.columns) ,show_feature_values=True) explanation2 = eli5.explain_prediction(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns)) explanation_html2 = eli5.formatters.html.format_as_html(explanation2) with open("templates/explanation.html", "a") as file: file.write(explanation_html2) doc_num = 2 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) #eli5.show_predicon(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns) ,show_feature_values=True) explanation3 = eli5.explain_prediction(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns)) explanation_html3 = eli5.formatters.html.format_as_html(explanation3) with open("templates/explanation.html", "a") as file: file.write(explanation_html3) #target_names = ['$50K or less', 'More than $50K'] interpreter = Interpretation(training_data=X_test, training_labels=y_test, feature_names=list(data.columns)) im_model = InMemoryModel(xgc.predict_proba, examples=X_train) plots = interpreter.feature_importance.plot_feature_importance( im_model, ascending=True, n_samples=23000) plots[0].savefig('skater.png') features_pdp = input_features xgc_np = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5, objective='binary:logistic', random_state=42) xgc_np.fit(X_train.values, y_train) # In[ ]: from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer exp = LimeTabularExplainer(X_test.values, feature_names=list(data.columns), discretize_continuous=True) doc_num = 0 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) instance = exp.explain_instance(X_test.iloc[doc_num].values, xgc_np.predict_proba) instance.save_to_file('templates/lime.html', show_all=False) doc_num = 2 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) instance2 = exp.explain_instance(X_test.iloc[doc_num].values, xgc_np.predict_proba) instance2.save_to_file('templates/lime2.html', show_all=False) explainer = shap.TreeExplainer(xgc) shap_values = explainer.shap_values(X_test) pd.DataFrame(shap_values).head() #shap.force_plot(explainer.expected_value, shap_values[:,], X_test_disp.iloc[:,],show=False,matplotlib=True) #plt.savefig("static/force_plot.png") shap.summary_plot(shap_values, X_test, plot_type="bar", show=False) plt.savefig("static/summary_plot.png") shap.summary_plot(shap_values, X_test, show=False) plt.savefig("static/summary_plot2.png") return "Everyone Happy"
predicted_labels=wtp_dnn_predictions, classes=['red', 'white']) # # Model Interpretation # ## View Feature importances # In[14]: from skater.core.explanations import Interpretation from skater.model import InMemoryModel wtp_interpreter = Interpretation(wtp_test_SX, feature_names=wtp_features.columns) wtp_im_model = InMemoryModel(wtp_lr.predict_proba, examples=wtp_train_SX, target_names=wtp_lr.classes_) plots = wtp_interpreter.feature_importance.plot_feature_importance( wtp_im_model, ascending=False) # ## View model ROC curve # In[15]: meu.plot_model_roc_curve(wtp_lr, wtp_test_SX, wtp_test_y) # ## Visualize Model Decision Surface # In[59]: feature_indices = [