def test_partial_dependence_multiclass(self):
        # Iris data classes: ['setosa', 'versicolor', 'virginica']
        iris = datasets.load_iris()
        # 1. Using GB Classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)

        expected_feature_name = PartialDependence.feature_column_name_formatter('sepal length (cm)')

        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{0} not in columns {1}".format(expected_feature_name,
                                                      pdp_df.columns.values))
        # 2. Using SVC
        from sklearn import svm
        # With SVC, predict_proba is supported only if probability flag is enabled, by default it is false
        clf = svm.SVC(probability=True)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)
        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{} not in columns {}".format(*[expected_feature_name,
                                                      pdp_df.columns.values]))
    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]


        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])]

        if debug:
            self.interpreter = Interpretation(log_level='DEBUG')
        else:
            self.interpreter = Interpretation()  # default level is 'WARNING'
        self.interpreter.load_data(self.X, feature_names=self.features)

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(self.classifier.predict, examples=self.X, unique_values=self.classifier.classes_)
        self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba, examples=self.X)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba, examples=self.X)


        # Yet another set of input!!
        self.sample_x_categorical = np.array([['B', -1], ['A', -1], ['A', -2], ['C', 1], ['C', 2], ['A', 1]])
        self.sample_y_categorical = np.array(['A', 'A', 'A', 'B', 'B', 'B'])
        self.categorical_feature_names = ['Letters', 'Numbers']
        self.categorical_transformer = MultiColumnLabelBinarizer()
        self.categorical_transformer.fit(self.sample_x_categorical)
        self.sample_x_categorical_transormed = self.categorical_transformer.transform(self.sample_x_categorical)
        self.categorical_classifier = LogisticRegression()
        self.categorical_classifier.fit(self.sample_x_categorical_transormed, self.sample_y_categorical)
        self.categorical_predict_fn = lambda x: self.categorical_classifier.predict_proba(self.categorical_transformer.transform(x))
        self.categorical_model = InMemoryModel(self.categorical_predict_fn, examples=self.sample_x_categorical)
    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]


        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])]

        if debug:
            self.interpreter = Interpretation(training_data=self.X, feature_names=self.features, log_level='DEBUG')
        else:
            self.interpreter = Interpretation(training_data=self.X, feature_names=self.features)  # default level is 'WARNING'

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(self.classifier.predict,
                                                   examples=self.X,
                                                   unique_values=self.classifier.classes_,
                                                   probability=False)
        self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba,
                                                         examples=self.X,
                                                         probability=True)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba,
                                                          examples=self.X,
                                                          probability=True)
    def test_partial_dependence_binary_classification(self):
        # In the default implementation of pdp on sklearn, there is an approx. done
        # if the number of unique values for a feature space < grid_resolution specified.
        # For now, we have decided to not have that approximation. In V2, we will be benchmarking for
        # performance as well. Around that time we will revisit the same.
        # Reference: https://github.com/scikit-learn/scikit-learn/blob/4d9a12d175a38f2bcb720389ad2213f71a3d7697/sklearn/ensemble/tests/test_partial_dependence.py
        # TODO: check on the feature space approximation (V2)
        # Test partial dependence for classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(self.sample_x, self.sample_y)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=self.sample_x)
        interpreter = Interpretation()
        interpreter.load_data(np.array(self.sample_x), self.sample_feature_name)
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'],
                                                                   classifier_predict_fn,
                                                                   grid_resolution=5,
                                                                   sample=True)

        self.assertEquals(pdp_df.shape[0], len(np.unique(interpreter.data_set['0'])))

        # now with our own grid
        ud_grid = np.unique(self.sample_x[:, 0])
        # input: array([-2, -1,  1,  2])
        # the returned grid should have only 4 values as specified by the user
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'], classifier_predict_fn,
                                                                   grid=ud_grid, sample=True)
        self.assertEquals(pdp_df.shape[0], 4)
Beispiel #5
0
    def init_skater(self, target_names=None):
        """
        Initialize skater. Set ups skater interpreter and in-memory model.
        :return: void (Sets the values of the skater_interpreter and skater_model variables)
        """
        from skater.core.explanations import Interpretation
        from skater.model import InMemoryModel

        if not self.skater_interpreter or not self.skater_model:

            log.info(
                "Initializing Skater - generating new in-memory model."
                " This operation may be time-consuming so please be patient.")

            self.skater_interpreter = Interpretation(
                training_data=self.X_train_ohe,
                training_labels=self.y_train,
                feature_names=self.features_ohe)

            self.skater_model = InMemoryModel(
                self.model[1].predict_proba,
                examples=self.X_test_ohe,
                target_names=target_names,
                unique_values=self.y_train.unique())
        else:
            log.info("Skater is already initialized.")
    def partialDependence(self, feature_names, tick_labels_list = None):
        '''
        Calculates the partial dependence of one or several features
        with the output label prediction.
        feature_names should be a list containing at least one string value which is
        feature name from the data.
        tick_labels_list should be a list of strings that would replace default ticks
        on the plot.
        Outputs line plot.
        X-axis show the values of the corresponding feature.
        Y-axis show the magnitude of partial dependence.
        '''

        mem_model = InMemoryModel(self.model.predict_proba, examples = self.train_x, 
                         target_names=['Probability of no recidive', 'Probability of recidive'])
    
        axes_list = self.interpreter.partial_dependence.plot_partial_dependence(feature_names, mem_model, grid_resolution=25, 
                                                                   with_variance=True, figsize = (8, 4), progressbar=False)
        ax = axes_list[0][1]
    
        title = 'Dependence between ' + feature_names[0] + ' and predicted label'
       
        if(tick_labels_list != None):
            ax.set_xticklabels(tick_labels_list)

        ax.set_title(title)
        ax.set_ylim(0, 1)
        ax.plot()
Beispiel #7
0
def get_permuted_feature_scores(model, data):
    """Computed permuted feature importances, using skater
    """
    interpreter = Interpretation(data.testX, feature_names=data.feature_names)
    pyint_model = InMemoryModel(model.predict, examples=data.testX)
    feature_scores = list(
        interpreter.feature_importance.feature_importance(
            pyint_model, ascending=False, progressbar=False).items())
    return feature_scores
Beispiel #8
0
def analyze(model_prediction, X_train, render=False):
    skater_model = InMemoryModel(model_prediction, examples=X_train)
    interpreter = Interpretation(X_train, feature_names=X_train.columns)
    
    result = interpreter.feature_importance.feature_importance(skater_model, ascending=False)
    
    if render:
        return render_feature_importance(result)
    else:
        return result
Beispiel #9
0
 def __init__(self, random_forest_model, x_train, y_train):
     self.rf_model = random_forest_model
     self.x_train = x_train
     self.y_train = y_train
     self.columns = list(x_train.columns)
     self.explainer = LimeTabularExplainer(x_train.values,
                                           feature_names=self.columns)
     self.model = InMemoryModel(self.rf_model.predict_proba,
                                examples=self.x_train)
     self.interpreter = Interpretation(training_data=self.x_train,
                                       feature_names=self.columns,
                                       training_labels=self.y_train)
Beispiel #10
0
def analyze(features, model_prediction, X_train, resolution=20, render=False):
    skater_model = InMemoryModel(model_prediction, examples=X_train)
    interpreter = Interpretation(X_train, feature_names=X_train.columns)

    result = interpreter.partial_dependence.partial_dependence(
        features, skater_model, grid_resolution=resolution)
    result.rename(columns={'predicted_1': 'Prediction'}, inplace=True)

    if render:
        return render_partial_dependence(result, features)
    else:
        return result
Beispiel #11
0
 def test_issues_161_and_189(self):
     """
     ensure DataManager(data).data == data
     """
     X, y = load_breast_cancer(True)
     X, y = X[15:40], y[15:40]
     model = KNeighborsClassifier(weights='distance', p=2,
                                  n_neighbors=10).fit(X, y)
     skater_model = InMemoryModel(model.predict_proba,
                                  examples=X,
                                  probability=True)
     assert skater_model.probability is True
     assert skater_model.model_type == StaticTypes.model_types.classifier
Beispiel #12
0
def plot_partial_dependence_skater(estimator, X_train, feature_names):
    # Initialize names and interpreter class (which serves as a 'data manager')
    interpreter = Interpretation()
    interpreter.load_data(X_train, feature_names=feature_names)
    model = InMemoryModel(estimator.predict_proba, examples=X_train)
    # Plot partial dependence plots
    pdplots = interpreter.partial_dependence.plot_partial_dependence(
        feature_names,
        model,
        n_samples=100,
        n_jobs=3,
        grid_resolution=50,
        figsize=(10, 15))
Beispiel #13
0
def explain(skater_exp: Explanation, training_df, test_df, explanation_target,
            prefix_target):
    job = skater_exp.job
    model = joblib.load(job.predictive_model.model_path)
    model = model[0]

    features = list(training_df.drop(['trace_id', 'label'], 1).columns.values)
    interpreter = Interpretation(training_df, feature_names=features)
    X_train = training_df.drop(['trace_id', 'label'], 1)
    Y_train = training_df['label'].values

    model_inst = InMemoryModel(model.predict,
                               examples=X_train,
                               model_type=model._estimator_type,
                               unique_values=[1, 2],
                               feature_names=features,
                               target_names=['label'])
    surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5)

    surrogate_explainer.fit(X_train,
                            Y_train,
                            use_oracle=True,
                            prune='post',
                            scorer_type='default')
    surrogate_explainer.class_names = features

    viz = dtreeviz(surrogate_explainer.estimator_,
                   X_train,
                   Y_train,
                   target_name='label',
                   feature_names=features,
                   orientation="TD",
                   class_names=list(surrogate_explainer.class_names),
                   fancy=True,
                   X=None,
                   label_fontsize=12,
                   ticks_fontsize=8,
                   fontname="Arial")
    name = create_unique_name("skater_plot.svg")
    viz.save(name)
    if os.path.getsize(name) > 15000000:
        return 'The file size is too big'
    f = open(name, "r")
    response = f.read()
    os.remove(name)
    if os.path.isfile(name.split('.svg')[0]):
        os.remove(name.split('.svg')[0])

    return response
Beispiel #14
0
    def test_compute_default_scores(self):
        # For classification default scorer is weighted F1-score
        model_inst = InMemoryModel(self.classifier_est.predict,
                                   examples=self.X_train,
                                   model_type='classifier',
                                   unique_values=[0, 1, 2])
        scorer = model_inst.scorers.get_scorer_function(scorer_type='default')
        self.assertEqual(scorer.name == 'f1-score', True)

        scorer = model_inst.scorers.get_scorer_function(scorer_type='f1')
        self.assertEqual(scorer.name == 'f1-score', True)

        y_hat = self.classifier_est.predict(self.X_test)
        value = scorer(self.y_test, y_hat, average='weighted')
        self.assertEquals(value > 0, True)
Beispiel #15
0
    def test_compute_log_loss(self):
        model_inst = InMemoryModel(self.classifier_est.predict_proba,
                                   examples=self.X_train,
                                   probability=True,
                                   model_type='classifier')
        scorer = model_inst.scorers.get_scorer_function(scorer_type='default')
        self.assertEqual(scorer.name == 'cross-entropy', True)

        scorer = model_inst.scorers.get_scorer_function(
            scorer_type='cross_entropy')
        self.assertEqual(scorer.name == 'cross-entropy', True)

        y_hat = self.classifier_est.predict_proba(self.X_test)
        value = scorer(self.y_test, y_hat)
        self.assertEquals(value > 0, True)
def analyze(model_prediction, X_train, y_train):
    skater_model = InMemoryModel(model_prediction, examples=X_train)
    interpreter = Interpretation(X_train, feature_names=X_train.columns)

    surrogate_explainer = interpreter.tree_surrogate(skater_model, seed=5)
    surrogate_explainer.fit(X_train,
                            y_train,
                            use_oracle=True,
                            prune='post',
                            scorer_type='default')
    surrogate_explainer.plot_global_decisions(
        colors=['coral', 'lightsteelblue', 'darkkhaki'],
        file_name='simple_tree_pre.png')

    return Image(filename='simple_tree_pre.png')
def understanding_interaction():
    pyint_model = InMemoryModel(estimator.predict_proba, examples=X_test, target_names=features)
    # ['worst area', 'mean perimeter'] --> list(feature_selection.value)
    # Two-way iteraction
    interpreter.partial_dependence.plot_partial_dependence(["mass_tag_tag_max_mass", "maxDeltaEta_jet_jet"],
                                                           model, 
                                                           grid_resolution=grid_resolution.value,
                                                           with_variance=True)
        
    # Lets understand interaction using 2-way interaction using the same covariates
    # feature_selection.value --> ('worst area', 'mean perimeter')
    # Two-way iteraction 
    axes_list = interpreter.partial_dependence.plot_partial_dependence(["mass_tag_tag_max_mass", "maxDeltaEta_jet_jet"],
                                                                       pyint_model, 
                                                                       grid_resolution=grid_resolution.value,
                                                                       with_variance=True)
Beispiel #18
0
 def test_surrogate_with_cross_entropy(self):
     model_inst = InMemoryModel(self.classifier_est.predict_proba,
                                examples=self.X_train_c,
                                model_type='classifier',
                                feature_names=self.X_c.columns,
                                target_names=self.target_names,
                                log_level=_INFO,
                                probability=True)
     surrogate_explainer = self.interpreter.tree_surrogate(
         oracle=model_inst, seed=5)
     result = surrogate_explainer.fit(self.X_train_c,
                                      self.y_train_c,
                                      use_oracle=True,
                                      prune='post',
                                      scorer_type='default')
     self.assertEqual(surrogate_explainer.scorer_name_, 'cross-entropy',
                      True)
     self.assertEquals(result != 0, True)
Beispiel #19
0
 def setUpClass(cls):
     # Classification use-case
     cls.X_c, cls.y_c = make_moons(1000, noise=0.5)
     cls.X_c = pd.DataFrame(cls.X_c, columns=['F1', 'F2'])
     cls.target_names = ['class 0', 'class 1']
     cls.X_train_c, cls.X_test_c, cls.y_train_c, cls.y_test_c = train_test_split(
         cls.X_c, cls.y_c)
     cls.classifier_est = DecisionTreeClassifier(max_depth=5,
                                                 random_state=5)
     cls.classifier_est.fit(cls.X_train_c, cls.y_train_c)
     cls.interpreter = Interpretation(cls.X_train_c,
                                      feature_names=cls.X_c.columns)
     cls.model_inst = InMemoryModel(cls.classifier_est.predict,
                                    examples=cls.X_train_c,
                                    model_type='classifier',
                                    unique_values=[0, 1],
                                    feature_names=cls.X_c.columns,
                                    target_names=cls.target_names,
                                    log_level=_INFO)
def general_explanation_using_skater(all_roles_scores, labels_training_set,
                                     labels_test_set, df_train_set,
                                     df_test_set, alpha):
    '''
    Show the weight that more influenced a decision in eli 5 framework

    ----------------------------------------------------------------
    Params:
        all_roles_score = list of all the marks present in test and train set for each role
        labels_training_set
        labels_test_set
        df_train_set
        df_test_set

    '''
    le = preprocessing.LabelEncoder()
    le.fit(all_roles_scores)
    train_encoded_values = le.transform(labels_training_set)
    test_encoded_values = le.transform(labels_test_set)

    # boost_classifier = XGBClassifier(gamma = gamma, max_depth = maxde, min_child_weight = minchild)
    # boost_classifier.fit(df_train_set, train_encoded_values)

    # predictions = boost_classifier.predict(df_test_set)
    # predictions = predictions.astype('int')

    model_ordinal = LogisticAT(alpha=alpha)
    model_ordinal.fit(df_train_set.values, train_encoded_values)
    predictions = model_ordinal.predict(df_test_set)

    interpreter = Interpretation(df_train_set,
                                 feature_names=list(df_train_set.columns))

    model = InMemoryModel(model_ordinal.predict_proba,
                          examples=df_train_set[:10])

    plots = interpreter.feature_importance.feature_importance(model,
                                                              ascending=True)

    # fig, ax = plt.subplots(figsize=(5,35))
    # plots = interpreter.feature_importance.plot_feature_importance(model, ascending=True, ax= ax)

    return plots
Beispiel #21
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 71
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)[0]
        # load data
        training_df, test_df = get_encoded_logs(job)

        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        interpreter = Interpretation(training_df, feature_names=features)
        X_train = training_df.drop(['trace_id', 'label'], 1)
        Y_train = training_df['label'].values

        model_inst = InMemoryModel(model.predict,
                                   examples=X_train,
                                   model_type='classifier',
                                   unique_values=[1, 2],
                                   feature_names=features,
                                   target_names=['label'])
        surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5)

        surrogate_explainer.fit(X_train,
                                Y_train,
                                use_oracle=True,
                                prune='post',
                                scorer_type='default')
        surrogate_explainer.class_names = features

        viz = dtreeviz(surrogate_explainer.estimator_,
                       X_train,
                       Y_train,
                       target_name='label',
                       feature_names=features,
                       orientation="TD",
                       class_names=list(surrogate_explainer.class_names),
                       fancy=True,
                       X=None,
                       label_fontsize=12,
                       ticks_fontsize=8,
                       fontname="Arial")
        viz.save("skater_plot_train_2_2.svg")
Beispiel #22
0
def _create_skater_stuff(mdl, test_x, test_z):
    from skater.model import InMemoryModel
    from skater.core.explanations import Interpretation
    from hassbrain_algorithm.benchmark.interpretation import ModelWrapper
    from hassbrain_algorithm.benchmark.interpretation import _boolean2str

    wrapped_model = ModelWrapper(mdl)
    class_names = mdl.get_state_lbl_lst()
    feature_names = mdl.get_obs_lbl_lst()

    # this has to be done in order for skater to recognize the values as categorical and not numerical
    test_x = _boolean2str(test_x)

    # create interpretation
    interpreter = Interpretation(
        test_x,
        #class_names=class_names,
        feature_names=feature_names)

    # create model
    # supports classifiers with or without probability scores
    examples = test_x[:10]
    skater_model = InMemoryModel(
        wrapped_model.predict,
        #target_names=class_names,
        feature_names=feature_names,
        model_type='classifier',
        unique_values=class_names,
        probability=False,
        examples=examples)

    interpreter.load_data(test_x,
                          training_labels=test_z,
                          feature_names=feature_names)
    # todo flag for deletion (3lines below)
    #    if this can savely be deleted
    tmp = interpreter.data_set.feature_info
    for key, val in tmp.items():
        val['numeric'] = False
    return skater_model, interpreter
Beispiel #23
0
def part_dep_plot(features):
    for feature in features:
        interpreter = Interpretation()
        interpreter.load_data(import_quest_demos, feature_names=[feature])
        model = InMemoryModel(rf_final.predict_proba,
                              examples=import_quest_demos)
        pdplots = interpreter.partial_dependence.plot_partial_dependence(
            [feature],
            model,
            n_samples=100,
            n_jobs=-1,
            grid_resolution=50,
            figsize=(15, 15))
        name = "images/pdp_" + feature + ".png"
        plt.title("Partial Dependency Plot of Question " + feature,
                  fontsize=20)
        plt.ylabel(
            "Average Predicted Probability of Attrition by Question Value (*0.1)",
            fontsize=15)
        plt.xlabel("Question " + feature + " Response Value", fontsize=15)
        plt.savefig(name)
        plt.close()
Beispiel #24
0
    def __init__(self, predictive_model, feature_labels=None, dataset=None):
        if isinstance(predictive_model, MatPipe):
            self.predictive_model = predictive_model
            self.feature_labels = predictive_model.learner.features
            self.target = predictive_model.learner.fitted_target
            self.dataset = predictive_model.post_fit_df.drop([self.target],
                                                             axis=1)
        if feature_labels is not None:
            self.feature_labels = feature_labels
        if dataset is not None:
            self.dataset = dataset

        self.interpreter = Interpretation(self.dataset,
                                          feature_names=self.feature_labels)

        def predict_func(x):
            prediction = self.predictive_model.learner.predict(x, self.target)
            return prediction[self.target + " predicted"].values

        self.model = InMemoryModel(
            predict_func, examples=self.dataset
        )
    def featureImportance(self):
        '''
        Calculates the importance of each feature in training set
        and the value of this feature in resulting output
        Outputs a horizontal bar chart.
        X-axis show the feature importance.
        Y-axis display the name of corresponding feature.
        '''

        # Load model in Skater memory

        mem_model = InMemoryModel(self.model.predict_proba, examples = self.train_x)

        # Generate feature importance plots 

        f = plt.figure(figsize=(10,3))

        plots = self.interpreter.feature_importance.plot_feature_importance(mem_model, ascending = True, progressbar=False)
        figure = plots[0]
        figure.set_size_inches(15, 8)
        ax = plots[1]
        ax.set_title("Feature importance of Random Forest model, trained on COMPAS dataset")
        ax.plot()
                                                      loss=loss,
                                                      batch_size=256, 
                                                      epochs=35,
                                                      verbose=1))
}

## Applying Model Agnostic Interpretation to Ensemble Models
# source:
#   - https://github.com/datascienceinc/Skater/blob/master/examples/ensemble_model.ipynb

interpreter = Interpretation(X_test, feature_names=features)

estimator = binary_pipe['kerasclassifier']
estimator.fit(X_train, y_train)

model = InMemoryModel(estimator.predict_proba, 
                      examples=X_train)

# Model-agnostic Variable Importance for global interpretation
plots = interpreter.feature_importance.plot_feature_importance(model, ascending=True)

# Use partial dependence to understand the relationship between a variable and a model's predictions
model = InMemoryModel(estimator.predict_proba,
                      examples=X_test,
                      #unique_values=model.classes_
                      target_names=list(set(y_train)))

# Lets understand interaction using 2-way interaction using the same covariates                                                     
# feature_selection

# Partial dependence plots for global interpretation
# A visualization technique that can be used to understnd and estimate the dependence 
Beispiel #27
0
        print(classification_report(test_target, prediction))
        for model, title in zip(models, titles):
            clf = model.fit(train_data, train_target)
            prediction = clf.predict(test_data)
            print(f"{title}")
            print(classification_report(test_target, prediction))
            print(
                f"Confusion Matrix: \n {confusion_matrix(test_target, prediction)}"
            )

            # ax = axs[modelno - 1, fold - 1]
            interpreter = Interpretation(test_data,
                                         feature_names=featureNames[1:9])
            # model_no_proba = InMemoryModel(model.predict, examples=test_data, unique_values=model.classes_)
            pyint_model = InMemoryModel(
                model.predict_proba,
                examples=test_data,
                target_names=["CYT", "ME3", "MIT", "NUC"])
            # interpreter.feature_importance.plot_feature_importance(pyint_model, ascending=False, ax=ax,
            #                                                        progressbar=False)
            # ax.set_title(f"{title} on fold {fold}")
            # print("\n")

            ## To avoid clutter I only produce plots for gradient boosting and one fold only
            if (fold == 2 and modelno == 5):
                # Plot PDPs of variable "alm" since it is the most important feature, for 3 of the 4 models
                ## alm not the most important feature for Gaussian Naive bayes tho, explain that
                # for other variables just change the name
                # for other models just change the number
                # interpreter.partial_dependence.plot_partial_dependence(["alm"],
                #                                                        pyint_model, grid_resolution=30,
                #                                                        with_variance=True)
 def fail_func():
     self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                            InMemoryModel(self.string_classifier.predict,
                                                                          examples=self.X),
                                                            grid_resolution=10)
Beispiel #29
0
def run_explanations(csv_path, csv_columns, target_column, zero_value):
    # Read the dataset from the provided CSV and print out information about it.
    df = pd.read_csv(csv_path,
                     names=csv_columns,
                     skipinitialspace=True,
                     skiprows=1)
    #df = df.drop('Target',axis=1)
    input_features = [name for name in csv_columns if name != target_column]
    #data, labels = shap.datasets.adult(display=True)
    if target_column not in csv_columns:
        print("target column error")
        return ("target column error")
    elif zero_value not in df[target_column].tolist():
        if str.isdecimal(zero_value) and (
                np.int64(zero_value) in df[target_column].tolist()
                or np.float64(zero_value) in df[target_column].tolist()):
            print("happy")
            zero_value = np.int64(zero_value)
        else:
            print(zero_value, df[target_column].tolist(),
                  df[target_column].dtype)
            return ("zero value error")

    labels = df[target_column].tolist()
    #labels = np.array([int(label) for label in labels])
    labels2 = []
    for label in labels:
        if label == zero_value:
            labels2.append(0)
        else:
            labels2.append(1)
    labels = np.array(labels2)

    data = df[input_features]

    for feature in input_features:
        if data[feature].dtype is not np.dtype(
                np.int64) and data[feature].dtype is not np.dtype(
                    np.float64) and data[feature].dtype is not np.dtype(
                        np.float32):
            data[feature] = data[feature].astype('category')

    cat_cols = data.select_dtypes(['category']).columns
    data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42)

    data_disp, labels_disp = shap.datasets.adult(display=True)
    X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split(
        data_disp, labels_disp, test_size=0.3, random_state=42)

    xgc = xgb.XGBClassifier(n_estimators=500,
                            max_depth=5,
                            base_score=0.5,
                            objective='binary:logistic',
                            random_state=42)
    xgc.fit(X_train, y_train)
    predictions = xgc.predict(X_test)

    fig = plt.figure(figsize=(16, 12))
    title = fig.suptitle("Default Feature Importances from XGBoost",
                         fontsize=14)

    ax1 = fig.add_subplot(2, 2, 1)
    xgb.plot_importance(xgc, importance_type='weight', ax=ax1)
    t = ax1.set_title("Feature Importance - Feature Weight")

    ax2 = fig.add_subplot(2, 2, 2)
    xgb.plot_importance(xgc, importance_type='gain', ax=ax2)
    t = ax2.set_title("Feature Importance - Split Mean Gain")

    ax3 = fig.add_subplot(2, 2, 3)
    xgb.plot_importance(xgc, importance_type='cover', ax=ax3)
    t = ax3.set_title("Feature Importance - Sample Coverage")

    #plt.savefig('static/explanations.png')

    explanation = eli5.explain_weights(xgc.get_booster())
    explanation_html = eli5.formatters.html.format_as_html(explanation)
    print(explanation_html)

    with open("templates/explanation.html", "a+") as file:
        file.write(explanation_html)

    doc_num = 0
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    #eli5.show_prediction(xgc.get_booster(), X_test.iloc[doc_num],
    #                     feature_names=list(data.columns) ,show_feature_values=True)
    explanation2 = eli5.explain_prediction(xgc.get_booster(),
                                           X_test.iloc[doc_num],
                                           feature_names=list(data.columns))
    explanation_html2 = eli5.formatters.html.format_as_html(explanation2)
    with open("templates/explanation.html", "a") as file:
        file.write(explanation_html2)

    doc_num = 2
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    #eli5.show_predicon(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns) ,show_feature_values=True)
    explanation3 = eli5.explain_prediction(xgc.get_booster(),
                                           X_test.iloc[doc_num],
                                           feature_names=list(data.columns))
    explanation_html3 = eli5.formatters.html.format_as_html(explanation3)
    with open("templates/explanation.html", "a") as file:
        file.write(explanation_html3)

    #target_names = ['$50K or less', 'More than $50K']
    interpreter = Interpretation(training_data=X_test,
                                 training_labels=y_test,
                                 feature_names=list(data.columns))
    im_model = InMemoryModel(xgc.predict_proba, examples=X_train)

    plots = interpreter.feature_importance.plot_feature_importance(
        im_model, ascending=True, n_samples=23000)

    plots[0].savefig('skater.png')

    features_pdp = input_features

    xgc_np = xgb.XGBClassifier(n_estimators=500,
                               max_depth=5,
                               base_score=0.5,
                               objective='binary:logistic',
                               random_state=42)
    xgc_np.fit(X_train.values, y_train)

    # In[ ]:

    from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer

    exp = LimeTabularExplainer(X_test.values,
                               feature_names=list(data.columns),
                               discretize_continuous=True)

    doc_num = 0
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    instance = exp.explain_instance(X_test.iloc[doc_num].values,
                                    xgc_np.predict_proba)
    instance.save_to_file('templates/lime.html', show_all=False)

    doc_num = 2
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    instance2 = exp.explain_instance(X_test.iloc[doc_num].values,
                                     xgc_np.predict_proba)
    instance2.save_to_file('templates/lime2.html', show_all=False)

    explainer = shap.TreeExplainer(xgc)
    shap_values = explainer.shap_values(X_test)
    pd.DataFrame(shap_values).head()

    #shap.force_plot(explainer.expected_value, shap_values[:,], X_test_disp.iloc[:,],show=False,matplotlib=True)
    #plt.savefig("static/force_plot.png")

    shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
    plt.savefig("static/summary_plot.png")

    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig("static/summary_plot2.png")

    return "Everyone Happy"
Beispiel #30
0
                                      predicted_labels=wtp_dnn_predictions,
                                      classes=['red', 'white'])

# # Model Interpretation

# ## View Feature importances

# In[14]:

from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

wtp_interpreter = Interpretation(wtp_test_SX,
                                 feature_names=wtp_features.columns)
wtp_im_model = InMemoryModel(wtp_lr.predict_proba,
                             examples=wtp_train_SX,
                             target_names=wtp_lr.classes_)
plots = wtp_interpreter.feature_importance.plot_feature_importance(
    wtp_im_model, ascending=False)

# ## View model ROC curve

# In[15]:

meu.plot_model_roc_curve(wtp_lr, wtp_test_SX, wtp_test_y)

# ## Visualize Model Decision Surface

# In[59]:

feature_indices = [