def fit_anchor(dataset, X_train, X_test, y_train, y_test, X2E):

    class_name = dataset['class_name']
    columns = dataset['columns']
    continuous = dataset['continuous']
    possible_outcomes = dataset['possible_outcomes']
    label_encoder = dataset['label_encoder']

    feature_names = list(columns)
    feature_names.remove(class_name)

    categorical_names = dict()
    idx_discrete_features = list()
    for idx, col in enumerate(feature_names):
        if col == class_name or col in continuous:
            continue
        idx_discrete_features.append(idx)
        categorical_names[idx] = label_encoder[col].classes_

    # Create Anchor Explainer
    explainer = anchor_tabular.AnchorTabularExplainer(possible_outcomes,
                                                      feature_names, X2E,
                                                      categorical_names)
    explainer.fit(X_train, y_train, X_test, y_test)

    return explainer
Example #2
0
    def load_data(self,
                  gcs_path,
                  target_idx,
                  features_to_use=None,
                  categorical_features=[],
                  feature_names=None,
                  skip_first=False):

        self.__target_idx = target_idx
        self.__features_to_use = features_to_use
        self.__feature_names = feature_names
        self.__skip_first = skip_first

        self.__numeric_features = list(
            set(features_to_use).difference(set(categorical_features)))

        self.__csv_file = file_io.FileIO(gcs_path, mode='r').read()

        if self.model_type == REGRESSION:
            self.__transform_labels = self.__create_transform_func(
                self.__get_label_values())

            self.__dataset = load_csv_dataset(
                data=StringIO(self.__csv_file),
                feature_names=feature_names,
                skip_first=skip_first,
                target_idx=target_idx,
                categorical_features=categorical_features,
                features_to_use=features_to_use,
                discretize=True,
                feature_transformations={target_idx: self.__transform_labels})
        else:
            self.__dataset = load_csv_dataset(
                data=StringIO(self.__csv_file),
                feature_names=feature_names,
                skip_first=skip_first,
                target_idx=target_idx,
                categorical_features=categorical_features,
                features_to_use=features_to_use,
                discretize=True)

        self.__label_map = {
            self.__dataset.class_names[i]: i
            for i in range(len(self.__dataset.class_names))
        }

        self.__explainer = anchor_tabular.AnchorTabularExplainer(
            self.__dataset.class_names, self.__dataset.feature_names,
            self.__dataset.data, self.__dataset.categorical_names,
            self.__dataset.ordinal_features)

        self.__explainer.fit(self.__dataset.train, self.__dataset.labels_train,
                             self.__dataset.validation,
                             self.__dataset.labels_validation)

        self.__value_mapper = self.__get_value_mapper()
 def __init__(self, model, data, feature_names, class_names=['0', '1'], categorical_features=None):
     self.model = model
     self.data = np.array(data)
     self.feature_names = feature_names
     self.class_names = class_names
     self.categorical_features = categorical_features
     self.explainer = anchor_tabular.AnchorTabularExplainer(
         self.class_names,
         self.feature_names,
         self.data,)
Example #4
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 20
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)
        model = model[0]

        # load data
        training_df, test_df = get_encoded_logs(job)

        # get radom point in evaluation set
        EXPLANATION_TARGET = 1

        # get the actual explanation
        job.encoding.features.remove('label')
        explainer = anchor_tabular.AnchorTabularExplainer(
            class_names=[True, False],
            feature_names=job.encoding.features,
            data=training_df.drop(['trace_id', 'label'], 1).T,
            categorical_names={
                job.encoding.features.index(item): list(range(max(training_df[item])))
                for item in job.encoding.features
            }
        )
        explainer.fit(
            training_df.drop(['trace_id', 'label'], 1).as_matrix(),
            [True, False],
            test_df.drop(['trace_id', 'label'], 1).as_matrix(),
            [True, False]
        )

        model_fn = lambda x: model.predict(x)

        # show plot
        idx = 0
        np.random.seed(1)
        print('Prediction: ', explainer.class_names[model_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1))[0]])
        exp = explainer.explain_instance(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx], model_fn, threshold=0.95)
        print('Anchor: %s' % (' AND '.join(exp.names())))
        print('Precision: %.2f' % exp.precision())
        print('Coverage: %.2f' % exp.coverage())

        fit_anchor = np.where(np.all(test_df.drop(['trace_id', 'label'], 1)[:, exp.features()] == test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx][exp.features()], axis=1))[0]
        print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(test_df.drop(['trace_id', 'label'], 1).shape[0])))
        # print('Anchor test precision: %.2f' % (
        #     np.mean(predict_fn(test_df.drop(['trace_id', 'label'], 1)[fit_anchor]) == predict_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1))))
        #     np.mean(predict_fn(test_df.drop(['trace_id', 'label'], 1)[fit_anchor]) == predict_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1))))
        #       )

        print('done')
Example #5
0
def train_network():
    import numpy as np
    np.random.seed(1)
    import tensorflow as tf
    tf.set_random_seed(1)

    import sklearn
    from DNN.keras import pre_processing
    from DNN.Induction.Anchor import anchor_tabular, utils

    datamanager = pre_processing.Datamanager(dataset="adults",
                                             in_mod="normal",
                                             out_mod="normal")
    dataset = datamanager.ret
    print("state0", np.random.get_state()[1][0])
    # Fit the explainer to the dataset.
    explainer = anchor_tabular.AnchorTabularExplainer(
        dataset.class_names, dataset.feature_names, dataset.data_train,
        dataset.categorical_names)

    explainer.fit(dataset.data_train, dataset.train_labels,
                  dataset.data_validation, dataset.validation_labels)
    print("state1", np.random.get_state()[1][0])
    from DNN.keras import network
    #keras.random.seed(1)
    #print(dataset.categorical_names, dataset.categorical_names.keys())
    n_values = sum([
        len(dataset.categorical_names[i])
        for i in dataset.categorical_names.keys()
    ])
    model = network.NN_adult_3(n_values, 1)
    np.random.seed(1)
    print("state2", np.random.get_state()[1][0])
    tf.set_random_seed(1)
    model.train_anchor(
        explainer.encoder.transform(dataset.data_train).toarray(),
        dataset.train_labels,
        explainer.encoder.transform(dataset.data_validation).toarray(),
        dataset.validation_labels,
        explainer.encoder.transform(dataset.data_test).toarray(),
        dataset.test_labels,
        epochs=200,
        batch_size=120,
        use_gen=True)
    print("state3", np.random.get_state()[1][0])
    predict_fn = lambda x: model.predict(explainer.encoder.transform(x))
Example #6
0
def explain(anchor_exp: Explanation, training_df, test_df, explanation_target):
    job = Job.objects.filter(pk=anchor_exp.job)[0]

    explainer = anchor_tabular.AnchorTabularExplainer(
        [True, False],  # dataset.class_names
        job.encoding.features,  # = dataset.feature_names
        training_df.drop(['trace_id'], 1),  # dataset.data
        {
            item: list(range(max(training_df[item])))
            for item in job.encoding.features
        })
    explainer.fit(
        training_df,  # dataset.train
        [True, False],  # dataset.labels_train
        test_df,  # dataset.validation
        [True, False]  # dataset.labels_validation
    )

    # show plot
    idx = 0
    np.random.seed(1)
    print(
        'Prediction: ',
        explainer.class_names[MODEL[job.predictive_model.predictive_model][
            ModelActions.PREDICT.value](job, test_df)[0]])
    exp = explainer.explain_instance(
        test_df[idx],
        MODEL[job.predictive_model.predictive_model][
            ModelActions.PREDICT.value],
        threshold=0.95)
    print('Anchor: %s' % (' AND '.join(exp.names())))
    print('Precision: %.2f' % exp.precision())
    print('Coverage: %.2f' % exp.coverage())
    """
	fit_anchor = np.where(np.all(dataset.test[:, exp.features()] == dataset.test[idx][exp.features()], axis=1))[0]
	print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(dataset.test.shape[0])))
	print('Anchor test precision: %.2f' % (
		np.mean(predict_fn(dataset.test[fit_anchor]) == predict_fn(dataset.test[idx].reshape(1, -1))))
		  )
	"""
    return dict(exp.names(), exp.precision(), exp.coverage())
Example #7
0
    # Create DataIO object
    data_io = DataIOFactory.create(args)

    # Load text data as lists of lists of words (sequences) and corresponding list of lists of tags
    data_io = DataIOFactory.create(args)
    dataset, X_train, Y_train, X_dev, Y_dev, X_test, Y_test = data_io.read_train_dev_test(
        args)

    # fit imputation models
    # import ipdb; ipdb.set_trace()
    # tagger.fit_imputation_models(dataset, counterfactual_method = 'conditional_expected_value')

    # sklearn baselines
    explainer = anchor_tabular.AnchorTabularExplainer(
        dataset.class_names, dataset.feature_names, dataset.data,
        dataset.categorical_names)
    explainer.fit(dataset.train, dataset.labels_train, dataset.validation,
                  dataset.labels_validation)

    encode_fn = explainer.encoder.transform

    import sklearn
    import sklearn.neural_network
    import xgboost
    print("Logistic Regression baseline")
    model = sklearn.linear_model.LogisticRegression(C=1e2,
                                                    solver='lbfgs',
                                                    max_iter=300)
    model.fit(encode_fn(X_train), Y_train)
    print("Train acc: %.3f" % model.score(encode_fn(X_train), Y_train))
best_features_list = [feature for sublist in top_5 for feature in sublist]


# In[185]:


pd.DataFrame(list(Counter(best_features_list).items()),
             columns=['Feature', 'Count']).sort_values(by='Count', ascending=False)


# In[194]:


from anchor import anchor_tabular


# In[203]:


exp = anchor_tabular.AnchorTabularExplainer(class_names=['notengaged', 'engaged'],
                                            feature_names=features_train_df.columns.tolist(),
                                            data=features_train_df)


# In[209]:


exp.fit(features_train_df, labels_train_df, features_validation_df, labels_validation_df, discretizer='decile')

Example #9
0
def complete_test():
    # Load dataset
    import numpy as np
    np.random.seed(1)
    import tensorflow as tf
    tf.set_random_seed(1)

    import sklearn
    from DNN.kera import pre_processing
    from DNN.Induction.Anchor import anchor_tabular, utils

    datamanager = pre_processing.Datamanager(dataset="adults",
                                             in_mod="normal",
                                             out_mod="normal")
    dataset = datamanager.ret

    # Import the network.
    # Fit the explainer to the dataset.
    explainer = anchor_tabular.AnchorTabularExplainer(
        dataset.class_names, dataset.feature_names, dataset.data_train,
        dataset.categorical_names)

    # ! Explainer.encoder.transform return sparse matrix, instead of dense np.array
    explainer.fit(dataset.data_train, dataset.train_labels,
                  dataset.data_validation, dataset.validation_labels)

    from DNN.kera import network
    #np.random.seed(1)
    #keras.random.seed(1)
    #print(dataset.categorical_names, dataset.categorical_names.keys())
    n_values = sum([
        len(dataset.categorical_names[i])
        for i in dataset.categorical_names.keys()
    ])
    model = network.Model(name="NN-adult-5",
                          c_path="NN-Adult-5/NN-Adult-5-8531.hdf5")
    model.evaluate(
        data_train=explainer.encoder.transform(dataset.data_train).toarray(),
        train_labels=dataset.train_labels,
        data_test=explainer.encoder.transform(dataset.data_test).toarray(),
        test_labels=dataset.test_labels)

    # Try to explain a given prediction print(datamanager.translate(dataset.data_train[0]))
    predict_fn = lambda x: model.predict(explainer.encoder.transform(x))

    idx = 1
    instance = dataset.data_test[idx].reshape(1, -1)
    prediction = predict_fn(instance)[0]
    print("prediction:", prediction, "=", explainer.class_names[prediction])

    exp = explainer.explain_instance(instance,
                                     model.predict,
                                     threshold=0.98,
                                     verbose=True)

    from DNN import explanation
    from DNN import knowledge_base

    print(exp.exp_map.keys())
    print(datamanager.ret.feature_names)
    # We need to pass in the actual values of the prediction.
    print(instance, instance.flatten())
    #instance = instance.flatten()
    value = [int(instance.flatten()[f]) for f in exp.features()]
    print(value)
    print((' AND '.join(exp.names())))
    print(exp.exp_map)
    exp_1 = explanation.Explanation(**exp.exp_map)
    print(exp_1.features())
    print(exp_1.names())
    print(
        exp_1.get_explanation(dataset.feature_names,
                              dataset.categorical_names))
Example #10
0
def test_anchors():
    import anchor

    import numpy as np
    # ! Old imports
    #pip install anchor_exp
    from anchor import utils, anchor_tabular
    #from DNN.Induction import Anchor
    #from DNN.Induction.Anchor import anchor_tabular, utils
    dataset_folder = "Data/"

    # get bunch object, with a dict containing interesting keypoints of the dataset.
    # training_set, validation_set, testing_set, feature_names, categories_per_feature etc.
    dataset = utils.load_dataset("adult",
                                 balance=True,
                                 dataset_folder=dataset_folder)

    print(dataset.__dict__.keys())
    print(dataset.categorical_names)
    explainer = anchor_tabular.AnchorTabularExplainer(
        dataset.class_names, dataset.feature_names, dataset.data,
        dataset.categorical_names)
    explainer.fit(dataset.train,
                  dataset.labels_train,
                  dataset.validation,
                  dataset.labels_validation,
                  discretizer='quartile')
    exit()
    print(explainer.encoder)
    print(explainer.encoder.transform)
    #print(dataset.__dict__)
    #model = network.Model(name="wine")
    #dataman = Datamanager.Datamanager(dataset="wine")
    import sklearn
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=50, n_jobs=5)
    print(model)
    print(explainer.encoder.transform(dataset.train))  #, dataset.labels_train
    model.fit(explainer.encoder.transform(dataset.train), dataset.labels_train)
    predict_fn = lambda x: model.predict(explainer.encoder.transform(
        x))  # use the explainer.encoder to transform the data first.
    print(
        'Train',
        sklearn.metrics.accuracy_score(dataset.labels_train,
                                       predict_fn(dataset.train)))
    print(
        'Test',
        sklearn.metrics.accuracy_score(dataset.labels_test,
                                       predict_fn(dataset.test)))

    # Anchor
    idx = 0
    np.random.seed(1)
    print("Instance to explain", dataset.test[idx].reshape(1, -1))
    prediction_class = predict_fn(dataset.test[idx].reshape(
        1, -1))[0]  # select first index of prediction matrix.
    print("prediction: ", explainer.class_names[prediction_class])

    exp = explainer.explain_instance(dataset.test[idx],
                                     model.predict,
                                     threshold=0.95)
    print(explainer.explain_instance)
    print(exp.names())
    print("Anchor: %s" % (" AND ".join(exp.names())))
    print("Precision: %.2f" % exp.precision())
    print("Coverage: %.2f" % exp.coverage())
    print(exp.features())

    # Check that the ancor holds for other data points.
    all_np = np.all(
        dataset.test[:, exp.features()] == dataset.test[idx][exp.features()],
        axis=1)
    print(all_np)
    fit_anchor = np.where((all_np))[0]  # select the array of indexes?
    print(fit_anchor, fit_anchor.shape)
    print('Anchor test precision: %.2f' % (np.mean(
        predict_fn(dataset.test[fit_anchor]) == predict_fn(
            dataset.test[idx].reshape(1, -1)))))
    print('Anchor test coverage: %.2f' %
          (fit_anchor.shape[0] / float(dataset.test.shape[0])))

    # Looking at a particular anchor
    print('Partial anchor: %s' %
          (' AND '.join(exp.names(1))))  # get part of the anchor.
    print('Partial precision: %.2f' % exp.precision(1))
    print('Partial coverage: %.2f' % exp.coverage(1))

    print(exp.features())
    print(dataset.test[:,
                       exp.features(1)] == dataset.test[idx][exp.features(1)])
    fit_partial = np.where(
        np.all(dataset.test[:, exp.features(1)] == dataset.test[idx][
            exp.features(1)],
               axis=1))[0]
    print('Partial anchor test precision: %.2f' % (np.mean(
        predict_fn(dataset.test[fit_partial]) == predict_fn(
            dataset.test[idx].reshape(1, -1)))))
    print('Partial anchor test coverage: %.2f' %
          (fit_partial.shape[0] / float(dataset.test.shape[0])))
Example #11
0
def load_model():
    # Load pretrained model
    import numpy as np
    np.random.seed(1)
    import tensorflow as tf
    tf.set_random_seed(1)

    import sklearn
    from DNN.keras import pre_processing
    from DNN.Induction.Anchor import anchor_tabular, utils

    datamanager = pre_processing.Datamanager(dataset="adults",
                                             in_mod="normal",
                                             out_mod="normal")
    dataset = datamanager.ret

    #dataset.ret.data_train[1]
    #print(dataset.data_train[0])
    #print(datamanager.translate(dataset.data_train[0]))

    #print(datamanager.translate(dataset.data_test[0]))

    #exit()

    # Fit the explainer to the dataset.
    explainer = anchor_tabular.AnchorTabularExplainer(
        dataset.class_names, dataset.feature_names, dataset.data_train,
        dataset.categorical_names)

    # ! Explainer.encoder.transform returl sparse matrix, instead of dense np.array
    explainer.fit(dataset.data_train, dataset.train_labels,
                  dataset.data_validation, dataset.validation_labels)

    from DNN.keras import network
    #np.random.seed(1)
    #keras.random.seed(1)
    #print(dataset.categorical_names, dataset.categorical_names.keys())
    n_values = sum([
        len(dataset.categorical_names[i])
        for i in dataset.categorical_names.keys()
    ])
    model = network.Model(name="NN-adult-5",
                          c_path="NN-Adult-5/NN-Adult-5-8531.hdf5")
    model.evaluate(
        data_train=explainer.encoder.transform(dataset.data_train).toarray(),
        train_labels=dataset.train_labels,
        data_test=explainer.encoder.transform(dataset.data_test).toarray(),
        test_labels=dataset.test_labels)
    #explainer.encoder.transform(dataset.data_train).toarray(), dataset.train_labels,
    #        explainer.encoder.transform(dataset.data_validation).toarray(), dataset.validation_labels,
    #        explainer.encoder.transform(dataset.data_test).toarray(), dataset.test_labels

    # Try to explain a given prediction print(datamanager.translate(dataset.data_train[0]))
    predict_fn = lambda x: model.predict(explainer.encoder.transform(x))

    np.random.seed(1)
    idx = 1
    instance = dataset.data_test[idx].reshape(1, -1)
    print("instance", instance[0])
    print(datamanager.translate(instance[0]))
    prediction = predict_fn(instance)[0]
    print("prediction:", prediction, "=", explainer.class_names[prediction])
    #print("prediction: ", explainer.class_names[predict_fn(dataset.data_test[idx].reshape(1,-1))[0]]) # predict on the first datapoint

    exp = explainer.explain_instance(instance,
                                     model.predict,
                                     threshold=0.99,
                                     verbose=True)
    #print(exp.names())
    print("Anchor: %s" % (" AND ".join(exp.names())))
    print("Precision: %.2f" % exp.precision())
    print("Coverage: %.2f" % exp.coverage())
    print("Features:", exp.features())

    print("anchor values:", [instance[0][f] for f in exp.features()])

    print(dataset.data_test[:, exp.features()],
          dataset.data_test[:, exp.features()].shape)

    all_np = np.all(dataset.data_test[:, exp.features()] ==
                    dataset.data_test[idx][exp.features()],
                    axis=1)
    fit_anchor = np.where((all_np))[0]  # select the array of indexes?
    #print(dataset.data_test[:,exp.features()][fit_anchor])

    # of all data points that have the same values as the instance on anchor, how many are correct.
    print('Anchor test precision: %.2f' % (np.mean(
        predict_fn(dataset.data_test[fit_anchor]) == predict_fn(instance))))
    # of all the similar instances in test set, how large percentet of the dataset is this.
    print('Anchor test coverage: %.2f' %
          (fit_anchor.shape[0] / float(dataset.data_test.shape[0])))

    print("\nPartial anchor 1")
    # Looking at a particular anchor
    print(exp.names(0), exp.names(1))
    print('Partial anchor: %s' % (' AND '.join(exp.names(1))))
    print('Partial precision: %.2f' % exp.precision(1))
    print('Partial coverage: %.2f' % exp.coverage(1))
    print('partial features: {}'.format(exp.features(1)))
    print(instance[0])

    print("partial precision and coverage:")
    all_np = np.all(dataset.data_test[:, exp.features(1)] ==
                    dataset.data_test[idx][exp.features(1)],
                    axis=1)
    fit_anchor = np.where((all_np))[0]  # select the array of indexes?

    # of all data points that have the same values as the instance on anchor, how many are correct.
    print('Partial Anchor test precision: %.2f' % (np.mean(
        predict_fn(dataset.data_test[fit_anchor]) == predict_fn(instance))))
    # of all the similar instances in test set, how large percentet of the dataset is this.
    print('Partial Anchor test coverage: %.2f' %
          (fit_anchor.shape[0] / float(dataset.data_test.shape[0])))

    # translation of prediction data.
    print(datamanager.translate(dataset.data_test[idx]))

    print("\n:::TESTING::::")

    print(exp.exp_map['names'], type(exp.exp_map['names']))
    print(exp.exp_map['feature'])
    print(exp.exp_map['precision'])
    print(exp.exp_map['coverage'])
    print(exp.exp_map['mean'])
    print(exp.exp_map['all_precision'])
    print(exp.exp_map['num_preds'])
    print(exp.exp_map['instance'])

    #print(exp.exp_map['examples'])
    print(exp.exp_map.keys())
Example #12
0
def test_anchor_nn_data():
    import numpy as np
    # ? copy from repository
    from DNN.Induction.Anchor import anchor_tabular, utils

    from DNN.keras import pre_processing
    dataset_folder = "Data/"
    dataset = utils.load_dataset("adult",
                                 balance=True,
                                 dataset_folder=dataset_folder)
    print(dataset.__dict__.keys())
    datamanager = pre_processing.Datamanager(dataset="adults",
                                             in_mod="normal",
                                             out_mod="normal")
    #dataset = datamanager.ret

    #print(dataset_2.categorical_names[11])
    print(dataset.categorical_names[11])

    #print(dataset_2.ordinal_features)
    print(dataset.ordinal_features)

    #print(dataset_2.feature_names)
    print(dataset.feature_names)

    #print(dataset_2.train,type(dataset_2.train),type(dataset_2.train[0]),type(dataset_2.train[0][0]))
    #print(dataset.data_train,type(dataset.data_train),type(dataset.data_train[0]),type(dataset.data_train[0][0]))

    #print(dataset_2.labels_train)
    #print(dataset.train_labels)

    #dataman = preprocessing.datamanager()

    # Fit the explainer to the dataset.
    explainer = anchor_tabular.AnchorTabularExplainer(
        dataset.class_names, dataset.feature_names, dataset.train,
        dataset.categorical_names)

    explainer.fit(dataset.train, dataset.labels_train, dataset.validation,
                  dataset.labels_validation)

    print(explainer.encoder.transform)
    print(explainer.disc)
    #print(dataset.__dict__)
    #model = network.Model(name="wine")
    #dataman = Datamanager.Datamanager(dataset="wine")

    #print(dataset.data_train[0])
    #print(explainer.encoder.transform(dataset.data_train)[0].shape)
    #print(explainer.encoder.transform(dataset.data_train)[0].toarray())
    #print(explainer.encoder.transformers[0])
    import sklearn
    if (True):  # IF network.
        from DNN.keras import network

        nn = network.NN_adult_2(123, 1)
        #dataset_2.train, dataset_2.labels_train, dataset_2.validation, dataset_2.labels_validation
        nn.train_anchor(explainer.encoder.transform(dataset.train),
                        dataset.labels_train,
                        explainer.encoder.transform(dataset.validation),
                        dataset.labels_validation,
                        epochs=1,
                        batch_size=100)

        model = nn
        # ? Load pretrained model..
        #model = network.Model(name="adults")
        predict_fn = lambda i: model.predict(explainer.encoder.transform(
            i))  # use the explainer.encoder to transform the data first.
        print(dataset.train.shape,
              explainer.encoder.transform(dataset.train).shape)
        print(
            predict_fn(dataset.train).shape, type(predict_fn(dataset.train)),
            predict_fn(dataset.train))

        print(
            'Train',
            sklearn.metrics.accuracy_score(dataset.labels_train,
                                           predict_fn(dataset.train)))
        print(
            'Test',
            sklearn.metrics.accuracy_score(dataset.labels_test,
                                           predict_fn(dataset.test)))
    else:
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=50, n_jobs=5)
        print(model)
        #print(explainer.encoder.transform(dataset.data_train))#, dataset.labels_train
        model.fit(explainer.encoder.transform(dataset.data_train),
                  dataset.train_labels)
        predict_fn = lambda x: model.predict(explainer.encoder.transform(
            x))  # use the explainer.encoder to transform the data first.
        print(dataset.data_train.shape,
              explainer.encoder.transform(dataset.data_train).shape)
        print(
            predict_fn(dataset.data_train).shape,
            type(predict_fn(dataset.data_train)),
            predict_fn(dataset.data_train))
        print(
            'Train',
            sklearn.metrics.accuracy_score(dataset.labels_train,
                                           predict_fn(dataset.train)))
        print(
            'Test',
            sklearn.metrics.accuracy_score(dataset.labels_test,
                                           predict_fn(dataset.test)))

    idx = 0
    np.random.seed(1)
    print(predict_fn(dataset.test[idx].reshape(1, -1))[0])
    prediction = predict_fn(dataset.test[idx].reshape(1, -1))[0]
    print(explainer.class_names)
    print("prediction:", explainer.class_names[prediction])

    #print("prediction: ", explainer.class_names[predict_fn(dataset.data_test[idx].reshape(1,-1))[0]]) # predict on the first datapoint
    exp = explainer.explain_instance(dataset.test[idx],
                                     model.predict,
                                     threshold=0.95)
    print(exp.names())
    print("Anchor: %s" % (" AND ".join(exp.names())))
    print("Precision: %.2f" % exp.precision())
    print("Coverage: %.2f" % exp.coverage())
    print(exp.features())

    exit()
    # TODO: put explainer encoder in pre_processor

    model.fit(explainer.encoder.transform(dataset.data_train),
              dataset.train_labels)
    predict_fn = lambda x: model.predict(explainer.encoder.transform(
        x))  # use the explainer.encoder to transform the data first.
    print(
        'Train',
        sklearn.metrics.accuracy_score(dataset.train_labels,
                                       predict_fn(dataset.data_train)))
    print(
        'Test',
        sklearn.metrics.accuracy_score(dataset.test_labels,
                                       predict_fn(dataset.data_test)))
    # Anchor
    idx = 0
    np.random.seed(1)
    print(dataset.test_labels[idx])
    print(dataset.test_labels[idx].reshape(1, -1))

    print("prediction: ",
          explainer.class_names[predict_fn(dataset.data_test[idx].reshape(
              1, -1))[0]])  # predict on the first datapoint
    exp = explainer.explain_instance(dataset.data_test[idx],
                                     model.predict,
                                     threshold=0.95)
    print(exp.names())
    print("Anchor: %s" % (" AND ".join(exp.names())))
    print("Precision: %.2f" % exp.precision())
    print("Coverage: %.2f" % exp.coverage())
    print(exp.features())

    # TODO: list of catagories -> encoding -> one_hot_encoding.
    exit()
    # Check that the ancor holds for other data points.
    all_np = np.all(
        dataset.test[:, exp.features()] == dataset.test[idx][exp.features()],
        axis=1)
    print(all_np)
    fit_anchor = np.where((all_np))[0]  # select the array of indexes?
    print(fit_anchor, fit_anchor.shape)
    print('Anchor test precision: %.2f' % (np.mean(
        predict_fn(dataset.test[fit_anchor]) == predict_fn(
            dataset.test[idx].reshape(1, -1)))))
    print('Anchor test coverage: %.2f' %
          (fit_anchor.shape[0] / float(dataset.test.shape[0])))

    # Looking at a particular anchor
    print('Partial anchor: %s' % (' AND '.join(exp.names(1))))
    print('Partial precision: %.2f' % exp.precision(1))
    print('Partial coverage: %.2f' % exp.coverage(1))
Example #13
0
def anchor_call(xgb,
                sample=None,
                nb_samples=5,
                feats='all',
                nb_features_in_exp=5,
                threshold=0.95):

    timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \
            resource.getrusage(resource.RUSAGE_SELF).ru_utime

    # we need a way to say that features are categorical ?
    # we do not have this informations.
    explainer = anchor_tabular.AnchorTabularExplainer(
        class_names=xgb.target_name,
        feature_names=xgb.feature_names,
        train_data=xgb.X,
        categorical_names=xgb.categorical_names if xgb.use_categorical else {})
    # if (len(xgb.X_test) != 0):
    #     explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_test, xgb.Y_test)
    # else:
    #     explainer.fit(xgb.X_train, xgb.Y_train, xgb.X_train, xgb.Y_train)
    predict_fn_xgb = lambda x: xgb.model.predict(xgb.transform(x)).astype(int)

    f2imap = {}
    for i, f in enumerate(xgb.feature_names):
        f2imap[f.strip()] = i

    if (sample is not None):
        try:
            feat_sample = np.asarray(sample, dtype=np.float32)
        except Exception as inst:
            print("Cannot parse input sample:", sample, inst)
            exit()
        print(
            "\n\n\n Starting Anchor explainer... \nConsidering a sample with features:",
            feat_sample)
        if not (len(feat_sample) == len(xgb.X_train[0])):
            print(
                "Unmatched features are not supported: The number of features in a sample {} is not equal to the number of features in this benchmark {}"
                .format(len(feat_sample), len(xgb.X_train[0])))
            exit()

        # compute boost predictions
        feat_sample_exp = np.expand_dims(feat_sample, axis=0)
        feat_sample_exp = xgb.transform(feat_sample_exp)
        y_pred = xgb.model.predict(feat_sample_exp)[0]
        y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0]
        #hack testiing that we use the same onehot encoding
        # test_feat_sample_exp = explainer.encoder.transform(feat_sample_exp)
        test_y_pred = xgb.model.predict(feat_sample_exp)[0]
        test_y_pred_prob = xgb.model.predict_proba(feat_sample_exp)[0]
        assert (np.allclose(y_pred_prob, test_y_pred_prob))
        print(
            'Prediction: ',
            explainer.class_names[predict_fn_xgb(feat_sample.reshape(1,
                                                                     -1))[0]])
        # exp = explainer.explain_instance(feat_sample, xgb.model.predict, threshold=threshold)
        exp = explainer.explain_instance(feat_sample,
                                         predict_fn_xgb,
                                         threshold=threshold)
        print('Anchor: %s' % (' AND '.join(exp.names())))
        print('Precision: %.2f' % exp.precision())
        print('Coverage: %.2f' % exp.coverage())
        #print(exp.features())
        #print(exp.names())

        # explanation
        expl = []

        if (xgb.use_categorical):
            for k, v in enumerate(exp.features()):
                expl.append(v)
                print("Clause ", k, end=": ")
                print("feature (",
                      v,
                      ",",
                      explainer.feature_names[v],
                      end="); ")
                print("value (", feat_sample[v], ",",
                      explainer.categorical_names[v][int(feat_sample[v])], ")")
        else:
            print(
                "We only support datasets with categorical features for Anchor. Please pre-process your data."
            )
            exit()

        timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \
                resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer
        print('  time: {0:.2f}'.format(timer))

        return sorted(expl)

    ###################################### TESTING
    max_sample = nb_samples
    y_pred_prob = xgb.model.predict_proba(xgb.X_test)
    y_pred = xgb.model.predict(xgb.X_test)

    nb_tests = min(max_sample, len(xgb.Y_test))
    top_labels = 1
    for sample in range(nb_tests):
        np.set_printoptions(precision=2)
        feat_sample = xgb.X_test[sample]
        print("Considering a sample with features:", feat_sample)
        if (False):
            feat_sample[4] = 3000
            y_pred_prob_sample = xgb.model.predict_proba([feat_sample])
            print(y_pred_prob_sample)
            print("\t Predictions:", y_pred_prob[sample])
        exp = explainer.explain_instance(feat_sample,
                                         predict_fn_xgb,
                                         num_features=xgb.num_class,
                                         top_labels=1,
                                         labels=list(range(xgb.num_class)))
        for i in range(xgb.num_class):
            if (i != y_pred[sample]):
                continue
            print("\t \t Explanations for the winner class", i,
                  " (xgboost confidence = ", y_pred_prob[sample][i], ")")
            print("\t \t Features in explanations: ", exp.as_list(label=i))
    timer = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime + \
            resource.getrusage(resource.RUSAGE_SELF).ru_utime - timer
    print('  time: {0:.2f}'.format(timer))
    return
Example #14
0
def main():

    dataset_name = 'german_credit.csv'
    path_data = './datasets/'
    dataset = prepare_german_dataset(dataset_name, path_data)

    X, y = dataset['X'], dataset['y']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    blackbox = RandomForestClassifier(n_estimators=20)
    blackbox.fit(X_train, y_train)

    X2E = X_test
    idx_record2explain = 9

    class_name = dataset['class_name']
    columns = dataset['columns']
    continuous = dataset['continuous']
    possible_outcomes = dataset['possible_outcomes']
    label_encoder = dataset['label_encoder']

    feature_names = list(columns)
    feature_names.remove(class_name)

    categorical_names = dict()
    idx_discrete_features = list()
    for idx, col in enumerate(feature_names):
        if col == class_name or col in continuous:
            continue
        idx_discrete_features.append(idx)
        categorical_names[idx] = label_encoder[col].classes_

    # Create Anchor Explainer
    explainer = anchor_tabular.AnchorTabularExplainer(possible_outcomes,
                                                      feature_names, X2E,
                                                      categorical_names)
    explainer.fit(X_train, y_train, X_test, y_test)

    print(
        'Prediction: ',
        possible_outcomes[blackbox.predict(X2E[idx_record2explain].reshape(
            1, -1))[0]])

    exp, info = explainer.explain_instance(X2E[idx_record2explain].reshape(
        1, -1),
                                           blackbox.predict,
                                           threshold=0.95)

    print('Anchor: %s' % (' AND '.join(exp.names())))
    print('Precision: %.2f' % exp.precision())
    print('Coverage: %.2f' % exp.coverage())

    # Get test examples where the anchora pplies# Get t
    fit_anchor = np.where(
        np.all(X2E[:,
                   exp.features()] == X2E[idx_record2explain][exp.features()],
               axis=1))[0]
    print('Anchor test coverage: %.2f' %
          (fit_anchor.shape[0] / float(X2E.shape[0])))
    print('Anchor test precision: %.2f' % (np.mean(
        blackbox.predict(X2E[fit_anchor]) == blackbox.predict(
            X2E[idx_record2explain].reshape(1, -1)))))

    print(blackbox.predict(info['state']['raw_data']))