Esempio n. 1
0
    def test_gpx_classify(self):

        MY_INST = 33
        x_varied, y_varied = make_moons(n_samples=500, random_state=170)
        model = MLPClassifier()
        model.fit(x_varied, y_varied)
        my_predict = model.predict_proba

        gpx = Gpx(my_predict,
                  x_train=x_varied,
                  y_train=y_varied,
                  random_state=42,
                  num_samples=250)

        y_hat_gpx = gpx.explaining(x_varied[MY_INST, :])
        y_hat_bb = my_predict(x_varied[MY_INST, :].reshape(1, -1))

        acc = gpx.understand(metric='accuracy')

        gpx.logger.info('{} / y_hat_gpx: {} / y_hat_bb: {}'.format(
            self.test_gpx_classify.__name__, type(y_hat_gpx), type(y_hat_bb)))

        self.assertEqual(np.sum(y_hat_gpx), np.sum(y_hat_bb),
                         "gpx fail in predict the black-box prediction")

        gpx.logger.info('test accuracy: {}'.format(acc))
        self.assertGreater(
            acc, 0.9,
            'Accuracy decreasing in understand()  method of GPX class!!')
Esempio n. 2
0
    def test_feature_sensitivity(self):
        x, y = make_moons(n_samples=1500, noise=.4, random_state=17)
        clf = MLPClassifier()
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            train_size=.8,
                                                            test_size=.2,
                                                            random_state=17)
        clf.fit(x_train, y_train)

        gpx = Gpx(clf.predict_proba,
                  x_train=x,
                  y_train=y,
                  random_state=42,
                  feature_names=['x', 'y'])
        gpx.explaining(x_test[30, :])

        dict_sens = gpx.feature_sensitivity()

        for key in dict_sens:
            sens = dict_sens[key][0]
            gpx.logger.info(
                'test_feature_sensitivity soma sensibilidade: {}'.format(
                    np.sum(sens)))
            self.assertGreater(np.sum(sens), 1)
Esempio n. 3
0
    def test_feature_names(self):
        X, y = load_breast_cancer(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        f_names = load_breast_cancer().feature_names

        model = RandomForestClassifier()
        model.fit(X_train, y_train)

        gp_hyper_parameters = {
            'population_size':
            200,
            'generations':
            200,
            'stopping_criteria':
            0.0001,
            'p_crossover':
            0.7,
            'p_subtree_mutation':
            0.1,
            'p_hoist_mutation':
            0.05,
            'p_point_mutation':
            0.1,
            'const_range': (-1, 1),
            'parsimony_coefficient':
            0.0005,
            'init_depth': (3, 6),
            'n_jobs':
            -1,
            'function_set': ('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs',
                             'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan'),
            'feature_names':
            f_names
        }

        gpx = Gpx(model.predict_proba,
                  x_train=X_train,
                  y_train=y_train,
                  random_state=42,
                  gp_hyper_parameters=gp_hyper_parameters)
        gpx.explaining(X_test[30, :])

        print(gpx.gp_model._program)
Esempio n. 4
0
    def test_understand(self):
        x, y = make_moons(n_samples=1500, noise=.4, random_state=17)
        clf = MLPClassifier()
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            train_size=.8,
                                                            test_size=.2,
                                                            random_state=17)
        clf.fit(x_train, y_train)

        gpx = Gpx(clf.predict_proba,
                  x_train=x,
                  y_train=y,
                  feature_names=['x', 'y'])
        gpx.explaining(x_test[30, :])

        y = clf.predict_proba(x_test)
        gpx.logger.info(gpx.proba_transform(y))

        try:
            u = gpx.understand(metric='loss')
        except ValueError as e:
            gpx.logger.exception(e)
            u = gpx.understand(metric='accuracy')
            gpx.logger.info('test_understand accuracy {}'.format(u))
            self.assertGreater(u, .9, 'test_understand accuracy {}'.format(u))
Esempio n. 5
0
    def test_features_distribution(self):
        x_varied, y_varied = make_moons(n_samples=500, random_state=170)
        model = MLPClassifier()
        model.fit(x_varied, y_varied)
        my_predict = model.predict_proba

        gpx = Gpx(my_predict,
                  x_train=x_varied,
                  y_train=y_varied,
                  random_state=42,
                  num_samples=250)

        gpx.explaining(x_varied[13, :])
        d = gpx.features_distribution()

        gpx.logger.info('distribution-> program:{} / x_0: {} / x_1: {}'.format(
            gpx.gp_model._program, d['x_0'], d['x_1']))
        self.assertLess(
            d['x_0'], d['x_1'],
            'Method features_distributions() output unexpected, x_0 greater than x_1!'
        )
Esempio n. 6
0
    def test_gpx_regression(self):

        INSTANCE: int = 13
        reg = RandomForestRegressor()
        x, y = load_boston(return_X_y=True)
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            train_size=.8,
                                                            test_size=.2,
                                                            random_state=42)
        reg.fit(x_train, y_train)

        gpx = Gpx(predict=reg.predict,
                  x_train=x_train,
                  y_train=y_train,
                  problem='regression',
                  random_state=42)
        gpx.explaining(x_test[INSTANCE, :])
        y_hat = reg.predict(x_test)
        mse = mean_squared_error(y_test, y_hat)

        d = gpx.features_distribution()

        # self.assertEqual(max(list(d.values())), d['x_2'])

        self.assertLess(
            gpx.understand(metric='mse'), mse,
            '{} mse greater than understand (local mse)'.format(
                self.test_gpx_regression.__name__))
Esempio n. 7
0
    def setUp(self) -> None:
        self.PRINT: bool = False
        self.NUN_SAMPLES: int = 250
        self.INSTANCE: int = 74
        x, y = make_moons(n_samples=500, noise=.1)
        self.x_train, \
        self.x_test, \
        self.y_train, \
        self.y_test = train_test_split(x, y, test_size=.3)

        self.clf = RandomForestClassifier(random_state=42)
        self.clf.fit(self.x_train, self.y_train)

        gpx = Gpx(self.clf.predict_proba,
                  x_train=self.x_train,
                  y_train=self.y_train,
                  feature_names=['x', 'y'],
                  num_samples=self.NUN_SAMPLES
                  )
        self.ns = NoiseSet(gpx)

        self.labels = gpx.labels
Esempio n. 8
0
    def test_grafic_sensibility(self):
        INSTANCE: int = 74
        x, y = make_moons(n_samples=1500, noise=.4, random_state=17)
        clf = MLPClassifier()
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            train_size=.8,
                                                            test_size=.2,
                                                            random_state=17)
        clf.fit(x_train, y_train)

        gpx = Gpx(clf.predict_proba,
                  x_train=x,
                  y_train=y,
                  random_state=42,
                  feature_names=['x', 'y'])
        gpx.explaining(x_test[INSTANCE, :])

        x, y = gpx.x_around[:, 0], gpx.x_around[:, 1]
        y_proba = gpx.proba_transform(gpx.y_around)

        resolution = 0.02
        x1_min, x1_max = x.min() - 1, x.max() + 1
        x2_min, x2_max = y.min() - 1, y.max() + 1
        xm1, xm2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                               np.arange(x2_min, x2_max, resolution))
        Z_bb = gpx.gp_prediction(np.array([xm1.ravel(), xm2.ravel()]).T)

        fig, ax = plt.subplots()
        ax.set_xlim(x1_min, x1_max)
        ax.set_xlim(x2_min, x2_max)
        scat = plt.scatter(x, y, y_proba)

        def func(data):
            k, j = data
            scat.set_offsets(k)
            scat.set_array(j)

        mmm = gpx.max_min_matrix(noise_range=10)

        gen = []
        for n in mmm[:, 0]:
            aux = gpx.x_around.copy()
            aux[:, 0] = n
            gen.append((aux.copy(), gpx.gp_prediction(aux.copy())))

        animation = ani.FuncAnimation(fig,
                                      func,
                                      gen,
                                      interval=200,
                                      save_count=200)

        plt.contourf(xm1, xm2, Z_bb.reshape(xm1.shape), alpha=0.4)
        plt.scatter(x, y, c=y_proba)

        plt.show()

        writergif = ani.PillowWriter(fps=5)
        animation.save('sens_x_2.gif', writer=writergif)

        sens_gpx = gpx.feature_sensitivity()
        print(sens_gpx)
Esempio n. 9
0
    def test_gradient(self):

        INSTANCE: int = 15
        wine = load_wine()
        X, y = load_wine(return_X_y=True)

        X = X[y != 2]
        y = y[y != 2]

        clf = RandomForestClassifier()
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=.5,
                                                            test_size=.5,
                                                            random_state=42)

        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)

        clf.fit(x_train, y_train)

        gp_hyper_parameters = {
            'population_size': 100,
            'generations': 100,
            'stopping_criteria': 0.00001,
            'p_crossover': 0.7,
            'p_subtree_mutation': 0.1,
            'p_hoist_mutation': 0.05,
            'p_point_mutation': 0.1,
            'const_range': (-1, 1),
            'parsimony_coefficient': 0.0001,
            'init_depth': (3, 6),
            'n_jobs': -1,
            'function_set': ('add', 'mul', 'sin', 'cos', 'tan', 'sqrt'),
            'random_state': 42,
        }

        gpx = Gpx(clf.predict_proba,
                  x_train=x_train,
                  gp_hyper_parameters=gp_hyper_parameters,
                  y_train=y_train,
                  feature_names=wine.feature_names,
                  num_samples=5000,
                  k_neighbor=10)

        gpx.explaining(scaler.transform(x_test[INSTANCE, :].reshape(1, -1)))

        prog = gpx.program2sympy()

        print(sp.latex(sp.sympify(prog)))

        print(prog)

        part_dic = gpx.gradient_analysis()

        print('\n\n')
        print(part_dic)

        my_subs = zip(wine.feature_names, x_test[INSTANCE, :])
        my_subs_list = []

        for s_name, s_value in my_subs:
            if s_name in prog:
                my_subs_list.append((s_name, s_value))

        print('\n\n')

        y_0 = []
        y_1 = []
        names = []
        for k, v in part_dic.items():
            partial = v.subs(my_subs_list)
            print('feature: {}  gradient: {}'.format(k, partial))
            if partial >= 0:
                y_0.append(partial)
            else:
                y_1.append(partial)
            names.append(str(k).upper())
        x_0 = range(len(y_0))
        x_1 = range(len(y_0), len(part_dic))

        fig, ax = plt.subplots()
        ax.barh(x_0, y_0, color='b')
        ax.barh(x_1, y_1, color='r')
        width = 0.3
        ind = np.arange(len(part_dic))
        ax.set_yticks(ind + width / 2)
        ax.set_yticklabels(names, minor=False)
        # for i, v in enumerate(y_0 + y_1):
        #     ax.text(v + 3, i + .25, str(v), color='blue', fontweight='bold')
        plt.show()
Esempio n. 10
0
    def test_multi_class(self):

        INSTANCE: int = 20
        iris = load_iris()
        X, y = load_iris(return_X_y=True)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            random_state=42)

        scaler = Normalizer()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)

        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_train, y_train)

        gp_hyper_parameters = {
            'population_size':
            100,
            'generations':
            100,
            'stopping_criteria':
            0.0001,
            'p_crossover':
            0.7,
            'p_subtree_mutation':
            0.1,
            'p_hoist_mutation':
            0.05,
            'p_point_mutation':
            0.1,
            'const_range': (-1, 1),
            'parsimony_coefficient':
            0.0005,
            'init_depth': (3, 6),
            'n_jobs':
            -1,
            'random_state':
            42,
            'function_set': ('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs',
                             'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan')
        }

        gpx = Gpx(clf.predict_proba,
                  gp_hyper_parameters=gp_hyper_parameters,
                  x_train=X_train,
                  y_train=y_train,
                  feature_names=iris.feature_names,
                  num_samples=500,
                  k_neighbor=50,
                  random_state=42)

        y_hat = gpx.explaining(
            scaler.transform(X_test[INSTANCE, :].reshape(-1, 1)))

        x_around = gpx.x_around

        gpx_y = gpx.gp_prediction(x_around)
        bb_y = clf.predict(x_around)

        gpx.logger.info('Multiclass gpx_y:{} / bb_y {}'.format(gpx_y, bb_y))
        gpx.logger.info('test_understand mult-class accuracy {}'.format(
            accuracy_score(gpx_y, bb_y)))
Esempio n. 11
0
import time

NUN_SAMPLES: int = 250
INSTANCE: int = 74
x, y = make_moons(n_samples=500, noise=.1)
x_train, \
    x_test, \
    y_train, \
    y_test = train_test_split(x, y, test_size=.3)

clf = RandomForestClassifier(random_state=42)
clf.fit(x_train, y_train)

gpx = Gpx(clf.predict_proba,
          x_train=x_train,
          y_train=y_train,
          feature_names=['x', 'y'],
          num_samples=NUN_SAMPLES
          )


ns = NoiseSet(gpx)


def test_k_neighbor_opt(benchmark):
    benchmark.pedantic(ns.k_neighbor_adapter, args=(x_test[13, :], 4), iterations=3, rounds=30)


def test_k_neighbor_nor(benchmark):
    benchmark.pedantic(ns.noise_k_neighbor, args=(x_test[13, :], 4), iterations=3, rounds=30)