Exemple #1
0
    def score(self, X, y):
        supports = self.decision_function(X)

        y_pred = (supports > 0).astype(int)
        score = metrics.balanced_accuracy_score(y, y_pred)

        return score
Exemple #2
0
                X_test_c = extractor_c.transform(corpus_c_test).toarray()

                # Build classifiers
                clf_a = MLPClassifier(random_state=1410)
                clf_b = MLPClassifier(random_state=1410)
                clf_c = MLPClassifier(random_state=1410)

                clf_a.fit(X_train_a, y_train)
                clf_b.fit(X_train_b, y_train)
                clf_c.fit(X_train_c, y_train)

                # Establish predictions
                y_pred_a = clf_a.predict(X_test_a)
                y_pred_b = clf_b.predict(X_test_b)
                y_pred_c = clf_c.predict(X_test_c)

                # Calculate scores
                score_a = balanced_accuracy_score(y_test, y_pred_a)
                score_b = balanced_accuracy_score(y_test, y_pred_b)
                score_c = balanced_accuracy_score(y_test, y_pred_c)

                print(score_a, score_b, score_c)

                scores[fold + 2 * repeat, 0, i, j] = score_a
                scores[fold + 2 * repeat, 1, i, j] = score_b
                scores[fold + 2 * repeat, 2, i, j] = score_c

                print(scores)

np.save("n_gram_scores", scores)
Exemple #3
0
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = np.unique(y)
        self.n_features = X.shape[1]
        self.X_, self.y_ = X, y

        # Base clf, ensemble and subspaces
        self.clf_ = clone(self.base_estimator).fit(self.X_, self.y_)
        self.ensemble_ = self.clf_.estimators_
        # self.subspaces_ = self.clf_.subspaces

        # Calculate mean accuracy on training set
        p = np.mean(
            np.array([
                accuracy_score(self.y_, member_clf.predict(self.X_))
                for clf_ind, member_clf in enumerate(self.ensemble_)
            ]))

        # All measures for whole ensemble
        self.e, self.k, self.kw, self.dis, self.q = calc_diversity_measures(
            self.X_, self.y_, self.ensemble_, p)

        # Calculate diversity space for all measures
        self.diversity_space = np.zeros((5, len(self.ensemble_)))
        for i in range(len(self.ensemble_)):
            temp_ensemble = self.ensemble_.copy()
            temp_ensemble.pop(i)
            # temp_subspaces = self.subspaces_[np.arange(len(self.subspaces_))!=1]

            p = np.mean(
                np.array([
                    accuracy_score(self.y_, member_clf.predict(self.X_))
                    for clf_ind, member_clf in enumerate(self.ensemble_)
                ]))

            temp_e, temp_k, temp_kw, temp_dis, temp_q = calc_diversity_measures(
                self.X_, self.y_, temp_ensemble, p)
            self.diversity_space[0, i] = self.e - temp_e
            self.diversity_space[1, i] = self.k - temp_k
            self.diversity_space[2, i] = self.kw - temp_kw
            self.diversity_space[3, i] = self.dis - temp_dis
            self.diversity_space[4, i] = self.q - temp_q
        """
        # Density estimation plots PHD
        import matplotlib.pyplot as plt
        import matplotlib as mplt
        from matplotlib import rcParams
        rcParams['font.family'] = 'monospace'
        rcParams['font.size'] = 12

        fig, ax = plt.subplots(2, 3, figsize=(20,10))
        ax1 = plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2)
        ax2 = plt.subplot2grid((2,6), (0,2), colspan=2)
        ax3 = plt.subplot2grid((2,6), (0,4), colspan=2)
        ax4 = plt.subplot2grid((2,6), (1,1), colspan=2)
        ax5 = plt.subplot2grid((2,6), (1,3), colspan=2)
        axes = [ax1, ax2, ax3, ax4, ax5]
        dist_diversity_measures = ["The entropy measure E", "Measurement of interrater agreement k", "Kohavi-Wolpert variance", "The disagreement measure", "The Q statistics"]
        for i in range(self.diversity_space.shape[0]):
            axes[i].set_title(dist_diversity_measures[i])
            axes[i].set_xlabel("M measure")
            sns.distplot(self.diversity_space[i], hist=True, kde=True, color = (0.6015625,0.203125,0.17578125),
             hist_kws={'edgecolor':'black', 'color':'#d6adab', 'alpha':1.0},
             kde_kws={'linewidth': 4}, ax=axes[i], bins=8)
            sns.despine(top=True, right=True, left=False, bottom=False)
        # plt.show()
        plt.tight_layout()
        plt.savefig("density.png")
        plt.savefig("density.eps")
        exit()
        """

        # """
        # Density estimation plots ICCS
        import matplotlib.pyplot as plt
        import matplotlib as mplt
        from matplotlib import rcParams
        rcParams['font.family'] = 'monospace'
        rcParams['font.size'] = 18

        fig, ax = plt.subplots(2, 2, figsize=(18, 10))
        # ax1 = plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2)
        # ax2 = plt.subplot2grid((2,6), (0,2), colspan=2)
        # ax3 = plt.subplot2grid((2,6), (0,4), colspan=2)
        # ax4 = plt.subplot2grid((2,6), (1,1), colspan=2)
        # ax5 = plt.subplot2grid((2,6), (1,3), colspan=2)
        # axes = [ax1, ax2, ax3, ax4, ax5]
        ax = ax.ravel()
        dist_diversity_measures = [
            "The entropy measure E", "Measurement of interrater agreement k",
            "Kohavi-Wolpert variance", "The disagreement measure",
            "The Q statistics"
        ]
        for indx, i in enumerate([0, 1, 2, 4]):
            ax[indx].set_title(dist_diversity_measures[i])
            ax[indx].set_xlabel("M measure")
            sns.distplot(self.diversity_space[i],
                         hist=True,
                         kde=True,
                         color=(0.6015625, 0.203125, 0.17578125),
                         hist_kws={
                             'edgecolor': 'black',
                             'color': '#d6adab',
                             'alpha': 1.0
                         },
                         kde_kws={'linewidth': 4},
                         ax=ax[indx],
                         bins=8)
            sns.despine(top=True, right=True, left=False, bottom=False)
        # plt.show()
        plt.tight_layout()
        plt.savefig("density.png")
        plt.savefig("density.eps")
        exit()
        # """

        # Clustering
        # DIV x CLUSTERS x CLFS
        self.indexes = np.zeros(
            (5, self.max_clusters - 1, len(self.ensemble_)))

        for div_inxd, div in enumerate(self.diversity_space):
            for clu_indx, n_clusters in enumerate(
                    range(2, self.max_clusters + 1)):
                self.kmeans = KMeans(n_clusters=n_clusters,
                                     random_state=self.random_state)
                self.indexes[div_inxd, clu_indx] = self.kmeans.fit_predict(
                    div.reshape(-1, 1))
                # print(div_inxd, clu_indx)
                # print(self.indexes[div_inxd, clu_indx])

            # Plots
            # import matplotlib.pyplot as plt
            # import matplotlib as mplt
            # mplt.rcParams['axes.spines.right'] = False
            # mplt.rcParams['axes.spines.top'] = False
            # mplt.rcParams['axes.spines.left'] = False
            # plt.figure(figsize=(8,1))
            # plt.ylim(0, 0.2)
            # plt.yticks([])
            # # plt.xlim(-0.125, 0.075)
            # plt.tight_layout()
            # colors = ["red", "blue", "green", "orange", "cyan", "pink", "black", "yellow"]
            # for j in range(self.max_clusters):
            #     plt.vlines(0.05*self.diversity_space[0][self.kmeans.labels_ == j], 0, .2, color=colors[j])
            # plt.savefig("foo.png")

        # Calculate base models bac
        base_scores = np.array([
            balanced_accuracy_score(self.y_, member_clf.predict(self.X_))
            for clf_ind, member_clf in enumerate(self.ensemble_)
        ])
        # print(base_scores)
        # exit()
        # DIV x CLU x CLU
        # self.pruned_ensembles = np.zeros((5, self.max_clusters-1, self.max_clusters-1))
        self.pruned_ensembles = []
        # self.pruned_subspaces_ = []
        ensemble_ = np.array(self.ensemble_)

        for div_inxd in range(5):
            for cluster_indx, n_clusters in enumerate(
                    range(2, self.max_clusters + 1)):
                self.pruned_ensemble_ = []
                for j in range(n_clusters):
                    cluster_ensemble = ensemble_[self.indexes[
                        div_inxd, cluster_indx] == j]
                    # cluster_subspaces = self.subspaces_[indexes==j]
                    # print(cluster_ensemble.shape)
                    cluster_scores = base_scores[self.indexes[
                        div_inxd, cluster_indx] == j]
                    # print(cluster_scores)
                    best = np.argmax(cluster_scores)
                    # print(best)
                    # exit()
                    self.pruned_ensemble_.append(cluster_ensemble[best])
                self.pruned_ensembles.append(self.pruned_ensemble_)
                # print(len(self.pruned_ensemble_))
                # self.pruned_subspaces_.append(cluster_subspaces[best])
                # print(len(self.pruned_ensemble_))
        self.pruned_ensembles = np.array(self.pruned_ensembles)
        # print(np.array(self.pruned_ensembles).shape)
        # exit()

        # Single measures
        """
        # Calculate chosen diversity measure for whole ensemble
        self.whole_diversity = calc_diversity_measures2(self.X_, self.y_, self.ensemble_, self.subspaces_, p, self.diversity)

        # Calculate diversity space
        self.diversity_space = np.zeros((len(self.ensemble_)))
        for i in range(len(self.ensemble_)):
            temp_ensemble = self.ensemble_.copy()
            temp_ensemble.pop(i)
            temp_subspaces = self.subspaces_[np.arange(len(self.subspaces_))!=1]

            if self.diversity == "k":
                p = np.mean(np.array([ accuracy_score(self.y_,member_clf.predict(X[:, self.subspaces_[clf_ind]])) for clf_ind, member_clf in enumerate(self.ensemble_)]))

            temp_diversity_space = self.whole_diversity - calc_diversity_measures2(self.X_, self.y_, temp_ensemble, temp_subspaces, p, self.diversity)
            self.diversity_space[i] = temp_diversity_space

        import matplotlib.pyplot as plt
        import matplotlib as mplt
        mplt.rcParams['axes.spines.right'] = False
        mplt.rcParams['axes.spines.top'] = False
        mplt.rcParams['axes.spines.left'] = False
        plt.figure(figsize=(8,1))
        plt.ylim(0, 0.2)
        plt.yticks([])
        # plt.xlim(-0.125, 0.075)
        plt.tight_layout()
        plt.vlines(0.05*self.diversity_space, 0, .2, color=(0.6015625,0.203125,0.17578125))
        plt.savefig("foo.png")
        # plt.show()
        exit()
        """

        return self
Exemple #4
0
"""
WORDS
"""
print("##############WORDS##############")
# Keys
for key in keys:
    print("%s fold scores:" % key)
    fold_scores = []
    for repeat in range(n_repeats):
        y_repeat = y[repeat]
        proba = np.load("probas_bert/%i_%s_old.npy" % (repeat, key))
        pred = np.argmax(proba, axis=2)

        for split in range(n_splits):
            fold_score = balanced_accuracy_score(y_repeat[split], pred[split])
            fold_scores.append(fold_score)
            print("%.3f" % fold_score)
    fold_scores = np.array(fold_scores)
    key_mean_score = np.mean(fold_scores)
    print("%s mean score: %.3f" % (key, key_mean_score))
    print("\n")

# Ensemble
fold_scores = []
print("Ensemble fold scores:")
for repeat in range(n_repeats):
    y_repeat = y[repeat]

    probas = []
    for key in keys:
Exemple #5
0
                proba = np.load('probas_new/%i_%i_%i_%s_%s.npy' %
                                (repeat, 4, fold, key, i_s[i]))
                gathered[key_id, i, repeat, fold] = proba

print(gathered.shape)

# ALL ensemble
# REPEATS x SPLITS x SAMPLES x CLASSES
mean_proba = np.mean(gathered, axis=(0, 1, 4))
print(mean_proba.shape)
# REPEATS x SPLITS x SAMPLES
pred = np.argmax(mean_proba, axis=3)
print(pred.shape)

# Scores
y = np.load("all_y_new_af.npy")
pred = pred.reshape(n_repeats * n_splits, pred.shape[2])
print(y.shape)
print(pred.shape)

fold_scores = []
print("Ensemble fold scores:")
for fold in range(pred.shape[0]):
    score = balanced_accuracy_score(y[fold], pred[fold])
    fold_scores.append(score)
    print("%.3f" % score)

fold_scores = np.array(fold_scores)
ensemble_mean_score = np.mean(fold_scores)
print("Ensemble mean score: %.3f" % (ensemble_mean_score))
Exemple #6
0
        # print(means_a, means_b)
        # print(stds_a, stds_b)

        gaus_a = GaussianNB().fit(vecsum_a.reshape(-1, 1), y_train)
        gaus_b = GaussianNB().fit(vecsum_b.reshape(-1, 1), y_train)

        y_pred_ga = gaus_a.predict(vecsum_a_test.reshape(-1, 1))
        y_pred_gb = gaus_b.predict(vecsum_b_test.reshape(-1, 1))
        y_pred_gc = np.argmax(np.sum(np.array([
            gaus_a.predict_proba(vecsum_a_test.reshape(-1, 1)),
            gaus_b.predict_proba(vecsum_b_test.reshape(-1, 1))
        ]),
                                     axis=0),
                              axis=1)

        score_ga = balanced_accuracy_score(y_test, y_pred_ga)
        score_gb = balanced_accuracy_score(y_test, y_pred_gb)
        score_gc = balanced_accuracy_score(y_test, y_pred_gc)

        # print("GSCORES %.3f - %.3f" % (score_ga, score_gb))
        # exit()

        # Build classifiers
        clf_a = MLPClassifier(random_state=1410)
        clf_b = MLPClassifier(random_state=1410)

        clf_a.fit(X_train_a, y_train)
        clf_b.fit(X_train_b, y_train)

        # Make ensemble
        esm = np.array(