def score(self, X, y): supports = self.decision_function(X) y_pred = (supports > 0).astype(int) score = metrics.balanced_accuracy_score(y, y_pred) return score
X_test_c = extractor_c.transform(corpus_c_test).toarray() # Build classifiers clf_a = MLPClassifier(random_state=1410) clf_b = MLPClassifier(random_state=1410) clf_c = MLPClassifier(random_state=1410) clf_a.fit(X_train_a, y_train) clf_b.fit(X_train_b, y_train) clf_c.fit(X_train_c, y_train) # Establish predictions y_pred_a = clf_a.predict(X_test_a) y_pred_b = clf_b.predict(X_test_b) y_pred_c = clf_c.predict(X_test_c) # Calculate scores score_a = balanced_accuracy_score(y_test, y_pred_a) score_b = balanced_accuracy_score(y_test, y_pred_b) score_c = balanced_accuracy_score(y_test, y_pred_c) print(score_a, score_b, score_c) scores[fold + 2 * repeat, 0, i, j] = score_a scores[fold + 2 * repeat, 1, i, j] = score_b scores[fold + 2 * repeat, 2, i, j] = score_c print(scores) np.save("n_gram_scores", scores)
def fit(self, X, y): X, y = check_X_y(X, y) self.classes_ = np.unique(y) self.n_features = X.shape[1] self.X_, self.y_ = X, y # Base clf, ensemble and subspaces self.clf_ = clone(self.base_estimator).fit(self.X_, self.y_) self.ensemble_ = self.clf_.estimators_ # self.subspaces_ = self.clf_.subspaces # Calculate mean accuracy on training set p = np.mean( np.array([ accuracy_score(self.y_, member_clf.predict(self.X_)) for clf_ind, member_clf in enumerate(self.ensemble_) ])) # All measures for whole ensemble self.e, self.k, self.kw, self.dis, self.q = calc_diversity_measures( self.X_, self.y_, self.ensemble_, p) # Calculate diversity space for all measures self.diversity_space = np.zeros((5, len(self.ensemble_))) for i in range(len(self.ensemble_)): temp_ensemble = self.ensemble_.copy() temp_ensemble.pop(i) # temp_subspaces = self.subspaces_[np.arange(len(self.subspaces_))!=1] p = np.mean( np.array([ accuracy_score(self.y_, member_clf.predict(self.X_)) for clf_ind, member_clf in enumerate(self.ensemble_) ])) temp_e, temp_k, temp_kw, temp_dis, temp_q = calc_diversity_measures( self.X_, self.y_, temp_ensemble, p) self.diversity_space[0, i] = self.e - temp_e self.diversity_space[1, i] = self.k - temp_k self.diversity_space[2, i] = self.kw - temp_kw self.diversity_space[3, i] = self.dis - temp_dis self.diversity_space[4, i] = self.q - temp_q """ # Density estimation plots PHD import matplotlib.pyplot as plt import matplotlib as mplt from matplotlib import rcParams rcParams['font.family'] = 'monospace' rcParams['font.size'] = 12 fig, ax = plt.subplots(2, 3, figsize=(20,10)) ax1 = plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2) ax2 = plt.subplot2grid((2,6), (0,2), colspan=2) ax3 = plt.subplot2grid((2,6), (0,4), colspan=2) ax4 = plt.subplot2grid((2,6), (1,1), colspan=2) ax5 = plt.subplot2grid((2,6), (1,3), colspan=2) axes = [ax1, ax2, ax3, ax4, ax5] dist_diversity_measures = ["The entropy measure E", "Measurement of interrater agreement k", "Kohavi-Wolpert variance", "The disagreement measure", "The Q statistics"] for i in range(self.diversity_space.shape[0]): axes[i].set_title(dist_diversity_measures[i]) axes[i].set_xlabel("M measure") sns.distplot(self.diversity_space[i], hist=True, kde=True, color = (0.6015625,0.203125,0.17578125), hist_kws={'edgecolor':'black', 'color':'#d6adab', 'alpha':1.0}, kde_kws={'linewidth': 4}, ax=axes[i], bins=8) sns.despine(top=True, right=True, left=False, bottom=False) # plt.show() plt.tight_layout() plt.savefig("density.png") plt.savefig("density.eps") exit() """ # """ # Density estimation plots ICCS import matplotlib.pyplot as plt import matplotlib as mplt from matplotlib import rcParams rcParams['font.family'] = 'monospace' rcParams['font.size'] = 18 fig, ax = plt.subplots(2, 2, figsize=(18, 10)) # ax1 = plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2) # ax2 = plt.subplot2grid((2,6), (0,2), colspan=2) # ax3 = plt.subplot2grid((2,6), (0,4), colspan=2) # ax4 = plt.subplot2grid((2,6), (1,1), colspan=2) # ax5 = plt.subplot2grid((2,6), (1,3), colspan=2) # axes = [ax1, ax2, ax3, ax4, ax5] ax = ax.ravel() dist_diversity_measures = [ "The entropy measure E", "Measurement of interrater agreement k", "Kohavi-Wolpert variance", "The disagreement measure", "The Q statistics" ] for indx, i in enumerate([0, 1, 2, 4]): ax[indx].set_title(dist_diversity_measures[i]) ax[indx].set_xlabel("M measure") sns.distplot(self.diversity_space[i], hist=True, kde=True, color=(0.6015625, 0.203125, 0.17578125), hist_kws={ 'edgecolor': 'black', 'color': '#d6adab', 'alpha': 1.0 }, kde_kws={'linewidth': 4}, ax=ax[indx], bins=8) sns.despine(top=True, right=True, left=False, bottom=False) # plt.show() plt.tight_layout() plt.savefig("density.png") plt.savefig("density.eps") exit() # """ # Clustering # DIV x CLUSTERS x CLFS self.indexes = np.zeros( (5, self.max_clusters - 1, len(self.ensemble_))) for div_inxd, div in enumerate(self.diversity_space): for clu_indx, n_clusters in enumerate( range(2, self.max_clusters + 1)): self.kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state) self.indexes[div_inxd, clu_indx] = self.kmeans.fit_predict( div.reshape(-1, 1)) # print(div_inxd, clu_indx) # print(self.indexes[div_inxd, clu_indx]) # Plots # import matplotlib.pyplot as plt # import matplotlib as mplt # mplt.rcParams['axes.spines.right'] = False # mplt.rcParams['axes.spines.top'] = False # mplt.rcParams['axes.spines.left'] = False # plt.figure(figsize=(8,1)) # plt.ylim(0, 0.2) # plt.yticks([]) # # plt.xlim(-0.125, 0.075) # plt.tight_layout() # colors = ["red", "blue", "green", "orange", "cyan", "pink", "black", "yellow"] # for j in range(self.max_clusters): # plt.vlines(0.05*self.diversity_space[0][self.kmeans.labels_ == j], 0, .2, color=colors[j]) # plt.savefig("foo.png") # Calculate base models bac base_scores = np.array([ balanced_accuracy_score(self.y_, member_clf.predict(self.X_)) for clf_ind, member_clf in enumerate(self.ensemble_) ]) # print(base_scores) # exit() # DIV x CLU x CLU # self.pruned_ensembles = np.zeros((5, self.max_clusters-1, self.max_clusters-1)) self.pruned_ensembles = [] # self.pruned_subspaces_ = [] ensemble_ = np.array(self.ensemble_) for div_inxd in range(5): for cluster_indx, n_clusters in enumerate( range(2, self.max_clusters + 1)): self.pruned_ensemble_ = [] for j in range(n_clusters): cluster_ensemble = ensemble_[self.indexes[ div_inxd, cluster_indx] == j] # cluster_subspaces = self.subspaces_[indexes==j] # print(cluster_ensemble.shape) cluster_scores = base_scores[self.indexes[ div_inxd, cluster_indx] == j] # print(cluster_scores) best = np.argmax(cluster_scores) # print(best) # exit() self.pruned_ensemble_.append(cluster_ensemble[best]) self.pruned_ensembles.append(self.pruned_ensemble_) # print(len(self.pruned_ensemble_)) # self.pruned_subspaces_.append(cluster_subspaces[best]) # print(len(self.pruned_ensemble_)) self.pruned_ensembles = np.array(self.pruned_ensembles) # print(np.array(self.pruned_ensembles).shape) # exit() # Single measures """ # Calculate chosen diversity measure for whole ensemble self.whole_diversity = calc_diversity_measures2(self.X_, self.y_, self.ensemble_, self.subspaces_, p, self.diversity) # Calculate diversity space self.diversity_space = np.zeros((len(self.ensemble_))) for i in range(len(self.ensemble_)): temp_ensemble = self.ensemble_.copy() temp_ensemble.pop(i) temp_subspaces = self.subspaces_[np.arange(len(self.subspaces_))!=1] if self.diversity == "k": p = np.mean(np.array([ accuracy_score(self.y_,member_clf.predict(X[:, self.subspaces_[clf_ind]])) for clf_ind, member_clf in enumerate(self.ensemble_)])) temp_diversity_space = self.whole_diversity - calc_diversity_measures2(self.X_, self.y_, temp_ensemble, temp_subspaces, p, self.diversity) self.diversity_space[i] = temp_diversity_space import matplotlib.pyplot as plt import matplotlib as mplt mplt.rcParams['axes.spines.right'] = False mplt.rcParams['axes.spines.top'] = False mplt.rcParams['axes.spines.left'] = False plt.figure(figsize=(8,1)) plt.ylim(0, 0.2) plt.yticks([]) # plt.xlim(-0.125, 0.075) plt.tight_layout() plt.vlines(0.05*self.diversity_space, 0, .2, color=(0.6015625,0.203125,0.17578125)) plt.savefig("foo.png") # plt.show() exit() """ return self
""" WORDS """ print("##############WORDS##############") # Keys for key in keys: print("%s fold scores:" % key) fold_scores = [] for repeat in range(n_repeats): y_repeat = y[repeat] proba = np.load("probas_bert/%i_%s_old.npy" % (repeat, key)) pred = np.argmax(proba, axis=2) for split in range(n_splits): fold_score = balanced_accuracy_score(y_repeat[split], pred[split]) fold_scores.append(fold_score) print("%.3f" % fold_score) fold_scores = np.array(fold_scores) key_mean_score = np.mean(fold_scores) print("%s mean score: %.3f" % (key, key_mean_score)) print("\n") # Ensemble fold_scores = [] print("Ensemble fold scores:") for repeat in range(n_repeats): y_repeat = y[repeat] probas = [] for key in keys:
proba = np.load('probas_new/%i_%i_%i_%s_%s.npy' % (repeat, 4, fold, key, i_s[i])) gathered[key_id, i, repeat, fold] = proba print(gathered.shape) # ALL ensemble # REPEATS x SPLITS x SAMPLES x CLASSES mean_proba = np.mean(gathered, axis=(0, 1, 4)) print(mean_proba.shape) # REPEATS x SPLITS x SAMPLES pred = np.argmax(mean_proba, axis=3) print(pred.shape) # Scores y = np.load("all_y_new_af.npy") pred = pred.reshape(n_repeats * n_splits, pred.shape[2]) print(y.shape) print(pred.shape) fold_scores = [] print("Ensemble fold scores:") for fold in range(pred.shape[0]): score = balanced_accuracy_score(y[fold], pred[fold]) fold_scores.append(score) print("%.3f" % score) fold_scores = np.array(fold_scores) ensemble_mean_score = np.mean(fold_scores) print("Ensemble mean score: %.3f" % (ensemble_mean_score))
# print(means_a, means_b) # print(stds_a, stds_b) gaus_a = GaussianNB().fit(vecsum_a.reshape(-1, 1), y_train) gaus_b = GaussianNB().fit(vecsum_b.reshape(-1, 1), y_train) y_pred_ga = gaus_a.predict(vecsum_a_test.reshape(-1, 1)) y_pred_gb = gaus_b.predict(vecsum_b_test.reshape(-1, 1)) y_pred_gc = np.argmax(np.sum(np.array([ gaus_a.predict_proba(vecsum_a_test.reshape(-1, 1)), gaus_b.predict_proba(vecsum_b_test.reshape(-1, 1)) ]), axis=0), axis=1) score_ga = balanced_accuracy_score(y_test, y_pred_ga) score_gb = balanced_accuracy_score(y_test, y_pred_gb) score_gc = balanced_accuracy_score(y_test, y_pred_gc) # print("GSCORES %.3f - %.3f" % (score_ga, score_gb)) # exit() # Build classifiers clf_a = MLPClassifier(random_state=1410) clf_b = MLPClassifier(random_state=1410) clf_a.fit(X_train_a, y_train) clf_b.fit(X_train_b, y_train) # Make ensemble esm = np.array(