labels = pd.unique(y) trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y) clf = LogisticRegression(random_state=RANDOM_STATE) clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) train_accuracy = metrics.accuracy_score(trnY, prd_trn) test_accuracy = metrics.accuracy_score(tstY, prd_tst) text = key if (do_feature_eng): text += ' with FS' best_accuracies[text] = [train_accuracy, test_accuracy] ds.plot_evaluation_results(pd.unique(y), trnY, prd_trn, tstY, prd_tst) plt.suptitle('QOT Log Regression - ' + key + ' - Performance & Confusion matrix') plt.savefig(subDir + 'QOT Log Regression - ' + key + ' - Performance & Confusion matrix') plt.close("all") plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], best_accuracies, ylabel='Accuracy') plt.suptitle('QOT Sampling & Feature Selection') plt.savefig(graphsDir + 'QOT Sampling & Feature Selection')
d = dist[k] ds.multiple_line_chart(nvalues, overfitting_values[d], ax=axs[0, k], title='Overfitting for dist = %s' % (d), xlabel='K Neighbours', ylabel='accuracy', percentage=True) plt.suptitle('QOT Overfitting - KNN') plt.savefig(subDir + 'QOT Overfitting - KNN') clf = knn = KNeighborsClassifier(n_neighbors=best[0], metric=best[1]) clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) ds.plot_evaluation_results(["negative", "positive"], trnY, prd_trn, tstY, prd_tst) plt.suptitle('QOT KNN - ' + key + '- Performance & Confusion matrix - %d neighbors and %s' % (best[0], best[1])) plt.savefig(subDir + 'QOT KNN - ' + key + ' - Performance & Confusion matrix') plt.close("all") plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], best_accuracies, ylabel='Accuracy') plt.suptitle('QOT Sampling & Feature Selection') plt.savefig(graphsDir + 'QOT Sampling & Feature Selection') plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], recalls, ylabel='Recall')
values = {} for lr in learning_rate: yvalues = [] for n in n_estimators: gb.fit(trnX, trnY) prdY = gb.predict(tstX) yvalues.append(metrics.accuracy_score(tstY, prdY)) if yvalues[-1] > last_best: best = (d, lr, n) last_best = yvalues[-1] best_tree = gb values[lr] = yvalues prd_trn = best_tree.predict(trnX) prd_tst = best_tree.predict(tstX) ds.plot_evaluation_results(pd.unique(y), trnY, prd_trn, tstY, prd_tst, str(samp)) print(str(samp) + ': ' + str(last_best)) ''' UNDERSAMPLING allknn = AllKNN() trnX, trnY = allknn.fit_resample(trnX, trnY) nm = NearMiss() trnX, trnY = nm.fit_resample(trnX, trnY) OVERSAMPLING smt = SMOTE() trnX, trnY = smt.fit_resample(trnX, trnY) ada = ADASYN(random_state=42) trnX, trnY = smt.fit_sample(trnX, trnY)