# 0.952 (+/-0.057) for {'alpha': 0.0001, 'hidden_layer_sizes': (30, 11)} # 0.947 (+/-0.036) for {'alpha': 0.001, 'hidden_layer_sizes': (30, 11)} # # temporary values to be replaced ideal = [0, 0, 0] maxi = 0. # graph = np.zeros(hlayers.count) # find the average F1 score and its standard deviation for all the layer sizes print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity") for x in hlayers: temp = metrics.repeatedCrossValidatedScores(data, target, MLPClassifier( solver='lbfgs', alpha=0.001, hidden_layer_sizes=x, random_state=1), iterations=100, cv=10) metrics.printAverages(x, temp) if np.average(temp['test_f1']) > maxi: maxi = np.average(temp['test_f1']) ideal = x # print the best average and its F1 score print(str(ideal) + " gives " + str(maxi * 100) + "% accuracy") #print("The standard deviation was " + str(maxi[1] * 100) + "%")
import numpy as np from sklearn.ensemble import RandomForestClassifier import metrics import matplotlib.pyplot as plt maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in [5, 6, 13]: data, target = metrics.preprocess(k=features, fsiter=100) temp = metrics.repeatedCrossValidatedScores( data, target, RandomForestClassifier(n_estimators=256), iterations=100, cv=10) metrics.printAverages(features, temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for Random Forest", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.85, 0.90, 0.95, 1], ["85", "90", "95", "100"]) plt.xticks([0, 4, 8, 12, 16, 20, 24])
import arff import numpy as np from imblearn.over_sampling import SMOTE from sklearn.model_selection import train_test_split from sklearn.preprocessing import Imputer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score import metrics data, target = metrics.preprocess(k=10, fsiter=1000) print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity/accuracy") temp = metrics.repeatedCrossValidatedScores( data, target, LogisticRegression(C=1000), cv=10, iterations=50) # Gives avaerage accuracy metrics.printAverages(1000, temp) model = LogisticRegression( C=1000 ) #Creates a copy of te function LogisticRegression and names it as model results = metrics.repeatedCrossValidatedScores( data, target, model, cv=10, iterations=50) #Gives avaerage accuracy print("Accuracy: %0.2f (+/- %0.2f)" % (results['test_accuracy'].mean() * 100, results['test_accuracy'].std() * 200)) #prints results
from sklearn.ensemble import RandomForestClassifier import metrics import matplotlib.pyplot as plt data, target = metrics.preprocess(k=13, fsiter=1000) C_range = range(64, 258, 8) accuracy_scores = [] for c in C_range: rf = RandomForestClassifier(n_estimators=c) scores = metrics.repeatedCrossValidatedScores(data, target, rf, cv=10, iterations=50) temp = scores['test_accuracy'].mean() if temp > 0.9: accuracy_scores.append(temp) metrics.printAverages(c, scores) plt.plot(C_range, accuracy_scores) plt.title('Random Forest Optimization', size=11, fontweight='bold') plt.xlabel('Number of Estimators', size=8) plt.ylabel('Accuracy', size=8) plt.show()
import numpy as np from sklearn import tree import metrics import matplotlib.pyplot as plt maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in range(1, 25): data, target = metrics.preprocess(k=features, fsiter=1000) temp = metrics.repeatedCrossValidatedScores(data, target, tree.DecisionTreeClassifier(), iterations=100, cv=10) metrics.printAverages(features, temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for Decision Trees", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"]) plt.xticks([0, 4, 6, 8, 12, 16, 20, 24]) plt.show()
from sklearn.neighbors import KNeighborsClassifier from sklearn.externals import joblib import metrics data, target = metrics.preprocess(k=8, fsiter=1000) clf = KNeighborsClassifier(n_neighbors=1) scores = metrics.repeatedCrossValidatedScores(data, target, clf, iterations=1000, cv=10) metrics.printAverages('clf', scores) clf.fit(data, target) joblib.dump(clf, 'classifier.pkl', compress=9)
import numpy as np from sklearn import svm import metrics import warnings warnings.filterwarnings("ignore") # temporary values to be replaced ideal = [0, 0, 0] maxi = 0. # graph = np.zeros(hlayers.count) # find the average F1 score and its standard deviation for all the layer sizes print("#features/tp/tn/fp/fn/f1/precision/sensitivity/specificity") for x in range(1, 25): data, target = metrics.preprocess(k=x) temp = metrics.repeatedCrossValidatedScores( data, target, svm.SVC(C=1, kernel='linear', decision_function_shape='ovo', random_state=6)) metrics.printAverages(x, temp) if np.average(temp['test_f1']) > maxi: maxi = np.average(temp['test_f1']) ideal = x # print the best average and its F1 score print(str(ideal) + " gives " + str(maxi * 100) + "% accuracy") #print("The standard deviation was " + str(maxi[1] * 100) + "%")
from sklearn.neighbors import KNeighborsClassifier import metrics import matplotlib.pyplot as plt maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in range(1, 25): data, target = metrics.preprocess(k=features, fsiter=1000) for neighbours in [1]: temp = metrics.repeatedCrossValidatedScores( data, target, KNeighborsClassifier(n_neighbors=neighbours), iterations=100, cv=10) metrics.printAverages((features, neighbours), temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for KNN", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"]) plt.xticks([0, 4, 8, 12, 16, 20, 24])
'Decision Tree': metrics.preprocess(k=6, fsiter=1000, scaling=False), 'Random Forest': metrics.preprocess(k=13, fsiter=1000, scaling=False), 'Logistic Regression': metrics.preprocess(k=11, fsiter=1000), 'Neural Network': metrics.preprocess(k=8, fsiter=1000), 'Nearest Neighbours': metrics.preprocess(k=8, fsiter=1000) } accuracies = [] sensitivities = [] specificities = [] for key in classifiers.keys(): print(key) data, target = datasets[key] temp = metrics.repeatedCrossValidatedScores(data, target, classifiers[key], iterations=100, cv=10) metrics.printAverages(key, temp) accuracies.append(np.average(temp['test_accuracy']) - 0.9) sensitivities.append(np.average(temp['test_sensitivity']) - 0.9) specificities.append(np.average(temp['test_specificity']) - 0.9) plt.figure() ax = plt.subplot() plt.xticks([2, 6, 10, 14, 18, 22], classifiers.keys(), size=5.5) plt.yticks([0.9, 0.925, 0.95, 0.975, 1.0], ["90%", "92.5%", "95%", "97.5%", "100%"]) plt.title('Relative Success with optimal features') plt.xlabel('Algorithm', size=8) sens = ax.bar([1, 5, 9, 13, 17, 21],
import metrics import matplotlib.pyplot as plt maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in range(1, 25): data, target = metrics.preprocess(k=features, fsiter=100) temp = metrics.repeatedCrossValidatedScores( data, target, SVC(C=1, kernel='linear', decision_function_shape='ovo', random_state=1), iterations=100, cv=10) metrics.printAverages(features, temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for Support Vector Machine", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Repeated-Cross-Validation Accuracy (%)')
import numpy as np from sklearn.linear_model import LogisticRegression import metrics import matplotlib.pyplot as plt maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in range(1, 25): data, target = metrics.preprocess(k=features, fsiter=1000) temp = metrics.repeatedCrossValidatedScores(data, target, LogisticRegression(C=1000), iterations=100, cv=10) metrics.printAverages(features, temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for Logistic Regression", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.85, 0.90, 0.95, 1], ["85", "90", "95", "100"]) plt.xticks([0, 4, 8, 11, 12, 16, 20, 24])