Esempio n. 1
0
# 0.952 (+/-0.057) for {'alpha': 0.0001, 'hidden_layer_sizes': (30, 11)}
# 0.947 (+/-0.036) for {'alpha': 0.001, 'hidden_layer_sizes': (30, 11)}
#

# temporary values to be replaced
ideal = [0, 0, 0]
maxi = 0.
# graph = np.zeros(hlayers.count)

# find the average F1 score and its standard deviation for all the layer sizes
print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity")
for x in hlayers:
    temp = metrics.repeatedCrossValidatedScores(data,
                                                target,
                                                MLPClassifier(
                                                    solver='lbfgs',
                                                    alpha=0.001,
                                                    hidden_layer_sizes=x,
                                                    random_state=1),
                                                iterations=100,
                                                cv=10)
    metrics.printAverages(x, temp)

    if np.average(temp['test_f1']) > maxi:
        maxi = np.average(temp['test_f1'])
        ideal = x

# print the best average and its F1 score
print(str(ideal) + " gives " + str(maxi * 100) + "% accuracy")
#print("The standard deviation was " + str(maxi[1] * 100) + "%")
Esempio n. 2
0
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import metrics
import matplotlib.pyplot as plt

maxi = 0
ideal = (0, 0)

num_features = []
accs = []

for features in [5, 6, 13]:
    data, target = metrics.preprocess(k=features, fsiter=100)
    temp = metrics.repeatedCrossValidatedScores(
        data,
        target,
        RandomForestClassifier(n_estimators=256),
        iterations=100,
        cv=10)
    metrics.printAverages(features, temp)

    num_features.append(features)
    accs.append(np.average(temp['test_accuracy']))

print(str(ideal) + " gives " + str(maxi) + "% accuracy")

acc, = plt.plot(num_features, accs, label='Accuracy')
plt.title("Feature Selection for Random Forest", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Repeated-Cross-Validation Accuracy (%)')
plt.yticks([0.85, 0.90, 0.95, 1], ["85", "90", "95", "100"])
plt.xticks([0, 4, 8, 12, 16, 20, 24])
Esempio n. 3
0
import arff
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import metrics

data, target = metrics.preprocess(k=10, fsiter=1000)

print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity/accuracy")
temp = metrics.repeatedCrossValidatedScores(
    data, target, LogisticRegression(C=1000), cv=10,
    iterations=50)  # Gives avaerage accuracy
metrics.printAverages(1000, temp)

model = LogisticRegression(
    C=1000
)  #Creates a copy of te function LogisticRegression and names it as model
results = metrics.repeatedCrossValidatedScores(
    data, target, model, cv=10, iterations=50)  #Gives avaerage accuracy
print("Accuracy: %0.2f (+/- %0.2f)" %
      (results['test_accuracy'].mean() * 100,
       results['test_accuracy'].std() * 200))  #prints results
Esempio n. 4
0
from sklearn.ensemble import RandomForestClassifier
import metrics
import matplotlib.pyplot as plt

data, target = metrics.preprocess(k=13, fsiter=1000)

C_range = range(64, 258, 8)
accuracy_scores = []
for c in C_range:
    rf = RandomForestClassifier(n_estimators=c)
    scores = metrics.repeatedCrossValidatedScores(data,
                                                  target,
                                                  rf,
                                                  cv=10,
                                                  iterations=50)
    temp = scores['test_accuracy'].mean()
    if temp > 0.9:
        accuracy_scores.append(temp)
    metrics.printAverages(c, scores)

plt.plot(C_range, accuracy_scores)
plt.title('Random Forest Optimization', size=11, fontweight='bold')
plt.xlabel('Number of Estimators', size=8)
plt.ylabel('Accuracy', size=8)
plt.show()
Esempio n. 5
0
import numpy as np
from sklearn import tree
import metrics
import matplotlib.pyplot as plt

maxi = 0
ideal = (0, 0)

num_features = []
accs = []

for features in range(1, 25):
    data, target = metrics.preprocess(k=features, fsiter=1000)
    temp = metrics.repeatedCrossValidatedScores(data, target,
                                                tree.DecisionTreeClassifier(),
                                                iterations=100, cv=10)
    metrics.printAverages(features, temp)

    num_features.append(features)
    accs.append(np.average(temp['test_accuracy']))

print(str(ideal) + " gives " + str(maxi) + "% accuracy")

acc, = plt.plot(num_features, accs, label='Accuracy')
plt.title("Feature Selection for Decision Trees", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)')
plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"])
plt.xticks([0, 4, 6, 8, 12, 16, 20, 24])
plt.show()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib
import metrics

data, target = metrics.preprocess(k=8, fsiter=1000)
clf = KNeighborsClassifier(n_neighbors=1)

scores = metrics.repeatedCrossValidatedScores(data,
                                              target,
                                              clf,
                                              iterations=1000,
                                              cv=10)

metrics.printAverages('clf', scores)

clf.fit(data, target)

joblib.dump(clf, 'classifier.pkl', compress=9)
import numpy as np
from sklearn import svm
import metrics
import warnings
warnings.filterwarnings("ignore")

# temporary values to be replaced
ideal = [0, 0, 0]
maxi = 0.
# graph = np.zeros(hlayers.count)

# find the average F1 score and its standard deviation for all the layer sizes
print("#features/tp/tn/fp/fn/f1/precision/sensitivity/specificity")
for x in range(1, 25):
    data, target = metrics.preprocess(k=x)
    temp = metrics.repeatedCrossValidatedScores(
        data, target,
        svm.SVC(C=1,
                kernel='linear',
                decision_function_shape='ovo',
                random_state=6))
    metrics.printAverages(x, temp)

    if np.average(temp['test_f1']) > maxi:
        maxi = np.average(temp['test_f1'])
        ideal = x

# print the best average and its F1 score
print(str(ideal) + " gives " + str(maxi * 100) + "% accuracy")
#print("The standard deviation was " + str(maxi[1] * 100) + "%")
Esempio n. 8
0
from sklearn.neighbors import KNeighborsClassifier
import metrics
import matplotlib.pyplot as plt

maxi = 0
ideal = (0, 0)

num_features = []
accs = []

for features in range(1, 25):
    data, target = metrics.preprocess(k=features, fsiter=1000)
    for neighbours in [1]:
        temp = metrics.repeatedCrossValidatedScores(
            data,
            target,
            KNeighborsClassifier(n_neighbors=neighbours),
            iterations=100,
            cv=10)
        metrics.printAverages((features, neighbours), temp)

        num_features.append(features)
        accs.append(np.average(temp['test_accuracy']))

print(str(ideal) + " gives " + str(maxi) + "% accuracy")

acc, = plt.plot(num_features, accs, label='Accuracy')
plt.title("Feature Selection for KNN", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)')
plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"])
plt.xticks([0, 4, 8, 12, 16, 20, 24])
Esempio n. 9
0
    'Decision Tree': metrics.preprocess(k=6, fsiter=1000, scaling=False),
    'Random Forest': metrics.preprocess(k=13, fsiter=1000, scaling=False),
    'Logistic Regression': metrics.preprocess(k=11, fsiter=1000),
    'Neural Network': metrics.preprocess(k=8, fsiter=1000),
    'Nearest Neighbours': metrics.preprocess(k=8, fsiter=1000)
}

accuracies = []
sensitivities = []
specificities = []
for key in classifiers.keys():
    print(key)
    data, target = datasets[key]
    temp = metrics.repeatedCrossValidatedScores(data,
                                                target,
                                                classifiers[key],
                                                iterations=100,
                                                cv=10)
    metrics.printAverages(key, temp)
    accuracies.append(np.average(temp['test_accuracy']) - 0.9)
    sensitivities.append(np.average(temp['test_sensitivity']) - 0.9)
    specificities.append(np.average(temp['test_specificity']) - 0.9)

plt.figure()
ax = plt.subplot()
plt.xticks([2, 6, 10, 14, 18, 22], classifiers.keys(), size=5.5)
plt.yticks([0.9, 0.925, 0.95, 0.975, 1.0],
           ["90%", "92.5%", "95%", "97.5%", "100%"])
plt.title('Relative Success with optimal features')
plt.xlabel('Algorithm', size=8)
sens = ax.bar([1, 5, 9, 13, 17, 21],
Esempio n. 10
0
import metrics
import matplotlib.pyplot as plt

maxi = 0
ideal = (0, 0)

num_features = []
accs = []

for features in range(1, 25):
    data, target = metrics.preprocess(k=features, fsiter=100)
    temp = metrics.repeatedCrossValidatedScores(
        data,
        target,
        SVC(C=1,
            kernel='linear',
            decision_function_shape='ovo',
            random_state=1),
        iterations=100,
        cv=10)
    metrics.printAverages(features, temp)

    num_features.append(features)
    accs.append(np.average(temp['test_accuracy']))

print(str(ideal) + " gives " + str(maxi) + "% accuracy")

acc, = plt.plot(num_features, accs, label='Accuracy')
plt.title("Feature Selection for Support Vector Machine", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Repeated-Cross-Validation Accuracy (%)')
import numpy as np
from sklearn.linear_model import LogisticRegression
import metrics
import matplotlib.pyplot as plt

maxi = 0
ideal = (0, 0)

num_features = []
accs = []

for features in range(1, 25):
    data, target = metrics.preprocess(k=features, fsiter=1000)
    temp = metrics.repeatedCrossValidatedScores(data,
                                                target,
                                                LogisticRegression(C=1000),
                                                iterations=100,
                                                cv=10)
    metrics.printAverages(features, temp)

    num_features.append(features)
    accs.append(np.average(temp['test_accuracy']))

print(str(ideal) + " gives " + str(maxi) + "% accuracy")

acc, = plt.plot(num_features, accs, label='Accuracy')
plt.title("Feature Selection for Logistic Regression", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Repeated-Cross-Validation Accuracy (%)')
plt.yticks([0.85, 0.90, 0.95, 1], ["85", "90", "95", "100"])
plt.xticks([0, 4, 8, 11, 12, 16, 20, 24])