Ejemplo n.º 1
0
from sklearn.ensemble import RandomForestClassifier
import metrics
import matplotlib.pyplot as plt

data, target = metrics.preprocess(k=13, fsiter=1000)

C_range = range(64, 258, 8)
accuracy_scores = []
for c in C_range:
    rf = RandomForestClassifier(n_estimators=c)
    scores = metrics.repeatedCrossValidatedScores(data,
                                                  target,
                                                  rf,
                                                  cv=10,
                                                  iterations=50)
    temp = scores['test_accuracy'].mean()
    if temp > 0.9:
        accuracy_scores.append(temp)
    metrics.printAverages(c, scores)

plt.plot(C_range, accuracy_scores)
plt.title('Random Forest Optimization', size=11, fontweight='bold')
plt.xlabel('Number of Estimators', size=8)
plt.ylabel('Accuracy', size=8)
plt.show()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib
import metrics

data, target = metrics.preprocess(k=8, fsiter=1000)
clf = KNeighborsClassifier(n_neighbors=1)

scores = metrics.repeatedCrossValidatedScores(data,
                                              target,
                                              clf,
                                              iterations=1000,
                                              cv=10)

metrics.printAverages('clf', scores)

clf.fit(data, target)

joblib.dump(clf, 'classifier.pkl', compress=9)
Ejemplo n.º 3
0
import arff
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import metrics

data, target = metrics.preprocess(k=10, fsiter=1000)

print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity/accuracy")
temp = metrics.repeatedCrossValidatedScores(
    data, target, LogisticRegression(C=1000), cv=10,
    iterations=50)  # Gives avaerage accuracy
metrics.printAverages(1000, temp)

model = LogisticRegression(
    C=1000
)  #Creates a copy of te function LogisticRegression and names it as model
results = metrics.repeatedCrossValidatedScores(
    data, target, model, cv=10, iterations=50)  #Gives avaerage accuracy
print("Accuracy: %0.2f (+/- %0.2f)" %
      (results['test_accuracy'].mean() * 100,
       results['test_accuracy'].std() * 200))  #prints results
import metrics
import warnings
warnings.filterwarnings("ignore")

data, target = metrics.preprocess(k=8, fsiter=1000, scaling=False)

# default values
ideal = [0]
maxi = 0

# check a lot of hidden layer configurations for sets with high accuracy
print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity/accuracy")
for x in range(1, 100):
    temp = metrics.repeatedCrossValidatedScores(data,
                                                target,
                                                MLPClassifier(
                                                    solver='lbfgs',
                                                    alpha=1e-5,
                                                    hidden_layer_sizes=x,
                                                    random_state=1,
                                                ),
                                                iterations=20,
                                                cv=10)
    metrics.printAverages(x, temp)
    if np.average(temp['test_f1']) > maxi:
        maxi = np.average(temp['test_f1'])
        ideal = x

# print the highest accuracy one
print(str(ideal) + " gives " + str(maxi) + "% accuracy")
data, target = SMOTE().fit_sample(data, target)

# default values to be overwritten
ideal = [0, 0, 0]
maxi = 0
best = []

# check a lot of hidden layer configurations for sets with high accuracy
for x in range(3, 50):
    for y in range(2, x):
        for z in range(1, y):
            temp = metrics.aveaccuracy(data,
                                       target,
                                       MLPClassifier(solver='lbfgs',
                                                     alpha=1e-5,
                                                     hidden_layer_sizes=(x, y,
                                                                         z),
                                                     random_state=1),
                                       iterations=10)
            metrics.printAverages((x, y, z), temp)
            if np.average(temp['test_f1']) > maxi:
                maxi = np.average(temp['test_f1'])
                ideal = (x, y, z)
            if np.average(temp['test_f1']) > 0.8:
                best = np.append(best, [(x, y, z)])

# print the highest accuracy one
print(str(ideal) + " gives " + str(maxi) + "% accuracy")
for i in best:
    print(i)
Ejemplo n.º 6
0
import numpy as np
from sklearn import tree
import metrics
import matplotlib.pyplot as plt

maxi = 0
ideal = (0, 0)

num_features = []
accs = []

for features in range(1, 25):
    data, target = metrics.preprocess(k=features, fsiter=1000)
    temp = metrics.repeatedCrossValidatedScores(data, target,
                                                tree.DecisionTreeClassifier(),
                                                iterations=100, cv=10)
    metrics.printAverages(features, temp)

    num_features.append(features)
    accs.append(np.average(temp['test_accuracy']))

print(str(ideal) + " gives " + str(maxi) + "% accuracy")

acc, = plt.plot(num_features, accs, label='Accuracy')
plt.title("Feature Selection for Decision Trees", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)')
plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"])
plt.xticks([0, 4, 6, 8, 12, 16, 20, 24])
plt.show()
# fixes missing data by taking values from other rows and taking the average
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

# this function takes the average of every column excluding the unknown values
imp.fit(data)

# inserts the average into the missing spots
data = imp.fit_transform(data)

data, target = SMOTE().fit_sample(data, target)

# alpha_range = [float(i) / 100000 for i in range(1, 1001)]
alpha_range = [0.0005, 0.001, 0.005, 0.01]
alpha_accuracy = []
alpha_sensitivity = []
alpha_specificity = []
for x in alpha_range:
    temp = metrics.repeatedCrossValidatedScores(data, target,
                               MLPClassifier(solver='lbfgs', alpha=x, hidden_layer_sizes=43, random_state=1),
                               iterations=1000, cv=10)
    metrics.printAverages('%.5f' % x, temp)
    alpha_accuracy.append(np.average(temp['test_accuracy']))
    # alpha_sensitivity.append(np.average(temp['test_sensitivity']))
    # alpha_specificity.append(np.average(temp['test_specificity']))

plt.plot(range(1, 5), alpha_accuracy)
plt.xlabel('Value of Alpha')
plt.ylabel('Cross-Validation Accuracy')
plt.grid = True
plt.show()
Ejemplo n.º 8
0
maxi = 0
ideal = (0, 0)

num_features = []
accs = []

for features in range(1, 25):
    data, target = metrics.preprocess(k=features, fsiter=1000)
    for neighbours in [1]:
        temp = metrics.repeatedCrossValidatedScores(
            data,
            target,
            KNeighborsClassifier(n_neighbors=neighbours),
            iterations=100,
            cv=10)
        metrics.printAverages((features, neighbours), temp)

        num_features.append(features)
        accs.append(np.average(temp['test_accuracy']))

print(str(ideal) + " gives " + str(maxi) + "% accuracy")

acc, = plt.plot(num_features, accs, label='Accuracy')
plt.title("Feature Selection for KNN", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)')
plt.yticks([0.80, 0.85, 0.90, 0.95, 1], ["80%", "85", "90", "95", "100"])
plt.xticks([0, 4, 8, 12, 16, 20, 24])
plt.show()
Ejemplo n.º 9
0
    'Neural Network': metrics.preprocess(k=8, fsiter=1000),
    'Nearest Neighbours': metrics.preprocess(k=8, fsiter=1000)
}

accuracies = []
sensitivities = []
specificities = []
for key in classifiers.keys():
    print(key)
    data, target = datasets[key]
    temp = metrics.repeatedCrossValidatedScores(data,
                                                target,
                                                classifiers[key],
                                                iterations=100,
                                                cv=10)
    metrics.printAverages(key, temp)
    accuracies.append(np.average(temp['test_accuracy']) - 0.9)
    sensitivities.append(np.average(temp['test_sensitivity']) - 0.9)
    specificities.append(np.average(temp['test_specificity']) - 0.9)

plt.figure()
ax = plt.subplot()
plt.xticks([2, 6, 10, 14, 18, 22], classifiers.keys(), size=5.5)
plt.yticks([0.9, 0.925, 0.95, 0.975, 1.0],
           ["90%", "92.5%", "95%", "97.5%", "100%"])
plt.title('Relative Success with optimal features')
plt.xlabel('Algorithm', size=8)
sens = ax.bar([1, 5, 9, 13, 17, 21],
              sensitivities,
              width=0.8,
              color='red',
sens_range = []
neuron_accuracy = []
for features in range(1, 25):
    data, target = metrics.preprocess(k=features, fsiter=1000)
    maxacc = 0.0
    for neuron in range(40, 60):
        temp = metrics.repeatedCrossValidatedScores(
            data,
            target,
            MLPClassifier(solver='lbfgs',
                          alpha=0.001,
                          hidden_layer_sizes=neuron,
                          random_state=1),
            iterations=100,
            cv=10)
        metrics.printAverages((features, neuron), temp)
        if np.average(temp['test_accuracy']) > maxacc:
            maxacc = np.average(temp['test_accuracy'])
    neuron_accuracy.append(maxacc)
    sens_range.append(features)

for x in range(np.size(sens_range)):
    print((sens_range[x], neuron_accuracy[x]))
acc, = plt.plot(sens_range, neuron_accuracy, label='Accuracy')
plt.title("Feature Selection for Neural Networks", fontsize=14)
plt.xlabel('Number of Features')
plt.ylabel('Maximum Repeated-Cross-Validation Accuracy (%)')
plt.yticks([0.85, 0.90, 0.95, 1], ["85", "90", "95", "100"])
plt.xticks([0, 4, 8, 12, 16, 20, 24])
plt.show()