import numpy as np from sklearn.ensemble import RandomForestClassifier import metrics import matplotlib.pyplot as plt maxi = 0 ideal = (0, 0) num_features = [] accs = [] for features in [5, 6, 13]: data, target = metrics.preprocess(k=features, fsiter=100) temp = metrics.repeatedCrossValidatedScores( data, target, RandomForestClassifier(n_estimators=256), iterations=100, cv=10) metrics.printAverages(features, temp) num_features.append(features) accs.append(np.average(temp['test_accuracy'])) print(str(ideal) + " gives " + str(maxi) + "% accuracy") acc, = plt.plot(num_features, accs, label='Accuracy') plt.title("Feature Selection for Random Forest", fontsize=14) plt.xlabel('Number of Features') plt.ylabel('Repeated-Cross-Validation Accuracy (%)') plt.yticks([0.85, 0.90, 0.95, 1], ["85", "90", "95", "100"])
import matplotlib.pyplot as plt import metrics import warnings warnings.filterwarnings("ignore") data, target = metrics.preprocess() barHeights = metrics.FeatureSelection(data, target) features = [ 'age', 'blood pressure', 'specific gravity', 'albumin', 'sugar', 'red blood cells', 'pus cell', 'pus cell clumps', 'bacteria', 'blood glucose random', 'blood urea', 'serum creatinine', 'sodium', 'potassium', 'hemoglobin', 'packed cell volume', 'white blood cell count', 'red blood cell count', 'hypertension', 'diabetes mellitus', 'coronary artery disease', 'appetite', 'pedal edema', 'anemia' ] fig, ax = plt.subplots() plt.bar(range(1, 25), barHeights) plt.subplots_adjust(bottom=0.28, left=0.1) plt.yticks([0, 0.05, 0.1, 0.15, 0.20], ["0%", "5%", "10%", "15%", "20%"], size=7) plt.xticks(range(1, 25), features, rotation=270, size=7) plt.title('Importance of Each Feature', size=16) plt.ylabel('Relative Importance (%)', size=8) plt.show()
import numpy as np from sklearn.neural_network import MLPClassifier import metrics import warnings warnings.filterwarnings("ignore") data, target = metrics.preprocess(k=8, fsiter=1000) # 16, 14, 11 is the best so far, 6,3 was the best for 2 layers hlayers = [6, (12, 5)] param_grid = [{ 'hidden_layer_sizes': hlayers, 'alpha': [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005] }] # metrics.OptimizeClassifier(data, target, MLPClassifier(solver='lbfgs', random_state=1), param_grid) # Current Best: 0.931 (+/-0.068) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': 73} # 0.938 (+/-0.069) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 73} # 0.951 (+/-0.057) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 43} # 0.945 (+/-0.085) for {'activation': 'relu', 'alpha': 1e-06, 'hidden_layer_sizes': 43} # 0.948 (+/-0.057) for {'alpha': 0.01, 'hidden_layer_sizes': (35, 26)} # 0.942 (+/-0.074) for {'alpha': 5e-05, 'hidden_layer_sizes': (30, 11)} # 0.952 (+/-0.057) for {'alpha': 0.0001, 'hidden_layer_sizes': (30, 11)} # 0.947 (+/-0.036) for {'alpha': 0.001, 'hidden_layer_sizes': (30, 11)} # # temporary values to be replaced ideal = [0, 0, 0] maxi = 0. # graph = np.zeros(hlayers.count)
import numpy as np from scipy import interp import matplotlib.pyplot as plt from itertools import cycle from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import roc_curve, auc from sklearn.model_selection import StratifiedKFold import metrics from sklearn.preprocessing import label_binarize # ############################################################################# # Data IO and generation X, y = metrics.preprocess(k=8, fsiter=1000) y = label_binarize(y, classes=["0", "1"]).ravel() # ############################################################################# # Classification and ROC analysis # Run classifier with cross-validation and plot ROC curves cv = StratifiedKFold(n_splits=10, shuffle=True) classifier = KNeighborsClassifier(n_neighbors=1) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) i = 0 for iterations in range(10000): for train, test in cv.split(X, y):
import arff import numpy as np from sklearn.preprocessing import Imputer from sklearn.neural_network import MLPClassifier from imblearn.over_sampling import SMOTE import metrics import warnings warnings.filterwarnings("ignore") data, target = metrics.preprocess(k=8, fsiter=1000, scaling=False) # default values ideal = [0] maxi = 0 # check a lot of hidden layer configurations for sets with high accuracy print("hlayers/tp/tn/fp/fn/f1/precision/sensitivity/specificity/accuracy") for x in range(1, 100): temp = metrics.repeatedCrossValidatedScores(data, target, MLPClassifier( solver='lbfgs', alpha=1e-5, hidden_layer_sizes=x, random_state=1, ), iterations=20, cv=10) metrics.printAverages(x, temp) if np.average(temp['test_f1']) > maxi: maxi = np.average(temp['test_f1'])
import metrics import numpy as np import matplotlib.pyplot as plt from sklearn import tree from sklearn import svm from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier data, target = metrics.preprocess() classifiers = { 'Support Vector Machine': svm.SVC(C=1, kernel='linear', decision_function_shape='ovo', random_state=6), 'Random Forest': RandomForestClassifier(n_estimators=184), 'Logistic Regression': LogisticRegression(C=1000), 'Nearest Neighbours': KNeighborsClassifier(n_neighbors=1), 'Decision Tree': tree.DecisionTreeClassifier(), 'Neural Network': MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=54, random_state=1) }