def validationCurve(dataFile, clf, nClasses=2, score=None): ''' Choose a classifier and plot the validation curve Score against different values for a selected hyperparameter to see the influence of a single hyperparameter dataFile: dataset including features and targets clf: classifier to consider score: customise the scoring (default in sklearn is 'accuracy') ''' # Let's use fB(F2) score if score is None: from sklearn.metrics import fbeta_score, make_scorer score = make_scorer(fbeta_score, beta=2) d = pd.read_csv(dataFile, sep="\t", header=None) data = d.values # Balance the data set targets = data[:, -1] data = data[:, 0:-1] if nClasses == 2: posTargetInd = np.where(targets == 1) negTargetInd = np.where(targets == 0) # randomly select n negative rows n = min(np.shape(posTargetInd)[1], np.shape(negTargetInd)[1]) posTargetInd = posTargetInd[0].tolist() posTargetInd = random.sample(posTargetInd, n) negTargetInd = negTargetInd[0].tolist() negTargetInd = random.sample(negTargetInd, n) inds = posTargetInd + negTargetInd elif nClasses == 3: c1TargetInd = np.where(targets == 0) # c1=noise c2TargetInd = np.where(targets == 1) # c2=male c3TargetInd = np.where(targets == 2) # c3=female # randomly select n negative rows n = min( np.shape(c1TargetInd)[1], np.shape(c2TargetInd)[1], np.shape(c3TargetInd)[1]) c1TargetInd = c1TargetInd[0].tolist() c1TargetInd = random.sample(c1TargetInd, n) c2TargetInd = c2TargetInd[0].tolist() c2TargetInd = random.sample(c2TargetInd, n) c3TargetInd = c3TargetInd[0].tolist() c3TargetInd = random.sample(c3TargetInd, n) inds = c1TargetInd + c2TargetInd + c3TargetInd data = data[inds, :] targets = targets[inds] indices = np.arange(targets.shape[0]) np.random.shuffle(indices) data, targets = data[indices], targets[indices] if clf == 'GaussianNB': from sklearn.naive_bayes import GaussianNB estimator = GaussianNB() elif clf == 'SVM': estimator = SVC(C=1) param_name = "gamma" param_range = np.logspace(-6, 1, 10) # param_name = "C" # param_range = np.linspace(0.01, 1, 5) elif clf == 'MLP': estimator = MLPClassifier() param_name = "alpha" param_range = 10.0**-np.arange(1, 7) # param_name = "max_iter" # param_range = [100, 200, 300, 400, 500] elif clf == 'kNN': estimator = KNeighborsClassifier() param_name = "n_neighbors" param_range = [1, 2, 3, 4, 5, 6] elif clf == 'GP': estimator = GaussianProcessClassifier(1.0 * RBF(1.0)) elif clf == 'DT': estimator = DecisionTreeClassifier(max_depth=5) elif clf == 'RF': estimator = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=2) elif clf == 'Boost': estimator = AdaBoostClassifier() elif clf == 'XGB': estimator = xgb.XGBClassifier() elif clf == 'GMM': estimator = GaussianMixture(n_components=2, covariance_type='spherical', max_iter=20) title = "Validation Curves - " + clf v = Learning.Validate(estimator, title, data, targets, param_name=param_name, param_range=param_range, scoring=score) plt = v.plot_validation_curve() plt.show()
def learninigCurve(dataFile, clf, score=None): ''' Choose a classifier and plot the learning curve dataFile: dataset including features and targets clf: classifier to consider score: customise the scoring (default in sklearn is 'accuracy') ''' # Let's use fB(F2) score if score is None: from sklearn.metrics import fbeta_score, make_scorer score = make_scorer(fbeta_score, beta=2) d = pd.read_csv(dataFile, sep="\t", header=None) data = d.values # Balance the data set targets = data[:, -1] data = data[:, 0:-1] posTargetInd = np.where(targets == 1) negTargetInd = np.where(targets == 0) # randomly select n negative rows n = min(np.shape(posTargetInd)[1], np.shape(negTargetInd)[1]) posTargetInd = posTargetInd[0].tolist() posTargetInd = random.sample(posTargetInd, n) negTargetInd = negTargetInd[0].tolist() negTargetInd = random.sample(negTargetInd, n) inds = posTargetInd + negTargetInd data = data[inds, :] targets = targets[inds] indices = np.arange(targets.shape[0]) np.random.shuffle(indices) data, targets = data[indices], targets[indices] if clf == 'GaussianNB': from sklearn.naive_bayes import GaussianNB estimator = GaussianNB() elif clf == 'SVM': from sklearn.svm import SVC estimator = SVC(gamma=0.0077) elif clf == 'MLP': estimator = MLPClassifier(hidden_layer_sizes=(250, ), max_iter=100, early_stopping=True) elif clf == 'kNN': estimator = KNeighborsClassifier(3) elif clf == 'GP': estimator = GaussianProcessClassifier(1.0 * RBF(1.0)) elif clf == 'DT': estimator = DecisionTreeClassifier(max_depth=5) elif clf == 'RF': estimator = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=2) elif clf == 'Boost': estimator = AdaBoostClassifier() elif clf == 'XGB': estimator = xgb.XGBClassifier() elif clf == 'GMM': estimator = GaussianMixture(n_components=2, covariance_type='spherical', max_iter=20) title = "Learning Curves - " + clf v = Learning.Validate(estimator, title, data, targets, scoring=score) plt = v.plot_learning_curve() plt.show()