def testLogisticRegression(): print("\n\n Testing Logistic Regression:") df = pd.read_csv("testData/ex2data1.txt", delimiter=',', header=None) df_copy = df.copy() df.columns = [0, 1, 2] print(df) df.describe() df_y = df[2] df, mu, sigma = featureScaling(df) X = df.values y = df_y.values y_vector = np.zeros((y.shape[0], 1)) y_vector[:, 0] = y[:] y = y_vector theta, cost, X = lg.logisticRegression(400, 0.1, X[:, 0:2], y[:, 0:1]) plotLogisticRegression(df_copy.values, y, theta, cost) print("\nTest successful") print("<-------------------->") print("Theta:") print(theta) print("<-------------------->") print("Testing for:") print( "Student with a score of 45 in Exam 1 and a score of 85 in Exam 2") test = np.array([1, (45 - mu[0]) / sigma[0], (85 - mu[1]) / sigma[1]]) predict_prob = lg.sigmoid(np.matmul(test, theta)) #print(str(test) + " * " + str(theta)) print("Probability that the Student passes: " + str(predict_prob) + " Expected Value: 0.775 +/- 0.002") print("<-------------------->") predict_prob_one = lg.sigmoid(np.matmul(X[12:13, :], theta)) predict_prob_two = lg.sigmoid(np.matmul(X[43:44, :], theta)) predict_prob_three = lg.sigmoid(np.matmul(X[21:22, :], theta)) print( str(float(predict_prob_one)) + " <-- Example 14 --> " + str(y[12:13, 0])) print( str(float(predict_prob_two)) + " <-- Example 45 --> " + str(y[43:44, 0])) print( str(float(predict_prob_three)) + " <-- Example 23 --> " + str(y[21:22, 0]))
df['Standard Deviation'] = ((stdvA + stdvE)) df['Range'] = ((rangeA + rangeE)) df['Inner Quartile Range'] = ((iqrA + iqrE)) #df['Absolute Value Mean']= ((absSumA+absSumE)) df['Output'] = outputSet df = df.sample(frac=1).reset_index(drop=True) print(df) return df #print(training_df) df = create_prob_functions() #linnearRegression=linnearRegression.LinnearRegression() #linnearRegression.runModel(df) #runing logistic regression model logisticRegression = logisticRegression.logisticRegression() logisticRegression.runModel(df) """ kMeans=kMeans.kMeans() category=kMeans.runModel(df) """ #training_df=df.tail(20000) #testing_df=df.tail(20000) """ mlpnn=mlpnn.MLPNN() model=mlpnn.runModel(training_df, df) """
convertToNum(wine_data) convertToNum(cancer_data) #wine_data = Stats.removeOutliers(wine_data) #Stats.normalityOfFeatures(wine_data) #ratioOnes = Stats.ratioOfOnes(wine_data.iloc[:,-1]) print("-------logistic regression--------------") """wineLR = lR.logisticRegression(wine_data.iloc[:,:-1], wine_data.iloc[:,-1], 0.1, 100, 0,5) start = time.time() acc=wineLR.start() end = time.time() print("wine") print(acc) print(end-start)""" cancerLR = lR.logisticRegression(cancer_data.iloc[:,:-1], cancer_data.iloc[:,-1], 0.1, 100, 0,5) start = time.time() acc = cancerLR.start() end = time.time() print("cancer") print(acc) print(end-start) #print("-----------------LDA------------------------------") print("---------linear discriminant analysis------------") var = lDA.linearDiscriminantAnalysis(cancer_data.iloc[:,:-1]) var = lDA.linearDiscriminantAnalysis(wine_data.iloc[:,:-1]) wd=wine_data.iloc[:,0:11] x0=wine_data.iloc[3:4,0:11] #(ans,inc,cor)=var.predict_A(wd,wine_data.iloc[:,-1]) start = time.time() myl=var.predict_k_Log_odds(wd,wine_data.iloc[:,-1],5)
import os import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt from logisticRegression import logisticRegression pathname = os.path.dirname(sys.argv[0]) os.chdir(pathname) data = pd.read_csv("../ex2data1.txt", delimiter=",", header=None) X = data.values[:, :2] y = data.values[:, 2] del (data) clf = logisticRegression(lambd=0, tol=1e-6, verbose=True) clf.fit(X, y) y_pred = clf.predict(X) print("train accuracy =", clf.accuracy(y_pred, y)) # add decision boundary xx = np.linspace(X[:, 0].min(), X[:, 0].max()) yy = -clf.theta[1] / clf.theta[2] * xx - clf.theta[0] / clf.theta[2] plt.figure() plt.scatter(X[:, 0], X[:, 1], c=y, s=40, alpha=0.8, cmap=plt.cm.Paired) plt.figure() plt.scatter(X[:, 0], X[:, 1], c=y, s=40, alpha=0.8, cmap=plt.cm.Paired) plt.plot(xx, yy, 'k-')
##################################################################### # Select the top num_feat features X = X[:,index[:num_feat]] X = np.transpose(X) # Start measuring execution time start = timeit.default_timer() # Train the classifier w = logisticRegression(X,Y) # Print logistic regression learning execution time stop = timeit.default_timer() print ('Running Time: ' + str(stop-start)) # Load test data and split data from class labels data = np.loadtxt('test.csv', delimiter=',') rY = data[:, 0] # The real class labels test = np.transpose(data[:, 1:data.shape[1]]) # Keep the features indicated by the feature selection task if featureSelection == True: test = test[index[:num_feat],:]
import os import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt from logisticRegression import logisticRegression pathname = os.path.dirname(sys.argv[0]) os.chdir(pathname) data = pd.read_csv("../ex2data1.txt", delimiter=",", header=None) X = data.values[:,:2] y = data.values[:,2] del(data) clf = logisticRegression(lambd=0,tol=1e-6,verbose=True) clf.fit(X,y) y_pred = clf.predict(X) print("train accuracy =", clf.accuracy(y_pred, y)) # add decision boundary xx = np.linspace(X[:,0].min(), X[:,0].max()) yy = -clf.theta[1]/clf.theta[2]*xx - clf.theta[0]/clf.theta[2] plt.figure() plt.scatter(X[:,0],X[:,1],c=y,s=40,alpha=0.8,cmap=plt.cm.Paired) plt.figure() plt.scatter(X[:,0],X[:,1],c=y,s=40,alpha=0.8,cmap=plt.cm.Paired) plt.plot(xx, yy, 'k-')
from sklearn.metrics import accuracy_score from sklearn.metrics import r2_score from analyseData import analyseData from plotDecisionBoundry import plotDecisionBoundry from evaluateRegression import evaluateRegression from kmeans import kmeans from dbscan import dbscan from agglomerative import agglomerative from evaluateClustering import evaluateClustering from vizualizeData import vizualizeData #loading pre-processed data X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData( ) yClass_lr = logisticRegression(X_train, X_test, yClass_train) lrAc = accuracy_score(yClass_test, yClass_lr) X_lr = X_test #reloading data pre-processed with different parameters X_train, X_test, yClass_train, yClass_test, yReg_train, yReg_test = processData( normalization='mms') yClass_rf = randomForest(X_train, X_test, yClass_train) rfAc = accuracy_score(yClass_test, yClass_rf) X_rf = X_test yClass_knn = kNN(X_train, X_test, yClass_train) knnAc = accuracy_score(yClass_test, yClass_knn) X_knn = X_test
def main(): # -------------------------- Fist 1D function max_iter = 100 x_histo, f_histo, grad_histo = optimize(fonc=_test_1, dfonc=_dtest_1, xinit=[2], eps=0.1, max_iter=100) x_histo2, f_histo2, grad_histo2 = optimize( fonc=_test_1, dfonc=_dtest_1, xinit=[2], eps=0.8, max_iter=100) _plot_2D_val_grad_f("xcos(x)\nen fonction du nombre d'itération,\n" "les valeurs de f et du gradient de f", x_histo, f_histo, grad_histo) _plot_2D_compare(title="xcos(x)\nla fonction f et la trajectoire de\n" "l'optimisation(les valeurs successives de f(x))", fonc=_test_1, dfonc=_dtest_1, xinit1=[2], xinit2=[2], eps1=0.1, eps2=0.8) _plot_courbe( "xcos(x), courbe(t, log||xt - x*||)", x_histo, x_histo2, max_iter) # -------------------------- Second 1D function x_histo, f_histo, grad_histo = optimize(fonc=_test_2, dfonc=_dtest_2, xinit=[2], eps=0.1) x_histo2, f_histo2, grad_histo2 = optimize(fonc=_test_2, dfonc=_dtest_2, xinit=[2], eps=0.8) _plot_2D_val_grad_f("-log(x)+x^2\nen fonction du nombre d'itération,\n" "les valeurs de f et du gradient de f", x_histo, f_histo, grad_histo) _plot_2D_compare(title="-log(x)+x^2\nla fonction f et la trajectoire de\n" "l'optimisation(les valeurs successives de f(x))", fonc=_test_2, dfonc=_dtest_2, xinit1=[2], xinit2=[2], eps1=0.1, eps2=0.8) _plot_courbe( "-log(x)+x^2, courbe(t, log||xt - x*||)", x_histo, x_histo2, max_iter) # -------------------------- 2d function Rosenbrock (or banana) x_histo, f_histo, grad_histo = optimize(fonc=_test_3, dfonc=_dtest_3, xinit=[0, 1], eps=0.1) _plot_2D_val_grad_f("Rosenbrock\nen fonction du nombre d'itération,\n" "les valeurs de f et du gradient de f", x_histo, f_histo, grad_histo) _plot_3D(x_histo, f_histo, grad_histo, _test_3) # --------------------------- Logistic Regression trainx, trainy = load_usps("USPS_train.txt") testx, testy = load_usps("USPS_test.txt") logisticReg = logisticRegression(loss_g=cost_f_g, max_iter=100, epsilon=0.1) # logisticReg.fit(trainx, trainy) # Matrice de poids 6 vs 9 and 1 vs 8 fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True, sharey=True) plt.suptitle("Matrice de poids") weight_matrix(6, 9, fig, logisticReg, ax1) weight_matrix(1, 8, fig, logisticReg, ax2) # plt.savefig("weight_matrix_qqlexs_LR") # Matrice de poids 6 vs All matrix_one_vs_all(6, logisticReg) # Courbes d'erreurs 6 vs All error_curves(6) # --------------------------- Naïve Bayes clf_gaussian = GaussianNB() test_clf_on_usps(clf_gaussian, 6)
if __name__ == "__main__": # download data from the internet if it doesn't exists if not os.path.exists("ex2data1.txt"): req = requests.get("https://raw.github.com/SaveTheRbtz/ml-class/master/ex2/ex2data1.txt") with open("ex2data1.txt", "w") as f: f.write(req.content) # load text file and separate into data and label textfile = np.loadtxt("ex2data1.txt", delimiter=",") data = textfile[:, 0:2] label = textfile[:, 2] # fit logistic regression model = logisticRegression.logisticRegression() model.fit(data, label) # print information model.summary() # visualize information plt.clf() pos = label.astype(bool) neg = np.abs(label - 1.0).astype(bool) plt.plot(data[pos, 0], data[pos, 1], "ro") plt.plot(data[neg, 0], data[neg, 1], "bo") X, Y = np.meshgrid(np.arange(25, 105), np.arange(25, 105)) result = np.zeros((0, 80))
from logisticRegression import logisticRegression from naiveBayesGaussian import naiveBayesGaussian from Utils import datasets, plotCompareErrors num_splits = 10 train_percent = [10, 25, 50, 75, 100] LR_errors = logisticRegression(num_splits, train_percent) NB_errors = naiveBayesGaussian(num_splits, train_percent) names = ["LogisticReg", "NaiveBayes"] for dataname in datasets.keys(): error_list = [] error_list.append(LR_errors[dataname]) error_list.append(NB_errors[dataname]) plotCompareErrors(names, dataname, error_list, train_percent)
# Compute Kendall tau correlation ## TODO: Compute the Kendall tau correlation of the features' lists produced # by the two feature selection measures ##################################################################### # Select the top num_feat features X = X[:, index[:num_feat]] X = np.transpose(X) # Start measuring execution time start = timeit.default_timer() # Train the classifier w = logisticRegression(X, Y) # Print logistic regression learning execution time stop = timeit.default_timer() print('Running Time: ' + str(stop - start)) # Load test data and split data from class labels data = np.loadtxt('test.csv', delimiter=',') rY = data[:, 0] # The real class labels test = np.transpose(data[:, 1:data.shape[1]]) # Keep the features indicated by the feature selection task if featureSelection == True: test = test[index[:num_feat], :] # Perform predictions of the test data
import sys, os sys.path.append(os.pardir) import numpy as np from dataset.mnist import load_mnist import random import logisticRegression def generate_rand_data(origin_size, rand_size): rand_array = np.zeros(origin_size, dtype=bool) ran_num = random.randint(0, origin_size - 1) for i in range(rand_size): while rand_array[ran_num] == 1: ran_num = random.randint(0, origin_size - 1) rand_array[ran_num] = 1 return rand_array # Load Mnist Data set (x_train, y_train), (x_test, y_test) = load_mnist(flatten=True, normalize=True)#, one_hot_label=True) num = np.unique(y_train, axis=0) # num = y array 중 unique한 값들로만 이루어진 array num = num.shape[0] # num = y array 중 unique한 값들의 개수 y_train = np.eye(num)[y_train] # np.eye = 단위행렬을 만드는 함수. 즉, y의 unique한 개수만큼의 row를 가지는 단위행렬을 만들고 y에 해당하는 row를 추출한다. LRmodel = logisticRegression.logisticRegression(x_train, y_train, 'multi') # train Data set LRmodel.learn(learning_rate=0.001, epoch=10) accuracy = LRmodel.predict(x_test, y_test) print("accuracy:", accuracy, " score:", int(accuracy*y_test.shape[0]), "/", y_test.shape[0])
def _runLogRegression(designM, labels, alpha, regParam, iterations): theta = logisticRegression(designM, labels, alpha, regParam, iterations) a = accuracy(logHypo(theta, designM), labels) return theta, a
# 評価データ(全体の10%) Xte = myData.X[dtrNum:] Yte = myData.Y[dtrNum:] #------------------- #------------------- # 3. 入力データの標準化 xMean = np.mean(Xtr, axis=0) xStd = np.std(Xtr, axis=0) Xtr = (Xtr - xMean) / xStd Xte = (Xte - xMean) / xStd #------------------- #------------------- # 4. ロジスティックモデルの学習と評価 myModel = lr.logisticRegression(Xtr, Ytr) trLoss = [] teLoss = [] for ite in range(1001): trLoss.append(myModel.CE(Xtr, Ytr)) teLoss.append(myModel.CE(Xte, Yte)) if ite % 100 == 0: print(f"反復:{ite}") print(f"モデルパラメータ:\nw={myModel.w},\nb={myModel.b}") print(f"平均交差エントロピー損失={myModel.CE(Xte,Yte):.2f}") print(f"正解率={myModel.accuracy(Xte,Yte):.2f}") print("----------------")
score_split = train_data.groupby(['score']).count() score_split.reset_index(inplace=True) score_split = pd.DataFrame(score_split, columns=['score', 'V2']) score_split.rename(columns={'V2': 'count'}, inplace=True) #data preprocessing preprocess = data_preprocessing.preprocessing(train_data) categorical_var = preprocess.split_numerical_categorical()[1] numerical_var = preprocess.split_numerical_categorical()[0] low_corr_var = preprocess.numerical_feature_selection(numerical_var) ################Numerical data classifier#################### #logistic regression classifier score_LR = {} for C in [0.1, 1, 10, 100, 1000]: regr = logisticRegression.logisticRegression(train_data, low_corr_var, C) score_LR[C] = regr.classifier() score_LR_df = pd.DataFrame(list(score_LR.values()), index=list(score_LR.keys()), columns=['precision', 'recall', 'accuracy']) score_LR_df[['precision0', 'precision1']] = score_LR_df['precision'].apply(pd.Series) score_LR_df[['recall0', 'recall1']] = score_LR_df['recall'].apply(pd.Series) score_LR_df = pd.DataFrame( score_LR_df, columns=['precision0', 'precision1', 'recall0', 'recall1', 'accuracy']) fig, ax = plt.subplots(figsize=(8, 5)) ax = sns.heatmap(score_LR_df)
class_num = np.unique(input, axis=0) # num = y array 중 unique한 값들로만 이루어진 array class_num = class_num.shape[0] # num = y array 중 unique한 값들의 개수 return np.eye( class_num )[input], class_num # np.eye = 단위행렬을 만드는 함수. 즉, y의 unique한 개수만큼의 row를 가지는 단위행렬을 만들고 y에 해당하는 row를 추출한다. # Load Iris Data set iris = load_iris() # Parsing the data sets X = iris.data # iris data input y = iris.target # iris target = label : 0, 1, 2 y_name = iris.target_names # iris target name : Setosa, Versicolor, Virginica # Divide data sets into train, test sets X_train, X_test, y_train, y_test\ = train_test_split(X, y, test_size=1/15, shuffle=True, random_state=int(time.time())) # sklearn의 데이터분할 내장함수 사용. # test_size : 전체 데이터의 몇 %를 test data로 사용할지 지정 # shuffle : 셔플 여부 설정, random_state : 셔플을 위한 시드 값 지정 y_train, class_num = one_hot_encoding(y_train) LRmodel = logisticRegression.logisticRegression(X_train, y_train) # train Data set LRmodel.learn(learning_rate=0.001, epoch=10) accuracy = LRmodel.predict(X_test, y_test) print("accuracy:", accuracy, " score:", int(accuracy * y_test.shape[0]), "/", y_test.shape[0])
import logisticRegression # Load Mnist Data set (x_train, y_train), (x_test, y_test) = load_mnist(flatten=True, normalize=True, one_hot_label=True) class_num = np.unique(y_train, axis=0) # num = y array 중 unique한 값들로만 이루어진 array class_num = class_num.shape[0] # num = y array 중 unique한 값들의 개수 LRmodel_arr, cost_arr = [], [] i = 0 while i < class_num: print("\n***", i, "th Logistic Regression model ***") LRmodel_arr.append( logisticRegression.logisticRegression(x_train, y_train[:, i])) cost_arr.append(LRmodel_arr[i].learn(learning_rate=0.1, epoch=1)) i += 1 for graph in cost_arr: plt.plot(graph[0], graph[1]) plt.title('Binary-Class Model\'s Loss Graph') plt.xlabel('number of iteration') plt.ylabel('cost') plt.legend(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], loc='upper right') plt.tight_layout() plt.show() # train Data set i = 0
from logisticRegression import logisticRegression from sklearn.preprocessing import PolynomialFeatures pathname = os.path.dirname(sys.argv[0]) os.chdir(pathname) data = pd.read_csv("../ex2data2.txt", delimiter=",", header=None) X = data.values[:,:2] y = data.values[:,2] del(data) poly = PolynomialFeatures(6,include_bias=False) X_new = poly.fit_transform(X) print("After polynomial feature transformation, now shape=", X_new.shape) clf = logisticRegression(lambd=1, verbose=False, tol=1e-10) clf.fit(X_new,y) y_pred = clf.predict(X_new) print("train accuracy = ", clf.accuracy(y_pred, y)) h = .02 # step size in the mesh # create a mesh to plot in x_min, x_max = X[:, 0].min() - .2, X[:, 0].max() + .2 y_min, y_max = X[:, 1].min() - .2, X[:, 1].max() + .2 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(poly.fit_transform(np.c_[xx.ravel(), yy.ravel()])) # Put the result into a color plot
from logisticRegression import logisticRegression from sklearn.preprocessing import PolynomialFeatures pathname = os.path.dirname(sys.argv[0]) os.chdir(pathname) data = pd.read_csv("../ex2data2.txt", delimiter=",", header=None) X = data.values[:, :2] y = data.values[:, 2] del (data) poly = PolynomialFeatures(6, include_bias=False) X_new = poly.fit_transform(X) print("After polynomial feature transformation, now shape=", X_new.shape) clf = logisticRegression(lambd=1, verbose=False, tol=1e-10) clf.fit(X_new, y) y_pred = clf.predict(X_new) print("train accuracy = ", clf.accuracy(y_pred, y)) h = .02 # step size in the mesh # create a mesh to plot in x_min, x_max = X[:, 0].min() - .2, X[:, 0].max() + .2 y_min, y_max = X[:, 1].min() - .2, X[:, 1].max() + .2 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(poly.fit_transform(np.c_[xx.ravel(), yy.ravel()])) # Put the result into a color plot Z = Z.reshape(xx.shape)
from sklearn import cross_validation from logisticRegression import logisticRegression from sklearn import linear_model data = pd.read_csv('../data/train.csv') data = data.replace(np.nan, -1) for col in ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']: col_max = data[col].max() col_min = data[col].min() data[col] = (data[col] - col_min) / (col_max - col_min) data = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Survived']] data['Sex_male'] = data['Sex'].apply(lambda x: 1 if x == 'male' else 0) data['Sex_female'] = data['Sex'].apply(lambda x: 1 if x == 'female' else 0) train, test = cross_validation.train_test_split(data, test_size=0.3, random_state=2018) train_y = train[['Survived']] train_x = train[[ 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Sex_female' ]] test_y = test[['Survived']] test_x = test[[ 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Sex_female' ]] clf = logisticRegression(learning_rate=0.01, epoch=100, bacth_size=10, feature_size=7) clf.fit(train_x, train_y, test_x, test_y)
# download data from the internet if it doesn't exists if not os.path.exists("ex2data1.txt"): req = requests.get( 'https://raw.github.com/SaveTheRbtz/ml-class/master/ex2/ex2data1.txt' ) with open("ex2data1.txt", 'w') as f: f.write(req.content) # load text file and separate into data and label textfile = np.loadtxt("ex2data1.txt", delimiter=",") data = textfile[:, 0:2] label = textfile[:, 2] # fit logistic regression model = logisticRegression.logisticRegression() model.fit(data, label) # print information model.summary() # visualize information plt.clf() pos = label.astype(bool) neg = np.abs(label - 1.0).astype(bool) plt.plot(data[pos, 0], data[pos, 1], 'ro') plt.plot(data[neg, 0], data[neg, 1], 'bo') X, Y = np.meshgrid(np.arange(25, 105), np.arange(25, 105)) result = np.zeros((0, 80))
def crossValidateLogistic(X, Y, regType, kfold, numSamples): #%% import numpy as np from logisticRegression import logisticRegression import matplotlib.pyplot as plt from numpy import random as rng from compiler.ast import flatten scale = np.sqrt((X**2).mean()) mskOutNans = (np.sum(np.isnan(X), axis=1) + np.squeeze(np.isnan(Y))) < 1 X = X[mskOutNans, :] Y = Y[mskOutNans] numObservations, numFeatures = X.shape lvect = flatten([0, list(10**(np.arange(-5, 2, 0.5)))]) l = np.zeros((len(lvect), 2)) if regType == 'l2': l[:, 0] = lvect # l2-regularization elif regType == 'l1': l[:, 1] = lvect # l1-regularization else: lvect = [0., 0.] # no regularization l = [0., 0.] l = l * scale perClassErrorTest = np.nan + np.ones((numSamples, l.shape[0])) perClassErrorTrain = np.nan + np.ones((numSamples, l.shape[0])) for s in range(numSamples): ## %%%%%% shuffle trials to break any dependencies on the sequence of trails shfl = rng.permutation(np.arange(0, numObservations)) Ys = Y[shfl] Xs = X[shfl, :] ## %%%%% divide data to training and testin sets YTrain = Ys[np.arange(0, int((kfold - 1.) / kfold * numObservations))] YTest = Ys[np.arange(int((kfold - 1.) / kfold * numObservations), numObservations)] XTrain = Xs[np.arange(0, int( (kfold - 1.) / kfold * numObservations)), :] XTest = Xs[np.arange(int( (kfold - 1.) / kfold * numObservations), numObservations), :] ## %%%%% loop over the possible regularization values for i in range(l.shape[0]): w, b, lps, perClassEr, cost, optParams = logisticRegression( np.reshape(XTrain, (np.prod(XTrain.shape)), order='F'), YTrain, l[i, :]) perClassErrorTest[s, i] = optParams.perClassErFn(XTest, YTest) perClassErrorTrain[s, i] = optParams.perClassErFn(XTrain, YTrain) print 'cross-validating: %.2f %% completed' % ( (s + 1.) / (numSamples + 0.) * 100.) meanPerClassErrorTrain = np.mean(perClassErrorTrain, axis=0) semPerClassErrorTrain = np.std(perClassErrorTrain, axis=0) / np.sqrt(numSamples) meanPerClassErrorTest = np.mean(perClassErrorTest, axis=0) semPerClassErrorTest = np.std(perClassErrorTest, axis=0) / np.sqrt(numSamples) ix = np.argmin(meanPerClassErrorTest) l = l[meanPerClassErrorTest <= (meanPerClassErrorTest[ix] + semPerClassErrorTest[ix]), :] lbest = l[-1, :] # best regularization term based on minError+SE criteria ix = np.sum(l == lbest, 1) == 2 ##%%%%%% plot coss-validation results plt.figure('cross validation') plt.fill_between(lvect, meanPerClassErrorTrain - semPerClassErrorTrain, meanPerClassErrorTrain + semPerClassErrorTrain, alpha=0.5, edgecolor='k', facecolor='k') plt.fill_between(lvect, meanPerClassErrorTest - semPerClassErrorTest, meanPerClassErrorTest + semPerClassErrorTest, alpha=0.5, edgecolor='r', facecolor='r') plt.plot(lvect, meanPerClassErrorTrain, 'k', label='training') plt.plot(lvect, meanPerClassErrorTest, 'r', label='validation') plt.plot(np.array(lvect)[ix], meanPerClassErrorTest[ix], 'bo') plt.xlim([lvect[1], lvect[-1]]) plt.xscale('log') plt.xlabel('regularization parameter') plt.ylabel('classification error (%)') plt.legend() return lbest