def cv_test(): """ tests the cross validation. needs working krr class! """ Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :] krr = imp.krr() pl.figure() pl.subplot(1, 2, 1) params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", np.logspace(-2, 2, 10)] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) print cvkrr.kernelparameter print cvkrr.regularization pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T) pl.title("CV with fixed regularization") pl.subplot(1, 2, 2) params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", [0]] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) print cvkrr.kernelparameter print cvkrr.regularization pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T) pl.title("CV with efficient LOOCV") print "\n(time the test takes on my notebook: approx. 6 seconds)"
def fold_cross_validation(self): """ # Excercise 3c Perform a cross validation on a training set of 2500 Perform split_data before """ # Fivefold Cross validation train_samples = random.sample(range(0, 5000), 2500) Xtr2500 = self.Xtr[train_samples] Ytr2500 = self.Ytr[train_samples] D2500 = np.linalg.norm(Xtr2500[None, :] - Xtr2500[:, None], axis=2) quantiles = np.quantile(D2500, [0.1, 0.5, 0.9]) params = { 'kernel': ['gaussian'], 'kernelparameter': quantiles, 'regularization': np.logspace(-7, 0, 10) } self.cvkrr = imp.cv(Xtr2500, Ytr2500, imp.krr, params, loss_function=mean_absolute_error, nfolds=5) y_pred2500 = self.cvkrr.predict(self.Xte) MAE = mean_absolute_error(self.Yte, y_pred2500) print("The mean absolute error is: {} ".format(round(MAE, 2))) print("The best regularzation parameter C is: {}".format( self.cvkrr.regularization)) print("The best kernelparameter sigma is: {}".format( self.cvkrr.kernelparameter))
def test_cv(self): Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01).reshape(-1, 1) pl.figure() pl.subplot(1, 2, 1) params = { 'kernel': ['gaussian'], 'kernelparameter': np.logspace(-4, 4, 20), 'regularization': np.logspace(-2, 2, 10) } cvkrr = imp.cv(Xtr, Ytr, imp.krr, params, loss_function=squared_error_loss, nrepetitions=2) ypred = cvkrr.predict(Xte) print('Regularization range: 10**-4 .. 10**4') print('Gaussian kernel parameter: ', cvkrr.kernelparameter) print('Regularization paramter: ', cvkrr.regularization) pl.plot(Xtr, Ytr) pl.plot(Xte, ypred) pl.subplot(1, 2, 2) params = { 'kernel': ['gaussian'], 'kernelparameter': np.logspace(-2, 2, 10), 'regularization': [0] } cvkrr = imp.cv(Xtr, Ytr, imp.krr, params, loss_function=squared_error_loss, nrepetitions=2) ypred = cvkrr.predict(Xte) print('Regularization via efficient leave on out') print('Kernel parameter: ', cvkrr.kernelparameter) print('Regularization paramter: ', cvkrr.regularization) pl.plot(Xtr, Ytr) pl.plot(Xte, ypred) pl.show()
def cv_test(): ''' tests the cross validation. needs working krr class! ''' Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :] krr = imp.krr() pl.figure() pl.subplot(1, 2, 1) params = [ 'kernel', ['gaussian'], 'kernelparam', np.logspace(-2, 2, 10), 'regularization', np.logspace(-2, 2, 10) ] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T) pl.subplot(1, 2, 2) params = [ 'kernel', ['gaussian'], 'kernelparam', np.logspace(-2, 2, 10), 'regularization', [0] ] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T)
def krr_app(reg=False): ''' Applies krr to all data sets and saves the result to a file ''' datasets = ['banana','diabetis','flare-solar','image','ringnorm'] #dataset = ['image'] # for computing the results via console, the dataset was changed manually path = 'ps3_datasets/' results = dict() for data in datasets: Xtr = np.loadtxt(path+'U04_'+data+'-xtrain.dat') Ytr = np.loadtxt(path+'U04_'+data+'-ytrain.dat') Xte = np.loadtxt(path+'U04_'+data+'-xtest.dat') d,n = Xtr.shape print data, ' was loaded with %d dimensions'%d krr = imp.krr() kernels = ['gaussian','polynomial','linear'] kernel_params = [np.logspace(-2,2,10),np.arange(10),np.arange(10)] tmp_results = dict() for i in range(len(kernels)): params = [ 'kernel',[kernels[i]], 'kernelparam', kernel_params[i], 'regularization', [0]] cvkrr = imp.cv(Xtr, Ytr.reshape(1,-1), krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) result = dict() result['cvloss'] = cvkrr.cvloss result['kernel'] = kernels[i] result['kernelparameter'] = cvkrr.kernelparameter result['regularization'] = cvkrr.regularization result['ypred'] = cvkrr.ypred tmp_results[i] = result print 'finished %s kernel on %s'%(kernels[i],data) CVloss = np.zeros(len(kernels)) for i in range(len(kernels)): CVloss[i] = tmp_results[i]['cvloss'] print 'CVloss for dataset %s'%data,CVloss results[data] = tmp_results[np.argmin(CVloss)]
def apply_krr(reg=False): ''' This function applies the krr to the provided data set in order to find a good classification. The results are stored in a dictionary which is at the end pickled. Usage: It is important to adapt the path of the datasets. Input: reg : boolean variable indicating whether the regularization constant shall be estimated by LOOCV or drawn from a provided range. True means that the provided range is used and False means that the LOOCV will be used. Author: Till Rohrmann, [email protected] ''' # IMPORTANT: Adapt path to where the data sets have been stored path = 'ps3_datasets'; testSuffix = 'xtest'; trainXSuffix = 'xtrain'; trainYSuffix = 'ytrain'; files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]; datasetNames = set(); for filename in files: m = re.search('U04_([^\.]*)\.dat', filename); if m != None: datasetNames.add(m.group(1)[:m.group(1).rfind('-')]); result = {}; if reg: regularization = np.logspace(-2, 2, 10); else: regularization = [0]; gaussianKernelParams = np.logspace(-2, 2, 10); polynomialKernelParams = np.arange(1, 10); nrep = 5; nfs = 10; for dataset in datasetNames: print('Dataset: ' + dataset); # training phase filenameX = 'U04_' + dataset + '-' + trainXSuffix + '.dat'; filenameY = 'U04_' + dataset + '-' + trainYSuffix + '.dat'; filenameTestX = 'U04_' + dataset + '-' + testSuffix + '.dat'; X = np.loadtxt(os.path.join(path, filenameX), dtype=float); Y = np.loadtxt(os.path.join(path, filenameY), dtype=float)[np.newaxis, :]; testX = np.loadtxt(os.path.join(path, filenameTestX), dtype=float); print('Shape: ' + str(X.shape)); # linear cv startTime = time.time(); krrLinear = imp.krr(); linearParams = ['kernel', ['linear'], 'kernelparam', [0], 'regularization', regularization]; imp.cv(X, Y, krrLinear, linearParams, nrepetitions=nrep, nfolds=nfs); timeLinear = time.time() - startTime; # polynomial cv startTime = time.time(); krrPolynomial = imp.krr(); polynomialParams = ['kernel', ['polynomial'], 'kernelparam', polynomialKernelParams, 'regularization', regularization]; imp.cv(X, Y, krrPolynomial, polynomialParams, nrepetitions=nrep, nfolds=nfs); timePolynomial = time.time() - startTime; # gaussian cv startTime = time.time(); krrGaussian = imp.krr(); gaussianParams = ['kernel', ['gaussian'], 'kernelparam', gaussianKernelParams, 'regularization', regularization]; imp.cv(X, Y, krrGaussian, gaussianParams, nrepetitions=nrep, nfolds=nfs); timeGaussian = time.time() - startTime; krr = [krrLinear, krrPolynomial, krrGaussian][np.argmin([krrLinear.cvloss, krrPolynomial.cvloss, krrGaussian.cvloss])]; minTime = [timeLinear, timePolynomial, timeGaussian][np.argmin([krrLinear.cvloss, krrPolynomial.cvloss, krrGaussian.cvloss])]; krr.predict(testX); dictionary = dict(); dictionary['kernel'] = krr.kernel; dictionary['kernelparameter'] = krr.kernelparameter; dictionary['regularization'] = krr.regularization; dictionary['cvloss'] = krr.cvloss; dictionary['ypred'] = krr.ypred; result[dataset] = dictionary; # plot ROC curve and calculate AUC params = ['kernel', [krr.kernel], 'kernelparam', [krr.kernelparameter], 'regularization', [krr.regularization]]; rocKRR = imp.krr(); imp.cv(X, Y, rocKRR, params, loss_function=roc_fun, nrepetitions=nrep, nfolds=nfs); truePositiveRate = rocKRR.cvloss[0]; falsePositiveRate = rocKRR.cvloss[1]; # Simpson rule for integration xdiff = falsePositiveRate[1:] - falsePositiveRate[:-1]; ysum = (truePositiveRate[1:] + truePositiveRate[:-1]) / 2 AUC = np.dot(ysum, xdiff); pl.figure(); pl.plot(falsePositiveRate, truePositiveRate); pl.ylabel('True positive rate'); pl.xlabel('False positive rate'); if reg == True: pl.title('ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + ' regularization:' + ('%.3f' % dictionary['regularization'])); else: pl.title('LOOCV ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + ' regularization:' + ('%.3f' % dictionary['regularization'])); print('Dataset:' + dataset + ' kernel:' + dictionary['kernel'] + ' cvloss:' + str(dictionary['cvloss']) + ' AUC:' + str(AUC) + ' time:' + ('%.1f' % minTime)); if reg: filename = 'results.p' else: filename = 'resultsLOOCV.p' pickle.dump(result, open(filename, 'wb'));
def assignment_4(): # 4b # load data import pandas as pd cwd = os.getcwd() xtrain_names = [ 'U04_banana-xtrain.dat', 'U04_diabetis-xtrain.dat', 'U04_flare-solar-xtrain.dat', 'U04_image-xtrain.dat', 'U04_ringnorm-xtrain.dat' ] ytrain_names = [ 'U04_banana-ytrain.dat', 'U04_diabetis-ytrain.dat', 'U04_flare-solar-ytrain.dat', 'U04_image-ytrain.dat', 'U04_ringnorm-ytrain.dat' ] xtest_names = [ 'U04_banana-xtest.dat', 'U04_diabetis-xtest.dat', 'U04_flare-solar-xtest.dat', 'U04_image-xtest.dat', 'U04_ringnorm-xtest.dat' ] ytest_names = [ 'U04_banana-ytest.dat', 'U04_diabetis-ytest.dat', 'U04_flare-solar-ytest.dat', 'U04_image-ytest.dat', 'U04_ringnorm-ytest.dat' ] xtrain_data = [] ytrain_data = [] xtest_data = [] ytest_data = [] all_datasets = ['banana', 'diabetis', 'flare-solar', 'image', 'ringnorm'] folds = [10, 9, 9, 10, 10] for (xtrain, ytrain, xtest, ytest) in zip(xtrain_names, ytrain_names, xtest_names, ytest_names): path_to_data = cwd + '/data/' + xtrain assert os.path.exists(path_to_data), "The path does not exist." xtrain_data.append(np.loadtxt(path_to_data)) path_to_data = cwd + '/data/' + ytrain assert os.path.exists(path_to_data), "The path does not exist." ytrain_data.append(np.loadtxt(path_to_data)) path_to_data = cwd + '/data/' + xtest assert os.path.exists(path_to_data), "The path does not exist." xtest_data.append(np.loadtxt(path_to_data)) path_to_data = cwd + '/data/' + ytest assert os.path.exists(path_to_data), "The path does not exist." ytest_data.append(np.loadtxt(path_to_data)) # 4b - GENERATE DICTIONARY RESULTS FOR EACH DATASET params = { 'kernel': ['linear', 'polynomial'], 'kernelparameter': [1, 2, 3], 'regularization': [0] } results = { 'banana': { 'cvloss': [0], 'kernel': [0], 'kernelparameter': [0], 'regularization': [0], 'y_pred': [0] }, 'diabetis': { 'cvloss': [0], 'kernel': [0], 'kernelparameter': [0], 'regularization': [0], 'y_pred': [0] }, 'flare-solar': { 'cvloss': [0], 'kernel': [0], 'kernelparameter': [0], 'regularization': [0], 'y_pred': [0] }, 'image': { 'cvloss': [0], 'kernel': [0], 'kernelparameter': [0], 'regularization': [0], 'y_pred': [0] }, 'ringnorm': { 'cvloss': [0], 'kernel': [0], 'kernelparameter': [0], 'regularization': [0], 'y_pred': [0] } } # bug description - "setting an array element with a sequence" if len(xtrain_data) is not equally divisible by nfolds # solving the bug is very difficult, because it would require converting the unequal sequences into numpy arrays # and filling the missing values. These values are then indexed on the training data, and will result in an error or a datapoint being used repeatedly # depending on how you choose to fill the values # so the obvious solution is to pick n_folds so that len(xtrain)%n_folds=0 ie n_folds is a multiple of xtrain for (xtrain, ytrain, xtest, ytest, dataset, fold) in zip(xtrain_data, ytrain_data, xtest_data, ytest_data, all_datasets, folds): print('Xtrain\n', xtrain.shape) print('ytrain\n', ytrain.shape) print('Xtest\n', xtest.shape) print('ytest\n', ytest.shape) cvkrr = imp.cv(xtrain.T, ytrain, imp.krr, params, loss_function=zero_one_loss, nfolds=fold, nrepetitions=5) y_pred = cvkrr.predict(xtest.T) results[dataset]['y_pred'] = y_pred results[dataset]['kernel'] = cvkrr.kernel results[dataset]['kernelparameter'] = cvkrr.kernelparameter results[dataset]['regularization'] = cvkrr.regularization results[dataset]['cvloss'] = cvkrr.cvloss params = { 'kernel': ['linear', 'gaussian'], 'kernelparameter': [0.1, 0.5, 0.9], 'regularization': [0] } for (xtrain, ytrain, xtest, ytest, dataset, fold) in zip(xtrain_data, ytrain_data, xtest_data, ytest_data, all_datasets, folds): print('Xtrain\n', xtrain.shape) print('ytrain\n', ytrain.shape) print('Xtest\n', xtest.shape) print('ytest\n', ytest.shape) cvkrr = imp.cv(xtrain.T, ytrain, imp.krr, params, loss_function=zero_one_loss, nfolds=fold, nrepetitions=5) y_pred = cvkrr.predict(xtest.T) if results[dataset]['cvloss'] > cvkrr.cvloss: results[dataset]['y_pred'] = y_pred results[dataset]['kernel'] = cvkrr.kernel results[dataset]['kernelparameter'] = cvkrr.kernelparameter results[dataset]['regularization'] = cvkrr.regularization results[dataset]['cvloss'] = cvkrr.cvloss # manually remove kernelparameter from linear soln. results['flare-solar']['kernelparameter'] = None # open a file, where you want to store the data file = open('results.p', 'wb') # dump information to that file pickle.dump(results, file) # close the file file.close() #4C - PLOT ROC CURVES FOR VARYING BIASES for (xtrain, ytrain, dataset, fold) in zip(xtrain_data, ytrain_data, all_datasets, folds): print('Xtrain\n', xtrain.shape) print('ytrain\n', ytrain.shape) params = { 'kernel': [str(results[dataset]['kernel'])], 'kernelparameter': [(results[dataset]['kernelparameter'])], 'regularization': [(results[dataset]['regularization'])] } # print(params['kernel']) cvkrr = imp.cv(xtrain.T, ytrain, imp.krr, params, loss_function=roc_fun, nfolds=fold, nrepetitions=4) loss = cvkrr.cvloss # print('fpr\n',loss[0]) # print('tpr\n',loss[1]) # print(loss) fpr = np.append(loss[0], 0) fpr = np.insert(fpr, 0, 1) tpr = np.append(loss[1], 0) tpr = np.insert(tpr, 0, 1) # plot ROC fun plt.figure(figsize=(4.5, 4.5)) plt.plot(fpr, tpr, label='KRR algorithm') plt.plot(np.arange(0, 1.1, 0.1), np.arange(0, 1.1, 0.1), label='Random guess') plt.ylabel('True Positive Rate (TPR)') plt.xlabel('False Positive Rate (FPR)') plt.title('%s dataset\'s average ROC curve from a varying bias' % dataset) plt.legend() # 4.d - COMPARE LOOCV TO CV REGULARISATION cv_regularisation = [] for (xtrain, ytrain, xtest, ytest, dataset, fold) in zip(xtrain_data, ytrain_data, xtest_data, ytest_data, all_datasets, folds): print('Xtrain\n', xtrain.shape) print('ytrain\n', ytrain.shape) print('Xtest\n', xtest.shape) print('ytest\n', ytest.shape) params = { 'kernel': [results[dataset]['kernel']], 'kernelparameter': [results[dataset]['kernelparameter']], 'regularization': np.logspace(-5, 5, 11) } cvkrr = imp.cv(xtrain.T, ytrain, imp.krr, params, loss_function=zero_one_loss, nfolds=fold, nrepetitions=5) y_pred = cvkrr.predict(xtest.T) cv_regularisation.append(cvkrr.cvloss)
def plot_energies_for_1000(self): """ Excercise 3e, perform under-, well- and overfit for 1000 training samples """ # split data # Random Partitioning X_pos = np.linspace(0, len(self.X) - 1, len(self.X)) random.Random(4).shuffle(X_pos) Xtr1000 = self.X[X_pos[:1000].astype('int')] Xte1000 = self.X[X_pos[1000:].astype('int')] Ytr1000 = self.y[X_pos[:1000].astype('int')] Yte1000 = self.y[X_pos[1000:].astype('int')] # get parameter for good fit # Fivefold Cross validation D1000 = np.linalg.norm(Xtr1000[None, :] - Xtr1000[:, None], axis=2) quantiles = np.quantile(D1000, [0.1, 0.5, 0.9]) params = { 'kernel': ['gaussian'], 'kernelparameter': quantiles, 'regularization': np.logspace(-7, 0, 10) } cvkrr = imp.cv(Xtr1000, Ytr1000, imp.krr, params, loss_function=mean_absolute_error, nfolds=5) y_pred1000 = cvkrr.predict(Xte1000) MAE = mean_absolute_error(Yte1000, y_pred1000) # result of CV print("The mean absolute error is: {} ".format(round(MAE, 2))) print("The best regularzation parameter C is: {}".format( cvkrr.regularization)) print("The best kernelparameter sigma is: {}".format( cvkrr.kernelparameter)) print("The cvloss: {}".format(cvkrr.cvloss)) # define parameters for training params = { 'kernel': ['linear', 'gaussian', 'gaussian'], 'kernelparameter': [False, cvkrr.kernelparameter, 1], 'regularization': [cvkrr.regularization, cvkrr.regularization, 0] } # plot plt.figure(figsize=(10, 6)) for i in [0, 1, 2]: model = imp.krr(params['kernel'][i], params['kernelparameter'][i], params['regularization'][i]) model.fit(Xtr1000, Ytr1000) y_pred_train = model.predict(Xtr1000) y_pred = model.predict(self.Xte) plt.subplot(1, 3, i + 1) plt.plot(self.Yte, y_pred, 'bo') plt.plot(Ytr1000, y_pred_train, 'ro') plt.xlabel("y_true") plt.ylabel("y_pred") plt.legend(labels=['test', 'train']) plt.tight_layout(pad=3.0)