def test_krr(self): ''' tests the class krr ''' Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01).reshape(-1, 1) pl.figure() kernels = ['gaussian', 'polynomial', 'linear'] titles = ['gaussian', 'polynomial', 'linear'] params = [0.5, 6, 0] regularizations = [0.01, 0.01, 0.01] for i in range(3): for j in range(2): pl.subplot(2, 3, 1 + i + 3 * j) if j == 0: krr = imp.krr(kernel=kernels[i], kernelparameter=params[i], regularization=regularizations[i]) krr.fit(Xtr, Ytr) if j == 1: krr = imp.krr(kernel=kernels[i], kernelparameter=params[i], regularization=0) krr.fit(Xtr, Ytr) ypred = krr.predict(Xte) pl.plot(Xtr, Ytr) pl.plot(Xte, ypred) if j == 0 and i == 0: pl.ylabel('fixed regularization') if j == 1 and i == 0: pl.ylabel('reg. by efficent cv') pl.title(titles[i]) pl.show()
def krr_test(): """ tests the class krr """ Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :] pl.figure() kernels = ["gaussian", "polynomial", "linear"] titles = ["gaussian", "polynomial", "linear"] params = [0.5, 4, 0] regularizations = [0.01, 0.01, 0.01] for i in range(3): for j in range(2): pl.subplot(2, 3, 1 + i + 3 * j) krr = imp.krr() if j == 0: krr.fit(Xtr, Ytr, kernel=kernels[i], kernelparameter=params[i], regularization=regularizations[i]) print "reg_fixed: ", krr.regularization if j == 1: krr.fit(Xtr, Ytr, kernel=kernels[i], kernelparameter=params[i], regularization=0) print "reg_loocv: ", krr.regularization krr.predict(Xte) pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, krr.ypred.T) if j == 0 and i == 0: pl.ylabel("fixed regularization") if j == 1 and i == 0: pl.ylabel("reg. by efficent cv") pl.title(titles[i]) print "\n(time the test takes on my notebook: approx. 400 milliseconds)"
def plot_MAE_for_different_nsamples(self): """ # Excercise 3d Plot MAE for different nsamples Perform fold_cross_validation before """ MAE = [] n_samples = [ 100, 300, 600, 900, 1200, 1700, 2000, 2700, 3000, 3900, 4200, 4500, 4700, 4800, 4900, 4950, 5000 ] for i in tqdm(n_samples): train_samples = random.sample(range(0, 5000), i) Xtr_nsample = self.Xtr[train_samples] Ytr_nsample = self.Ytr[train_samples] model = imp.krr([self.cvkrr.kernel][0], [self.cvkrr.kernelparameter][0], [self.cvkrr.regularization]) model.fit(Xtr_nsample, Ytr_nsample) y_pred = model.predict(self.Xte) MAE.append(mean_absolute_error(self.Yte, y_pred)) plt.figure(figsize=(8, 6)) plt.plot(n_samples, MAE, 'bo') plt.xlabel("n training samples") plt.ylabel("Mean Absolute Error [kcal/mol]")
def cv_test(): """ tests the cross validation. needs working krr class! """ Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :] krr = imp.krr() pl.figure() pl.subplot(1, 2, 1) params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", np.logspace(-2, 2, 10)] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) print cvkrr.kernelparameter print cvkrr.regularization pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T) pl.title("CV with fixed regularization") pl.subplot(1, 2, 2) params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", [0]] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) print cvkrr.kernelparameter print cvkrr.regularization pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T) pl.title("CV with efficient LOOCV") print "\n(time the test takes on my notebook: approx. 6 seconds)"
def cv_test(): ''' tests the cross validation. needs working krr class! ''' Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :] krr = imp.krr() pl.figure() pl.subplot(1, 2, 1) params = [ 'kernel', ['gaussian'], 'kernelparam', np.logspace(-2, 2, 10), 'regularization', np.logspace(-2, 2, 10) ] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T) pl.subplot(1, 2, 2) params = [ 'kernel', ['gaussian'], 'kernelparam', np.logspace(-2, 2, 10), 'regularization', [0] ] cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, cvkrr.ypred.T)
def krr_test(): ''' tests the class krr ''' Xtr, Ytr = noisysincfunction(100, 0.1) Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :] pl.figure() kernels = ['gaussian', 'polynomial', 'linear'] titles = ['gaussian', 'polynomial', 'linear'] params = [0.5, 4, 0] regularizations = [0.01, 0.01, 0.01] for i in range(3): for j in range(2): pl.subplot(2, 3, 1 + i + 3 * j) krr = imp.krr() if j == 0: krr.fit(Xtr, Ytr, kernel=kernels[i], kernelparameter=params[i], regularization=regularizations[i]) if j == 1: krr.fit(Xtr, Ytr, kernel=kernels[i], kernelparameter=params[i], regularization=0) print krr.regularization krr.predict(Xte) pl.plot(Xtr.T, Ytr.T) pl.plot(Xte.T, krr.ypred.T) if j == 0 and i == 0: pl.ylabel('fixed regularization') if j == 1 and i == 0: pl.ylabel('reg. by efficent cv') pl.title(titles[i])
def krr_app(reg=False): ''' Applies krr to all data sets and saves the result to a file ''' datasets = ['banana','diabetis','flare-solar','image','ringnorm'] #dataset = ['image'] # for computing the results via console, the dataset was changed manually path = 'ps3_datasets/' results = dict() for data in datasets: Xtr = np.loadtxt(path+'U04_'+data+'-xtrain.dat') Ytr = np.loadtxt(path+'U04_'+data+'-ytrain.dat') Xte = np.loadtxt(path+'U04_'+data+'-xtest.dat') d,n = Xtr.shape print data, ' was loaded with %d dimensions'%d krr = imp.krr() kernels = ['gaussian','polynomial','linear'] kernel_params = [np.logspace(-2,2,10),np.arange(10),np.arange(10)] tmp_results = dict() for i in range(len(kernels)): params = [ 'kernel',[kernels[i]], 'kernelparam', kernel_params[i], 'regularization', [0]] cvkrr = imp.cv(Xtr, Ytr.reshape(1,-1), krr, params, loss_function=squared_error_loss, nrepetitions=2) cvkrr.predict(Xte) result = dict() result['cvloss'] = cvkrr.cvloss result['kernel'] = kernels[i] result['kernelparameter'] = cvkrr.kernelparameter result['regularization'] = cvkrr.regularization result['ypred'] = cvkrr.ypred tmp_results[i] = result print 'finished %s kernel on %s'%(kernels[i],data) CVloss = np.zeros(len(kernels)) for i in range(len(kernels)): CVloss[i] = tmp_results[i]['cvloss'] print 'CVloss for dataset %s'%data,CVloss results[data] = tmp_results[np.argmin(CVloss)]
yTrain = np.loadtxt("ps3_datasets/U04_banana-ytrain.dat") n = xTest.shape[1] lables = np.ones((1, n)) lables[yTest >= 0] = 0.5 # # pl.scatter(xTest[0,:],xTest[1,:],c=lables, cmap = cm.jet); # pl.title('Prediction banana test data set'); # # pl.figure(); # pl.scatter(xTrain[0,:],xTrain[1,:],c=yTrain, cmap=cm.jet); # pl.title('Banana training data set'); krr = imp.krr(file['banana']['kernel'], file['banana']['kernelparameter'], file['banana']['regularization']) krr.fit(xTrain, yTrain) xInput = np.linspace(-3, 3) (x, y) = np.meshgrid(xInput, xInput) x = x.reshape((1, x.size)) y = y.reshape((1, y.size)) X = np.vstack((x, y)) krr.predict(X) Z = krr.ypred sn = np.sqrt(x.size) Z = Z.reshape((sn, sn)) pl.figure()
def apply_krr(reg=False): ''' This function applies the krr to the provided data set in order to find a good classification. The results are stored in a dictionary which is at the end pickled. Usage: It is important to adapt the path of the datasets. Input: reg : boolean variable indicating whether the regularization constant shall be estimated by LOOCV or drawn from a provided range. True means that the provided range is used and False means that the LOOCV will be used. Author: Till Rohrmann, [email protected] ''' # IMPORTANT: Adapt path to where the data sets have been stored path = 'ps3_datasets'; testSuffix = 'xtest'; trainXSuffix = 'xtrain'; trainYSuffix = 'ytrain'; files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]; datasetNames = set(); for filename in files: m = re.search('U04_([^\.]*)\.dat', filename); if m != None: datasetNames.add(m.group(1)[:m.group(1).rfind('-')]); result = {}; if reg: regularization = np.logspace(-2, 2, 10); else: regularization = [0]; gaussianKernelParams = np.logspace(-2, 2, 10); polynomialKernelParams = np.arange(1, 10); nrep = 5; nfs = 10; for dataset in datasetNames: print('Dataset: ' + dataset); # training phase filenameX = 'U04_' + dataset + '-' + trainXSuffix + '.dat'; filenameY = 'U04_' + dataset + '-' + trainYSuffix + '.dat'; filenameTestX = 'U04_' + dataset + '-' + testSuffix + '.dat'; X = np.loadtxt(os.path.join(path, filenameX), dtype=float); Y = np.loadtxt(os.path.join(path, filenameY), dtype=float)[np.newaxis, :]; testX = np.loadtxt(os.path.join(path, filenameTestX), dtype=float); print('Shape: ' + str(X.shape)); # linear cv startTime = time.time(); krrLinear = imp.krr(); linearParams = ['kernel', ['linear'], 'kernelparam', [0], 'regularization', regularization]; imp.cv(X, Y, krrLinear, linearParams, nrepetitions=nrep, nfolds=nfs); timeLinear = time.time() - startTime; # polynomial cv startTime = time.time(); krrPolynomial = imp.krr(); polynomialParams = ['kernel', ['polynomial'], 'kernelparam', polynomialKernelParams, 'regularization', regularization]; imp.cv(X, Y, krrPolynomial, polynomialParams, nrepetitions=nrep, nfolds=nfs); timePolynomial = time.time() - startTime; # gaussian cv startTime = time.time(); krrGaussian = imp.krr(); gaussianParams = ['kernel', ['gaussian'], 'kernelparam', gaussianKernelParams, 'regularization', regularization]; imp.cv(X, Y, krrGaussian, gaussianParams, nrepetitions=nrep, nfolds=nfs); timeGaussian = time.time() - startTime; krr = [krrLinear, krrPolynomial, krrGaussian][np.argmin([krrLinear.cvloss, krrPolynomial.cvloss, krrGaussian.cvloss])]; minTime = [timeLinear, timePolynomial, timeGaussian][np.argmin([krrLinear.cvloss, krrPolynomial.cvloss, krrGaussian.cvloss])]; krr.predict(testX); dictionary = dict(); dictionary['kernel'] = krr.kernel; dictionary['kernelparameter'] = krr.kernelparameter; dictionary['regularization'] = krr.regularization; dictionary['cvloss'] = krr.cvloss; dictionary['ypred'] = krr.ypred; result[dataset] = dictionary; # plot ROC curve and calculate AUC params = ['kernel', [krr.kernel], 'kernelparam', [krr.kernelparameter], 'regularization', [krr.regularization]]; rocKRR = imp.krr(); imp.cv(X, Y, rocKRR, params, loss_function=roc_fun, nrepetitions=nrep, nfolds=nfs); truePositiveRate = rocKRR.cvloss[0]; falsePositiveRate = rocKRR.cvloss[1]; # Simpson rule for integration xdiff = falsePositiveRate[1:] - falsePositiveRate[:-1]; ysum = (truePositiveRate[1:] + truePositiveRate[:-1]) / 2 AUC = np.dot(ysum, xdiff); pl.figure(); pl.plot(falsePositiveRate, truePositiveRate); pl.ylabel('True positive rate'); pl.xlabel('False positive rate'); if reg == True: pl.title('ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + ' regularization:' + ('%.3f' % dictionary['regularization'])); else: pl.title('LOOCV ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + ' regularization:' + ('%.3f' % dictionary['regularization'])); print('Dataset:' + dataset + ' kernel:' + dictionary['kernel'] + ' cvloss:' + str(dictionary['cvloss']) + ' AUC:' + str(AUC) + ' time:' + ('%.1f' % minTime)); if reg: filename = 'results.p' else: filename = 'resultsLOOCV.p' pickle.dump(result, open(filename, 'wb'));
def plot_energies_for_1000(self): """ Excercise 3e, perform under-, well- and overfit for 1000 training samples """ # split data # Random Partitioning X_pos = np.linspace(0, len(self.X) - 1, len(self.X)) random.Random(4).shuffle(X_pos) Xtr1000 = self.X[X_pos[:1000].astype('int')] Xte1000 = self.X[X_pos[1000:].astype('int')] Ytr1000 = self.y[X_pos[:1000].astype('int')] Yte1000 = self.y[X_pos[1000:].astype('int')] # get parameter for good fit # Fivefold Cross validation D1000 = np.linalg.norm(Xtr1000[None, :] - Xtr1000[:, None], axis=2) quantiles = np.quantile(D1000, [0.1, 0.5, 0.9]) params = { 'kernel': ['gaussian'], 'kernelparameter': quantiles, 'regularization': np.logspace(-7, 0, 10) } cvkrr = imp.cv(Xtr1000, Ytr1000, imp.krr, params, loss_function=mean_absolute_error, nfolds=5) y_pred1000 = cvkrr.predict(Xte1000) MAE = mean_absolute_error(Yte1000, y_pred1000) # result of CV print("The mean absolute error is: {} ".format(round(MAE, 2))) print("The best regularzation parameter C is: {}".format( cvkrr.regularization)) print("The best kernelparameter sigma is: {}".format( cvkrr.kernelparameter)) print("The cvloss: {}".format(cvkrr.cvloss)) # define parameters for training params = { 'kernel': ['linear', 'gaussian', 'gaussian'], 'kernelparameter': [False, cvkrr.kernelparameter, 1], 'regularization': [cvkrr.regularization, cvkrr.regularization, 0] } # plot plt.figure(figsize=(10, 6)) for i in [0, 1, 2]: model = imp.krr(params['kernel'][i], params['kernelparameter'][i], params['regularization'][i]) model.fit(Xtr1000, Ytr1000) y_pred_train = model.predict(Xtr1000) y_pred = model.predict(self.Xte) plt.subplot(1, 3, i + 1) plt.plot(self.Yte, y_pred, 'bo') plt.plot(Ytr1000, y_pred_train, 'ro') plt.xlabel("y_true") plt.ylabel("y_pred") plt.legend(labels=['test', 'train']) plt.tight_layout(pad=3.0)