from scikits.learn.gaussian_process import GaussianProcess from scikits.learn.cross_val import cross_val_score, KFold from scikits.learn.metrics import r2_score # Load the dataset from scikits' data sets diabetes = datasets.load_diabetes() X, y = diabetes.data, diabetes.target # Instanciate a GP model gp = GaussianProcess(regr='constant', corr='absolute_exponential', theta0=[1e-4] * 10, thetaL=[1e-12] * 10, thetaU=[1e-2] * 10, nugget=1e-2, optimizer='Welch') # Fit the GP model to the data performing maximum likelihood estimation gp.fit(X, y) # Deactivate maximum likelihood estimation for the cross-validation loop gp.theta0 = gp.theta # Given correlation parameter = MLE gp.thetaL, gp.thetaU = None, None # None bounds deactivate MLE # Perform a cross-validation estimate of the coefficient of determination using # the cross_val module using all CPUs available on the machine K = 20 # folds R2 = cross_val_score(gp, X, y=y, cv=KFold(y.size, K), n_jobs=-1).mean() print("The %d-Folds estimate of the coefficient of determination is R2 = %s" % (K, R2))
################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([transform], svm.SVC()) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: transform._set_params(percentile=percentile) this_scores = cross_val.cross_val_score(clf, X, y) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds)) pl.title( 'Performance of the SVM-Anova varying the percentile of features selected') pl.xlabel('Percentile') pl.ylabel('Cross-validation errors rate') pl.axis('tight') pl.show()
def classify(data='All_10.dr',plot=[],fig=[],sort=1,L1=''): #reading and parsing labels with open("/home/fs0/madugula/scratch/FC/covarscript/fgcutshncl2.txt") as f: label=f.read().splitlines() label=array(map(int,label)) blk=concatenate(([1],diff(label))) blk[blk!=0]=1 ind=nonzero(blk)[0] mats={} results=[] errorb=[] xlabels=[] tasks=['r','t','v','vt','vtbw'] netmats=['1','0','0a','2','3','4','5','6','7','8','9','10','11','12'] #netmats=['1','0','0a','2','5'] netmatnames=['Corr','Cov','Amp','ICOV','ICOV0.1','ICOV1','ICOV10','ICOV20''ICOV40','ICOV60','ICOV80','ICOV100','ICOV150','ICOV200'] #netmatnames=['Corr','Cov','Amp','ICOV10'] for nmm in arange(len(netmats)): mm=netmats[nmm] mats[nmm]=[] targetslist=[] results.append([]) errorb.append([]) for i in range(len(tasks)): # tmp_task=scipy.io.loadmat(tasks[i]+'_'+data+'/out_tpts_all_150.mat') tmp_task=scipy.io.loadmat(tasks[i]+'_'+data+'/out.mat') tmp2=tmp_task['netmat'+mm] mats[nmm].append((tmp2)) targetslist.append(ones((tmp2.shape[0]))*i) subnum=mats[nmm][0].shape[0] titles=[] matsub=[] ### # create new feature sets ### nmm+=1 mats[nmm]=[] ### results.append([]) errorb.append([]) ### ### # L1cov ### ### if L1 != '': ### for i in range(len(tasks)): ### tmp = loadtxt('r_'+data+'/all_conds/cov_' + tasks[i] + '.txt') ### mats[nmm].append((tmp)) ### netmatnames.append('L1Cov') ### ### nmm+=1 ### mats[nmm]=[] ### ### results.append([]) ### ### # L1prec ### ### if L1 != '': ### for i in range(len(tasks)): ### tmp = loadtxt('r_'+data+'/all_conds/prec_' + tasks[i] + '.txt') ### mats[nmm].append((tmp)) ### ### netmatnames.append('L1Prec') ### ### nmm+=1 ### mats[nmm]=[] ### ### results.append([]) ### ### # corr+amp shp=mats[0][0].shape nsubs=shp[0] nels=shp[1] size=mats[0][0].size diagels=diag(reshape(arange(nels),(nels**.5,nels**.5))) for i in range(len(tasks)): corrs=mats[0][i] for ii in range(nsubs): corrs[ii,diagels]=mats[2][i][ii,:] mats[nmm].append(corrs) netmatnames.append('Corr+Amp') # corr+ICOVs for ii in r_[1,arange(3,5)]: nmm+=1 mats[nmm]=[] results.append([]) errorb.append([]) for i in range(len(tasks)): mats[nmm].append(c_[mats[0][i],mats[ii][i]]) netmatnames.append('Corr + ' + netmatnames[ii]) # now, prediction for nmm in arange(len(mats)): print(nmm) subnum=mats[nmm][0].shape[0] titles=[] matsub=[] #for i in arange(len(mats[nmm])-1)+1: # matsub.append(mats[nmm][i]-mats[nmm][0]) for i in arange(len(mats[nmm])): matsub.append(mats[nmm][i]-mean(mats[nmm],0)) # print("vs rest, no sub ") for x,y in combinations((arange(len(mats[nmm]))),2): training=concatenate((mats[nmm][x],mats[nmm][y])) targets=concatenate((0*ones((subnum)),1*ones((subnum)))) labels=concatenate([arange(subnum),arange(subnum)]) vec=arange(mats[nmm][x].shape[0]) # random.shuffle(vec) # training=training[vec] # targets=targets[vec] clf=svm.SVC(kernel='linear') lolo=cv.LeaveOneLabelOut(labels) results[nmm].append(mean(cv.cross_val_score(clf,training,targets,cv=lolo))) errorb[nmm].append(stats_binom.wilson_score_interval(results[nmm][-1]*len(targets),len(targets),0.1)) xlabels.append(tasks[x] + ' vs ' + tasks[y] +' No sub' ) # results=validate(clf,K,training,targets) # print(task[x]+" versus "+task[y]+": "+str(results[nmm][-1])) # print("vs rest, no sub ") # multi-label training=concatenate(matsub[0:-1]) targets=concatenate(targetslist[0:-1]) labels=tile(arange(subnum),[1,len(mats[nmm][0:-1])]).flatten() clf=svm.SVC(kernel='linear') lolo=cv.LeaveOneLabelOut(labels) results[nmm].append(mean(cv.cross_val_score(clf,training,targets,cv=lolo))) errorb[nmm].append(stats_binom.wilson_score_interval(results[nmm][-1],len(targets),0.1)) xlabels.append('Multi' ) ### # print("Ranking") ### for x in range(len(matsub)): ### ### training=concatenate([matsub[x],-matsub[x]]) ### targets=concatenate((0*ones((subnum)),1*ones((subnum)))) ### labels=concatenate([arange(subnum),arange(subnum)]) ### clf=svm.SVC(kernel='linear') ### lolo=cv.LeaveOneLabelOut(labels) ### ### results[nmm].append(mean(cv.cross_val_score(clf,training,targets,cv=lolo))) ### xlabels.append(tasks[x+1] + ' vs r') ### # results=validate(clf,K,training,targets) ### # print(tasks[x]+"from Rest: "+str(results)) ### # print("Subtractions") for x,y in combinations((arange(len(matsub))),2): training=concatenate((matsub[x],matsub[y])) targets=concatenate((0*ones((subnum)),1*ones((subnum)))) labels=concatenate([arange(subnum),arange(subnum)]) vec=arange(mats[nmm][x].shape[0]) # random.shuffle(vec) # training=training[vec] # targets=targets[vec] clf=svm.SVC(kernel='linear') lolo=cv.LeaveOneLabelOut(labels) results[nmm].append(mean(cv.cross_val_score(clf,training,targets,cv=lolo))) errorb[nmm].append(stats_binom.wilson_score_interval(results[nmm][-1]*len(targets),len(targets),0.1)) xlabels.append(tasks[x] + ' vs ' + tasks[y]) # results=validate(clf,K,training,targets) # print(task[x]+" versus "+task[y]+": "+str(results[nmm][-1])) if plot: if fig == []: fig=plt.figure() multibar(array(results),fig,sort=sort,xlabels=xlabels,condlabels=netmatnames,title='Rest Measures',ylabel='Accuracy') return results,errorb,netmatnames,xlabels
from scikits.learn import cross_val, datasets, linear_model diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target lasso = linear_model.Lasso() alphas = np.logspace(-4, -1, 20) scores = list() scores_std = list() for alpha in alphas: lasso.alpha = alpha this_scores = cross_val.cross_val_score(lasso, X, y, n_jobs=-1) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) pl.figure(1, figsize=(2.5, 2)) pl.clf() pl.axes([.1, .25, .8, .7]) pl.semilogx(alphas, scores) pl.semilogx(alphas, np.array(scores) + np.array(scores_std) / 20, 'b--') pl.semilogx(alphas, np.array(scores) - np.array(scores_std) / 20, 'b--') pl.yticks(()) pl.ylabel('CV score') pl.xlabel('alpha') pl.axhline(np.max(scores), linestyle='--', color='.5') pl.text(2e-4, np.max(scores) + 1e-4, '.489')
gp = GaussianProcess(regr='constant', corr='absolute_exponential', theta0=[1e-4] * 10, thetaL=[1e-12] * 10, thetaU=[1e-2] * 10, nugget=1e-2, optimizer='Welch', verbose=False) # Fit the GP model to the data gp.fit(X, y) gp.theta0 = gp.theta gp.thetaL = None gp.thetaU = None gp.verbose = False # Estimate the leave-one-out predictions using the cross_val module n_jobs = 2 # the distributing capacity available on the machine y_pred = y + cross_val.cross_val_score(gp, X, y=y, cv=cross_val.LeaveOneOut(y.size), n_jobs=n_jobs, ).ravel() # Compute the empirical explained variance Q2 = metrics.explained_variance_score(y, y_pred) # Goodness-of-fit plot pl.figure() pl.title('Goodness-of-fit plot (Q2 = %1.2e)' % Q2) pl.plot(y, y_pred, 'r.', label='Leave-one-out') pl.plot(y, gp.predict(X), 'k.', label='Whole dataset (nugget=1e-2)') pl.plot([y.min(), y.max()], [y.min(), y.max()], 'k--') pl.xlabel('Observations') pl.ylabel('Predictions') pl.legend(loc='upper left') pl.axis('tight')
digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target ################################################################################ # Plot the PCA spectrum pca.fit(X_digits) pl.figure(1, figsize=(4, 3)) pl.clf() pl.axes([.2, .2, .7, .7]) pl.plot(pca.explained_variance_, linewidth=2) pl.axis('tight') pl.xlabel('n_components') pl.ylabel('explained_variance_') ################################################################################ # Prediction scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1) from scikits.learn.grid_search import GridSearchCV n_components = [10, 15, 20, 30, 40, 50, 64] Cs = np.logspace(-4, 4, 16) estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs), n_jobs=-1) estimator.fit(X_digits, y_digits)
################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC())]) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100) for percentile in percentiles: clf._set_params(anova__percentile=percentile) # Compute cross-validation score using all CPUs this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds)) pl.title( 'Performance of the SVM-Anova varying the percentile of features selected') pl.xlabel('Percentile') pl.ylabel('Prediction rate') pl.axis('tight') pl.show()
# Author: Vincent Dubourg <*****@*****.**> # License: BSD style from scikits.learn import datasets from scikits.learn.gaussian_process import GaussianProcess from scikits.learn.cross_val import cross_val_score, KFold # Load the dataset from scikits' data sets diabetes = datasets.load_diabetes() X, y = diabetes.data, diabetes.target # Instanciate a GP model gp = GaussianProcess(regr='constant', corr='absolute_exponential', theta0=[1e-4] * 10, thetaL=[1e-12] * 10, thetaU=[1e-2] * 10, nugget=1e-2, optimizer='Welch') # Fit the GP model to the data performing maximum likelihood estimation gp.fit(X, y) # Deactivate maximum likelihood estimation for the cross-validation loop gp.theta0 = gp.theta # Given correlation parameter = MLE gp.thetaL, gp.thetaU = None, None # None bounds deactivate MLE # Perform a cross-validation estimate of the coefficient of determination using # the cross_val module using all CPUs available on the machine K = 20 # folds R2 = cross_val_score(gp, X, y=y, cv=KFold(y.size, K), n_jobs=-1).mean() print("The %d-Folds estimate of the coefficient of determination is R2 = %s" % (K, R2))
from scikits.learn import cross_val, datasets, linear_model diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target lasso = linear_model.Lasso() alphas = np.logspace(-4, -1, 20) scores = list() scores_std = list() for alpha in alphas: lasso.alpha = alpha this_scores = cross_val.cross_val_score(lasso, X, y, n_jobs=-1) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) pl.figure(1, figsize=(2.5, 2)) pl.clf() pl.axes([.1, .25, .8, .7]) pl.semilogx(alphas, scores) pl.semilogx(alphas, np.array(scores) + np.array(scores_std)/20, 'b--') pl.semilogx(alphas, np.array(scores) - np.array(scores_std)/20, 'b--') pl.yticks(()) pl.ylabel('CV score') pl.xlabel('alpha') pl.axhline(np.max(scores), linestyle='--', color='.5') pl.text(2e-4, np.max(scores)+1e-4, '.489')
X, y, session = X[y!=0], y[y!=0], session[y!=0] n_samples, n_features = X.shape n_conditions = np.size(np.unique(y)) ### Define the prediction function to be used. # Here we use a Support Vector Classification, with a linear kernel and C=1 clf = SVC(kernel='linear', C=1.) ### Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. We set the number of features to be selected to 500 feature_selection = SelectKBest(f_classif, k=500) ### We combine the dimension reduction and the prediction function anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)]) ### Define the cross-validation scheme used for validation. # Here we use a LeaveOneLabelOut cross-validation on the session, which # corresponds to a leave-one-session-out cv = LeaveOneLabelOut(session) ### Compute the prediction accuracy for the different folds (i.e. session) cv_scores = cross_val_score(anova_svc, X, y, cv=cv, n_jobs=-1, verbose=1, iid=True) ### Return the corresponding mean prediction accuracy classification_accuracy = np.sum(cv_scores) / float(n_samples) print "Classification accuracy: %f" % classification_accuracy, \ " / Chance level: %f" % (1. / n_conditions)
import numpy as np from scikits.learn import cross_val, datasets, svm digits = datasets.load_digits() X = digits.data y = digits.target svc = svm.SVC() gammas = np.logspace(-6, -1, 10) scores = list() scores_std = list() for gamma in gammas: svc.gamma = gamma this_scores = cross_val.cross_val_score(svc, X, y, n_jobs=-1) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) import pylab as pl pl.figure(1, figsize=(2.5, 2)) pl.clf() pl.axes([.1, .25, .8, .7]) pl.semilogx(gammas, scores) pl.semilogx(gammas, np.array(scores) + np.array(scores_std), 'b--') pl.semilogx(gammas, np.array(scores) - np.array(scores_std), 'b--') pl.yticks(()) pl.ylabel('CV score') pl.xlabel('gamma') pl.ylim(0, 1.1) #pl.axhline(np.max(scores), linestyle='--', color='.5') pl.text(gammas[np.argmax(scores)], .9*np.max(scores), '%.3f' % np.max(scores),