def pls_kfold( sample_set, kfold_group_count, max_components, preprocess ): print "load..."; l = AttrDict(load('linre_big'+sample_set+'.npz')) disa = l.disa expa = l.expa Y = disa[:,None] X = l.flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix..."; X_err, X = find_peaks(X,l.exa) pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1 = PC[:,0] good = PC1 > -PC1.std()*2 X, Y, expa = X[good,:], Y[good,:], expa[good] if preprocess: X[X<0.5]=0.5 X = X**0.25 #save? print "cross-validation..."; group_count = kfold_group_count(len(disa)) Ypred4n_components = empty((len(Y),max_components)) for n_components in arange(max_components)+1: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) Ypred4n_components[:,n_components-1] = Ypred[:,0] print "done for "+str(n_components)+" components" savez('out23/'+preprocess+'pred.npz', X=X, Y=Y, expa=expa, Ypred4n_components=Ypred4n_components )
print "load..." l = load('linre_big'+sample_set+'.npz') flum = l['flum'] disa = l['disa'] exa = l['exa'] expa = l['expa'] Y = disa[:,None] X = flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix peaks..." X_err, X = find_peaks(X,exa) print "fix outliers..." pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1, PC2 = PC[:,0], PC[:,1] good = PC1 > -PC1.std()*2 plot_scores(fn='_bad_1', expa=expa, x=PC1,y=PC2, xl='T1',yl='T2', title=', bad') print expa[logical_not(good)] X, Y, expa = X[good,:], Y[good,:], expa[good] print "preprocess with power..." if preprocess:
""" samples_in_testing_set = 5 n_components = 14 from numpy import load, arange, where from linre_tools import find_peaks, PCA from scipy.linalg import lstsq l = load('linre_big.npz') flum = l['flum'] disa = l['disa'] exa = l['exa'] expa = l['expa'] X_orig = flum.T X_err, X_orig = find_peaks(X_orig,exa) ## exclude outliers PC = PCA(n_components=2).fit_transform(X_orig.copy()) #mean inside PC1 = PC[:,0] good_std = PC1 < PC1.std() a4fit = arange(len(X_orig)) >= samples_in_testing_set ia4fit, = where( a4fit & good_std ) X4fit = X_orig[ia4fit,:] disa4fit = disa[ia4fit] pca = PCA(n_components=n_components) PC = pca.fit_transform(X4fit.copy()) dis_mean = disa4fit.mean() #print PC.shape,(disa4fit-dis_mean).shape (a,residues,rank,s) = lstsq(PC,disa4fit-dis_mean)
def pca_kfold(sample_set,test_by_good_only,kfold_group_count): l = load('linre_big'+sample_set+'.npz') flum = l['flum'] disa = l['disa'] exa = l['exa'] expa = l['expa'] X_orig = flum.T X_err, X_orig = find_peaks(X_orig,exa) X_orig, disa, expa = shuffle(X_orig, disa, expa, random_state=0) ## exclude outliers PC = PCA(n_components=2).fit_transform(X_orig.copy()) #mean inside PC1 = PC[:,0] good_std = PC1 < PC1.std() print expa[~good_std] if test_by_good_only: good_idxa, = where(good_std) good_std = good_std[good_idxa] X_orig = X_orig[good_idxa,:] disa = disa[good_idxa] expa = expa[good_idxa] def pca_calc(ia4fit): X4fit = X_orig[ia4fit,:] pca = PCA(n_components=max_components) PC = pca.fit_transform(X4fit.copy()) return (pca,PC) cache = dict() def cached(f,l): k = tuple(l) if k not in cache: cache[k] = f(l) return cache[k] def make(n_components,ia4fit,ia4test): disa4fit = disa[ia4fit] X4test = X_orig[ia4test,:] (pca,PC) = cached(pca_calc,ia4fit) PC = PC[:,:n_components].copy() dis_mean = disa4fit.mean() (a,residues,rank,s) = lstsq(PC,disa4fit-dis_mean) PC = pca.transform(X4test.copy())[:,:n_components] return PC.dot(a[:,None])[:,0] + dis_mean #returns prediced dis x4plot, y4plot = [], [] group_count = kfold_group_count(len(disa)) disa_pred4n_components = empty((max_components,len(disa))) is_loo = group_count == len(disa) title_method = 'LOO' if is_loo else 'K-Fold '+str(group_count)+' groups' title_n = str(len(disa))+' samples' title_bad = '' if test_by_good_only else ' (inc. outliers)' for n_components in arange(max_components)+1: disa_pred = empty_like(disa) loo = KFold( n=len(disa), k=group_count, indices=False ) for train, test in loo: ia4fit, = where( train & good_std ) ia4test, = where( test ) if len(ia4test): disa_pred[ia4test] = make(n_components,ia4fit,ia4test) disa_pred4n_components[n_components-1] = disa_pred RMSEP = sqrt( power((disa_pred-disa),2).sum(axis=0) / len(disa) ) print n_components, RMSEP x4plot.append(n_components) y4plot.append(RMSEP) print 'plot start' plt.grid(True) plt.title(title_method+', '+title_n+title_bad) plt.xlabel('PC Count') plt.ylabel('RMSEP, mg/L') plt.plot(x4plot,y4plot) res_dir = "out18"; res_name = "ts"+sample_set+"g"+str(test_by_good_only)+"k"+str(group_count); savez(res_dir+'/'+res_name+".npz", disa = disa, disa_pred4n_components = disa_pred4n_components, expa = expa ) plt.savefig(res_dir+'/png/'+res_name+".png") plt.savefig(res_dir+'/pdf/'+res_name+".pdf") plt.cla() print 'plot finish'
peaks +-60 are observable with next values ~0; its about to be a noise, we've marked it; its possible to browse samples sorted by dis after pressing '/'; no facts was found found from this; """ sample_set = '2' from numpy import load l = load('linre_big'+sample_set+'.npz') ema, exa, flum, expa, disa = l['ema'], l['exa'], l['flum'], l['expa'], l['disa'] X = flum.T #mark errors from linre_tools import find_peaks X_err, X_wo_peaks = find_peaks(X,exa) #X = X_wo_peaks from linre_explorer import freq3d_explore pga = [e+' '+str(d) for e, d in zip(expa,disa)] is_sorted_by_dis = [False] dis_idxa = list(enumerate(zip(expa,disa))) dis_idxa.sort(key=lambda p:(p[1][0][0],p[1][1])) def pg_indexer(i): return dis_idxa[i][0] if is_sorted_by_dis[0] else i def on_key_inner(k): if k == '/': is_sorted_by_dis[0] = not is_sorted_by_dis[0] def mplot_inner(ax,gi): erra = X_err[gi] ax.plot(exa[erra],ema[erra],0,'ro') ax.set_zlabel ('Fluorescence')