def pls_train(self, X, Y, verbose=True): Xn = simple_normalize(X) pls = PLSRegression() if verbose: print 'fitting canonical pls...' pls.fit(Xn, Y) return pls
def plss(X, y, cv, n_components=1): """ """ pls = PLSRegression(n_components=n_components) sse = np.zeros(y.shape[1]) for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] y0 = y_train.mean(0) X0 = X_train.mean(0) pls.fit(X_train - X0, y_train - y0) sse += np.sum((y_test - y0 - pls.predict(X_test - X0)) ** 2, 0) return sse
def plss(X, y, cv, n_components=1): """ """ pls = PLSRegression(n_components=n_components) sse = np.zeros(y.shape[1]) for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] y0 = y_train.mean(0) X0 = X_train.mean(0) pls.fit(X_train - X0, y_train - y0) sse += np.sum((y_test - y0 - pls.predict(X_test - X0))**2, 0) return sse
def pls_kfold( sample_set, kfold_group_count, max_components, preprocess ): print "load..."; l = AttrDict(load('linre_big'+sample_set+'.npz')) disa = l.disa expa = l.expa Y = disa[:,None] X = l.flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix..."; X_err, X = find_peaks(X,l.exa) pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1 = PC[:,0] good = PC1 > -PC1.std()*2 X, Y, expa = X[good,:], Y[good,:], expa[good] if preprocess: X[X<0.5]=0.5 X = X**0.25 #save? print "cross-validation..."; group_count = kfold_group_count(len(disa)) Ypred4n_components = empty((len(Y),max_components)) for n_components in arange(max_components)+1: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) Ypred4n_components[:,n_components-1] = Ypred[:,0] print "done for "+str(n_components)+" components" savez('out23/'+preprocess+'pred.npz', X=X, Y=Y, expa=expa, Ypred4n_components=Ypred4n_components )
pl.xticks(()) pl.yticks(()) pl.show() ############################################################################### # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print ("True B (such that: Y = XB + Err)") print (B) # compare pls2.coefs with B print ("Estimated B") print (np.round(pls2.coefs, 1)) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
def dict2mean(X, dict): plsca = PLSRegression(n_components=np.shape(dict['coefs'])[0]) plsca.x_mean_ = dict['x_mean'] plsca.y_mean_ = dict['y_mean'] plsca.coefs = dict['coefs'] return plsca.predict(X)
np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]) pl.legend() pl.show() ############################################################################### # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print "True B (such that: Y = XB + Err)" print B # compare pls2.coefs with B print "Estimated B" print np.round(pls2.coefs, 1) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
yf = [] ym = [] ypca = [] yplsm = [] yplsf = [] X_fmri = scale(np.concatenate(good_data['fmri'], axis=1)) X_meg = scale(np.concatenate(good_data['meg'], axis=1)) for ncomp in max_comps: mse_fmri = [] mse_meg = [] mse_pca = [] mse_plsm = [] mse_plsf = [] print 'Trying %d components' % ncomp plsca = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] y_train = sx.iloc[train].tolist() y_test = sx.iloc[test].tolist() pca = RandomizedPCA(n_components=ncomp, whiten=True) clf = LinearRegression().fit(pca.fit_transform(X_fmri_train), y_train) mse_fmri.append(mean_squared_error(clf.predict(pca.transform(X_fmri_test)), y_test))
from sklearn.cross_validation import ShuffleSplit from sklearn.pls import PLSRegression from sklearn.metrics import mean_absolute_error from sklearn.dummy import DummyRegressor nobs = X_meg.shape[0] max_comps = range(2, 30, 2) nfolds = 50 cv = ShuffleSplit(nobs, n_iter=nfolds, test_size=.1) # Trying the prediction with different components comp_scores = [] dumb_scores = [] for ncomp in max_comps: print 'Trying %d components' % ncomp pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] pls.fit(X_fmri_train, X_meg_train) pred = pls.predict(X_fmri_test) mae += mean_absolute_error(X_meg_test, pred)
from dataset_creator import DatasetCreator params = {"LAMBDA": 0.4, "dimension": 4096} c = DatasetCreator(dtk_params=params, encoder_params=[4096, 3]) D = c.get_d() n = len(D[0]) print(D[1]) train_X, train_Y = D[0][:n / 2], D[1][:n / 2] test_X, test_Y = D[0][n / 2:], D[1][n / 2:] pls2 = PLSRegression() pls2.fit(train_X, train_Y) #print(pls2.coefs) pred = pls2.predict(test_X) mean_err = np.mean((pred - test_Y)**2) print(mean_err) mean_cos = 0 mean_cos_original = 0 for i, j in zip(pred, test_Y): mean_cos = mean_cos + np.dot(i, j) / np.sqrt(np.dot(i, i) * np.dot(j, j))
from sklearn.metrics import mean_absolute_error from sklearn.dummy import DummyRegressor nobs = X_meg.shape[0] max_comps = range(5,30,5) nfolds=50 cv = ShuffleSplit(nobs,n_iter=nfolds,test_size=.1) y = inatt # Trying the prediction with different components comp_scores = [] dumb_scores = [] meg_scores, fmri_scores = [], [] for ncomp in max_comps: print 'Trying %d components'%ncomp pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 meg_mae, fmri_mae = 0, 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] y_train = y[train] y_test = y[test] X_train = np.hstack([X_fmri_train,X_meg_train]) X_test = np.hstack([X_fmri_test,X_meg_test])
mds = AttrDict(load("out23/pred.npz")) X, Y, expa = mds.X, mds.Y, mds.expa X[X<0.5]=0.5 #0.7 X = X**0.25 group_count = 11 n_components_list = range(9,20) for n_components in n_components_list: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) print n_components, RMSEP(Y[:,0],Ypred[:,0]) #n, bins, patches = plt.hist(X.flatten(),40,range=(0,2)) #plt.show() """ stuff: print [v for v in X.flatten() if v<-0.4] #only 3 numbers from X <-0.4 X = (1-X)**2/X*2 #Kubelka-Munk function """ """-
def test_predictions(): d = load_linnerud() X = d.data Y = d.target tol = 5e-12 miter = 1000 num_comp = 2 Xorig = X.copy() Yorig = Y.copy() # SSY = np.sum(Yorig**2) # center = True scale = False pls1 = PLSRegression(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls1.fit(Xorig, Yorig) Yhat1 = pls1.predict(Xorig) SSYdiff1 = np.sum((Yorig-Yhat1)**2) # print "PLSRegression: R2Yhat = %.4f" % (1 - (SSYdiff1 / SSY)) # Compare PLSR and sklearn.PLSRegression pls3 = PLSR(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls3.fit(X, Y) Yhat3 = pls3.predict(X) assert_array_almost_equal(Yhat1, Yhat3, decimal = 5, err_msg = "PLSR gives wrong prediction") SSYdiff3 = np.sum((Yorig-Yhat3)**2) # print "PLSR : R2Yhat = %.4f" % (1 - (SSYdiff3 / SSY)) assert abs(SSYdiff1 - SSYdiff3) < 0.00005 pls2 = PLSCanonical(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls2.fit(Xorig, Yorig) Yhat2 = pls2.predict(Xorig) SSYdiff2 = np.sum((Yorig-Yhat2)**2) # print "PLSCanonical : R2Yhat = %.4f" % (1 - (SSYdiff2 / SSY)) # Compare PLSC and sklearn.PLSCanonical pls4 = PLSC(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls4.fit(X, Y) Yhat4 = pls4.predict(X) SSYdiff4 = np.sum((Yorig-Yhat4)**2) # print "PLSC : R2Yhat = %.4f" % (1 - (SSYdiff4 / SSY)) # Compare O2PLS and sklearn.PLSCanonical pls5 = O2PLS(num_comp = [num_comp, 1, 0], center = True, scale = scale, tolerance = tol, max_iter = miter) pls5.fit(X, Y) Yhat5 = pls5.predict(X) SSYdiff5 = np.sum((Yorig-Yhat5)**2) # print "O2PLS : R2Yhat = %.4f" % (1 - (SSYdiff5 / SSY)) assert abs(SSYdiff2 - SSYdiff4) < 0.00005 assert SSYdiff2 > SSYdiff5
def pls(coords, intens): print PLSRegression().fit(coords, intens)
disa = l['disa'] exa = l['exa'] expa = l['expa'] Y = disa[:,None] X = flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix peaks..." X_err, X = find_peaks(X,exa) print "fix outliers..." pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1, PC2 = PC[:,0], PC[:,1] good = PC1 > -PC1.std()*2 plot_scores(fn='_bad_1', expa=expa, x=PC1,y=PC2, xl='T1',yl='T2', title=', bad') print expa[logical_not(good)] X, Y, expa = X[good,:], Y[good,:], expa[good] print "preprocess with power..." if preprocess: X[X<0.5]=0.5 X = X**0.25 print "fit..."
reader.next() data =np.array(filter(lambda row: '' not in row and 'NA' not in row and '?' not in row, [[convert(row[i]) for i in good_cols] for row in reader if 'evd' in row])).astype(float) data = data[~np.isnan(data).any(axis=1)] loc = data[:,9] tumor = data[:,4] #data = (data- data.min(axis=0))/(data.max(axis=0)-data.min(axis=0)) y = data[:,7] x = np.delete(data,7,1) x = (x-x.min(axis=0))/(x.max(axis=0)-x.min(axis=0)) print x.shape pls1 = PLSRegression(n_components = x.shape[1]) pls1.fit(x,y) cfs = np.nan_to_num(np.log(pls1.coefs)) fig, (coeffs,dist) = plt.subplots(nrows=1,ncols=2) coeffs.barh(range(len(cfs)),cfs, edgecolor='k', color = ['r' if x<0 else 'g' for x in cfs], linewidth=1) artist.adjust_spines(coeffs) coeffs.axvline(x=0,color='k',linestyle='--',linewidth=2) coeffs.axvline(x=-5.3,color='r',linestyle='--',linewidth=1) coeffs.axvline(x=5.3,color='r',linestyle='--',linewidth=1) coeffs.set_xlabel(r'\Large \textbf{Importance} $\left(\log \beta\right)$') coeffs.set_yticks(range(len(labels)))