def getTStat(X,y,alpha,lam,nSamp=100): # here we are doing residual bootstrap # to identify the std err and report # the t-stat (mean/st err) nObs,nRegs = X.shape # sd is done by res boot so we need to get the res enm = enet.fit(X,y, alpha,lambdas=[lam]) yHat = enm.predict(X)[:,0] res = y - yHat resCent = res-np.mean(res) ySample = np.zeros((nObs,nSamp)) # now we need the samples for i in range(nSamp): resSample = st.sampleWR(resCent) ySample[:,i] = yHat+resSample # residual bs time sc = np.zeros(nRegs) sSqc = np.zeros(nRegs) for i in range(nSamp): # need the coef # they change so we need to map the back to the original tmpEnm = enet.fit(X,ySample[:,i], alpha,lambdas=[lam]) sc[tmpEnm.indices] = sc[tmpEnm.indices] + tmpEnm.coef[:,0] sSqc[tmpEnm.indices] = sSqc[tmpEnm.indices] + tmpEnm.coef[:,0]**2 # get averages and variances aveCoef = sc/float(nSamp) sdCoef = np.sqrt(sSqc/float(nSamp) - aveCoef**2) # get tstat # due to the sparsity of lasso # its possible for a coef to be zero # on all samples, thus a zero st error # we are going to remove the zeros sdCoef[sdCoef<1E-52] = 1E-52 tStat = np.abs(aveCoef/sdCoef) return tStat
def estStErr(self,nSamp=100): X = self._X y = self._y nObs,nRegs = X.shape lam = self._lam yHat = self._yHat intercept= self._intercept globalCoef = self._globalCoef coefIndex = self._coefIndex notEmpty = self._notEmpty alpha = self._alpha # get the bootstrap residual response samples res = y - yHat resCent = res-np.mean(res) ySample = np.zeros((nObs,nSamp)) self._ySample = ySample for i in range(nSamp): resSample = st.sampleWR(resCent) ySample[:,i] = yHat+resSample if notEmpty: # working on subset now Xhat = X[:,coefIndex] self._Xhat = Xhat nObs,nRegsHat = Xhat.shape sdXhat = np.sqrt(np.var(Xhat,0)) self._sdXhat = sdXhat # residual bs time sumErr = 0 sumSqErr = 0 sumNullErr = 0 sumSqNullErr = 0 sc = np.zeros(nRegsHat) sSqc = np.zeros(nRegsHat) sumSup = np.zeros(nRegsHat) for i in range(nSamp): # cv to get the errors err,tmpEnm,tmpallVals = fitSampling(Xhat,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # need the coef # they change so we need to map the back to the original tmpEnm = enet.fit(Xhat,ySample[:,i], alpha,lambdas=[lam]) sc[tmpEnm.indices] = sc[tmpEnm.indices] + tmpEnm.coef[:,0] sSqc[tmpEnm.indices] = sSqc[tmpEnm.indices] + tmpEnm.coef[:,0]**2 # find supports occur = np.zeros(len(tmpEnm.coef[:,0])) occur[abs(tmpEnm.coef[:,0])>1E-25] = 1.0 sumSup[tmpEnm.indices] = sumSup[tmpEnm.indices] + occur # get averages and variances aveErr = sumErr/nSamp self._aveErr = aveErr self._sdErr = np.sqrt(sumSqErr/nSamp - aveErr**2) aveNullErr = sumNullErr/nSamp self._aveNullErr=aveNullErr self._sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) aveCoef = sc/nSamp self._aveCoef = aveCoef self._sdCoef = np.sqrt(sSqc/nSamp - aveCoef**2) self._pSup = sumSup/nSamp else: # residual bs time sumNullErr = 0 sumSqNullErr = 0 for i in range(nSamp): # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # get averages and variances aveNullErr = sumNullErr/nSamp sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) self._aveNullErr = aveNullErr self._sdNullErr = sdNullErr self._aveErr = aveNullErr self._sdErr = sdNullErr
def fitSampling(regressors, response, alpha, nSamp, method='cv', memlimit=None, largest=None, **kwargs): """Performs an elastic net constrained linear regression, see fit, with selected sampleing method to estimate errors using nSamp number of sampleings. methods: 'cv' cross validation with nSamp number of folds 'bs' bootstrap 'bs632' boostrap 632 (weighted average of bs and training error) Returns a TrainingError object (cvTools) and an ENetModel object for the full fit (err,enm). Function requires cvTools """ nObs,nRegs = regressors.shape # get the full model fit fullEnm = enet.fit(regressors, response, alpha, memlimit, largest, **kwargs) # get the lambda values determined in the full fit (going to force these lambdas for all cv's) lam = fullEnm.lambdas # the lambdas may have been user defined, don't want it defined twice if kwargs.has_key('lambdas'): del kwargs['lambdas'] # lets partition the data via our sampling method if method=='cv': t,v = st.kFoldCV(range(nObs),nSamp,randomise=True) elif (method=='bs') or (method=='bs632'): t,v = st.kRoundBS(range(nObs),nSamp) else: raise ValueError('Sampling method not correct') # lets consider many versions of errors # with our error being mean squared error # we want the epected mean squared error # and the corisponding variance over the diffrent versions nModels = len(lam) smse = np.zeros(nModels) sSqmse = np.zeros(nModels) allVals = np.zeros((nModels,nSamp)) # loop through the folds for i in range(nSamp): # get the training values X = regressors[t[i]] y = response[t[i]] enm = enet.fit(X, y, alpha, memlimit, largest, lambdas=lam, **kwargs) # get the validation values Xval = regressors[v[i]] Yval = response[v[i]] nVal = float(len(Yval)) # get the predicted responses from validation regressors Yhat = enm.predict(Xval) # what is the mean squared error? # notice the T was necassary to do the subtraction # the rows are the models and the cols are the observations mse = np.sum((Yhat.T-Yval)**2,1)/nVal # sum the rows (errors for given model) smse = smse + mse sSqmse = sSqmse + mse**2 allVals[:,i] = mse # now it is time to average and send back # I am putting the errors in a container nSampFlt = float(nSamp) meanmse = smse/nSampFlt varmse = sSqmse/nSampFlt - meanmse**2 if method=='bs632': yhat = fullEnm.predict(regressors) resubmse = np.sum((yhat.T-response)**2,1)/float(nObs) meanmse = 0.632*meanmse+(1-0.632)*resubmse err = enet.ENetTrainError(lam,nSamp,meanmse,varmse,[0],[0],alpha) err.setParamName('lambda') fullEnm.setErrors(err.mErr) return err, fullEnm, allVals
def estModel(XFull,y,nSamp=100,alphaList=np.array([1]),estErr=True,estImp=False,reduceX=False,params=[]): """Estimate a mean and standard deviation for an elastic net model using bootstrap residual. Note: Bootstrap resampling is used to select model parameters, then the bs res at these params is used on the full feature set X to calculate means and standard errors. Note: if estErr then 10 fold CV is used to estimate the prediction error at each iteration of the bs. This is ten extra iterations at each bs res sample, but reduces the bias in prediction error. The mean and sdDev of the CV error is then reported. Note: If params are passed then we assume its a tuple with the (lambda,alpha) model parameters. In this case model selection is bipassed. and these params are used. """ nObs,nRegsFull = XFull.shape # select full model values if len(params)==2: lam,alpha = params enm = enet.fit(XFull,y,alpha,lambdas=[lam])[0] else: enm = select(XFull,y,nSamp,alphaList) lam = enm.lambdas[0] yHat = enm.predict(XFull) intercept = enm.intercept[0] globalCoef =enm.coef[np.abs(enm.coef)>1E-21] coefIndex = enm.indices[np.abs(enm.coef)>1E-21] alpha = enm.alpha # now is when we reduce the x if we need too! if reduceX: nRegs = len(coefIndex) if nRegs > 0: X = XFull[:,coefIndex] nObs, _ = X.shape else: X = XFull nRegs = nRegsFull # get the bootstrap residual response samples res = y - yHat resCent = res-np.mean(res) ySample = np.zeros((nObs,nSamp)) for i in range(nSamp): resSample = st.sampleWR(resCent) ySample[:,i] = yHat+resSample if nRegs > 0: # residual bs time if estErr: sumErr = 0 sumSqErr = 0 sumNullErr = 0 sumSqNullErr = 0 sc = np.zeros(nRegs) sSqc = np.zeros(nRegs) ac = lil_matrix((nRegs,nSamp)) sumSup = np.zeros(nRegs) for i in range(nSamp): # cv to get the errors if estErr: err,tmpEnm,tmpallVals = fitSampling(X,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # need the coef # they change so we need to map the back to the original tmpEnm = enet.fit(X,ySample[:,i], alpha,lambdas=[lam]) sc[tmpEnm.indices] = sc[tmpEnm.indices] + tmpEnm.coef[:,0] sSqc[tmpEnm.indices] = sSqc[tmpEnm.indices] + tmpEnm.coef[:,0]**2 if len(tmpEnm.indices)>0: ac[tmpEnm.indices,i] = tmpEnm.coef # find supports occur = np.zeros(len(tmpEnm.coef[:,0])) occur[abs(tmpEnm.coef[:,0])>1E-25] = 1.0 sumSup[tmpEnm.indices] = sumSup[tmpEnm.indices] + occur # get averages and variances if estErr: aveErr = sumErr/nSamp sdErr = np.sqrt(sumSqErr/nSamp - aveErr**2) aveNullErr = sumNullErr/nSamp sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) aveCoef = sc/nSamp sdCoef = np.sqrt(sSqc/nSamp - aveCoef**2) #some crazy stuff here becase of the way scipy mat is shaped medCoef = np.array(np.median(ac.todense(),1))[:,0] pSup = sumSup/nSamp indices = np.arange(nRegs)[np.abs(medCoef)>1E-21] # put it in a dict for simplicity solution = {} if estErr: solution['aveErr'] = aveErr solution['sdErr'] = sdErr solution['aveNullErr'] = aveNullErr solution['sdNullErr'] = sdNullErr if reduceX: # need to go back to the original indicies solution['aveCoef'] = np.zeros(nRegsFull) solution['sdCoef'] = np.zeros(nRegsFull) solution['medCoef'] = np.zeros(nRegsFull) solution['pSup'] = np.zeros(nRegsFull) solution['aveCoef'][coefIndex] = aveCoef solution['sdCoef'][coefIndex] = sdCoef solution['medCoef'][coefIndex] = medCoef solution['pSup'][coefIndex] = pSup solution['indices'] = coefIndex[indices] else: solution['aveCoef'] = aveCoef solution['sdCoef'] = sdCoef solution['medCoef'] = medCoef solution['pSup'] = pSup solution['indices'] = indices nRegsHat = len(indices) if nRegsHat>0 and estImp: Xhat = X[:,indices] # lets do the leave one out importance deal errOutHat = np.zeros(nRegsHat) if nRegsHat>1: for j in range(nRegsHat): Xprime = np.delete(Xhat,j,axis=1) # residual bs time sumErr = 0 sumSqErr = 0 for i in range(nSamp): # cv to get the errors err,tmpenm,tmpallVals = fitSampling(Xprime,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr errOutHat[j] = sumErr/nSamp elif nRegsHat==1: errOutHat[0] = aveNullErr # lets do leave only one errInHat = np.zeros(nRegsHat) for j in range(nRegsHat): Xprime = np.zeros((nObs,1)) Xprime[:,0] = Xhat[:,j] # residual bs time sumErr = 0 sumSqErr = 0 for i in range(nSamp): # cv to get the errors err,tmpenm,tmpallVals = fitSampling(Xprime,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr errInHat[j] = sumErr/nSamp errOut = np.zeros(nRegs) errOut[indices] = errOutHat solution['errOut'] = errOut errIn = np.zeros(nRegs) errIn[indices] = errInHat solution['errIn'] = errIn else: solution = {} if estErr: sumNullErr = 0 sumSqNullErr = 0 for i in range(nSamp): # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # get averages and variances aveNullErr = sumNullErr/nSamp sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) aveErr = aveNullErr sdErr = sdNullErr solution['aveErr'] = aveErr solution['sdErr'] = sdErr solution['aveNullErr'] = aveNullErr solution['sdNullErr'] = sdNullErr solution['aveCoef'] = np.zeros(nRegsFull) solution['sdCoef'] = np.zeros(nRegsFull) solution['medCoef'] = np.zeros(nRegsFull) solution['pSup'] = np.zeros(nRegsFull) solution['indices'] = np.array([]) return solution, enm
def permModelSimple(X,y,nSamp=100,alphaList=np.array([1]),nPerms=1000,reselect=True): """Fits the data to linear model using specified elastic net param (defulat is 1, ie LASSO). The penalty is specified by bootstrap permutations with nSamp (reselect determines if the penalty should be re-estimated over the permutations or if the full model value should be used). Permutations are done to find the permutation coef (key = 'medPermCoef') which is used to estimate the p-value (key = 'p'). NOTE: in this version we do not calculate the standard error estimate over the permutations, therfore we do not scale the coef, so the test statistic is simply the coefficent itself. """ ## ok this is a cut and paste job, ## some varriable names are not great (ie I still use the name tStat when its just ## the abs of the coef and not really the tStat, but I think all is correct ## in the technical sense that it does what I think it does. nObs,nRegs = X.shape solution, enm = estModel(X,y,nSamp,alphaList,estImp=True) medCoef = solution['medCoef'] aveCoef = solution['aveCoef'] sdCoef = solution['sdCoef'] indices = solution['indices'] solution['coef'] = np.zeros(nRegs) solution['coef'][enm.indices] = enm.coef lam = enm.lambdas[0] alpha = enm.alpha p = np.ones(nRegs) medPermCoef = np.zeros(nRegs) if len(indices)>0: tStat = np.zeros(nRegs) tStat[enm.indices] = np.abs(enm.coef) tStatPerm = lil_matrix((nRegs,nPerms)) for i in range(nPerms): # permute the response # *** probably should keep track to avoid repeats, future??? yPerm = np.random.permutation(y) if reselect: enmPerm = select(X,yPerm,nSamp,alphaList) else: enmPerm = enet.fit(X,yPerm,alpha,lambdas=[lam])[0] indPerm = enmPerm.indices if len(indPerm)>0: tmp = np.abs(enmPerm.coef) # more crzy shift cuz the dif from 1-d array in np and scipy tStatPerm[indPerm,i] = np.array(tmp,ndmin=2).T #np.savetxt('tStat.dat',tStat) #np.savetxt('tStatPerm.dat',np.array(tStatPerm.todense())) p = np.ones(nRegs) for i in range(nRegs): # even more confusion for scpy and np arrays # gdpPerm is expecting a vector which is diffrent # from an nx1 matrix (apperently) curTStatPerm = np.array(tStatPerm[i,:].todense())[0,:] medPermCoef[i] = np.median(curTStatPerm) p[i] = gpdPerm.est(tStat[i],curTStatPerm) # use standard permutation if this fails if np.isnan(p[i]) or p[i] == 0: tmp = np.sum(curTStatPerm>=tStat[i])+1 p[i] = float(tmp)/(float(nPerms)) if p[i]>1.0:p[i]=1.0 solution['p'] = p solution['medPermCoef'] = medPermCoef return solution, enm
def netTTestPermute(regressors,response,lam,alpha,nperm=1000): """Caclulates p (significance) values for the regressors in an elastic net linear fit, null assupmtion is that the regressor coefficent is zero. Calculates t statistic and performs a permutation test to; applie a generalized pereto dist to approximate t-stat distribution tail when appropriate. regressors - matrix of regression varriables (col-regressors row-observation) response - vector of the response varriable (col-observation) lam - scalar float; elastic net lambda (penalty) parameter alpha - scalar float; elastic net alpha (balance) parameter nperm - scalar number of permutations returns p values corrisponding to the col of regressors. tStat the test statistic tStatPerm tStats for random permutations the rows - regressorsm the col - permutations coef the coefficents from the linear fit """ import elasticNetLinReg as enet n,m = regressors.shape # check to see if we have enough observations if math.factorial(n)<nperm: raise ValueError("Not enough observations \ for {} permutations".format(nperm)) # get the enet coef estimates: coefs = np.zeros(m) enm = enet.fit(regressors,response,alpha,lambdas=[lam]) coefs[enm.indices] = enm.coef #********* yHat = enm.predict(regressors) # get the sum of the sum residuals squared srs = np.sum((response.T-yHat)**2) # calculate the co square inverse cInv = np.linalg.inv(np.dot(regressors.T,regressors)) # coef error estimates d = np.diag(cInv) s = np.sqrt(np.abs((1.0/(n-1))*srs*d)) #********* # t-statistic tStat = np.abs(coefs)/s tStatPerm = np.ones((m,nperm)) for i in range(nperm): # permute the response # *** probably should keep track to avoid repeats, future??? responsePerm = np.random.permutation(response) # repeat calc of tStat coefsPerm = np.zeros(m) enmPerm = enet.fit(regressors,responsePerm,alpha,lambdas=[lam]) coefsPerm[enmPerm.indices] = enmPerm.coef yHat = enmPerm.predict(regressors) srs = np.sum((responsePerm.T-yHat)**2) # no need to redo the operations on regressor matrix sPerm = np.sqrt(np.abs((1.0/(n-1))*srs*d)) tStatPerm[:,i] = np.abs(coefsPerm)/sPerm p = np.ones(m)*2 for i in range(m): p[i] = gpdPerm.est(tStat[i],tStatPerm[i,:]) return p, tStat, tStatPerm, s
def run(X,y,name): nSamp = 100 alphaList = np.array([1])#np.arange(.1,1.1,.1) nObs,nRegs = X.shape sdY = np.sqrt(np.var(y)) # selection via bootstrap bestMin = 1E10 for a in alphaList: tmpErr,tmpEnm,allVals = fitSampling(X,y,a,nSamp,method='bs') tmpErrV = tmpErr.mErr tmpMin = np.min(tmpErrV) print tmpMin if tmpMin < bestMin: bestMin = tmpMin modelIndex = np.argmin(tmpErrV) enm = tmpEnm err = tmpErr alpha = a # important values lam = enm.lambdas[modelIndex] yHat = enm.predict(X)[:,modelIndex] intercept = enm.intercept[modelIndex] globalCoef = enm.coef[np.abs(enm.coef[:,modelIndex])>1E-21,modelIndex] coefIndex = enm.indices[np.abs(enm.coef[:,modelIndex])>1E-21] notEmpty = len(coefIndex) > 0 # get the bootstrap residual response samples res = y - yHat resCent = res-np.mean(res) ySample = np.zeros((nObs,nSamp)) for i in range(nSamp): resSample = st.sampleWR(resCent) ySample[:,i] = yHat+resSample notEmpty = len(coefIndex) > 0 if notEmpty: # working on subset now Xhat = X[:,coefIndex] nObs,nRegsHat = Xhat.shape sdXhat = np.sqrt(np.var(Xhat,0)) # residual bs time sumErr = 0 sumSqErr = 0 sumNullErr = 0 sumSqNullErr = 0 sc = np.zeros(nRegsHat) sSqc = np.zeros(nRegsHat) sumSup = np.zeros(nRegsHat) for i in range(nSamp): # cv to get the errors err,tmpEnm,tmpallVals = fitSampling(Xhat,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # need the coef # they change so we need to map the back to the original tmpEnm = enet.fit(Xhat,ySample[:,i], alpha,lambdas=[lam]) sc[tmpEnm.indices] = sc[tmpEnm.indices] + tmpEnm.coef[:,0] sSqc[tmpEnm.indices] = sSqc[tmpEnm.indices] + tmpEnm.coef[:,0]**2 # find supports occur = np.zeros(len(tmpEnm.coef[:,0])) occur[abs(tmpEnm.coef[:,0])>1E-25] = 1.0 sumSup[tmpEnm.indices] = sumSup[tmpEnm.indices] + occur # get averages and variances aveErr = sumErr/nSamp sdErr = np.sqrt(sumSqErr/nSamp - aveErr**2) aveNullErr = sumNullErr/nSamp sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) aveCoef = sc/nSamp sdCoef = np.sqrt(sSqc/nSamp - aveCoef**2) pSup = sumSup/nSamp # let do the leave one out importance deal codN = np.zeros(nRegsHat) if nRegsHat>1: for j in range(nRegsHat): Xprime = np.delete(Xhat,j,axis=1) # residual bs time sumErr = 0 sumSqErr = 0 for i in range(nSamp): # cv to get the errors err,tmpenm,tmpallVals = fitSampling(Xprime,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr codN[j] = sumErr/nSamp elif nRegsHat==1: codN[0] = aveNullErr # lets do leave only one cod1 = np.zeros(nRegsHat) for j in range(nRegsHat): Xprime = np.zeros((nObs,1)) Xprime[:,0] = Xhat[:,j] # residual bs time sumErr = 0 sumSqErr = 0 for i in range(nSamp): # cv to get the errors err,tmpenm,tmpallVals = fitSampling(Xprime,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr cod1[j] = sumErr/nSamp # now we are going to estimate # some pvalues. it should # be noted: that we want to use # permutation, to get a real feel # for random or unrelated data # but we dont want to run a bs # for each perm (but we should) # so in here we are using the # ols stderr to get the test stat # we will record a bunch of stuff # from here to look at latter p,tStat,tStatPerm,olsSE = regStat.netTTestPermute(Xhat,y,lam,alpha,nperm=1000) n,m = tStatPerm.shape #***** # would like to check if any values are nan # this most likly means the gpd failed in goodness of fit for tail # will use direct permutation values as the estimate in that case # *** some other form of automated checking might be good here for i in range(n): if np.isnan(p[i]): z = tStatPerm[i,:] tmp = np.sum(z>tStat[i]) p[i] = float(tmp)/float(m) else: # residual bs time sumNullErr = 0 sumSqNullErr = 0 for i in range(nSamp): # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # get averages and variances aveNullErr = sumNullErr/nSamp sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) aveErr = aveNullErr sdErr = sdNullErr # we have it all, lets print it f = open('SLR2run_'+name+'.dat','w') lam.tofile(f,sep="\t") f.write("\n") alpha.tofile(f,sep="\t") f.write("\n") intercept.tofile(f,sep="\t") f.write("\n") aveErr.tofile(f,sep="\t") f.write("\n") sdErr.tofile(f,sep="\t") f.write("\n") aveNullErr.tofile(f,sep="\t") f.write("\n") sdNullErr.tofile(f,sep="\t") f.write("\n") sdY.tofile(f,sep="\t") f.write("\n") if notEmpty: coefIndex.tofile(f,sep="\t") f.write("\n") sdXhat.tofile(f,sep="\t") f.write("\n") globalCoef.tofile(f,sep="\t") f.write("\n") aveCoef.tofile(f,sep="\t") f.write("\n") sdCoef.tofile(f,sep="\t") f.write("\n") pSup.tofile(f,sep="\t") f.write("\n") codN.tofile(f,sep="\t") f.write("\n") cod1.tofile(f,sep="\t") f.write("\n") p.tofile(f,sep="\t") f.write("\n") olsSE.tofile(f,sep="\t") f.write("\n") f.close()
def estModel(XFull,y,nSamp=100,alphaList=np.array([1]),indType='coef',estErr=True,estImp=True,reduceX=False,params=[],): """Estimate a mean, median and standard deviation for an elastic net model using bootstrap residual. Bootstrap resampling is used to select model parameters, then the bs res at these params is used on the full feature set X to calculate the stats. nSamp is used for selection and stat estimates. Options *indType* determines which stat to use for indicies. Indices report the non zero entries in the sparse regression model. Possible types: coef - use coefs from full fit after the selection (defult) ave - use the avereage coefs after the bs, typically includes many more regressors, not recomended as the average removes sparsity benifit. med - use the median value after the bs, typically fewer regressors chosen then 'coef' if *estErr* then 10 fold CV is used to estimate the prediction error at each iteration of the bs. This is ten extra iterations at each bs res sample, but reduces the bias in prediction error. The mean and sdDev of the CV error is then reported. If *estImp* then the importance of each selected regressor is estimated. For errOut this is the error if the regressor is removed, multi varriate error. For errIn this is the error if the regressor is alone, univariate error. If *reduceX* then the regressor matrix is ruduced based on the full model fit after selection. Only non zero coef are kept, much faster, but biases the other stats. NOTE: This was never tested after the last migration, its possible the indices in the solution do not match the orginal ones If *params* are passed then we assume its a tuple with the (lambda,alpha) model parameters. In this case model selection is bipassed. and these params are used. """ nObs,nRegsFull = XFull.shape # select full model values if len(params)==2: lam,alpha = params enm = enet.fit(XFull,y,alpha,lambdas=[lam])[0] else: enm = select(XFull,y,nSamp,alphaList) lam = enm.lambdas[0] yHat = enm.predict(XFull) intercept = enm.intercept[0] globalCoef =enm.coef[np.abs(enm.coef)>1E-21] coefIndex = enm.indices[np.abs(enm.coef)>1E-21] alpha = enm.alpha # now is when we reduce the x if we need too! if reduceX: nRegs = len(coefIndex) if nRegs > 0: X = XFull[:,coefIndex] nObs, _ = X.shape else: X = XFull nRegs = nRegsFull # get the bootstrap residual response samples res = y - yHat resCent = res-np.mean(res) ySample = np.zeros((nObs,nSamp)) for i in range(nSamp): resSample = st.sampleWR(resCent) ySample[:,i] = yHat+resSample if nRegs > 0: # residual bs time if estErr: sumErr = 0 sumSqErr = 0 sumNullErr = 0 sumSqNullErr = 0 sc = np.zeros(nRegs) sSqc = np.zeros(nRegs) ac = lil_matrix((nRegs,nSamp)) sumSup = np.zeros(nRegs) for i in range(nSamp): # cv to get the errors if estErr: err,tmpEnm,tmpallVals = fitSampling(X,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # need the coef # they change so we need to map the back to the original tmpEnm = enet.fit(X,ySample[:,i], alpha,lambdas=[lam]) sc[tmpEnm.indices] = sc[tmpEnm.indices] + tmpEnm.coef[:,0] sSqc[tmpEnm.indices] = sSqc[tmpEnm.indices] + tmpEnm.coef[:,0]**2 if len(tmpEnm.indices)>0: ac[tmpEnm.indices,i] = tmpEnm.coef # find supports occur = np.zeros(len(tmpEnm.coef[:,0])) occur[abs(tmpEnm.coef[:,0])>1E-25] = 1.0 sumSup[tmpEnm.indices] = sumSup[tmpEnm.indices] + occur # get averages and variances if estErr: aveErr = sumErr/nSamp sdErr = np.sqrt(sumSqErr/nSamp - aveErr**2) aveNullErr = sumNullErr/nSamp sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) aveCoef = sc/nSamp sdCoef = np.sqrt(sSqc/nSamp - aveCoef**2) #some crazy stuff here becase of the way scipy mat is shaped medCoef = np.array(np.median(ac.todense(),1))[:,0] pSup = sumSup/nSamp # lets do the selection if indType=='coef': indices = coefIndex elif indType=='med': indices = np.arange(nRegs)[np.abs(medCoef)>1E-21] elif indType=='ave': indices = np.arange(nRegs)[np.abs(aveCoef)>1E-21] else: raise ValueError('The indType '+indType+' is not valid.') # put it in a dict for simplicity solution = {} if estErr: solution['aveErr'] = aveErr solution['sdErr'] = sdErr solution['aveNullErr'] = aveNullErr solution['sdNullErr'] = sdNullErr if reduceX: # need to go back to the original indicies solution['aveCoef'] = np.zeros(nRegsFull) solution['sdCoef'] = np.zeros(nRegsFull) solution['medCoef'] = np.zeros(nRegsFull) solution['pSup'] = np.zeros(nRegsFull) solution['aveCoef'][coefIndex] = aveCoef solution['sdCoef'][coefIndex] = sdCoef solution['medCoef'][coefIndex] = medCoef solution['pSup'][coefIndex] = pSup solution['indices'] = coefIndex[indices] else: solution['aveCoef'] = aveCoef solution['sdCoef'] = sdCoef solution['medCoef'] = medCoef solution['pSup'] = pSup solution['indices'] = indices nRegsHat = len(indices) if nRegsHat>0 and estImp: Xhat = X[:,indices] # lets do the leave one out importance deal errOutHat = np.zeros(nRegsHat) if nRegsHat>1: for j in range(nRegsHat): Xprime = np.delete(Xhat,j,axis=1) # residual bs time sumErr = 0 sumSqErr = 0 for i in range(nSamp): # cv to get the errors err,tmpenm,tmpallVals = fitSampling(Xprime,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr errOutHat[j] = sumErr/nSamp elif nRegsHat==1: errOutHat[0] = aveNullErr # lets do leave only one errInHat = np.zeros(nRegsHat) for j in range(nRegsHat): Xprime = np.zeros((nObs,1)) Xprime[:,0] = Xhat[:,j] # residual bs time sumErr = 0 sumSqErr = 0 for i in range(nSamp): # cv to get the errors err,tmpenm,tmpallVals = fitSampling(Xprime,ySample[:,i],alpha,10,method='cv',lambdas=[lam]) sumErr = err.mErr[0] + sumErr sumSqErr = err.mErr[0]**2 + sumSqErr errInHat[j] = sumErr/nSamp errOut = np.zeros(nRegs) errOut[indices] = errOutHat solution['errOut'] = errOut errIn = np.zeros(nRegs) errIn[indices] = errInHat solution['errIn'] = errIn else: solution = {} if estErr: sumNullErr = 0 sumSqNullErr = 0 for i in range(nSamp): # cv over this thing to get the null model errors nullErr,a = fitSamplingNull(ySample[:,i],10, method='cv') sumNullErr = sumNullErr + nullErr sumSqNullErr = sumSqNullErr + nullErr**2 # get averages and variances aveNullErr = sumNullErr/nSamp sdNullErr = np.sqrt(sumSqNullErr/nSamp - aveNullErr**2) aveErr = aveNullErr sdErr = sdNullErr solution['aveErr'] = aveErr solution['sdErr'] = sdErr solution['aveNullErr'] = aveNullErr solution['sdNullErr'] = sdNullErr solution['aveCoef'] = np.zeros(nRegsFull) solution['sdCoef'] = np.zeros(nRegsFull) solution['medCoef'] = np.zeros(nRegsFull) solution['pSup'] = np.zeros(nRegsFull) solution['indices'] = np.array([]) return solution, enm