def pls_cal(dbfile,maskfile,outpath,which_elem,testfold,nc,normtype=1,mincomp=0,maxcomp=100,plstype='mlpy',keepfile=None,removefile=None,cal_dir=None,masterlist_file=None,compfile=None,name_sub_file=None,foldfile=None,nfolds=7,seed=None,n_bag=None,skscale=False,n_boost=None,max_samples=0.1,n_elems=9): plstype_string=plstype if n_bag!=None: plstype_string=plstype+'_bag' if n_boost!=None: plstype_string=plstype+'_boost' if skscale==True: plstype_string=plstype+'_scale' print('Reading database') sys.stdout.flush() spectra,comps,spect_index,names,labels,wvl=ccam.read_db(dbfile,compcheck=True,n_elems=n_elems) oxides=labels[2:] compindex=numpy.where(oxides==which_elem)[0] print('Choosing spectra') which_removed=outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_removed.csv' spectra,names,spect_index,comps=ccam.choose_spectra(spectra,spect_index,names,comps,compindex,mincomp=mincomp,maxcomp=maxcomp,keepfile=keepfile,removefile=removefile,which_removed=which_removed) print('Masking spectra') spectra,wvl=ccam.mask(spectra,wvl,maskfile) print('Normalizing spectra') spectra=ccam.normalize(spectra,wvl,normtype=normtype) print('Assigning Folds') if foldfile!=None: #if a fold file is specified, use it folds=ccam.folds(foldfile,names) else: #otherwise, define random folds folds=ccam.random_folds(names,nfolds,seed=seed) names_nofold=names[(folds==0)] spect_index_nofold=spect_index[(folds==0)] #write a file containing the samples not assigned to folds with open(which_removed,'ab') as writefile: writer=csv.writer(writefile,delimiter=',',) for i in range(len(names_nofold)): writer.writerow([names_nofold[i],spect_index_nofold[i],'No Fold']) #remove spectra that are not assigned to any fold spectra=spectra[(folds!=0),:] spect_index=spect_index[(folds!=0)] names=names[(folds!=0)] comps=comps[(folds!=0),:] folds=folds[(folds!=0)] print('Defining Training and Test Sets') spectra_train=spectra[(folds!=testfold)] spect_index_train=spect_index[(folds!=testfold)] names_train=names[(folds!=testfold)] comps_train=comps[(folds!=testfold),compindex] folds_train=folds[(folds!=testfold)] folds_train_unique=numpy.unique(folds_train) spectra_test=spectra[(folds==testfold)] spect_index_test=spect_index[(folds==testfold)] names_test=names[(folds==testfold)] comps_test=comps[(folds==testfold),compindex] folds_test=folds[(folds==testfold)] print('Do Leave One Label Out (LOLO) cross validation with all folds but the test set') #define array to hold cross validation predictions and RMSEs train_predict_cv=numpy.zeros((len(names_train),nc)) RMSECV=numpy.zeros(nc) for i in folds_train_unique: print('Holding out fold #'+str(i)) if skscale==False: #mean center those spectra left in #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:]) X_cv_in,X_cv_in_mean=ccam.meancenter(spectra_train[(folds_train!=i),:]) #and those left out X_cv_out=ccam.meancenter(spectra_train[(folds_train==i),:],X_mean=X_cv_in_mean)[0] #mean center compositions left in Y_cv_in,Y_cv_in_mean=ccam.meancenter(comps_train[(folds_train!=i)]) if skscale==True: X_cv_in=spectra_train[(folds_train!=i),:] X_cv_out=spectra_train[(folds_train==i),:] Y_cv_in=comps_train[(folds_train!=i)] Y_cv_in_mean=0 #step through each number of components for j in range(1,nc+1): print('Training Model for '+str(j)+' components') #train the model if plstype=='mlpy': PLS1model=ccam.mlpy_pls.PLS(j) PLS1model.learn(X_cv_in,Y_cv_in) #predict the samples held out train_predict_cv[(folds_train==i),j-1]=PLS1model.pred(X_cv_out)+Y_cv_in_mean if plstype=='sklearn': PLS1model=PLSRegression(n_components=j,scale=skscale) if n_bag==None and n_boost==None: PLS1model.fit(X_cv_in,Y_cv_in) train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1model.predict(X_cv_out)+Y_cv_in_mean) if n_bag!=None: PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1) PLS1bagged.fit(X_cv_in,Y_cv_in) train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1bagged.predict(X_cv_out)+Y_cv_in_mean) if n_boost!=None: PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost) PLS1boosted.fit(X_cv_in,Y_cv_in) train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1boosted.predict(X_cv_out)+Y_cv_in_mean) #calculate RMSECV for i in range(0,nc): sqerr=(train_predict_cv[:,i]-comps_train)**2.0 RMSECV[i]=numpy.sqrt(numpy.mean(sqerr)) #mean center full model if skscale==False: X,X_mean=ccam.meancenter(spectra_train) X_test=ccam.meancenter(spectra_test,X_mean=X_mean)[0] X_all=ccam.meancenter(spectra,X_mean=X_mean)[0] Y,Y_mean=ccam.meancenter(comps_train) if skscale==True: X=spectra_train X_test=spectra_test X_all=spectra Y=comps_train Y_mean=0 #create arrays for results and RMSEs trainset_results=numpy.zeros((len(names_train),nc)) testset_results=numpy.zeros((len(names_test),nc)) results=numpy.zeros((len(names),nc)) RMSEP=numpy.zeros(nc) RMSEC=numpy.zeros(nc) beta=numpy.zeros((len(X[0,:]),nc)) Q_res=numpy.zeros((len(X[:,0]),nc)) T2=numpy.zeros((len(X[:,0]),nc)) [a,evals,b]=numpy.linalg.svd(numpy.cov(numpy.dot(X,X.transpose()))) evals=numpy.diag(evals**2) if cal_dir!=None: print('Reading cal target data') cal_data,cal_wvl,cal_filelist=ccam.read_ccs(cal_dir) cal_data,cal_wvl=ccam.mask(cal_data,cal_wvl,maskfile) cal_data=ccam.normalize(cal_data,cal_wvl,normtype=normtype) if skscale==True: cal_data_centered=cal_data if skscale==False: cal_data_centered=ccam.meancenter(cal_data,X_mean=X_mean)[0] RMSEP_cal=numpy.zeros(nc) RMSEP_cal_good=numpy.zeros(nc) RMSEP_KGAMEDS=numpy.zeros(nc) RMSEP_MACUSANITE=numpy.zeros(nc) RMSEP_NAU2HIS=numpy.zeros(nc) RMSEP_NAU2LOS=numpy.zeros(nc) RMSEP_NAU2MEDS=numpy.zeros(nc) RMSEP_NORITE=numpy.zeros(nc) RMSEP_PICRITE=numpy.zeros(nc) RMSEP_SHERGOTTITE=numpy.zeros(nc) targets,dists,amps,nshots=ccam.target_lookup(cal_filelist,masterlist_file,name_sub_file) target_comps=ccam.target_comp_lookup(targets,compfile,which_elem) cal_results=numpy.zeros((len(targets),nc)) model_list=[] #Now step through each # of components with the full model for j in range(1,nc+1): print('Training full model for '+str(j)+' components') if plstype=='mlpy': PLS1model=ccam.mlpy_pls.PLS(j) PLS1model.learn(X,Y) beta[:,j-1]=PLS1model.beta() model_list.append([PLS1model]) trainset_results[:,j-1]=PLS1model.pred(X)+Y_mean testset_results[:,j-1]=PLS1model.pred(X_test)+Y_mean results[:,j-1]=PLS1model.pred(X_all)+Y_mean if cal_dir != None: comps_copy=copy.copy(target_comps) # if skscale==True: # cal_results[:,j-1]=PLS1model.pred(cal_data) # if skscale==False: cal_results[:,j-1]=PLS1model.pred(cal_data_centered)+Y_mean RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) if plstype=='sklearn': PLS1model=PLSRegression(n_components=j,scale=skscale) if n_bag==None and n_boost==None: PLS1model.fit(X,Y) T=PLS1model.x_scores_ #There's probably a more efficient way to calculate T2... for k in range(len(X[:,0])): T2[k,j-1]=numpy.dot(T[k,:],numpy.dot(numpy.linalg.inv(numpy.dot(T.transpose(),T)),T[k,:])) E=X-numpy.dot(PLS1model.x_scores_,PLS1model.x_loadings_.transpose()) Q_res[:,j-1]=numpy.dot(E,E.transpose()).diagonal() trainset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X)+Y_mean) testset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X_test)+Y_mean) results[:,j-1]=numpy.squeeze(PLS1model.predict(X_all)+Y_mean) beta[:,j-1]=numpy.squeeze(PLS1model.coefs) model_list.append([PLS1model]) if cal_dir != None: comps_copy=copy.copy(target_comps) cal_results[:,j-1]=numpy.squeeze(PLS1model.predict(cal_data_centered)+Y_mean) RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) if n_bag!=None: PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1) PLS1bagged.fit(X,Y) trainset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X)+Y_mean) testset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_test)+Y_mean) results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_all)+Y_mean) beta[:,j-1]=None model_list.append([PLS1bagged]) if cal_dir != None: comps_copy=copy.copy(target_comps) cal_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(cal_data_centered)+Y_mean) RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) if n_boost!=None: PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost) PLS1boosted.fit(X,Y) trainset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X)+Y_mean) testset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_test)+Y_mean) results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_all)+Y_mean) beta[:,j-1]=None model_list.append([PLS1boosted]) if cal_dir != None: comps_copy=copy.copy(target_comps) cal_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(cal_data_centered)+Y_mean) RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results) RMSEC[j-1]=numpy.sqrt(numpy.mean((trainset_results[:,j-1]-comps_train)**2.0)) RMSEP[j-1]=numpy.sqrt(numpy.mean((testset_results[:,j-1]-comps_test)**2.0)) with open(outpath+which_elem+'_'+plstype_string+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'.pkl','wb') as picklefile: pickle.dump(model_list,picklefile) #if cal_dir is specified, read cal target data and calculate RMSEs if cal_dir!=None: n_good_cal=numpy.sum(numpy.array([RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE])[:,0]!=0) print(n_good_cal) RMSEP_cal=(RMSEP_KGAMEDS+RMSEP_MACUSANITE+RMSEP_NAU2HIS+RMSEP_NAU2LOS+RMSEP_NAU2MEDS+RMSEP_NORITE+RMSEP_PICRITE+RMSEP_SHERGOTTITE)/n_good_cal RMSEP_single_cals=[RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE,RMSEP_cal] with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_caltargets_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['File','Target','Laser Energy','True_Comp'] row.extend(list(range(1,nc+1))) writer.writerow(row) for i in range(0,len(targets)): row=[cal_filelist[i],targets[i],amps[i],target_comps[i]] row.extend(cal_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP_caltargets.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEP Cal Targets (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEP_cal[i]]) ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal.png',RMSEP_cals=RMSEP_single_cals) ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal_good.png',RMSEP_good=RMSEP_cal_good) # plot RMSEs ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot.png') #Write output info to files with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_Q_res.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=["Sample","Spectrum","Fold","True Comp"] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(Q_res[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_quartiles.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=[which_elem] writer.writerow(row) row=['Min',numpy.percentile(comps[:,compindex],0)] writer.writerow(row) row=['1st Quartile',numpy.percentile(comps[:,compindex],25)] writer.writerow(row) row=['Median',numpy.percentile(comps[:,compindex],50)] writer.writerow(row) row=['3rd Quartile',numpy.percentile(comps[:,compindex],75)] writer.writerow(row) row=['Max',numpy.percentile(comps[:,compindex],100)] writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_HotellingT2.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=["Sample","Spectrum","Fold","True Comp"] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(T2[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSECV.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSECV (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSECV[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEC.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEC (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEC[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['NC','RMSEP (wt.%)']) for i in range(0,nc): writer.writerow([i+1,RMSEP[i]]) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_cv_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(train_predict_cv[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_train_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_train)): row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]] row.extend(trainset_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_test_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names_test)): row=[names_test[i],spect_index_test[i],folds_test[i],comps_test[i]] row.extend(testset_results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_all_predict.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['Sample','Spectrum','Fold','True_Comp'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(names)): row=[names[i],spect_index[i],folds[i],comps[i,compindex]] row.extend(results[i,:]) writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_beta_coeffs.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') row=['wvl'] row.extend(range(1,nc+1)) writer.writerow(row) for i in range(0,len(wvl)): row=[wvl[i]] row.extend(beta[i,:]) writer.writerow(row) if skscale==False: with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_meancenters.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow([which_elem+' mean',Y_mean]) for i in range(0,len(wvl)): row=[wvl[i],X_mean[i]] writer.writerow(row) with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_inputinfo.csv','w',newline='') as writefile: writer=csv.writer(writefile,delimiter=',') writer.writerow(['Spectral database =',dbfile]) writer.writerow(['Spectra Kept =',keepfile]) writer.writerow(['Spectra Removed =',which_removed]) writer.writerow(['Fold Definition =',foldfile]) writer.writerow(['Test Fold =',maskfile]) writer.writerow(['Mask File =',maskfile]) writer.writerow(['Algorithm =',plstype_string]) writer.writerow(['# of components =',nc]) writer.writerow(['Normalization Type =',normtype]) writer.writerow(['Composition Min. =',mincomp]) writer.writerow(['Composition Max. =',maxcomp])
def pls_cal(dbfile, foldfile, maskfile, outpath, which_elem, testfold, nc, normtype=3, mincomp=0, maxcomp=100, plstype='mlpy', keepfile=None, removefile=None, cal_dir=None, masterlist_file=None, compfile=None, name_sub_file=None): print 'Reading database' sys.stdout.flush() spectra, comps, spect_index, names, labels, wvl = ccam.read_db( dbfile, compcheck=True) oxides = labels[2:] compindex = numpy.where(oxides == which_elem)[0] print 'Choosing spectra' which_removed = outpath + which_elem + '_' + plstype + '_nc' + str( nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str( maxcomp) + '_removed.csv' spectra, names, spect_index, comps = ccam.choose_spectra( spectra, spect_index, names, comps, compindex, mincomp=mincomp, maxcomp=maxcomp, keepfile=keepfile, removefile=removefile, which_removed=which_removed) print 'Masking spectra' spectra, wvl = ccam.mask(spectra, wvl, maskfile) print 'Normalizing spectra' spectra = ccam.normalize(spectra, wvl, normtype=normtype) print 'Assigning Folds' folds = ccam.folds(foldfile, names) names_nofold = names[(folds == 0)] spect_index_nofold = spect_index[(folds == 0)] #write a file containing the samples not assigned to folds with open(which_removed, 'ab') as writefile: writer = csv.writer( writefile, delimiter=',', ) for i in range(len(names_nofold)): writer.writerow( [names_nofold[i], spect_index_nofold[i], 'No Fold']) #remove spectra that are not assigned to any fold spectra = spectra[(folds != 0), :] spect_index = spect_index[(folds != 0)] names = names[(folds != 0)] comps = comps[(folds != 0), :] folds = folds[(folds != 0)] print 'Defining Training and Test Sets' spectra_train = spectra[(folds != testfold)] spect_index_train = spect_index[(folds != testfold)] names_train = names[(folds != testfold)] comps_train = comps[(folds != testfold), compindex] folds_train = folds[(folds != testfold)] folds_train_unique = numpy.unique(folds_train) spectra_test = spectra[(folds == testfold)] spect_index_test = spect_index[(folds == testfold)] names_test = names[(folds == testfold)] comps_test = comps[(folds == testfold), compindex] folds_test = folds[(folds == testfold)] print 'Do Leave One Label Out (LOLO) cross validation with all folds but the test set' #define array to hold cross validation predictions and RMSEs train_predict_cv = numpy.zeros((len(names_train), nc)) RMSECV = numpy.zeros(nc) for i in folds_train_unique: print 'Holding out fold #' + str(i) #mean center those spectra left in #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:]) X_cv_in, X_cv_in_mean = ccam.meancenter( spectra_train[(folds_train != i), :]) #and those left out X_cv_out = ccam.meancenter(spectra_train[(folds_train == i), :], X_mean=X_cv_in_mean)[0] #mean center compositions left in Y_cv_in, Y_cv_in_mean = ccam.meancenter( comps_train[(folds_train != i)]) #step through each number of components for j in range(1, nc + 1): print 'Training PLS Model for ' + str(j) + ' components' #train the model if plstype == 'mlpy': PLS1model = mlpy.pls.PLS(j) PLS1model.learn(X_cv_in, Y_cv_in) #predict the samples held out train_predict_cv[(folds_train == i), j - 1] = PLS1model.pred(X_cv_out) + Y_cv_in_mean if plstype == 'sklearn': PLS1model = PLSRegression(n_components=nc) PLS1model.fit(X_cv_in, Y_cv_in) train_predict_cv[ (folds_train == i), j - 1] = PLS1model.predict(X_cv_out) + Y_cv_in_mean #calculate RMSECV for i in range(0, nc): sqerr = (train_predict_cv[:, i] - comps_train)**2.0 RMSECV[i] = numpy.sqrt(numpy.mean(sqerr)) #mean center full model X, X_mean = ccam.meancenter(spectra_train) X_test = ccam.meancenter(spectra_test, X_mean=X_mean)[0] Y, Y_mean = ccam.meancenter(comps_train) #create arrays for results and RMSEs trainset_results = numpy.zeros((len(names_train), nc)) testset_results = numpy.zeros((len(names_test), nc)) RMSEP = numpy.zeros(nc) RMSEC = numpy.zeros(nc) beta = numpy.zeros((len(X_mean), nc)) #Now step through each # of components with the full model for j in range(1, nc + 1): print 'Training full model for ' + str(j) + ' components' if plstype == 'mlpy': PLS1model = mlpy.pls.PLS(j) PLS1model.learn(X, Y) beta[:, j - 1] = PLS1model.beta() trainset_results[:, j - 1] = PLS1model.pred(X) + Y_mean testset_results[:, j - 1] = PLS1model.pred(X_test) + Y_mean if plstype == 'sklearn': PLS1model = PLSRegression(n_components=nc) PLS1model.fit(X, Y) print 'stop' RMSEC[j - 1] = numpy.sqrt( numpy.mean((trainset_results[:, j - 1] - comps_train)**2.0)) RMSEP[j - 1] = numpy.sqrt( numpy.mean((testset_results[:, j - 1] - comps_test)**2.0)) #if cal_dir is specified, read cal target data and calculate RMSEs if cal_dir != None: cal_data, cal_wvl, cal_filelist = ccam.read_ccs(cal_dir) cal_data, cal_wvl = ccam.mask(cal_data, cal_wvl, maskfile) cal_data = ccam.normalize(cal_data, cal_wvl, normtype=normtype) RMSEP_cal = numpy.zeros(nc) RMSEP_KGAMEDS = numpy.zeros(nc) RMSEP_MACUSANITE = numpy.zeros(nc) RMSEP_NAU2HIS = numpy.zeros(nc) RMSEP_NAU2LOS = numpy.zeros(nc) RMSEP_NAU2MEDS = numpy.zeros(nc) RMSEP_NORITE = numpy.zeros(nc) RMSEP_PICRITE = numpy.zeros(nc) RMSEP_SHERGOTTITE = numpy.zeros(nc) targets, dists, amps = ccam.target_lookup(cal_filelist, masterlist_file, name_sub_file) target_comps = ccam.target_comp_lookup(targets, compfile, which_elem) cal_results = numpy.zeros((len(targets), nc)) for i in range(nc): comps_copy = copy.copy(target_comps) cal_results[:, i] = ccam.pls_unk(cal_data, i + 1, beta=beta[:, i], X_mean=X_mean, Y_mean=Y_mean) #RMSEP_cal[i]=numpy.sqrt(numpy.mean((cal_results[:,i]-target_comps)**2)) cal_results[(comps_copy < mincomp), i] = 0 cal_results[(comps_copy > maxcomp), i] = 0 comps_copy[(comps_copy < mincomp)] = 0 comps_copy[(comps_copy > maxcomp)] = 0 RMSEP_KGAMEDS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'KGAMEDS'), i] - comps_copy[(targets == 'KGAMEDS')])**2)) RMSEP_MACUSANITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'MACUSANITE'), i] - comps_copy[(targets == 'MACUSANITE')])**2)) RMSEP_NAU2HIS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NAU2HIS'), i] - comps_copy[(targets == 'NAU2HIS')])**2)) RMSEP_NAU2LOS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NAU2LOS'), i] - comps_copy[(targets == 'NAU2LOS')])**2)) RMSEP_NAU2MEDS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NAU2MEDS'), i] - comps_copy[(targets == 'NAU2MEDS')])**2)) RMSEP_NORITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NORITE'), i] - comps_copy[(targets == 'NORITE')])**2)) RMSEP_PICRITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'PICRITE'), i] - comps_copy[(targets == 'PICRITE')])**2)) RMSEP_SHERGOTTITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'SHERGOTTITE'), i] - comps_copy[(targets == 'SHERGOTTITE')])**2)) n_good_cal = len(numpy.unique(comps_copy)) - 1 RMSEP_cal = (RMSEP_KGAMEDS + RMSEP_MACUSANITE + RMSEP_NAU2HIS + RMSEP_NAU2LOS + RMSEP_NAU2MEDS + RMSEP_NORITE + RMSEP_PICRITE + RMSEP_SHERGOTTITE) / n_good_cal RMSEP_single_cals = [ RMSEP_KGAMEDS, RMSEP_MACUSANITE, RMSEP_NAU2HIS, RMSEP_NAU2LOS, RMSEP_NAU2MEDS, RMSEP_NORITE, RMSEP_PICRITE, RMSEP_SHERGOTTITE, RMSEP_cal ] with open( outpath + which_elem + '_' + str(mincomp) + '-' + str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_caltargets_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['File', 'Target', 'Laser Energy', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(targets)): row = [cal_filelist[i], targets[i], amps[i], target_comps[i]] row.extend(cal_results[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + str(mincomp) + '-' + str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_RMSECP_caltargets.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSECP Cal Targets (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSEP_cal[i]]) ccam.plots.RMSE(RMSECV, RMSEP, RMSEC, which_elem + ' RMSEs', outpath + which_elem + '_' + str(mincomp) + '-' + str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_RMSE_plot_cal.png', RMSEP_cals=RMSEP_single_cals) # plot RMSEs ccam_plots.ccam_plot_RMSE( RMSECV, RMSEP, RMSEC, which_elem + 'RMSEs', outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSE_plot.png') #Write output info to files print outpath + which_elem + '_' + plstype + '_nc' + str( nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str( maxcomp) + '_RMSECV.csv' with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSECV.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSECV (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSECV[i]]) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSEC.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSEC (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSEC[i]]) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSEP.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSEP (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSEP[i]]) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_cv_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['Sample', 'Spectrum', 'Fold', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(names_train)): row = [ names_train[i], spect_index_train[i], folds_train[i], comps_train[i] ] row.extend(train_predict_cv[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_train_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['Sample', 'Spectrum', 'Fold', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(names_train)): row = [ names_train[i], spect_index_train[i], folds_train[i], comps_train[i] ] row.extend(trainset_results[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_test_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['Sample', 'Spectrum', 'Fold', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(names_test)): row = [ names_test[i], spect_index_test[i], folds_test[i], comps_test[i] ] row.extend(testset_results[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_beta_coeffs.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['wvl'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(wvl)): row = [wvl[i]] row.extend(beta[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_meancenters.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow([which_elem + ' mean', Y_mean]) for i in range(0, len(wvl)): row = [wvl[i], X_mean[i]] writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_inputinfo.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['Spectral database =', dbfile]) writer.writerow(['Spectra Kept =', keepfile]) writer.writerow(['Spectra Removed =', which_removed]) writer.writerow(['Fold Definition =', foldfile]) writer.writerow(['Test Fold =', maskfile]) writer.writerow(['Mask File =', maskfile]) writer.writerow(['Algorithm =', plstype]) writer.writerow(['# of components =', nc]) writer.writerow(['Normalization Type =', normtype]) writer.writerow(['Composition Min. =', mincomp]) writer.writerow(['Composition Max. =', maxcomp])