Ejemplo n.º 1
0
def pls_cal(dbfile,maskfile,outpath,which_elem,testfold,nc,normtype=1,mincomp=0,maxcomp=100,plstype='mlpy',keepfile=None,removefile=None,cal_dir=None,masterlist_file=None,compfile=None,name_sub_file=None,foldfile=None,nfolds=7,seed=None,n_bag=None,skscale=False,n_boost=None,max_samples=0.1,n_elems=9):
    plstype_string=plstype    
    if n_bag!=None:
        plstype_string=plstype+'_bag'
    if n_boost!=None:
        plstype_string=plstype+'_boost'
    if skscale==True:
        plstype_string=plstype+'_scale'
    print('Reading database')
    sys.stdout.flush()
    spectra,comps,spect_index,names,labels,wvl=ccam.read_db(dbfile,compcheck=True,n_elems=n_elems)
    oxides=labels[2:]
    compindex=numpy.where(oxides==which_elem)[0]
    
    print('Choosing spectra')
    
    which_removed=outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_removed.csv'
    spectra,names,spect_index,comps=ccam.choose_spectra(spectra,spect_index,names,comps,compindex,mincomp=mincomp,maxcomp=maxcomp,keepfile=keepfile,removefile=removefile,which_removed=which_removed)
        
    
    print('Masking spectra')
    spectra,wvl=ccam.mask(spectra,wvl,maskfile)
    
    print('Normalizing spectra')
    spectra=ccam.normalize(spectra,wvl,normtype=normtype)
    
    
    print('Assigning Folds')
    if foldfile!=None:
        #if a fold file is specified, use it
        folds=ccam.folds(foldfile,names)
    else:
        #otherwise, define random folds
        folds=ccam.random_folds(names,nfolds,seed=seed)

    names_nofold=names[(folds==0)]
    spect_index_nofold=spect_index[(folds==0)]
    #write a file containing the samples not assigned to folds
    with open(which_removed,'ab') as writefile:
        writer=csv.writer(writefile,delimiter=',',)
        for i in range(len(names_nofold)):
            writer.writerow([names_nofold[i],spect_index_nofold[i],'No Fold'])
    
    
    #remove spectra that are not assigned to any fold
    spectra=spectra[(folds!=0),:]
    spect_index=spect_index[(folds!=0)]
    names=names[(folds!=0)]
    comps=comps[(folds!=0),:]
    folds=folds[(folds!=0)]
    
    print('Defining Training and Test Sets')
    spectra_train=spectra[(folds!=testfold)]
    spect_index_train=spect_index[(folds!=testfold)]
    names_train=names[(folds!=testfold)]
    comps_train=comps[(folds!=testfold),compindex]
    folds_train=folds[(folds!=testfold)]
    folds_train_unique=numpy.unique(folds_train)
    
    spectra_test=spectra[(folds==testfold)]
    spect_index_test=spect_index[(folds==testfold)]
    names_test=names[(folds==testfold)]
    comps_test=comps[(folds==testfold),compindex]
    folds_test=folds[(folds==testfold)]
    
    print('Do Leave One Label Out (LOLO) cross validation with all folds but the test set')
    #define array to hold cross validation predictions and RMSEs
    train_predict_cv=numpy.zeros((len(names_train),nc))
    RMSECV=numpy.zeros(nc)
    
    for i in folds_train_unique:
        print('Holding out fold #'+str(i))
        
        if skscale==False:
        #mean center those spectra left in
            #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:])
            X_cv_in,X_cv_in_mean=ccam.meancenter(spectra_train[(folds_train!=i),:])
            
            #and those left out
            X_cv_out=ccam.meancenter(spectra_train[(folds_train==i),:],X_mean=X_cv_in_mean)[0]   
             
            #mean center compositions left in
            Y_cv_in,Y_cv_in_mean=ccam.meancenter(comps_train[(folds_train!=i)])
        if skscale==True:
            X_cv_in=spectra_train[(folds_train!=i),:]
            X_cv_out=spectra_train[(folds_train==i),:]
            Y_cv_in=comps_train[(folds_train!=i)]
            Y_cv_in_mean=0
       
        #step through each number of components
        for j in range(1,nc+1):
            print('Training Model for '+str(j)+' components')
            #train the model
            if plstype=='mlpy':
                PLS1model=ccam.mlpy_pls.PLS(j)
                PLS1model.learn(X_cv_in,Y_cv_in)
                    #predict the samples held out
                train_predict_cv[(folds_train==i),j-1]=PLS1model.pred(X_cv_out)+Y_cv_in_mean
                
            if plstype=='sklearn':
                PLS1model=PLSRegression(n_components=j,scale=skscale)
                if n_bag==None and n_boost==None:
                    PLS1model.fit(X_cv_in,Y_cv_in)
                    train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1model.predict(X_cv_out)+Y_cv_in_mean)
                if n_bag!=None:
                    PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1)
                    PLS1bagged.fit(X_cv_in,Y_cv_in)
                    train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1bagged.predict(X_cv_out)+Y_cv_in_mean)
                if n_boost!=None:
                    PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost)
                    PLS1boosted.fit(X_cv_in,Y_cv_in)
                    train_predict_cv[(folds_train==i),j-1]=numpy.squeeze(PLS1boosted.predict(X_cv_out)+Y_cv_in_mean)
    #calculate RMSECV
    for i in range(0,nc):
        sqerr=(train_predict_cv[:,i]-comps_train)**2.0
        RMSECV[i]=numpy.sqrt(numpy.mean(sqerr))
    
    #mean center full model
    if skscale==False:
        X,X_mean=ccam.meancenter(spectra_train)
        X_test=ccam.meancenter(spectra_test,X_mean=X_mean)[0]
        X_all=ccam.meancenter(spectra,X_mean=X_mean)[0]
        
        Y,Y_mean=ccam.meancenter(comps_train)
    if skscale==True:
        X=spectra_train
        X_test=spectra_test
        X_all=spectra
        Y=comps_train
        Y_mean=0
    
    #create arrays for results and RMSEs
    trainset_results=numpy.zeros((len(names_train),nc))
    testset_results=numpy.zeros((len(names_test),nc))
    results=numpy.zeros((len(names),nc))    
    
    RMSEP=numpy.zeros(nc)
    RMSEC=numpy.zeros(nc)
    beta=numpy.zeros((len(X[0,:]),nc))
    Q_res=numpy.zeros((len(X[:,0]),nc))
    T2=numpy.zeros((len(X[:,0]),nc))

    [a,evals,b]=numpy.linalg.svd(numpy.cov(numpy.dot(X,X.transpose())))
    evals=numpy.diag(evals**2)
    if cal_dir!=None:
        print('Reading cal target data')
        cal_data,cal_wvl,cal_filelist=ccam.read_ccs(cal_dir)
        cal_data,cal_wvl=ccam.mask(cal_data,cal_wvl,maskfile)
        cal_data=ccam.normalize(cal_data,cal_wvl,normtype=normtype)
        if skscale==True:
            cal_data_centered=cal_data
        if skscale==False:
            cal_data_centered=ccam.meancenter(cal_data,X_mean=X_mean)[0]

            
        RMSEP_cal=numpy.zeros(nc)
        RMSEP_cal_good=numpy.zeros(nc)        
        RMSEP_KGAMEDS=numpy.zeros(nc)
        RMSEP_MACUSANITE=numpy.zeros(nc)
        RMSEP_NAU2HIS=numpy.zeros(nc)
        RMSEP_NAU2LOS=numpy.zeros(nc)
        RMSEP_NAU2MEDS=numpy.zeros(nc)
        RMSEP_NORITE=numpy.zeros(nc)
        RMSEP_PICRITE=numpy.zeros(nc)
        RMSEP_SHERGOTTITE=numpy.zeros(nc)
        
        targets,dists,amps,nshots=ccam.target_lookup(cal_filelist,masterlist_file,name_sub_file)
        target_comps=ccam.target_comp_lookup(targets,compfile,which_elem)
        cal_results=numpy.zeros((len(targets),nc))
       
    model_list=[]
    #Now step through each # of components with the full model
    for j in range(1,nc+1):
        print('Training full model for '+str(j)+' components')
        if plstype=='mlpy':
        
            PLS1model=ccam.mlpy_pls.PLS(j)
            PLS1model.learn(X,Y)
            beta[:,j-1]=PLS1model.beta()
            model_list.append([PLS1model])
            trainset_results[:,j-1]=PLS1model.pred(X)+Y_mean
            testset_results[:,j-1]=PLS1model.pred(X_test)+Y_mean
            results[:,j-1]=PLS1model.pred(X_all)+Y_mean
            if cal_dir != None:
                comps_copy=copy.copy(target_comps)
#                if skscale==True:
#                    cal_results[:,j-1]=PLS1model.pred(cal_data)
#                if skscale==False:
                cal_results[:,j-1]=PLS1model.pred(cal_data_centered)+Y_mean
                RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
   


        if plstype=='sklearn':
            PLS1model=PLSRegression(n_components=j,scale=skscale)

            if n_bag==None and n_boost==None:
                PLS1model.fit(X,Y)
                T=PLS1model.x_scores_
                #There's probably a more efficient way to calculate T2...
                for k in range(len(X[:,0])):
                    T2[k,j-1]=numpy.dot(T[k,:],numpy.dot(numpy.linalg.inv(numpy.dot(T.transpose(),T)),T[k,:]))
                
                E=X-numpy.dot(PLS1model.x_scores_,PLS1model.x_loadings_.transpose())
                Q_res[:,j-1]=numpy.dot(E,E.transpose()).diagonal()
                
                trainset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X)+Y_mean)
                testset_results[:,j-1]=numpy.squeeze(PLS1model.predict(X_test)+Y_mean)
                results[:,j-1]=numpy.squeeze(PLS1model.predict(X_all)+Y_mean)
                beta[:,j-1]=numpy.squeeze(PLS1model.coefs)
                model_list.append([PLS1model])

                    
                if cal_dir != None:
                    comps_copy=copy.copy(target_comps)
                    cal_results[:,j-1]=numpy.squeeze(PLS1model.predict(cal_data_centered)+Y_mean)
                    RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
   
            if n_bag!=None:
                PLS1bagged=ensemble.BaggingRegressor(PLS1model,n_estimators=n_bag,max_samples=max_samples,verbose=1)
                PLS1bagged.fit(X,Y)
                trainset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X)+Y_mean)
                testset_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_test)+Y_mean)
                results[:,j-1]=numpy.squeeze(PLS1bagged.predict(X_all)+Y_mean)
                beta[:,j-1]=None
                model_list.append([PLS1bagged])
                if cal_dir != None:
                    comps_copy=copy.copy(target_comps)
                    cal_results[:,j-1]=numpy.squeeze(PLS1bagged.predict(cal_data_centered)+Y_mean)
                    RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
            if n_boost!=None:
                PLS1boosted=ensemble.AdaBoostRegressor(PLS1model,n_estimators=n_boost)
                PLS1boosted.fit(X,Y)
                trainset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X)+Y_mean)
                testset_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_test)+Y_mean)
                results[:,j-1]=numpy.squeeze(PLS1boosted.predict(X_all)+Y_mean)
                beta[:,j-1]=None
                model_list.append([PLS1boosted])
                if cal_dir != None:
                    comps_copy=copy.copy(target_comps)
                    cal_results[:,j-1]=numpy.squeeze(PLS1boosted.predict(cal_data_centered)+Y_mean)
                    RMSEP_KGAMEDS[j-1],RMSEP_MACUSANITE[j-1],RMSEP_NAU2HIS[j-1],RMSEP_NAU2LOS[j-1],RMSEP_NAU2MEDS[j-1],RMSEP_NORITE[j-1],RMSEP_PICRITE[j-1],RMSEP_SHERGOTTITE[j-1],RMSEP_cal_good[j-1]=cal_rmses(targets,nc,target_comps,j,cal_data_centered,Y_mean,mincomp,maxcomp,cal_results)
   
        RMSEC[j-1]=numpy.sqrt(numpy.mean((trainset_results[:,j-1]-comps_train)**2.0))
        RMSEP[j-1]=numpy.sqrt(numpy.mean((testset_results[:,j-1]-comps_test)**2.0))
        
   
    with open(outpath+which_elem+'_'+plstype_string+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'.pkl','wb') as picklefile:
            pickle.dump(model_list,picklefile)

 #if cal_dir is specified, read cal target data and calculate RMSEs    
    if cal_dir!=None:
        n_good_cal=numpy.sum(numpy.array([RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE])[:,0]!=0)
        print(n_good_cal)
        RMSEP_cal=(RMSEP_KGAMEDS+RMSEP_MACUSANITE+RMSEP_NAU2HIS+RMSEP_NAU2LOS+RMSEP_NAU2MEDS+RMSEP_NORITE+RMSEP_PICRITE+RMSEP_SHERGOTTITE)/n_good_cal
        RMSEP_single_cals=[RMSEP_KGAMEDS,RMSEP_MACUSANITE,RMSEP_NAU2HIS,RMSEP_NAU2LOS,RMSEP_NAU2MEDS,RMSEP_NORITE,RMSEP_PICRITE,RMSEP_SHERGOTTITE,RMSEP_cal]            
                       
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_caltargets_predict.csv','w',newline='') as writefile:
            writer=csv.writer(writefile,delimiter=',')
            row=['File','Target','Laser Energy','True_Comp']
            row.extend(list(range(1,nc+1)))
            writer.writerow(row)
            for i in range(0,len(targets)):
                row=[cal_filelist[i],targets[i],amps[i],target_comps[i]]
                row.extend(cal_results[i,:])
                writer.writerow(row)
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP_caltargets.csv','w',newline='') as writefile:
            writer=csv.writer(writefile,delimiter=',')
            writer.writerow(['NC','RMSEP Cal Targets (wt.%)'])            
            for i in range(0,nc):
                writer.writerow([i+1,RMSEP_cal[i]])
        ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal.png',RMSEP_cals=RMSEP_single_cals)
        ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot_cal_good.png',RMSEP_good=RMSEP_cal_good)
        
    # plot RMSEs
    ccam.RMSE(RMSECV,RMSEP,RMSEC,which_elem+' RMSEs',outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSE_plot.png')
    
    
   
   #Write output info to files

    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_Q_res.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=["Sample","Spectrum","Fold","True Comp"]
        row.extend(range(1,nc+1))
        writer.writerow(row)        
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(Q_res[i,:])
            writer.writerow(row)
    with open(outpath+which_elem+'_'+str(mincomp)+'-'+str(maxcomp)+'_quartiles.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=[which_elem]
        writer.writerow(row)
        row=['Min',numpy.percentile(comps[:,compindex],0)]
        writer.writerow(row)
        row=['1st Quartile',numpy.percentile(comps[:,compindex],25)]
        writer.writerow(row)
        row=['Median',numpy.percentile(comps[:,compindex],50)]
        writer.writerow(row)
        row=['3rd Quartile',numpy.percentile(comps[:,compindex],75)]
        writer.writerow(row)
        row=['Max',numpy.percentile(comps[:,compindex],100)]
        writer.writerow(row)

    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_HotellingT2.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=["Sample","Spectrum","Fold","True Comp"]
        row.extend(range(1,nc+1))
        writer.writerow(row)        
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(T2[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSECV.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSECV (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSECV[i]])
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEC.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSEC (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSEC[i]])
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_RMSEP.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        writer.writerow(['NC','RMSEP (wt.%)'])            
        for i in range(0,nc):
            writer.writerow([i+1,RMSEP[i]])
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_cv_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(train_predict_cv[i,:])
            writer.writerow(row)
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_train_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_train)):
            row=[names_train[i],spect_index_train[i],folds_train[i],comps_train[i]]
            row.extend(trainset_results[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_test_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names_test)):
            row=[names_test[i],spect_index_test[i],folds_test[i],comps_test[i]]
            row.extend(testset_results[i,:])
            writer.writerow(row)
    
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_all_predict.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['Sample','Spectrum','Fold','True_Comp']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(names)):
            row=[names[i],spect_index[i],folds[i],comps[i,compindex]]
            row.extend(results[i,:])
            writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_beta_coeffs.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')
        row=['wvl']
        row.extend(range(1,nc+1))
        writer.writerow(row)
        for i in range(0,len(wvl)):
            row=[wvl[i]]
            row.extend(beta[i,:])
            writer.writerow(row)        
    
    if skscale==False:
        with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_meancenters.csv','w',newline='') as writefile:
            writer=csv.writer(writefile,delimiter=',')        
            writer.writerow([which_elem+' mean',Y_mean])
            for i in range(0,len(wvl)):
                row=[wvl[i],X_mean[i]]
                writer.writerow(row)
            
    with open(outpath+which_elem+'_'+plstype_string+'_nc'+str(nc)+'_norm'+str(normtype)+'_'+str(mincomp)+'-'+str(maxcomp)+'_inputinfo.csv','w',newline='') as writefile:
        writer=csv.writer(writefile,delimiter=',')        
        writer.writerow(['Spectral database =',dbfile])
        writer.writerow(['Spectra Kept =',keepfile])
        writer.writerow(['Spectra Removed =',which_removed])
        writer.writerow(['Fold Definition =',foldfile])
        writer.writerow(['Test Fold =',maskfile])
        writer.writerow(['Mask File =',maskfile])
        writer.writerow(['Algorithm =',plstype_string])
        writer.writerow(['# of components =',nc])
        writer.writerow(['Normalization Type =',normtype])
        writer.writerow(['Composition Min. =',mincomp])
        writer.writerow(['Composition Max. =',maxcomp])
Ejemplo n.º 2
0
def pls_cal(dbfile,
            foldfile,
            maskfile,
            outpath,
            which_elem,
            testfold,
            nc,
            normtype=3,
            mincomp=0,
            maxcomp=100,
            plstype='mlpy',
            keepfile=None,
            removefile=None,
            cal_dir=None,
            masterlist_file=None,
            compfile=None,
            name_sub_file=None):

    print 'Reading database'
    sys.stdout.flush()
    spectra, comps, spect_index, names, labels, wvl = ccam.read_db(
        dbfile, compcheck=True)
    oxides = labels[2:]
    compindex = numpy.where(oxides == which_elem)[0]

    print 'Choosing spectra'
    which_removed = outpath + which_elem + '_' + plstype + '_nc' + str(
        nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(
            maxcomp) + '_removed.csv'
    spectra, names, spect_index, comps = ccam.choose_spectra(
        spectra,
        spect_index,
        names,
        comps,
        compindex,
        mincomp=mincomp,
        maxcomp=maxcomp,
        keepfile=keepfile,
        removefile=removefile,
        which_removed=which_removed)

    print 'Masking spectra'
    spectra, wvl = ccam.mask(spectra, wvl, maskfile)

    print 'Normalizing spectra'
    spectra = ccam.normalize(spectra, wvl, normtype=normtype)

    print 'Assigning Folds'
    folds = ccam.folds(foldfile, names)
    names_nofold = names[(folds == 0)]
    spect_index_nofold = spect_index[(folds == 0)]
    #write a file containing the samples not assigned to folds
    with open(which_removed, 'ab') as writefile:
        writer = csv.writer(
            writefile,
            delimiter=',',
        )
        for i in range(len(names_nofold)):
            writer.writerow(
                [names_nofold[i], spect_index_nofold[i], 'No Fold'])

    #remove spectra that are not assigned to any fold
    spectra = spectra[(folds != 0), :]
    spect_index = spect_index[(folds != 0)]
    names = names[(folds != 0)]
    comps = comps[(folds != 0), :]
    folds = folds[(folds != 0)]

    print 'Defining Training and Test Sets'
    spectra_train = spectra[(folds != testfold)]
    spect_index_train = spect_index[(folds != testfold)]
    names_train = names[(folds != testfold)]
    comps_train = comps[(folds != testfold), compindex]
    folds_train = folds[(folds != testfold)]
    folds_train_unique = numpy.unique(folds_train)

    spectra_test = spectra[(folds == testfold)]
    spect_index_test = spect_index[(folds == testfold)]
    names_test = names[(folds == testfold)]
    comps_test = comps[(folds == testfold), compindex]
    folds_test = folds[(folds == testfold)]

    print 'Do Leave One Label Out (LOLO) cross validation with all folds but the test set'
    #define array to hold cross validation predictions and RMSEs
    train_predict_cv = numpy.zeros((len(names_train), nc))
    RMSECV = numpy.zeros(nc)

    for i in folds_train_unique:
        print 'Holding out fold #' + str(i)
        #mean center those spectra left in
        #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:])
        X_cv_in, X_cv_in_mean = ccam.meancenter(
            spectra_train[(folds_train != i), :])

        #and those left out
        X_cv_out = ccam.meancenter(spectra_train[(folds_train == i), :],
                                   X_mean=X_cv_in_mean)[0]

        #mean center compositions left in
        Y_cv_in, Y_cv_in_mean = ccam.meancenter(
            comps_train[(folds_train != i)])

        #step through each number of components
        for j in range(1, nc + 1):
            print 'Training PLS Model for ' + str(j) + ' components'
            #train the model
            if plstype == 'mlpy':
                PLS1model = mlpy.pls.PLS(j)
                PLS1model.learn(X_cv_in, Y_cv_in)

                #predict the samples held out
                train_predict_cv[(folds_train == i), j -
                                 1] = PLS1model.pred(X_cv_out) + Y_cv_in_mean
            if plstype == 'sklearn':
                PLS1model = PLSRegression(n_components=nc)
                PLS1model.fit(X_cv_in, Y_cv_in)
                train_predict_cv[
                    (folds_train == i),
                    j - 1] = PLS1model.predict(X_cv_out) + Y_cv_in_mean
    #calculate RMSECV
    for i in range(0, nc):
        sqerr = (train_predict_cv[:, i] - comps_train)**2.0
        RMSECV[i] = numpy.sqrt(numpy.mean(sqerr))

    #mean center full model
    X, X_mean = ccam.meancenter(spectra_train)
    X_test = ccam.meancenter(spectra_test, X_mean=X_mean)[0]

    Y, Y_mean = ccam.meancenter(comps_train)

    #create arrays for results and RMSEs
    trainset_results = numpy.zeros((len(names_train), nc))
    testset_results = numpy.zeros((len(names_test), nc))
    RMSEP = numpy.zeros(nc)
    RMSEC = numpy.zeros(nc)
    beta = numpy.zeros((len(X_mean), nc))

    #Now step through each # of components with the full model
    for j in range(1, nc + 1):
        print 'Training full model for ' + str(j) + ' components'
        if plstype == 'mlpy':
            PLS1model = mlpy.pls.PLS(j)
            PLS1model.learn(X, Y)
            beta[:, j - 1] = PLS1model.beta()
            trainset_results[:, j - 1] = PLS1model.pred(X) + Y_mean
            testset_results[:, j - 1] = PLS1model.pred(X_test) + Y_mean
        if plstype == 'sklearn':
            PLS1model = PLSRegression(n_components=nc)
            PLS1model.fit(X, Y)
            print 'stop'

        RMSEC[j - 1] = numpy.sqrt(
            numpy.mean((trainset_results[:, j - 1] - comps_train)**2.0))
        RMSEP[j - 1] = numpy.sqrt(
            numpy.mean((testset_results[:, j - 1] - comps_test)**2.0))

#if cal_dir is specified, read cal target data and calculate RMSEs
    if cal_dir != None:
        cal_data, cal_wvl, cal_filelist = ccam.read_ccs(cal_dir)
        cal_data, cal_wvl = ccam.mask(cal_data, cal_wvl, maskfile)
        cal_data = ccam.normalize(cal_data, cal_wvl, normtype=normtype)

        RMSEP_cal = numpy.zeros(nc)
        RMSEP_KGAMEDS = numpy.zeros(nc)
        RMSEP_MACUSANITE = numpy.zeros(nc)
        RMSEP_NAU2HIS = numpy.zeros(nc)
        RMSEP_NAU2LOS = numpy.zeros(nc)
        RMSEP_NAU2MEDS = numpy.zeros(nc)
        RMSEP_NORITE = numpy.zeros(nc)
        RMSEP_PICRITE = numpy.zeros(nc)
        RMSEP_SHERGOTTITE = numpy.zeros(nc)

        targets, dists, amps = ccam.target_lookup(cal_filelist,
                                                  masterlist_file,
                                                  name_sub_file)
        target_comps = ccam.target_comp_lookup(targets, compfile, which_elem)
        cal_results = numpy.zeros((len(targets), nc))

        for i in range(nc):
            comps_copy = copy.copy(target_comps)
            cal_results[:, i] = ccam.pls_unk(cal_data,
                                             i + 1,
                                             beta=beta[:, i],
                                             X_mean=X_mean,
                                             Y_mean=Y_mean)
            #RMSEP_cal[i]=numpy.sqrt(numpy.mean((cal_results[:,i]-target_comps)**2))
            cal_results[(comps_copy < mincomp), i] = 0
            cal_results[(comps_copy > maxcomp), i] = 0
            comps_copy[(comps_copy < mincomp)] = 0
            comps_copy[(comps_copy > maxcomp)] = 0
            RMSEP_KGAMEDS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'KGAMEDS'), i] -
                            comps_copy[(targets == 'KGAMEDS')])**2))
            RMSEP_MACUSANITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'MACUSANITE'), i] -
                            comps_copy[(targets == 'MACUSANITE')])**2))
            RMSEP_NAU2HIS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NAU2HIS'), i] -
                            comps_copy[(targets == 'NAU2HIS')])**2))
            RMSEP_NAU2LOS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NAU2LOS'), i] -
                            comps_copy[(targets == 'NAU2LOS')])**2))
            RMSEP_NAU2MEDS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NAU2MEDS'), i] -
                            comps_copy[(targets == 'NAU2MEDS')])**2))
            RMSEP_NORITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NORITE'), i] -
                            comps_copy[(targets == 'NORITE')])**2))
            RMSEP_PICRITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'PICRITE'), i] -
                            comps_copy[(targets == 'PICRITE')])**2))
            RMSEP_SHERGOTTITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'SHERGOTTITE'), i] -
                            comps_copy[(targets == 'SHERGOTTITE')])**2))
        n_good_cal = len(numpy.unique(comps_copy)) - 1
        RMSEP_cal = (RMSEP_KGAMEDS + RMSEP_MACUSANITE + RMSEP_NAU2HIS +
                     RMSEP_NAU2LOS + RMSEP_NAU2MEDS + RMSEP_NORITE +
                     RMSEP_PICRITE + RMSEP_SHERGOTTITE) / n_good_cal
        RMSEP_single_cals = [
            RMSEP_KGAMEDS, RMSEP_MACUSANITE, RMSEP_NAU2HIS, RMSEP_NAU2LOS,
            RMSEP_NAU2MEDS, RMSEP_NORITE, RMSEP_PICRITE, RMSEP_SHERGOTTITE,
            RMSEP_cal
        ]

        with open(
                outpath + which_elem + '_' + str(mincomp) + '-' +
                str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' +
                str(normtype) + '_caltargets_predict.csv', 'wb') as writefile:
            writer = csv.writer(writefile, delimiter=',')
            row = ['File', 'Target', 'Laser Energy', 'True_Comp']
            row.extend(range(1, nc + 1))
            writer.writerow(row)
            for i in range(0, len(targets)):
                row = [cal_filelist[i], targets[i], amps[i], target_comps[i]]
                row.extend(cal_results[i, :])
                writer.writerow(row)
        with open(
                outpath + which_elem + '_' + str(mincomp) + '-' +
                str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' +
                str(normtype) + '_RMSECP_caltargets.csv', 'wb') as writefile:
            writer = csv.writer(writefile, delimiter=',')
            writer.writerow(['NC', 'RMSECP Cal Targets (wt.%)'])
            for i in range(0, nc):
                writer.writerow([i + 1, RMSEP_cal[i]])
        ccam.plots.RMSE(RMSECV,
                        RMSEP,
                        RMSEC,
                        which_elem + ' RMSEs',
                        outpath + which_elem + '_' + str(mincomp) + '-' +
                        str(maxcomp) + '_' + plstype + '_nc' + str(nc) +
                        '_norm' + str(normtype) + '_RMSE_plot_cal.png',
                        RMSEP_cals=RMSEP_single_cals)

    # plot RMSEs
    ccam_plots.ccam_plot_RMSE(
        RMSECV, RMSEP, RMSEC, which_elem + 'RMSEs', outpath + which_elem +
        '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' +
        str(mincomp) + '-' + str(maxcomp) + '_RMSE_plot.png')

    #Write output info to files
    print outpath + which_elem + '_' + plstype + '_nc' + str(
        nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(
            maxcomp) + '_RMSECV.csv'
    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_RMSECV.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['NC', 'RMSECV (wt.%)'])
        for i in range(0, nc):
            writer.writerow([i + 1, RMSECV[i]])

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_RMSEC.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['NC', 'RMSEC (wt.%)'])
        for i in range(0, nc):
            writer.writerow([i + 1, RMSEC[i]])

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_RMSEP.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['NC', 'RMSEP (wt.%)'])
        for i in range(0, nc):
            writer.writerow([i + 1, RMSEP[i]])

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_cv_predict.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['Sample', 'Spectrum', 'Fold', 'True_Comp']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(names_train)):
            row = [
                names_train[i], spect_index_train[i], folds_train[i],
                comps_train[i]
            ]
            row.extend(train_predict_cv[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_train_predict.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['Sample', 'Spectrum', 'Fold', 'True_Comp']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(names_train)):
            row = [
                names_train[i], spect_index_train[i], folds_train[i],
                comps_train[i]
            ]
            row.extend(trainset_results[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_test_predict.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['Sample', 'Spectrum', 'Fold', 'True_Comp']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(names_test)):
            row = [
                names_test[i], spect_index_test[i], folds_test[i],
                comps_test[i]
            ]
            row.extend(testset_results[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_beta_coeffs.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['wvl']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(wvl)):
            row = [wvl[i]]
            row.extend(beta[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_meancenters.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow([which_elem + ' mean', Y_mean])
        for i in range(0, len(wvl)):
            row = [wvl[i], X_mean[i]]
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_inputinfo.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['Spectral database =', dbfile])
        writer.writerow(['Spectra Kept =', keepfile])
        writer.writerow(['Spectra Removed =', which_removed])
        writer.writerow(['Fold Definition =', foldfile])
        writer.writerow(['Test Fold =', maskfile])
        writer.writerow(['Mask File =', maskfile])
        writer.writerow(['Algorithm =', plstype])
        writer.writerow(['# of components =', nc])
        writer.writerow(['Normalization Type =', normtype])
        writer.writerow(['Composition Min. =', mincomp])
        writer.writerow(['Composition Max. =', maxcomp])