def pls_predict(data,nc,wvl,maskfile,coeff_file=None,mean_file=None,loadfile=None): normtype=0 data,wvl=ccam.mask(data,wvl,maskfile) #find the norm type from the coeff file if coeff_file!=None: if coeff_file.find('norm1')!=-1: normtype=1 if coeff_file.find('norm3')!=-1: normtype=3 if normtype==0: print('Error: Cant determine normalization from coeff file name!') return if loadfile!=None: if loadfile.find('norm1')!=-1: normtype=1 if loadfile.find('norm3')!=-1: normtype=3 if normtype==0: print('Error: Cant determine normalization from loadfile name!') return data_norm=ccam.normalize(data,wvl,normtype=normtype) if loadfile==None: y=ccam.pls_unk(data_norm,nc,coeff_file=coeff_file,means_file=mean_file) if loadfile!=None: y=ccam.pls_unk_load(data_norm,nc,loadfile,means_file=mean_file) return y,normtype
high_cutoff = 15 data, wvl, filelist = ccam.read_ccs(searchdir) #pickle.dump(data, open( "ccamdata.pkl", "wb" )) #pickle.dump(wvl,open( "ccamwvl.pkl", "wb" )) #pickle.dump(filelist,open( "ccamfilelist.pkl", "wb" )) #pickle.load(open( r"C:\Users\rbanderson\Documents\MSL\ChemCam\Data Processing\Working\ccam\ccamdata.pkl", "rb" )) #pickle.load(open( r"C:\Users\rbanderson\Documents\MSL\ChemCam\Data Processing\Working\ccam\ccamwvl.pkl", "rb" )) #pickle.load(open( r"C:\Users\rbanderson\Documents\MSL\ChemCam\Data Processing\Working\ccam\ccamfilelist.pkl", "rb" )) targetlist, distslist, amplist = ccam.target_lookup(filelist, masterlist, name_subs) y_full = ccam.pls_unk(data_norm3, nc_full, coeff_file=coeff_file_full, means_file=means_file_full) y_low = ccam.pls_unk(data_norm3, nc_low, coeff_file=coeff_file_low, means_file=means_file_low) y_mid = ccam.pls_unk(data_norm1, nc_mid, coeff_file=coeff_file_mid, means_file=means_file_mid) y_high = ccam.pls_unk(data_norm3, nc_high, coeff_file=coeff_file_high, means_file=means_file_high) y_combined = numpy.zeros_like(y_high)
def pls_cal(dbfile, foldfile, maskfile, outpath, which_elem, testfold, nc, normtype=3, mincomp=0, maxcomp=100, plstype='mlpy', keepfile=None, removefile=None, cal_dir=None, masterlist_file=None, compfile=None, name_sub_file=None): print 'Reading database' sys.stdout.flush() spectra, comps, spect_index, names, labels, wvl = ccam.read_db( dbfile, compcheck=True) oxides = labels[2:] compindex = numpy.where(oxides == which_elem)[0] print 'Choosing spectra' which_removed = outpath + which_elem + '_' + plstype + '_nc' + str( nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str( maxcomp) + '_removed.csv' spectra, names, spect_index, comps = ccam.choose_spectra( spectra, spect_index, names, comps, compindex, mincomp=mincomp, maxcomp=maxcomp, keepfile=keepfile, removefile=removefile, which_removed=which_removed) print 'Masking spectra' spectra, wvl = ccam.mask(spectra, wvl, maskfile) print 'Normalizing spectra' spectra = ccam.normalize(spectra, wvl, normtype=normtype) print 'Assigning Folds' folds = ccam.folds(foldfile, names) names_nofold = names[(folds == 0)] spect_index_nofold = spect_index[(folds == 0)] #write a file containing the samples not assigned to folds with open(which_removed, 'ab') as writefile: writer = csv.writer( writefile, delimiter=',', ) for i in range(len(names_nofold)): writer.writerow( [names_nofold[i], spect_index_nofold[i], 'No Fold']) #remove spectra that are not assigned to any fold spectra = spectra[(folds != 0), :] spect_index = spect_index[(folds != 0)] names = names[(folds != 0)] comps = comps[(folds != 0), :] folds = folds[(folds != 0)] print 'Defining Training and Test Sets' spectra_train = spectra[(folds != testfold)] spect_index_train = spect_index[(folds != testfold)] names_train = names[(folds != testfold)] comps_train = comps[(folds != testfold), compindex] folds_train = folds[(folds != testfold)] folds_train_unique = numpy.unique(folds_train) spectra_test = spectra[(folds == testfold)] spect_index_test = spect_index[(folds == testfold)] names_test = names[(folds == testfold)] comps_test = comps[(folds == testfold), compindex] folds_test = folds[(folds == testfold)] print 'Do Leave One Label Out (LOLO) cross validation with all folds but the test set' #define array to hold cross validation predictions and RMSEs train_predict_cv = numpy.zeros((len(names_train), nc)) RMSECV = numpy.zeros(nc) for i in folds_train_unique: print 'Holding out fold #' + str(i) #mean center those spectra left in #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:]) X_cv_in, X_cv_in_mean = ccam.meancenter( spectra_train[(folds_train != i), :]) #and those left out X_cv_out = ccam.meancenter(spectra_train[(folds_train == i), :], X_mean=X_cv_in_mean)[0] #mean center compositions left in Y_cv_in, Y_cv_in_mean = ccam.meancenter( comps_train[(folds_train != i)]) #step through each number of components for j in range(1, nc + 1): print 'Training PLS Model for ' + str(j) + ' components' #train the model if plstype == 'mlpy': PLS1model = mlpy.pls.PLS(j) PLS1model.learn(X_cv_in, Y_cv_in) #predict the samples held out train_predict_cv[(folds_train == i), j - 1] = PLS1model.pred(X_cv_out) + Y_cv_in_mean if plstype == 'sklearn': PLS1model = PLSRegression(n_components=nc) PLS1model.fit(X_cv_in, Y_cv_in) train_predict_cv[ (folds_train == i), j - 1] = PLS1model.predict(X_cv_out) + Y_cv_in_mean #calculate RMSECV for i in range(0, nc): sqerr = (train_predict_cv[:, i] - comps_train)**2.0 RMSECV[i] = numpy.sqrt(numpy.mean(sqerr)) #mean center full model X, X_mean = ccam.meancenter(spectra_train) X_test = ccam.meancenter(spectra_test, X_mean=X_mean)[0] Y, Y_mean = ccam.meancenter(comps_train) #create arrays for results and RMSEs trainset_results = numpy.zeros((len(names_train), nc)) testset_results = numpy.zeros((len(names_test), nc)) RMSEP = numpy.zeros(nc) RMSEC = numpy.zeros(nc) beta = numpy.zeros((len(X_mean), nc)) #Now step through each # of components with the full model for j in range(1, nc + 1): print 'Training full model for ' + str(j) + ' components' if plstype == 'mlpy': PLS1model = mlpy.pls.PLS(j) PLS1model.learn(X, Y) beta[:, j - 1] = PLS1model.beta() trainset_results[:, j - 1] = PLS1model.pred(X) + Y_mean testset_results[:, j - 1] = PLS1model.pred(X_test) + Y_mean if plstype == 'sklearn': PLS1model = PLSRegression(n_components=nc) PLS1model.fit(X, Y) print 'stop' RMSEC[j - 1] = numpy.sqrt( numpy.mean((trainset_results[:, j - 1] - comps_train)**2.0)) RMSEP[j - 1] = numpy.sqrt( numpy.mean((testset_results[:, j - 1] - comps_test)**2.0)) #if cal_dir is specified, read cal target data and calculate RMSEs if cal_dir != None: cal_data, cal_wvl, cal_filelist = ccam.read_ccs(cal_dir) cal_data, cal_wvl = ccam.mask(cal_data, cal_wvl, maskfile) cal_data = ccam.normalize(cal_data, cal_wvl, normtype=normtype) RMSEP_cal = numpy.zeros(nc) RMSEP_KGAMEDS = numpy.zeros(nc) RMSEP_MACUSANITE = numpy.zeros(nc) RMSEP_NAU2HIS = numpy.zeros(nc) RMSEP_NAU2LOS = numpy.zeros(nc) RMSEP_NAU2MEDS = numpy.zeros(nc) RMSEP_NORITE = numpy.zeros(nc) RMSEP_PICRITE = numpy.zeros(nc) RMSEP_SHERGOTTITE = numpy.zeros(nc) targets, dists, amps = ccam.target_lookup(cal_filelist, masterlist_file, name_sub_file) target_comps = ccam.target_comp_lookup(targets, compfile, which_elem) cal_results = numpy.zeros((len(targets), nc)) for i in range(nc): comps_copy = copy.copy(target_comps) cal_results[:, i] = ccam.pls_unk(cal_data, i + 1, beta=beta[:, i], X_mean=X_mean, Y_mean=Y_mean) #RMSEP_cal[i]=numpy.sqrt(numpy.mean((cal_results[:,i]-target_comps)**2)) cal_results[(comps_copy < mincomp), i] = 0 cal_results[(comps_copy > maxcomp), i] = 0 comps_copy[(comps_copy < mincomp)] = 0 comps_copy[(comps_copy > maxcomp)] = 0 RMSEP_KGAMEDS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'KGAMEDS'), i] - comps_copy[(targets == 'KGAMEDS')])**2)) RMSEP_MACUSANITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'MACUSANITE'), i] - comps_copy[(targets == 'MACUSANITE')])**2)) RMSEP_NAU2HIS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NAU2HIS'), i] - comps_copy[(targets == 'NAU2HIS')])**2)) RMSEP_NAU2LOS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NAU2LOS'), i] - comps_copy[(targets == 'NAU2LOS')])**2)) RMSEP_NAU2MEDS[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NAU2MEDS'), i] - comps_copy[(targets == 'NAU2MEDS')])**2)) RMSEP_NORITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'NORITE'), i] - comps_copy[(targets == 'NORITE')])**2)) RMSEP_PICRITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'PICRITE'), i] - comps_copy[(targets == 'PICRITE')])**2)) RMSEP_SHERGOTTITE[i] = numpy.sqrt( numpy.mean((cal_results[(targets == 'SHERGOTTITE'), i] - comps_copy[(targets == 'SHERGOTTITE')])**2)) n_good_cal = len(numpy.unique(comps_copy)) - 1 RMSEP_cal = (RMSEP_KGAMEDS + RMSEP_MACUSANITE + RMSEP_NAU2HIS + RMSEP_NAU2LOS + RMSEP_NAU2MEDS + RMSEP_NORITE + RMSEP_PICRITE + RMSEP_SHERGOTTITE) / n_good_cal RMSEP_single_cals = [ RMSEP_KGAMEDS, RMSEP_MACUSANITE, RMSEP_NAU2HIS, RMSEP_NAU2LOS, RMSEP_NAU2MEDS, RMSEP_NORITE, RMSEP_PICRITE, RMSEP_SHERGOTTITE, RMSEP_cal ] with open( outpath + which_elem + '_' + str(mincomp) + '-' + str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_caltargets_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['File', 'Target', 'Laser Energy', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(targets)): row = [cal_filelist[i], targets[i], amps[i], target_comps[i]] row.extend(cal_results[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + str(mincomp) + '-' + str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_RMSECP_caltargets.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSECP Cal Targets (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSEP_cal[i]]) ccam.plots.RMSE(RMSECV, RMSEP, RMSEC, which_elem + ' RMSEs', outpath + which_elem + '_' + str(mincomp) + '-' + str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_RMSE_plot_cal.png', RMSEP_cals=RMSEP_single_cals) # plot RMSEs ccam_plots.ccam_plot_RMSE( RMSECV, RMSEP, RMSEC, which_elem + 'RMSEs', outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSE_plot.png') #Write output info to files print outpath + which_elem + '_' + plstype + '_nc' + str( nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str( maxcomp) + '_RMSECV.csv' with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSECV.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSECV (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSECV[i]]) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSEC.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSEC (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSEC[i]]) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_RMSEP.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['NC', 'RMSEP (wt.%)']) for i in range(0, nc): writer.writerow([i + 1, RMSEP[i]]) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_cv_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['Sample', 'Spectrum', 'Fold', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(names_train)): row = [ names_train[i], spect_index_train[i], folds_train[i], comps_train[i] ] row.extend(train_predict_cv[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_train_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['Sample', 'Spectrum', 'Fold', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(names_train)): row = [ names_train[i], spect_index_train[i], folds_train[i], comps_train[i] ] row.extend(trainset_results[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_test_predict.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['Sample', 'Spectrum', 'Fold', 'True_Comp'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(names_test)): row = [ names_test[i], spect_index_test[i], folds_test[i], comps_test[i] ] row.extend(testset_results[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_beta_coeffs.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') row = ['wvl'] row.extend(range(1, nc + 1)) writer.writerow(row) for i in range(0, len(wvl)): row = [wvl[i]] row.extend(beta[i, :]) writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_meancenters.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow([which_elem + ' mean', Y_mean]) for i in range(0, len(wvl)): row = [wvl[i], X_mean[i]] writer.writerow(row) with open( outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) + '_inputinfo.csv', 'wb') as writefile: writer = csv.writer(writefile, delimiter=',') writer.writerow(['Spectral database =', dbfile]) writer.writerow(['Spectra Kept =', keepfile]) writer.writerow(['Spectra Removed =', which_removed]) writer.writerow(['Fold Definition =', foldfile]) writer.writerow(['Test Fold =', maskfile]) writer.writerow(['Mask File =', maskfile]) writer.writerow(['Algorithm =', plstype]) writer.writerow(['# of components =', nc]) writer.writerow(['Normalization Type =', normtype]) writer.writerow(['Composition Min. =', mincomp]) writer.writerow(['Composition Max. =', maxcomp])