Exemple #1
0
def pls_predict(data,nc,wvl,maskfile,coeff_file=None,mean_file=None,loadfile=None):
    normtype=0
    
    
    data,wvl=ccam.mask(data,wvl,maskfile)
    #find the norm type from the coeff file
    if coeff_file!=None:    
        if coeff_file.find('norm1')!=-1:
            normtype=1
        if coeff_file.find('norm3')!=-1:
            normtype=3
        if normtype==0:
            print('Error: Cant determine normalization from coeff file name!')
            return
        
    
    if loadfile!=None:
        if loadfile.find('norm1')!=-1:
            normtype=1
        if loadfile.find('norm3')!=-1:
            normtype=3
        if normtype==0:
            print('Error: Cant determine normalization from loadfile name!')
            return
    data_norm=ccam.normalize(data,wvl,normtype=normtype)
    if loadfile==None:
        y=ccam.pls_unk(data_norm,nc,coeff_file=coeff_file,means_file=mean_file)
    if loadfile!=None:
        y=ccam.pls_unk_load(data_norm,nc,loadfile,means_file=mean_file)
    return y,normtype

    
high_cutoff = 15

data, wvl, filelist = ccam.read_ccs(searchdir)
#pickle.dump(data, open( "ccamdata.pkl", "wb" ))
#pickle.dump(wvl,open( "ccamwvl.pkl", "wb" ))
#pickle.dump(filelist,open( "ccamfilelist.pkl", "wb" ))

#pickle.load(open( r"C:\Users\rbanderson\Documents\MSL\ChemCam\Data Processing\Working\ccam\ccamdata.pkl", "rb" ))
#pickle.load(open( r"C:\Users\rbanderson\Documents\MSL\ChemCam\Data Processing\Working\ccam\ccamwvl.pkl", "rb" ))
#pickle.load(open( r"C:\Users\rbanderson\Documents\MSL\ChemCam\Data Processing\Working\ccam\ccamfilelist.pkl", "rb" ))

targetlist, distslist, amplist = ccam.target_lookup(filelist, masterlist,
                                                    name_subs)

y_full = ccam.pls_unk(data_norm3,
                      nc_full,
                      coeff_file=coeff_file_full,
                      means_file=means_file_full)
y_low = ccam.pls_unk(data_norm3,
                     nc_low,
                     coeff_file=coeff_file_low,
                     means_file=means_file_low)
y_mid = ccam.pls_unk(data_norm1,
                     nc_mid,
                     coeff_file=coeff_file_mid,
                     means_file=means_file_mid)
y_high = ccam.pls_unk(data_norm3,
                      nc_high,
                      coeff_file=coeff_file_high,
                      means_file=means_file_high)

y_combined = numpy.zeros_like(y_high)
Exemple #3
0
def pls_cal(dbfile,
            foldfile,
            maskfile,
            outpath,
            which_elem,
            testfold,
            nc,
            normtype=3,
            mincomp=0,
            maxcomp=100,
            plstype='mlpy',
            keepfile=None,
            removefile=None,
            cal_dir=None,
            masterlist_file=None,
            compfile=None,
            name_sub_file=None):

    print 'Reading database'
    sys.stdout.flush()
    spectra, comps, spect_index, names, labels, wvl = ccam.read_db(
        dbfile, compcheck=True)
    oxides = labels[2:]
    compindex = numpy.where(oxides == which_elem)[0]

    print 'Choosing spectra'
    which_removed = outpath + which_elem + '_' + plstype + '_nc' + str(
        nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(
            maxcomp) + '_removed.csv'
    spectra, names, spect_index, comps = ccam.choose_spectra(
        spectra,
        spect_index,
        names,
        comps,
        compindex,
        mincomp=mincomp,
        maxcomp=maxcomp,
        keepfile=keepfile,
        removefile=removefile,
        which_removed=which_removed)

    print 'Masking spectra'
    spectra, wvl = ccam.mask(spectra, wvl, maskfile)

    print 'Normalizing spectra'
    spectra = ccam.normalize(spectra, wvl, normtype=normtype)

    print 'Assigning Folds'
    folds = ccam.folds(foldfile, names)
    names_nofold = names[(folds == 0)]
    spect_index_nofold = spect_index[(folds == 0)]
    #write a file containing the samples not assigned to folds
    with open(which_removed, 'ab') as writefile:
        writer = csv.writer(
            writefile,
            delimiter=',',
        )
        for i in range(len(names_nofold)):
            writer.writerow(
                [names_nofold[i], spect_index_nofold[i], 'No Fold'])

    #remove spectra that are not assigned to any fold
    spectra = spectra[(folds != 0), :]
    spect_index = spect_index[(folds != 0)]
    names = names[(folds != 0)]
    comps = comps[(folds != 0), :]
    folds = folds[(folds != 0)]

    print 'Defining Training and Test Sets'
    spectra_train = spectra[(folds != testfold)]
    spect_index_train = spect_index[(folds != testfold)]
    names_train = names[(folds != testfold)]
    comps_train = comps[(folds != testfold), compindex]
    folds_train = folds[(folds != testfold)]
    folds_train_unique = numpy.unique(folds_train)

    spectra_test = spectra[(folds == testfold)]
    spect_index_test = spect_index[(folds == testfold)]
    names_test = names[(folds == testfold)]
    comps_test = comps[(folds == testfold), compindex]
    folds_test = folds[(folds == testfold)]

    print 'Do Leave One Label Out (LOLO) cross validation with all folds but the test set'
    #define array to hold cross validation predictions and RMSEs
    train_predict_cv = numpy.zeros((len(names_train), nc))
    RMSECV = numpy.zeros(nc)

    for i in folds_train_unique:
        print 'Holding out fold #' + str(i)
        #mean center those spectra left in
        #X_cv_in1,X_cv_in_mean1=meancenter.ccam_meancenter(spectra_train[(folds_train!=i),:])
        X_cv_in, X_cv_in_mean = ccam.meancenter(
            spectra_train[(folds_train != i), :])

        #and those left out
        X_cv_out = ccam.meancenter(spectra_train[(folds_train == i), :],
                                   X_mean=X_cv_in_mean)[0]

        #mean center compositions left in
        Y_cv_in, Y_cv_in_mean = ccam.meancenter(
            comps_train[(folds_train != i)])

        #step through each number of components
        for j in range(1, nc + 1):
            print 'Training PLS Model for ' + str(j) + ' components'
            #train the model
            if plstype == 'mlpy':
                PLS1model = mlpy.pls.PLS(j)
                PLS1model.learn(X_cv_in, Y_cv_in)

                #predict the samples held out
                train_predict_cv[(folds_train == i), j -
                                 1] = PLS1model.pred(X_cv_out) + Y_cv_in_mean
            if plstype == 'sklearn':
                PLS1model = PLSRegression(n_components=nc)
                PLS1model.fit(X_cv_in, Y_cv_in)
                train_predict_cv[
                    (folds_train == i),
                    j - 1] = PLS1model.predict(X_cv_out) + Y_cv_in_mean
    #calculate RMSECV
    for i in range(0, nc):
        sqerr = (train_predict_cv[:, i] - comps_train)**2.0
        RMSECV[i] = numpy.sqrt(numpy.mean(sqerr))

    #mean center full model
    X, X_mean = ccam.meancenter(spectra_train)
    X_test = ccam.meancenter(spectra_test, X_mean=X_mean)[0]

    Y, Y_mean = ccam.meancenter(comps_train)

    #create arrays for results and RMSEs
    trainset_results = numpy.zeros((len(names_train), nc))
    testset_results = numpy.zeros((len(names_test), nc))
    RMSEP = numpy.zeros(nc)
    RMSEC = numpy.zeros(nc)
    beta = numpy.zeros((len(X_mean), nc))

    #Now step through each # of components with the full model
    for j in range(1, nc + 1):
        print 'Training full model for ' + str(j) + ' components'
        if plstype == 'mlpy':
            PLS1model = mlpy.pls.PLS(j)
            PLS1model.learn(X, Y)
            beta[:, j - 1] = PLS1model.beta()
            trainset_results[:, j - 1] = PLS1model.pred(X) + Y_mean
            testset_results[:, j - 1] = PLS1model.pred(X_test) + Y_mean
        if plstype == 'sklearn':
            PLS1model = PLSRegression(n_components=nc)
            PLS1model.fit(X, Y)
            print 'stop'

        RMSEC[j - 1] = numpy.sqrt(
            numpy.mean((trainset_results[:, j - 1] - comps_train)**2.0))
        RMSEP[j - 1] = numpy.sqrt(
            numpy.mean((testset_results[:, j - 1] - comps_test)**2.0))

#if cal_dir is specified, read cal target data and calculate RMSEs
    if cal_dir != None:
        cal_data, cal_wvl, cal_filelist = ccam.read_ccs(cal_dir)
        cal_data, cal_wvl = ccam.mask(cal_data, cal_wvl, maskfile)
        cal_data = ccam.normalize(cal_data, cal_wvl, normtype=normtype)

        RMSEP_cal = numpy.zeros(nc)
        RMSEP_KGAMEDS = numpy.zeros(nc)
        RMSEP_MACUSANITE = numpy.zeros(nc)
        RMSEP_NAU2HIS = numpy.zeros(nc)
        RMSEP_NAU2LOS = numpy.zeros(nc)
        RMSEP_NAU2MEDS = numpy.zeros(nc)
        RMSEP_NORITE = numpy.zeros(nc)
        RMSEP_PICRITE = numpy.zeros(nc)
        RMSEP_SHERGOTTITE = numpy.zeros(nc)

        targets, dists, amps = ccam.target_lookup(cal_filelist,
                                                  masterlist_file,
                                                  name_sub_file)
        target_comps = ccam.target_comp_lookup(targets, compfile, which_elem)
        cal_results = numpy.zeros((len(targets), nc))

        for i in range(nc):
            comps_copy = copy.copy(target_comps)
            cal_results[:, i] = ccam.pls_unk(cal_data,
                                             i + 1,
                                             beta=beta[:, i],
                                             X_mean=X_mean,
                                             Y_mean=Y_mean)
            #RMSEP_cal[i]=numpy.sqrt(numpy.mean((cal_results[:,i]-target_comps)**2))
            cal_results[(comps_copy < mincomp), i] = 0
            cal_results[(comps_copy > maxcomp), i] = 0
            comps_copy[(comps_copy < mincomp)] = 0
            comps_copy[(comps_copy > maxcomp)] = 0
            RMSEP_KGAMEDS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'KGAMEDS'), i] -
                            comps_copy[(targets == 'KGAMEDS')])**2))
            RMSEP_MACUSANITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'MACUSANITE'), i] -
                            comps_copy[(targets == 'MACUSANITE')])**2))
            RMSEP_NAU2HIS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NAU2HIS'), i] -
                            comps_copy[(targets == 'NAU2HIS')])**2))
            RMSEP_NAU2LOS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NAU2LOS'), i] -
                            comps_copy[(targets == 'NAU2LOS')])**2))
            RMSEP_NAU2MEDS[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NAU2MEDS'), i] -
                            comps_copy[(targets == 'NAU2MEDS')])**2))
            RMSEP_NORITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'NORITE'), i] -
                            comps_copy[(targets == 'NORITE')])**2))
            RMSEP_PICRITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'PICRITE'), i] -
                            comps_copy[(targets == 'PICRITE')])**2))
            RMSEP_SHERGOTTITE[i] = numpy.sqrt(
                numpy.mean((cal_results[(targets == 'SHERGOTTITE'), i] -
                            comps_copy[(targets == 'SHERGOTTITE')])**2))
        n_good_cal = len(numpy.unique(comps_copy)) - 1
        RMSEP_cal = (RMSEP_KGAMEDS + RMSEP_MACUSANITE + RMSEP_NAU2HIS +
                     RMSEP_NAU2LOS + RMSEP_NAU2MEDS + RMSEP_NORITE +
                     RMSEP_PICRITE + RMSEP_SHERGOTTITE) / n_good_cal
        RMSEP_single_cals = [
            RMSEP_KGAMEDS, RMSEP_MACUSANITE, RMSEP_NAU2HIS, RMSEP_NAU2LOS,
            RMSEP_NAU2MEDS, RMSEP_NORITE, RMSEP_PICRITE, RMSEP_SHERGOTTITE,
            RMSEP_cal
        ]

        with open(
                outpath + which_elem + '_' + str(mincomp) + '-' +
                str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' +
                str(normtype) + '_caltargets_predict.csv', 'wb') as writefile:
            writer = csv.writer(writefile, delimiter=',')
            row = ['File', 'Target', 'Laser Energy', 'True_Comp']
            row.extend(range(1, nc + 1))
            writer.writerow(row)
            for i in range(0, len(targets)):
                row = [cal_filelist[i], targets[i], amps[i], target_comps[i]]
                row.extend(cal_results[i, :])
                writer.writerow(row)
        with open(
                outpath + which_elem + '_' + str(mincomp) + '-' +
                str(maxcomp) + '_' + plstype + '_nc' + str(nc) + '_norm' +
                str(normtype) + '_RMSECP_caltargets.csv', 'wb') as writefile:
            writer = csv.writer(writefile, delimiter=',')
            writer.writerow(['NC', 'RMSECP Cal Targets (wt.%)'])
            for i in range(0, nc):
                writer.writerow([i + 1, RMSEP_cal[i]])
        ccam.plots.RMSE(RMSECV,
                        RMSEP,
                        RMSEC,
                        which_elem + ' RMSEs',
                        outpath + which_elem + '_' + str(mincomp) + '-' +
                        str(maxcomp) + '_' + plstype + '_nc' + str(nc) +
                        '_norm' + str(normtype) + '_RMSE_plot_cal.png',
                        RMSEP_cals=RMSEP_single_cals)

    # plot RMSEs
    ccam_plots.ccam_plot_RMSE(
        RMSECV, RMSEP, RMSEC, which_elem + 'RMSEs', outpath + which_elem +
        '_' + plstype + '_nc' + str(nc) + '_norm' + str(normtype) + '_' +
        str(mincomp) + '-' + str(maxcomp) + '_RMSE_plot.png')

    #Write output info to files
    print outpath + which_elem + '_' + plstype + '_nc' + str(
        nc) + '_norm' + str(normtype) + '_' + str(mincomp) + '-' + str(
            maxcomp) + '_RMSECV.csv'
    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_RMSECV.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['NC', 'RMSECV (wt.%)'])
        for i in range(0, nc):
            writer.writerow([i + 1, RMSECV[i]])

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_RMSEC.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['NC', 'RMSEC (wt.%)'])
        for i in range(0, nc):
            writer.writerow([i + 1, RMSEC[i]])

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_RMSEP.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['NC', 'RMSEP (wt.%)'])
        for i in range(0, nc):
            writer.writerow([i + 1, RMSEP[i]])

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_cv_predict.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['Sample', 'Spectrum', 'Fold', 'True_Comp']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(names_train)):
            row = [
                names_train[i], spect_index_train[i], folds_train[i],
                comps_train[i]
            ]
            row.extend(train_predict_cv[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_train_predict.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['Sample', 'Spectrum', 'Fold', 'True_Comp']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(names_train)):
            row = [
                names_train[i], spect_index_train[i], folds_train[i],
                comps_train[i]
            ]
            row.extend(trainset_results[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_test_predict.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['Sample', 'Spectrum', 'Fold', 'True_Comp']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(names_test)):
            row = [
                names_test[i], spect_index_test[i], folds_test[i],
                comps_test[i]
            ]
            row.extend(testset_results[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_beta_coeffs.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        row = ['wvl']
        row.extend(range(1, nc + 1))
        writer.writerow(row)
        for i in range(0, len(wvl)):
            row = [wvl[i]]
            row.extend(beta[i, :])
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_meancenters.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow([which_elem + ' mean', Y_mean])
        for i in range(0, len(wvl)):
            row = [wvl[i], X_mean[i]]
            writer.writerow(row)

    with open(
            outpath + which_elem + '_' + plstype + '_nc' + str(nc) + '_norm' +
            str(normtype) + '_' + str(mincomp) + '-' + str(maxcomp) +
            '_inputinfo.csv', 'wb') as writefile:
        writer = csv.writer(writefile, delimiter=',')
        writer.writerow(['Spectral database =', dbfile])
        writer.writerow(['Spectra Kept =', keepfile])
        writer.writerow(['Spectra Removed =', which_removed])
        writer.writerow(['Fold Definition =', foldfile])
        writer.writerow(['Test Fold =', maskfile])
        writer.writerow(['Mask File =', maskfile])
        writer.writerow(['Algorithm =', plstype])
        writer.writerow(['# of components =', nc])
        writer.writerow(['Normalization Type =', normtype])
        writer.writerow(['Composition Min. =', mincomp])
        writer.writerow(['Composition Max. =', maxcomp])