def predict(self, x): # x is a list of data frames to feed into each submodel. # This allows different normalizations to be used with each submodel predictions = [] for i, k in enumerate(self.submodels): xtemp = x[i].xs('wvl', axis=1, level=0, drop_level=False) xtemp, mean_vect = meancenter(xtemp, 'wvl', previous_mean=self.mean_vects[i]) predictions.append(k.predict(xtemp['wvl'])) return predictions
def fit(self, trainsets, ranges, ncs, ycol, figpath=None): self.ranges = ranges self.ncs = ncs self.ycol = ycol submodels = [] mean_vects = [] for i, rangei in enumerate(ranges): data_tmp = within_range.within_range(trainsets[i], rangei, ycol) x = data_tmp.xs('wvl', axis=1, level=0, drop_level=False) y = data_tmp['meta'][ycol] x_centered, x_mean_vect = meancenter( x, 'wvl') # mean center training data pls = PLSRegression(n_components=ncs[i], scale=False) pls.fit(x, y) submodels.append(pls) mean_vects.append(x_mean_vect) if figpath is not None: # calculate spectral residuals E = x_centered - np.dot(pls.x_scores_, pls.x_loadings_.transpose()) Q_res = np.dot(E, E.transpose()).diagonal() # calculate leverage T = pls.x_scores_ leverage = np.diag( T @ np.linalg.inv(T.transpose() @ T) @ T.transpose()) plot.figure() plot.scatter(leverage, Q_res, color='r', edgecolor='k') plot.title(ycol + ' (' + str(rangei[0]) + '-' + str(rangei[1]) + ')') plot.xlabel('Leverage') plot.ylabel('Q') plot.ylim([0, 1.1 * np.max(Q_res)]) plot.xlim([0, 1.1 * np.max(leverage)]) plot.savefig(figpath + '/' + ycol + '_' + str(rangei[0]) + '-' + str(rangei[1]) + 'Qres_vs_Leverage.png', dpi=600) self.leverage = leverage self.Q_res = Q_res self.submodels = submodels self.mean_vects = mean_vects
def fit(self, trainsets, ranges, ncs, ycol, figpath=None): self.ranges = ranges self.ncs = ncs self.ycol = ycol submodels = [] mean_vects = [] for i, rangei in enumerate(ranges): data_tmp = within_range.within_range(trainsets[i], rangei, ycol) x = data_tmp.xs('wvl', axis=1, level=0, drop_level=False) y = data_tmp['meta'][ycol] x_centered, x_mean_vect = meancenter(x, 'wvl') # mean center training data pls = PLSRegression(n_components=ncs[i], scale=False) pls.fit(x, y) submodels.append(pls) mean_vects.append(x_mean_vect) if figpath is not None: # calculate spectral residuals E = x_centered - np.dot(pls.x_scores_, pls.x_loadings_.transpose()) Q_res = np.dot(E, E.transpose()).diagonal() # calculate leverage T = pls.x_scores_ leverage = np.diag(T @ np.linalg.inv(T.transpose() @ T) @ T.transpose()) plot.figure() plot.scatter(leverage, Q_res, color='r', edgecolor='k') plot.title(ycol + ' (' + str(rangei[0]) + '-' + str(rangei[1]) + ')') plot.xlabel('Leverage') plot.ylabel('Q') plot.ylim([0, 1.1 * np.max(Q_res)]) plot.xlim([0, 1.1 * np.max(leverage)]) plot.savefig( figpath + '/' + ycol + '_' + str(rangei[0]) + '-' + str(rangei[1]) + 'Qres_vs_Leverage.png', dpi=600) self.leverage = leverage self.Q_res = Q_res self.submodels = submodels self.mean_vects = mean_vects
def pls_cv(Train, Test=None, nc=20, nfolds=5, ycol='SiO2', doplot=True, outpath='.', plotfile='pls_cv.png'): # create empty arrays for the RMSE values pls_rmsecv = np.empty(nc) pls_rmsec = np.empty(nc) # If there is a test set provided, create the RMSEP array to hold test set errors if Test is not None: pls_rmsep = np.empty(nc) # loop through each number of components for i in range(1, nc + 1): print('nc=' + str(i)) Train[('meta', ycol + '_cv_PLS_nc' + str( i))] = 0 # create a column to hold the PLS cross validation results for this nc Train[ ('meta', ycol + '_PLS_nc' + str(i))] = 0 # create a column to hold the PLS training set results for this nc if Test is not None: Test[ ('meta', ycol + '_PLS_nc' + str(i))] = 0 # create a column to hold the PLS test set results for this nc # Do the cross validation cv_iterator = LeaveOneLabelOut( Train[('meta', 'Folds')]) # create the iterator for cross validation within the training data for train, holdout in cv_iterator: # Iterate through each of the folds in the training set cv_train = Train.iloc[train] cv_holdout = Train.iloc[holdout] # Do PLS for this number of components cv_train_centered, cv_train_mean_vect = meancenter(cv_train) # mean center training data cv_holdout_centered, cv_holdout_mean_vect = meancenter(cv_holdout, previous_mean=cv_train_mean_vect) # apply same mean centering to holdout data pls = PLSRegression(n_components=i, scale=False) pls.fit(cv_train_centered['wvl'], cv_train_centered['meta'][ycol]) y_pred_holdout = pls.predict(cv_holdout_centered['wvl']) Train.set_value(Train.index[holdout], ('meta', ycol + '_cv_PLS_nc' + str(i)), y_pred_holdout) pls_rmsecv[i - 1] = np.sqrt( np.mean(np.subtract(Train[('meta', ycol)], Train[('meta', ycol + '_cv_PLS_nc' + str(i))]) ** 2, axis=0)) # Do train and test set PLS predictions for this number of components Train_centered, Train_mean_vect = meancenter(Train) pls = PLSRegression(n_components=i, scale=False) pls.fit(Train_centered['wvl'], Train_centered['meta'][ycol]) y_pred = pls.predict(Train_centered['wvl']) Train.set_value(Train.index, ('meta', ycol + '_PLS_nc' + str(i)), y_pred) pls_rmsec[i - 1] = np.sqrt( np.mean(np.subtract(Train[('meta', ycol)], Train[('meta', ycol + '_PLS_nc' + str(i))]) ** 2, axis=0)) if Test is not None: Test_centered, Train_mean_vect = meancenter(Test, previous_mean=Train_mean_vect) y_pred = pls.predict(Test_centered['wvl']) Test.set_value(Test.index, ('meta', ycol + '_PLS_nc' + str(i)), y_pred) pls_rmsep[i - 1] = np.sqrt( np.mean(np.subtract(Test[('meta', ycol)], Test[('meta', ycol + '_PLS_nc' + str(i))]) ** 2, axis=0)) if doplot == True: plot.figure() plot.title(ycol) plot.xlabel('# of components') plot.ylabel(ycol + ' RMSE (wt.%)') plot.plot(range(1, nc + 1), pls_rmsecv, label='RMSECV', color='r') plot.plot(range(1, nc + 1), pls_rmsec, label='RMSEC', color='b') if Test is not None: plot.plot(range(1, nc + 1), pls_rmsep, label='RMSEP', color='g') plot.legend(loc=0, fontsize=6) plot.savefig(outpath + '/' + plotfile, dpi=600) rmses = {'RMSEC': pls_rmsec, 'RMSECV': pls_rmsecv} if Test is not None: rmses['RMSEP'] = pls_rmsep return rmses