def test_sm_blend(): df = pd.read_csv(get_path('test_data.csv'), header=[0, 1]) x = df['wvl'] y = df[('comp', 'SiO2')] model1 = reg.regression(method=['PLS'], params=[{ 'n_components': 3, 'scale': False }]) model1.fit(x, y) df[('predict', 'model1')] = model1.predict(x) model2 = reg.regression(method=['PLS'], params=[{ 'n_components': 5, 'scale': False }]) model2.fit(x, y) df[('predict', 'model2')] = model2.predict(x) model3 = reg.regression(method=['PLS'], params=[{ 'n_components': 4, 'scale': False }]) model3.fit(x, y) df[('predict', 'model3')] = model3.predict(x) predictions = [ df[('predict', 'model2')], df[('predict', 'model1')], df[('predict', 'model3')], df[('predict', 'model1')] ] blendranges = [[-9999, 30], [20, 60], [50, 9999]] sm_obj = sm.sm(blendranges) blended_predictions = sm_obj.do_blend( np.array(predictions)) #without optimization rmse = np.sqrt(np.average((blended_predictions - df[('comp', 'SiO2')])**2)) np.testing.assert_almost_equal(rmse, 12.703434300128926, decimal=5) blended_predictions = sm_obj.do_blend( np.array(predictions), truevals=np.array(df[('comp', 'SiO2')])) #with optimization rmse = np.sqrt(np.average((blended_predictions - df[('comp', 'SiO2')])**2)) expected_blendranges = [ -9999., 36.5198746, 47.98157746, 56.2537253, 118.94036468, 9999. ] np.testing.assert_almost_equal(rmse, 9.954065920454982, decimal=5) np.testing.assert_allclose(expected_blendranges, sm_obj.blendranges, rtol=1e-5)
def setup(self): self.setComboBox(self.chooseDataComboBox, self.datakeys) method = self.chooseAlgorithmComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [ self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value() ] try: params, modelkey = self.alg[ self.chooseAlgorithmComboBox.currentText()].run() try: modelkey = "{} - {} - ({}, {}) {}".format( method, yvars[0][-1], yrange[0], yrange[1], modelkey) except: modelkey = "Problem naming model - make sure you have selected a y variable" pass self.list_amend(self.modelkeys, self.curr_count, modelkey) #print(params, modelkey) self.models[modelkey] = regression.regression([method], [yrange], [params]) self.model_xvars[modelkey] = xvars self.model_yvars[modelkey] = yvars if 'Model Coefficients' not in self.datakeys: self.datakeys.append('Model Coefficients') else: pass except: pass
def test_OLS(): regress = regression(method=['OLS'], params=[{'fit_intercept': True}]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 5.604104598379565 np.testing.assert_almost_equal(rmse, expected)
def test_OMP(): regress = regression(method=['OMP'], params=[{'fit_intercept': True}]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 9.835802028648189 np.testing.assert_almost_equal(rmse, expected)
def run(self): if 'Model Coefficients' in self.datakeys: pass else: Modules.data_count += 1 self.list_amend(self.datakeys, Modules.data_count, 'Model Coefficients') Modules.model_count += 1 self.count = Modules.model_count method = self.chooseAlgorithmComboBox.currentText() datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [ self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value() ] params, modelkey = self.alg[ self.chooseAlgorithmComboBox.currentText()].run() modelkey = "{} - {} - ({}, {}) {}".format(method, yvars[0][-1], yrange[0], yrange[1], modelkey) self.list_amend(self.modelkeys, self.count, modelkey) self.models[modelkey] = regression.regression([method], [yrange], [params]) x = self.data[datakey].df[xvars] y = self.data[datakey].df[yvars] x = np.array(x) y = np.array(y) ymask = np.squeeze((y > yrange[0]) & (y < yrange[1])) y = y[ymask] x = x[ymask, :] self.models[modelkey].fit(x, y) self.model_xvars[modelkey] = xvars self.model_yvars[modelkey] = yvars try: coef = np.squeeze(self.models[modelkey].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples( self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = modelkey try: coef[('meta', 'Intercept')] = self.models[modelkey].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) except: pass
def test_KRR(): regress = regression(method=['KRR'], yrange=[0.0, 100.0], params=[{'alpha': 0, 'kernel': 'linear', 'gamma': 'None', 'degree': 3.0, 'coef0': 1.0, 'kernel_params': 'None'}])
def test_badfit(): regress = regression(method=['PLS'], params=[{ 'n_components': 300, 'scale': False }]) regress.fit(x, y) assert regress.goodfit == False
def test_LASSO(): regress = regression(method=['LASSO'], yrange=[0.0, 100.0], params=[{'alpha': 1.0, 'fit_intercept': True, 'max_iter': 1000, 'tol': 0.0001, 'positive': False, 'selection': 'random'}])
def test_Ridge(): regress = regression(method=['Ridge'], yrange=[0.0, 100.0], params=[{'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 'None', 'normalize': False, 'solver': 'auto', 'tol': 0.0, 'random_state': ''}])
def test_LARS2_CV_true(): regress = regression(method=['LARS'], yrange=[0.0, 100.0], params=[{'fit_intercept': True, 'positive': False, 'verbose': False, 'normalize': False, 'precompute': True, 'copy_X': True, 'eps': 2.220445, 'CV': True}])
def test_Lasso_LARS_model_none(): regress = regression(method=['Lasso LARS'], yrange=[0.0, 100.0], params=[{'fit_intercept': True, 'positive': False, 'verbose': False, 'normalize': True, 'copy_X': True, 'precompute': 'Auto', 'max_iter': 500, 'model': None, 'eps': 2.220446}])
def test_LARS(): regress = regression(method=['LARS'], yrange=[0.0, 100.0], params=[{'n_nonzero_coefs': 500, 'fit_intercept': True, 'positive': False, 'verbose': False, 'normalize': False, 'precompute': True, 'copy_X': True, 'eps': 2.220445, 'fit_path': True}])
def test_Ridge(): regress = regression(method=['Ridge'], params=[{ 'alpha': 1.0, 'fit_intercept': True }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 19.29172384871638 np.testing.assert_almost_equal(rmse, expected)
def test_LASSO(): regress = regression(method=['LASSO'], params=[{ 'alpha': 1.0, 'fit_intercept': True, 'positive': False }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 22.815757879917708 np.testing.assert_almost_equal(rmse, expected)
def test_SVR(): regress = regression(method=['SVR'], yrange=[0.0, 100.0], params=[{'C': 1.0, 'epsilon': 0.1, 'kernel': 'rbf', 'degree': 0, 'gamma': 'auto', 'coef0': 0.0, 'shrinking': False, 'tol': 0.001, 'cache_size': 200, 'verbose': False, 'max_iter': -1}])
def test_Elastic_Net_CV_true(): regress = regression(method=['Elastic Net'], yrange=[0.0, 100.0], params=[{'l1_ratio': 0.5, 'fit_intercept': True, 'normalize': False, 'precompute': 'False', 'max_iter': 1000, 'copy_X': True, 'tol': 0.0001, 'positive': False, 'selection': 'cyclic', 'random_state': 'None', 'CV': True}])
def test_Elastic_Net(): regress = regression(method=['Elastic Net'], params=[{ 'alpha': 1.0, 'l1_ratio': 0.5, 'fit_intercept': True, 'positive': False }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 22.800420060822468 np.testing.assert_almost_equal(rmse, expected)
def test_Bayesian_Ridge(): regress = regression(method=['Bayesian Ridge'], yrange=[0.0, 100.0], params=[{'n_iter': 300, 'tol': 0.001, 'alpha_1': 0.001, 'alpha_2': 1e-06, 'lambda_1': 1e-06, 'lambda_2': 1e-06, 'compute_score': False, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'verbose': False}])
def test_ARD(): regress = regression(method=['ARD'], yrange=[0.0, 100.0], params=[{'n_iter': 300, 'tol': 0.001, 'alpha_1': 0.001, 'alpha_2': 1e-06, 'lambda_1': 1e-06, 'lambda_2': 1e-06, 'compute_score': False, 'threshold_lambda': 100000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'verbose': False}])
def test_KRR(): regress = regression(method=['KRR'], params=[{ 'alpha': 0, 'kernel': 'linear', 'gamma': 'None', 'degree': 3.0, 'coef0': 1.0, 'kernel_params': 'None' }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 5.603702809509191 np.testing.assert_almost_equal(rmse, expected, decimal=2)
def test_LARS(): regress = regression(method=['LARS'], params=[{ 'n_nonzero_coefs': 5, 'fit_intercept': True, 'normalize': False, 'precompute': True, 'copy_X': True, 'eps': 2.220445, 'fit_path': True }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 21.952591101815294 np.testing.assert_almost_equal(rmse, expected)
def test_Bayesian_Ridge(): regress = regression(method=['BRR'], params=[{ 'n_iter': 300, 'tol': 0.001, 'alpha_1': 0.001, 'alpha_2': 1e-06, 'lambda_1': 1e-06, 'lambda_2': 1e-06, 'compute_score': False, 'fit_intercept': True, 'normalize': False }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 6.3894201026386135 np.testing.assert_almost_equal(rmse, expected)
def test_SVR(): regress = regression(method=['SVR'], params=[{ 'C': 1.0, 'epsilon': 0.1, 'kernel': 'rbf', 'degree': 0, 'gamma': 'auto', 'coef0': 0.0, 'shrinking': False, 'tol': 0.001, 'cache_size': 200, 'verbose': False, 'max_iter': -1 }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 23.740048198035947 np.testing.assert_almost_equal(rmse, expected)
def test_ARD(): regress = regression(method=['ARD'], params=[{ 'n_iter': 300, 'tol': 0.001, 'alpha_1': 0.001, 'alpha_2': 1e-06, 'lambda_1': 1e-06, 'lambda_2': 1e-06, 'compute_score': False, 'threshold_lambda': 100000, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'verbose': False }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 6.714452573751844 np.testing.assert_almost_equal(rmse, expected)
def test_PLS(): regress = regression(method=['PLS'], params=[{ 'n_components': 3, 'scale': False }]) regress.fit(x, y) prediction = np.squeeze(regress.predict(x)) rmse = np.sqrt(np.average((prediction - y)**2)) expected = 9.568890617713505 np.testing.assert_almost_equal(rmse, expected) regress.calc_Qres_Lev(x) Qres_expected = [ 0.04055878, 0.04188589, 0.04159104, 0.04374264, 0.04080776, 0.04072383, 0.04057845, 0.04053754, 0.04056575, 0.04077855 ] np.testing.assert_array_almost_equal(regress.Q_res[0:10], Qres_expected) leverage_expected = [ 0.01225164, 0.01219529, 0.01431885, 0.03043435, 0.05013193, 0.01418457, 0.01055998, 0.00554777, 0.00891671, 0.00912439 ] np.testing.assert_array_almost_equal(regress.leverage[0:10], leverage_expected)
def test_OLS(): regress = regression(method=['OLS'], yrange=[0.0, 100.0], params=[{'fit_intercept': True}])
def test_OMP_CV_false(): regress = regression(method=['OMP'], yrange=[0.0, 100.0], params=[{'fit_intercept': True, 'CV': False}])
def test_Ridge_CV_true(): regress = regression(method=['Ridge'], yrange=[0.0, 100.0], params=[{'fit_intercept': True, 'normalize': False, 'CV': True}])
def test_OMP(): regress = regression(method=['OMP'], yrange=[0.0, 100.0], params=[{'fit_intercept': True, 'precompute': True}])
def do_cv(self, Train, xcols='wvl', ycol=('comp', 'SiO2'), method='PLS', yrange=[0, 100]): try: cv_iterator = LeaveOneLabelOut( Train[('meta', 'Folds')]) # create an iterator for cross validation based on the predefined folds except: print('***No folds found! Did you remember to define folds before running cross validation?***') rmsecv_folds = [] rmsec = [] rmsecv = [] models = [] modelkeys = [] # loop through the grid of parameters, do cross validation for each permutation # try: # self.progress.setMaximum(len(self.paramgrid)) # self.progress.setValue(0) # self.progress.show() # except: # pass for i in list(range(len(self.paramgrid))): print(self.paramgrid[i]) # self.progress.setValue(i) model = regression([method], [yrange], [self.paramgrid[i]]) modelkey = "{} - {} - ({}, {}) {}".format(method, ycol[0][-1], yrange[0], yrange[1], self.paramgrid[i]) rmsecv_folds_tmp = [] # Create empty list to hold RMSECV for each fold for train, holdout in cv_iterator: # Iterate through each of the folds in the training set cvcol = ('predict', '"'+method + '-CV-' + str(self.paramgrid[ i])+'"') # ycol[-1]+'_cv_'+method+'_param'+str(i)) #create the name of the column in which results will be stored cv_train = Train.iloc[train] # extract the data to be used to create the model cv_holdout = Train.iloc[holdout] # extract the data that will be held out of the model model.fit(cv_train[xcols], cv_train[ycol]) if model.goodfit: y_pred_holdout = model.predict(cv_holdout[xcols]) else: y_pred_holdout = cv_holdout[ycol] * np.nan Train.set_value(Train.index[holdout], cvcol, y_pred_holdout) rmsecv_folds_tmp.append(RMSE(y_pred_holdout, cv_holdout[ycol])) rmsecv_folds.append(rmsecv_folds_tmp) rmsecv.append(RMSE(Train[ycol], Train[cvcol])) model.fit(Train[xcols], Train[ycol]) if model.goodfit: models.append(model) modelkeys.append(modelkey) ypred_train = model.predict(Train[xcols]) else: ypred_train = Train[ycol] * np.nan calcol = ('predict', '"'+method + '-Cal-' + str(self.paramgrid[i])+'"') Train[calcol] = ypred_train rmsec.append(RMSE(ypred_train, Train[ycol])) output = pd.DataFrame(self.paramgrid) output['RMSEC'] = rmsec output['RMSECV'] = rmsecv rmsecv_folds = np.array(rmsecv_folds) for i in list(range(len(rmsecv_folds[0, :]))): label = 'Fold' + str(i) output[label] = rmsecv_folds[:, i] cols = output.columns.values cols = [('cv', i) for i in cols] output.columns = pd.MultiIndex.from_tuples(cols) return Train, output, models, modelkeys
def do_cv(self, Train, xcols='wvl', ycol=('comp', 'SiO2'), method='PLS', yrange=None, calc_path=False, alphas=None): models = [] modelkeys = [] predictkeys = [] if yrange is None: yrange = [np.min(Train[ycol]), np.max(Train(ycol))] for i in list(range(len(self.paramgrid))): print('Permutation ' + str(i + 1) + ' of ' + str(len(self.paramgrid))) paramstring = '' for key in self.paramgrid[i].keys(): paramstring = paramstring + key + ': ' + str( self.paramgrid[i][key]) + '; ' print(paramstring[:-2]) try: # create an iterator for cross validation based on the predefined folds cv_iterator = LeaveOneGroupOut().split( Train[xcols], Train[ycol], Train[('meta', 'Folds')]) n_folds = LeaveOneGroupOut().get_n_splits( groups=Train[('meta', 'Folds')]) except KeyError: print( '***No folds found! Did you remember to define folds before running cross validation?***' ) return 0 # create an empty output data frame to serve as template output_tmp = pd.DataFrame() # add columns for RMSEC, RMSECV, and RMSE for the folds output_tmp['RMSEC'] = 0 output_tmp['RMSECV'] = 0 #for f in np.array(range(n_folds)) + 1: for f in np.array(range(n_folds)) + 1: output_tmp['Fold ' + str(f)] = 0 #fill in the output template based on the current permutation parameters for k in self.paramgrid[i].keys(): output_tmp.at[0, k] = self.paramgrid[i][k] if alphas is not None: output_tmp = pd.concat([output_tmp] * len(alphas)) output_tmp['alphas'] = alphas output_tmp['Method'] = method rmsecv_folds_tmp = np.empty( shape=(0)) # Create empty array to hold RMSECV for each fold alphas_out = np.empty(shape=(0)) cvcols_all = np.empty(shape=(0)) foldcount = 1 for train, holdout in cv_iterator: # Iterate through each of the folds in the training set cv_train = Train.iloc[ train] # extract the data to be used to create the model cv_holdout = Train.iloc[ holdout] # extract the data that will be held out of the model if calc_path: # get X and y data X = cv_train[xcols] y = cv_train[ycol] #do the path calculation path_alphas,\ path_coefs,\ intercepts,\ path_n_iters,\ y_pred_holdouts,\ fold_rmses,\ cvcols = path_calc(X, y, cv_holdout[xcols], cv_holdout[ycol], alphas, self.paramgrid[i], yname = ycol[0][-1], method = method) output_tmp['Fold ' + str(foldcount)] = fold_rmses for n in list(range(len(path_alphas))): Train.at[Train.index[holdout], cvcols[n]] = y_pred_holdouts[n] else: if method == 'Local Regression': params = self.paramgrid[i] try: #on the first pass, pop off the n_neigbors parameter so it can be passed correctly n_neighbors = params['n_neighbors'] params.pop('n_neighbors') except: pass cvcols = [ ('predict', '"' + method + '- CV -' + str(self.paramgrid[i]) + ' n_neighbors: ' + str(n_neighbors) + '"') ] model = local_regression.LocalRegression( params, n_neighbors=n_neighbors) y_pred_holdout, coeffs, intercepts = model.fit_predict( cv_train[xcols], cv_train[ycol], cv_holdout[xcols]) else: cvcols = [('predict', '"' + method + '- CV -' + str(self.paramgrid[i]) + '"')] #fit the model and predict the held-out data model = regression([method], [self.paramgrid[i]]) model.fit(cv_train[xcols], cv_train[ycol]) if model.goodfit: y_pred_holdout = model.predict(cv_holdout[xcols]) else: y_pred_holdout = cv_holdout[ycol] * np.nan #add the predictions to the appropriate column in the training data Train.at[Train.index[holdout], cvcols[0]] = y_pred_holdout #append the RMSECV to the list output_tmp['Fold ' + str(foldcount)] = RMSE( y_pred_holdout, cv_holdout[ycol]) pass foldcount = foldcount + 1 #now that all the folds have been held out and predicted, calculate the overall rmsecv and add it to the output rmsecv = [] for col in cvcols: rmsecv.append(RMSE(Train[col], Train[ycol])) predictkeys.append(col[-1]) output_tmp['RMSECV'] = rmsecv #fit the model on the full training set using the current settings if calc_path: X = Train[xcols] y = Train[ycol] path_alphas, \ path_coefs, \ intercepts, \ path_n_iters, \ ypred_train, \ rmsec_train, \ cols = path_calc(X, y, X, y, alphas, self.paramgrid[i], colname = 'Cal', yname = ycol[0][-1], method = method) for n in list(range(len(path_alphas))): Train[cols[n]] = ypred_train[ n] #put the training set predictions in the data frame predictkeys.append(cols[n][-1]) #create the model and manually set its parameters based on the path results rather than training it model = regression([method], [self.paramgrid[i]]) model.model.set_params(alpha=path_alphas[n]) setattr(model.model, 'intercept_', intercepts[n]) setattr(model.model, 'coef_', np.squeeze(path_coefs)[:, n]) setattr(model.model, 'n_iter_', path_n_iters[n]) #add the model and its name to the list models.append(model) modelkey = "{} - {} - ({}, {}) Alpha: {}, {}".format( method, ycol[0][-1], yrange[0], yrange[1], path_alphas[n], self.paramgrid[i]) modelkeys.append(modelkey) output_tmp['RMSEC'] = rmsec_train else: if method == 'Local Regression': model = local_regression.LocalRegression( self.paramgrid[i], n_neighbors=n_neighbors) modelkey = "{} - {} - ({}, {}) {} n_neighbors: {}".format( method, ycol[0][-1], yrange[0], yrange[1], self.paramgrid[i], n_neighbors) else: model = regression([method], [self.paramgrid[i]]) modelkey = "{} - {} - ({}, {}) {}".format( method, ycol[0][-1], yrange[0], yrange[1], self.paramgrid[i]) models.append(model) modelkeys.append(modelkey) ypred_train = Train[ycol] * np.nan if method == 'Local Regression': ypred_train, coeffs, intercepts = model.fit_predict( Train[xcols], Train[ycol], Train[xcols]) else: model.fit(Train[xcols], Train[ycol]) #if the fit is good, then predict the training set if model.goodfit: ypred_train = model.predict(Train[xcols]) else: models = models[:-1] modelkeys = modelkeys[:-1] #add the calibration predictions to the appropriate column if method == 'Local Regression': calcol = ('predict', '"' + method + '- Cal -' + str(self.paramgrid[i]) + ' n_neighbors: ' + str(n_neighbors) + '"') else: calcol = ('predict', '"' + method + '- Cal -' + str(self.paramgrid[i]) + '"') predictkeys.append(calcol[-1]) Train[calcol] = ypred_train #append the RMSEC for the current settings to the cllection of all RMSECs output_tmp['RMSEC'] = RMSE(ypred_train, Train[ycol]) try: output = pd.concat((output, output_tmp)) except: output = output_tmp #make the columns of the output data drame multi-indexed cols = output.columns.values cols = [('cv', i) for i in cols] output.columns = pd.MultiIndex.from_tuples(cols) return Train, output, models, modelkeys, predictkeys
def test_PLS(): regress = regression(method=['PLS'], yrange=[0.0, 100.0], params=[{'n_components': 0,'scale': False}])