def run(self): Modules.data_count += 1 self.train_ind = Modules.data_count Modules.data_count += 1 self.test_ind = Modules.data_count datakey = self.chooseDataToStratifyComboBox.currentText() nfolds = self.nFoldsSpinBox.value() try: testfold = int(self.testFoldsSpinBox.value()) except: testfold = 1 colname = ('comp', self.chooseVarComboBox.currentText()) self.data[datakey] = spectral_data( stratified_folds(self.data[datakey].df, nfolds=nfolds, sortby=colname)) self.data[datakey + '-Train'] = spectral_data( rows_match(self.data[datakey].df, ('meta', 'Folds'), [testfold], invert=True)) self.data[datakey + '-Test'] = spectral_data( rows_match(self.data[datakey].df, ('meta', 'Folds'), [testfold])) self.list_amend(self.datakeys, self.train_ind, datakey + '-Train') self.list_amend(self.datakeys, self.test_ind, datakey + '-Test') print(self.datakeys) print('Test set: ' + str(self.data[datakey + '-Test'].df.index.shape[0])) print('Training set: ' + str(self.data[datakey + '-Train'].df.index.shape[0]))
def run(self): if 'Model Coefficients' in self.datakeys: pass else: Modules.data_count += 1 self.list_amend(self.datakeys, Modules.data_count, 'Model Coefficients') Modules.model_count += 1 self.count = Modules.model_count method = self.chooseAlgorithmComboBox.currentText() datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [ self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value() ] params, modelkey = self.alg[ self.chooseAlgorithmComboBox.currentText()].run() modelkey = "{} - {} - ({}, {}) {}".format(method, yvars[0][-1], yrange[0], yrange[1], modelkey) self.list_amend(self.modelkeys, self.count, modelkey) self.models[modelkey] = regression.regression([method], [yrange], [params]) x = self.data[datakey].df[xvars] y = self.data[datakey].df[yvars] x = np.array(x) y = np.array(y) ymask = np.squeeze((y > yrange[0]) & (y < yrange[1])) y = y[ymask] x = x[ymask, :] self.models[modelkey].fit(x, y) self.model_xvars[modelkey] = xvars self.model_yvars[modelkey] = yvars try: coef = np.squeeze(self.models[modelkey].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples( self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = modelkey try: coef[('meta', 'Intercept')] = self.models[modelkey].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) except: pass
def run(self): method = self.chooseAlgorithmComboBox.currentText() datakey = self.chooseDataComboBox.currentText() # return method parameters and parameters that changed methodParameters, _changed = self.getMethodParams(self.chooseAlgorithmComboBox.currentIndex()) datakey_new = datakey + '-Baseline Removed-' + method + str(_changed) datakey_baseline = datakey + '-Baseline-' + method + str(_changed) self.datakeys.append(datakey_new) self.datakeys.append(datakey_baseline) self.data[datakey_new] = self.data[datakey].df.copy(deep=True) df, df_baseline = remove_baseline(self.data[datakey_new],method, segment=True, params=methodParameters) self.data[datakey_new] = spectral_data(df) self.data[datakey_baseline] = spectral_data(df_baseline)
def run(self): if 'Standardization Vectors' in self.datakeys: pass else: Modules.data_count += 1 self.list_amend(self.datakeys, Modules.data_count, 'Standardization Vectors') datakey_to_scale = self.chooseDataComboBox.currentText() datakey_to_fit = self.comboBox.currentText() try: scaler = StandardScaler() scaler.fit(self.data[datakey_to_fit].df['wvl']) self.data[datakey_to_scale].df['wvl'] = scaler.transform( self.data[datakey_to_scale].df['wvl']) print( datakey_to_scale + " standardized using spectral channel mean and standard deviations from " + datakey_to_fit) try: scaler_out = pd.DataFrame( np.vstack((scaler.var_, scaler.mean_)).T) scaler_out.index = [ ('wvl', x) for x in self.data[datakey_to_fit].df['wvl'].columns.values ] scaler_out = scaler_out.T scaler_out[('meta', 'Dataset')] = datakey_to_fit try: self.data['Standardization Vectors'] = spectral_data( pd.concat([ self.data['Standardization Vectors'].df, scaler_out ])) except: self.data['Standardization Vectors'] = spectral_data( scaler_out) except: pass except Exception as e: print(e)
def update_dataname(self): keyname = self.dataSetNameLineEdit.text() filename = self.fileNameLineEdit.text() self.list_amend(self.datakeys, self.curr_count, keyname) try: self.data[keyname] = spectral_data(pd.read_csv(filename, header=[0, 1], verbose=False, nrows=2)) except: pass
def run(self, filename = None, keyname = None): if filename == None: filename = self.fileNameLineEdit.text() if keyname == None: keyname = self.dataSetNameLineEdit.text() print('Loading data file: ' + str(filename)) self.data[keyname] = spectral_data(pd.read_csv(filename, header=[0, 1], verbose=False)) self.list_amend(self.datakeys, self.curr_count, keyname)
def run(self): self.lookupfilename = self.lookupfile.text() self.read_lookupdata() left_on = self.left_on.currentText() right_on = self.right_on.currentText() data = self.data[self.choosedata.currentText()] data = spectral_data( lookup.lookup(data.df, lookupdf=self.lookupdata, left_on=left_on, right_on=right_on)) self.data[self.choosedata.currentText()] = data
def combine_data(self): dataSet1 = self.dataSet1ComboBox.currentText() dataSet2 = self.dataSet2ComboBox.currentText() newkey = self.outputToDataSetLineEdit.text() if newkey != '': self.datakeys.append(newkey) try: self.data[newkey] = spectral_data( pd.concat([self.data[dataSet1].df, self.data[dataSet2].df], ignore_index=True)) except: pass
def run(self): datakey = self.chooseDataToStratifyComboBox.currentText() nfolds = self.nFoldsSpinBox.value() try: testfold = int(self.testFoldsSpinBox.value()) except: testfold = 1 colname = ('comp', self.chooseVarComboBox.currentText()) self.data[datakey] = spectral_data( stratified_folds(self.data[datakey].df, nfolds=nfolds, sortby=colname)) self.data[datakey + '-Train'] = spectral_data( rows_match(self.data[datakey].df, ('meta', 'Folds'), [testfold], invert=True)) self.data[datakey + '-Test'] = spectral_data( rows_match(self.data[datakey].df, ('meta', 'Folds'), [testfold])) self.datakeys.append(datakey + '-Train') self.datakeys.append(datakey + '-Test') print(self.data.keys()) print(self.data[datakey + '-Test'].df.index.shape) print(self.data[datakey + '-Train'].df.index.shape) #self.stratifiedfoldshist() folds = self.data[datakey].df[('meta', 'Folds')] folds_unique = folds.unique()[np.isfinite(folds.unique())] for fold in folds_unique: dat_col_folds = self.data[datakey].df[colname][folds == fold] plt.hist(dat_col_folds, bins=20) plt.xlabel(colname[1]) plt.ylabel('Frequency') plt.title('Histogram of Fold ' + str(int(fold))) #plt.axis([0, 100, 0, 100]) #plt.grid(True) # plt.show() plt.savefig(self.outpath + '//' + colname[1] + '_fold' + str(int(fold)) + '_hist.png') plt.clf()
def run(self): method = self.chooseAlgorithmComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] fit_intercept = self.fit_intercept.isChecked() force_positive = self.forcepositive.isChecked() params = { 'fit_intercept': self.fit_intercept.isChecked(), 'max_iter': 10000, 'positive': self.forcepositive.isChecked(), 'selection': 'random', 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1] } localmodel = local_regression.LocalRegression( params, n_neighbors=self.n_neighbors_spin.value()) traindata = self.data[self.choosedata_train.currentText()] predictdata = self.data[self.choosedata_predict.currentText()] x_train = np.array(traindata.df[xvars]) y_train = np.array(traindata.df[yvars]) x_predict = np.array(predictdata.df[xvars]) predictions, coefs, intercepts = localmodel.fit_predict( x_train, y_train, x_predict) predictname = ('predict', 'Local LASSO - ' + self.choosedata_predict.currentText() + ' - Predict') self.data[self.choosedata_predict.currentText( )].df[predictname] = predictions coefs = pd.DataFrame(coefs, columns=pd.MultiIndex.from_tuples( self.data[self.choosedata_predict.currentText( )].df[xvars].columns.values)) coefs[('meta', 'Intercept')] = intercepts try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coefs])) except: self.data['Model Coefficients'] = spectral_data(coefs) self.datakeys.append('Model Coefficients')
def plot_spect_update_list(self, obj): try: obj.clear() self.data[self.chooseDataComboBox.currentText()] = spectral_data( enumerate_duplicates( self.data[self.chooseDataComboBox.currentText()].df, self.chooseColumnComboBox.currentText())) rowchoices = self.data[self.chooseDataComboBox.currentText()].df[( 'meta', self.chooseColumnComboBox.currentText())] for i in rowchoices: obj.addItem(i) except: pass
def test_combine_datasets(qtbot): form = QtWidgets.QWidget() gui = CombineDataSets() gui.setupUi(form) key1 = 'test1' key2 = 'test2' outkey = 'data' __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) filename = os.path.join(__location__, 'dataset.csv') gui.data[key1] = spectral_data( pd.read_csv(filename, header=[0, 1], verbose=True)) gui.data[key2] = spectral_data( pd.read_csv(filename, header=[0, 1], verbose=True)) gui.dataSet1ComboBox.addItem(key1) gui.dataSet1ComboBox.setItemText(0, key1) gui.dataSet2ComboBox.addItem(key2) gui.dataSet2ComboBox.setItemText(0, key2) gui.outputToDataSetLineEdit.setText(outkey) gui.run() print(gui.dataSet1ComboBox.currentText(), gui.dataSet2ComboBox.currentText()) print(gui.data) try: assert_frame_equal( gui.data['data'].df, spectral_data(pd.concat([gui.data[key1].df, gui.data[key2].df])).df) assert True except: assert False
def setup(self): """ The setup here is only doing the first 2 rows of our dataset This will cut down on time to load. :return: """ try: filename = self.fileNameLineEdit.text() keyname = self.dataSetNameLineEdit.text() self.data[keyname] = spectral_data( pd.read_csv(filename, header=[0, 1], verbose=False, nrows=2)) self.list_amend(self.datakeys, self.curr_count, keyname) except: pass
def run(self): datakey = self.chooseDataToStratifyComboBox.currentText() nfolds = self.nFoldsSpinBox.value() try: testfold = int(self.testFoldsSpinBox.value()) except: testfold = 1 colname = ('comp', self.chooseVarComboBox.currentText()) self.data[datakey] = spectral_data( stratified_folds(self.data[datakey].df, nfolds=nfolds, sortby=colname)) self.data[datakey + '-Train'] = spectral_data( rows_match(self.data[datakey].df, ('meta', 'Folds'), [testfold], invert=True)) self.data[datakey + '-Test'] = spectral_data( rows_match(self.data[datakey].df, ('meta', 'Folds'), [testfold])) self.datakeys.append(datakey + '-Train') self.datakeys.append(datakey + '-Test') print(self.data.keys()) print(self.data[datakey + '-Test'].df.index.shape) print(self.data[datakey + '-Train'].df.index.shape)
def run(self): datakey = self.chooseDataComboBox.currentText() colname = self.splitOnUniqueValuesOfComboBox.currentText() vars_level0 = self.data[datakey].df.columns.get_level_values(0) vars_level1 = self.data[datakey].df.columns.get_level_values(1) vars_level1 = list(vars_level1[vars_level0 != 'wvl']) vars_level0 = list(vars_level0[vars_level0 != 'wvl']) colname = (vars_level0[vars_level1.index(colname)], colname) coldata = np.array([str(i) for i in self.data[datakey].df[colname]]) unique_values = np.unique(coldata) for i in unique_values: new_datakey = datakey + ' - ' + str(i) self.datakeys.append(new_datakey) self.data[new_datakey] = spectral_data( self.data[datakey].df.ix[coldata == i])
def run(self): Modules.data_count += 1 self.count = Modules.data_count dataSet1 = self.dataSet1ComboBox.currentText() dataSet2 = self.dataSet2ComboBox.currentText() newkey = self.outputToDataSetLineEdit.text() if newkey != '': self.list_amend(self.datakeys, self.count, newkey) try: data1 = self.data[dataSet1].df data2 = self.data[dataSet2].df data1[('meta', 'Dataset')] = dataSet1 data2[('meta', 'Dataset')] = dataSet2 newdata = spectral_data( pd.concat([data1, data2], ignore_index=True)) self.data[newkey] = newdata except: pass
def run(self, filename=None, keyname=None): Modules.data_count += 1 self.count = Modules.data_count if filename == None: filename = self.fileNameLineEdit.text() if keyname == None: keyname = self.dataSetNameLineEdit.text() #if the datakey exists, add a number to it to make it unique number = 1 while keyname in self.datakeys: number += 1 keyname = keyname + ' - ' + str(number) print('Loading data file: ' + str(filename)) self.data[keyname] = spectral_data( pd.read_csv(filename, header=[0, 1], verbose=False)) self.list_amend(self.datakeys, self.count, keyname)
def run(self): match_vectors = [] logic_list = [] datakey = self.chooseData.currentText() for i in self.operations: values_tmp = i.GetValues() if i.hidden == None: match_vectors.append(self.evaluate_operation(datakey, values_tmp)) logic_list.append(values_tmp['logic']) else: if i.hidden.isChecked() == False: match_vectors.append(self.evaluate_operation(datakey, values_tmp)) logic_list.append(values_tmp['logic']) match_combined = np.all(match_vectors, axis=0) print(self.data[datakey].df.shape) self.data[datakey] = spectral_data(self.data[datakey].df.ix[~match_combined], dim_red=self.data[datakey].dim_red) print(self.data[datakey].df.shape)
def jsc_batch(directory, LUT_files, searchstring='*.txt', to_csv=None): # Read in the lookup tables to expand filename metadata refdata = read_refdata(LUT_files) # get the list of files that match the search string in the given directory filelist = file_search(directory, searchstring) spectIDs = [] # create an empty list to hold the spectrometer IDs libsIDs = [] timestamps = [] locs = [] for file in filelist: filesplit = os.path.basename(file).split('_') spectIDs.append( filesplit[6]) # get the spectrometer IDs for each file in the list libsIDs.append(filesplit[0]) timestamps.append(filesplit[-1].split('.')[0]) locs.append(filesplit[1]) spectIDs_unique = np.unique(spectIDs) # get the unique spectrometer IDs libsIDs_unique = np.unique(libsIDs) dfs = [ ] # create an empty list to hold the data frames for each spectrometer # loop through each LIBS ID alldata = [] for ID in libsIDs_unique: print('Working on : ' + str(ID)) sublist = filelist[np.in1d(libsIDs, ID)] locs = [] for file in sublist: locs.append(os.path.basename(file).split('_')[1]) locs_unique = np.unique(locs) # loop through each location for that libs ID for loc in locs_unique: print(loc) sub_sublist = sublist[np.in1d( locs, loc)] # get the files for that LIBSID and location data = JSC(sub_sublist, refdata) alldata.append(data) pass combined = pd.concat(alldata) if to_csv is not None: print('Writing combined data to: ' + to_csv) combined.to_csv(to_csv) return spectral_data(combined)
def update_datakeys(self, setup=False): datakey = self.chooseDataComboBox.currentText() colname = self.splitOnUniqueValuesOfComboBox.currentText() vars_level0 = self.data[datakey].df.columns.get_level_values(0) vars_level1 = self.data[datakey].df.columns.get_level_values(1) vars_level1 = list(vars_level1[vars_level0 != 'wvl']) vars_level0 = list(vars_level0[vars_level0 != 'wvl']) colname = (vars_level0[vars_level1.index(colname)], colname) coldata = np.array([str(i) for i in self.data[datakey].df[colname]]) unique_values = np.unique(coldata) for i in unique_values: new_datakey = datakey + ' - ' + str(i) if not new_datakey in self.datakeys: Modules.data_count += 1 self.list_amend(self.datakeys, Modules.data_count, new_datakey) if setup == False: self.data[new_datakey] = spectral_data( self.data[datakey].df.iloc[coldata == i])
def run(self, filename=None, keyname=None): Modules.data_count += 1 self.count = Modules.data_count if filename == None: filename = self.fileNameLineEdit.text() if keyname == None: keyname = self.dataSetNameLineEdit.text() #if the datakey exists, add a number to it to make it unique number = 1 while keyname in self.datakeys: number += 1 keyname = keyname + ' - ' + str(number) print('Loading data file: ' + str(filename)) data = pd.read_csv(filename, header=[0, 1], verbose=False) try: #remove duplicate wvl values data_wvl = data['wvl'] data_no_wvl = data.drop(columns='wvl') good_wvls = [] for i in data_wvl.columns: try: i = float(i) good_wvls.append(True) except: print("Removing column " + str(i)) good_wvls.append(False) data_wvl = data_wvl.iloc[:, good_wvls] data_wvl.columns = pd.MultiIndex.from_tuples([ ('wvl', float(i)) for i in data_wvl.columns ]) data = pd.merge(data_no_wvl, data_wvl, left_index=True, right_index=True) except: pass self.data[keyname] = spectral_data(data) self.list_amend(self.datakeys, self.count, keyname) self.datafiles[keyname] = os.path.basename(filename)
def run(self): method = self.chooseAlgorithmComboBox.currentText() datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [ self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value() ] # Warning: Params passing through cv.cv(params) needs to be in lists # Example: {'n_components': [4], 'scale': [False]} params, modelkey = self.alg[ self.chooseAlgorithmComboBox.currentText()].run() #if the method supports it, separate out alpha from the other parameters and prepare for calculating path path_methods = ['Elastic Net', 'LASSO'] #, 'Ridge'] if method in path_methods: calc_path = True alphas = params.pop('alpha') else: alphas = None calc_path = False y = np.array(self.data[datakey].df[yvars]) match = np.squeeze((y > yrange[0]) & (y < yrange[1])) data_for_cv = spectral_data(self.data[datakey].df.ix[match]) paramgrid = list( ParameterGrid(params)) # create a grid of parameter permutations cv_obj = cv.cv(paramgrid) try: cv_iterator = LeaveOneGroupOut().split( data_for_cv.df[xvars], data_for_cv.df[yvars], data_for_cv.df[('meta', 'Folds')] ) # create an iterator for cross validation based on the predefined folds n_folds = LeaveOneGroupOut().get_n_splits( groups=data_for_cv.df[('meta', 'Folds')]) except: print( '***No folds found! Did you remember to define folds before running cross validation?***' ) self.data[ datakey].df, self.cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv( data_for_cv.df, cv_iterator, xcols=xvars, ycol=yvars, yrange=yrange, method=method, alphas=alphas, calc_path=calc_path, n_folds=n_folds) for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) for n, key in enumerate(cvmodelkeys): self.list_amend(self.modelkeys, len(self.modelkeys), key) self.modelkeys.append(key) self.models[key] = cvmodels[n] self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': coef = np.squeeze(cvmodels[n].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples( self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) self.datakeys.append('Model Coefficients') number = 1 cvid = str('CV Results ' + modelkey + ' - ' + yvars[0][1]) while cvid in self.datakeys: number += 1 cvid = str('CV Results ' + modelkey + ' - ' + yvars[0][1]) + ' - ' + str(number) self.datakeys.append(cvid) self.data[cvid] = self.cv_results
def setup(self): try: method = self.chooseAlgorithmComboBox.currentText() datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [ self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value() ] # Warning: Params passing through cv.cv(params) needs to be in lists # Example: {'n_components': [4], 'scale': [False]} params, modelkey = self.alg[ self.chooseAlgorithmComboBox.currentText()].run() #if the method supports it, separate out alpha from the other parameters and prepare for calculating path path_methods = ['Elastic Net', 'LASSO'] #, 'Ridge'] if method in path_methods: alphas = params.pop('alpha') else: alphas = None paramgrid = list(ParameterGrid( params)) # create a grid of parameter permutations cv_obj = cv.cv(paramgrid) cvpredictkeys = [] cvmodelkeys = [] for i in range(len(paramgrid)): if alphas is not None: for j in range(len(alphas)): keytemp = '"' + method + ' - ' + yvars[0][ -1] + ' - CV - Alpha:' + str( alphas[j]) + ' - ' + str(paramgrid[i]) + '"' cvpredictkeys.append(keytemp) keytemp = '"' + method + ' - ' + yvars[0][ -1] + ' - Cal - Alpha:' + str( alphas[j]) + ' - ' + str(paramgrid[i]) + '"' cvpredictkeys.append(keytemp) modelkeytemp = "{} - {} - ({}, {}) Alpha: {}, {}".format( method, yvars[0][-1], yrange[0], yrange[1], alphas[j], paramgrid[i]) cvmodelkeys.append(modelkeytemp) else: keytemp = '"' + method + '- Cal -' + str( paramgrid[i]) + '"' cvpredictkeys.append(keytemp) keytemp = '"' + method + '- Cal -' + str( paramgrid[i]) + '"' cvpredictkeys.append(keytemp) modelkeytemp = "{} - {} - ({}, {}) {}".format( method, yvars[0][-1], yrange[0], yrange[1], paramgrid[i]) cvmodelkeys.append(modelkeytemp) for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) self.data[datakey].df[( 'predict', key )] = 9999 #Need to fill the data frame with dummy values until CV is actually run for n, key in enumerate(cvmodelkeys): self.list_amend(self.modelkeys, len(self.modelkeys), key) self.modelkeys.append(key) self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': coef = self.data[datakey].df[xvars[ 0]].columns.values * 0.0 + 9999 #Fill with dummy coeffs before model is run coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples( self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[( 'meta', 'Intercept' )] = 0 #Fill intercept with zeros prior to model run except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat( [self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) self.datakeys.append('Model Coefficients') self.list_amend(self.datakeys, len(self.datakeys), 'CV Results ' + modelkey) except: pass
def run(self): datakeyA = self.chooseDataA.currentText() datakeyB = self.chooseDataB.currentText() dataAmatchcol = self.chooseDataAMatch.currentText() dataBmatchcol = self.chooseDataBMatch.currentText() paramgrid = [{'method': 'None'}] if self.PDScheckbox.isChecked(): paramgrid.extend( list(ParameterGrid(self.alg['PDS - Piecewise DS'][0].run()))) if self.PDSPLScheckBox.isChecked(): paramgrid.extend( list( ParameterGrid( self.alg['PDS-PLS - PDS using Partial Least Squares'] [0].run()))) if self.DScheckbox.isChecked(): paramgrid.extend( list( ParameterGrid( self.alg['DS - Direct Standardization'][0].run()))) if self.LASSODScheckbox.isChecked(): paramgrid.extend(list(ParameterGrid( self.alg['LASSO DS'][0].run()))) if self.Ratiocheckbox.isChecked(): paramgrid.extend([{'method': 'Ratio'}]) if self.SparseDScheckBox.isChecked(): paramgrid.extend( list(ParameterGrid(self.alg['Sparse Low Rank DS'][0].run()))) if self.RidgeDScheckBox.isChecked(): paramgrid.extend(list(ParameterGrid( self.alg['Ridge DS'][0].run()))) if self.CCAcheckBox.isChecked(): paramgrid.extend( list( ParameterGrid( self.alg['CCA - Canonical Correlation Analysis'] [0].run()))) if self.NewCCAcheckBox.isChecked(): paramgrid.extend(list(ParameterGrid(self.alg['New CCA'][0].run()))) if self.ForwardBackwardcheckBox.isChecked(): paramgrid.extend( list(ParameterGrid(self.alg['Forward Backward DS'][0].run()))) if self.IPDDScheckBox.isChecked(): paramgrid.extend( list( ParameterGrid( self.alg['Incremental Proximal Descent DS'][0].run()))) #get the data sets A = self.data[datakeyA].df B = self.data[datakeyB].df A_mean, B_mean = caltran_utils.prepare_data(A, B, dataAmatchcol, dataBmatchcol) #prepare for cross validation uniquevals = np.unique(A_mean[('meta', dataAmatchcol)]) cv_results = pd.DataFrame() ind = 0 for params in paramgrid: #step through all the different permutations print(params) transformed_datakey = datakeyA + '-' + str(params) for key in params.keys(): # store parameters in the results file cv_results.loc[ind, key] = params[key] ct_obj = cal_tran.cal_tran( params) #create a caltran object using the current parameters A_mean_transformed = copy.deepcopy(A_mean) A_mean_transformed['wvl'] = A_mean_transformed['wvl'] * 0 rmses = [] for val in uniquevals: #hold out each unique spectrum in turn print(val) # define the validation data (the held out spectrum) # and the training data (the spectra that are not held out) # for both data sets val_data_A = np.squeeze( np.array(A_mean[A_mean[('meta', dataAmatchcol)] == val]['wvl'], dtype='float')) train_data_A = np.squeeze( np.array( A_mean[A_mean[('meta', dataAmatchcol)] != val]['wvl'], dtype='float')) val_data_B = np.squeeze( np.array(B_mean[B_mean[('meta', dataBmatchcol)] == val]['wvl'], dtype='float')) train_data_B = np.squeeze( np.array( B_mean[B_mean[('meta', dataBmatchcol)] != val]['wvl'], dtype='float')) ct_obj.derive_transform( train_data_A, train_data_B ) #derive the transform based on the training data val_data_A_transformed = ct_obj.apply_transform( val_data_A ) #apply the transform to the held out spectrum from A if self.keep_spectra_checkBox.isChecked(): A_mean_transformed.loc[ A_mean_transformed[('meta', dataAmatchcol)] == val, 'wvl'] = val_data_A_transformed #this step is very slow, can we speed it up? rmses.append(mismatch_rmse(val_data_A_transformed, val_data_B)) cv_results.loc[ind, val + '_RMSE'] = rmses[ -1] #record the RMSE for the held out spectrum cv_results.loc[ind, 'average_RMSE'] = np.mean(rmses) if self.keep_spectra_checkBox.isChecked(): Modules.data_count += 1 self.index = Modules.data_count self.list_amend(self.datakeys, self.index, transformed_datakey) self.data[transformed_datakey] = spectral_data.spectral_data( A_mean_transformed) ind = ind + 1 cv_results.columns = pd.MultiIndex.from_tuples([ ('cv', col) for col in cv_results.columns ]) cvid = 'Caltran CV Results' number = 1 while cvid in self.datakeys: number += 1 cvid = cvid + ' - ' + str(number) Modules.data_count += 1 self.index = Modules.data_count self.list_amend(self.datakeys, self.index, cvid) self.data[cvid] = cv_results
def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None): # Determine if the file is a .csv or .SAV if '.sav' in searchstring.lower(): is_sav = True else: is_sav = False filelist = file_search(directory, searchstring) basenames = np.zeros_like(filelist) sclocks = np.zeros_like(filelist) P_version = np.zeros_like(filelist, dtype='int') # Extract the sclock and version for each file and ensure that only one # file per sclock is being read, and that it is the one with the highest version number for i, name in enumerate(filelist): basenames[i] = os.path.basename(name) sclocks[i] = basenames[i][4:13] # extract the sclock P_version[i] = basenames[i][-5:-4] # extract the version sclocks_unique = np.unique(sclocks) # find unique sclocks filelist_new = np.array([], dtype='str') for i in sclocks_unique: match = (sclocks == i) # find all instances with matching sclocks maxP = P_version[match] == max(P_version[match]) # find the highest version among these files filelist_new = np.append(filelist_new, filelist[match][maxP]) # keep only the file with thei highest version filelist = filelist_new # Should add a progress bar for importing large numbers of files dt = [] if progressbar: progressbar.setWindowTitle('ChemCam data progress') progressbar.setRange(0, filelist.size) progressbar.show() filecount = 0 for i in filelist: filecount = filecount + 1 print(i) try: if is_sav: t = time.time() tmp = CCAM_SAV(i, ave=ave) dt.append(time.time() - t) else: t = time.time() tmp = CCAM_CSV(i) dt.append(time.time() - t) if i == filelist[0]: combined = tmp else: # This ensures that rounding errors are not causing mismatches in columns cols1 = list(combined['wvl'].columns) cols2 = list(tmp['wvl'].columns) if set(cols1) == set(cols2): combined = pd.concat([combined, tmp]) else: print("Wavelengths don't match!") except: pass if progressbar: progressbar.setValue(filecount) QtCore.QCoreApplication.processEvents() pass combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')]) if lookupfile is not None: combined = lookup(combined, lookupfile=lookupfile) if to_csv is not None: combined.to_csv(to_csv) return spectral_data(combined)
def run(self): dataSet1 = self.dataSet1ComboBox.currentText() dataSet2 = self.dataSet2ComboBox.currentText() newkey = self.outputToDataSetLineEdit.text() self.datakeys.append(newkey) self.data[newkey] = spectral_data(pd.concat([self.data[dataSet1].df, self.data[dataSet2].df], ignore_index=True))
def ccam_batch(directory, searchstring='*.csv', to_csv=None, lookupfile=None, ave=True, progressbar=None, left_on='sclock', right_on='Spacecraft Clock', versioncheck=True): # Determine if the file is a .csv or .SAV if 'sav' in searchstring.lower(): is_sav = True else: is_sav = False filelist = file_search(directory, searchstring) if len(filelist) == 0: print('No files found in ' + directory + ' using search string ' + searchstring) return basenames = np.zeros_like(filelist) sclocks = np.zeros_like(filelist) P_version = np.zeros_like(filelist, dtype='int') if versioncheck == True: # Extract the sclock and version for each file and ensure that only one # file per sclock is being read, and that it is the one with the highest version number for i, name in enumerate(filelist): basenames[i] = os.path.basename(name) sclocks[i] = basenames[i][4:13] # extract the sclock P_version[i] = basenames[i][-5:-4] # extract the version sclocks_unique = np.unique(sclocks) # find unique sclocks filelist_new = np.array([], dtype='str') for i in sclocks_unique: match = (sclocks == i) # find all instances with matching sclocks maxP = P_version[match] == max( P_version[match]) # find the highest version among these files filelist_new = np.append( filelist_new, filelist[match] [maxP]) # keep only the file with thei highest version filelist = filelist_new # Should add a progress bar for importing large numbers of files dt = [] if progressbar: from PyQt5 import QtCore # only rely on PyQt5 if a progressbar object has been passed progressbar.setWindowTitle('ChemCam data progress') progressbar.setRange(0, filelist.size) progressbar.show() filecount = 0 workinglist = [] subcount = 0 for i, file in enumerate(filelist): filecount = filecount + 1 print('File #' + str(filecount)) print(file) if is_sav: tmp = CCAM_SAV(file, ave=ave) else: tmp = CCAM_CSV(file, ave=ave) try: # This ensures that rounding errors are not causing mismatches in columns cols1 = list(combined['wvl'].columns) cols2 = list(tmp['wvl'].columns) if set(cols1) == set(cols2): combined = pd.concat([combined, tmp]) else: print("Wavelengths don't match!") except: combined = tmp # if doing single shots, save out the data every 50 files so that the program doesn't run out of memory if filecount % 50 == 0 and ave == False: workingfilename = 'temporary_data_files_' + str( subcount) + '-' + str(filecount) + '.csv' workinglist.append(workingfilename) combined.to_csv(workingfilename) subcount = filecount del combined gc.collect() if progressbar: progressbar.setValue(filecount) QtCore.QCoreApplication.processEvents() pass if ave == False: for f in workinglist: pass try: combined.loc[:, ('meta', 'sclock')] = pd.to_numeric(combined.loc[:, ('meta', 'sclock')]) except: pass if lookupfile is not None: try: combined = lookup(combined, lookupfile=lookupfile, left_on=left_on, right_on=right_on, skiprows=1) except: combined = lookup(combined, lookupfile=lookupfile, left_on=left_on, right_on=right_on, skiprows=0) if to_csv is not None: combined.to_csv(to_csv) return spectral_data(combined)
def run(self): self.cv_results_combined = None #clear previous results in case of re-run if 'Model Coefficients' in self.datakeys: pass else: Modules.data_count += 1 self.coef_index = Modules.data_count self.list_amend(self.datakeys, self.coef_index, 'Model Coefficients') Modules.data_count += 1 self.results_index = Modules.data_count paramgrids = {} if self.ARDcheckbox.isChecked(): paramgrids['ARD']=list(ParameterGrid(self.alg['ARD'][0].run())) if self.BRRcheckbox.isChecked(): paramgrids['BRR']=list(ParameterGrid(self.alg['BRR'][0].run())) if self.ENetcheckbox.isChecked(): enet_params=self.alg['Elastic Net'][0].run() params = enet_params[0] params['alpha'] = enet_params[1] paramgrids['Elastic Net']=list(ParameterGrid(params)) # if self.GPcheckBox.isChecked(): # paramgrids.append(list(ParameterGrid(self.alg['GP - Gaussian Processes'][0].run()))) if self.LARScheckbox.isChecked(): paramgrids['LARS']=list(ParameterGrid(self.alg['LARS'][0].run())) if self.LASSOcheckBox.isChecked(): lasso_params=self.alg['LASSO'][0].run() params = lasso_params[0] params['alpha'] = lasso_params[1] paramgrids['LASSO'] = list(ParameterGrid(params)) #paramgrids['LASSO']={'alphas':lasso_params[1],'params':list(ParameterGrid(lasso_params[0]))} if self.OLScheckBox.isChecked(): paramgrids['OLS']=list(ParameterGrid(self.alg['OLS'][0].run())) if self.OMPcheckBox.isChecked(): paramgrids['OMP']=list(ParameterGrid(self.alg['OMP'][0].run())) if self.PLScheckBox.isChecked(): paramgrids['PLS']=list(ParameterGrid(self.alg['PLS'][0].run())) if self.RidgecheckBox.isChecked(): paramgrids['Ridge']=list(ParameterGrid(self.alg['Ridge'][0].run())) if self.SVRcheckBox.isChecked(): paramgrids['SVR']=list(ParameterGrid(self.alg['SVR'][0].run())) if self.LocalcheckBox.isChecked(): paramgrids['Local Regression']=list(ParameterGrid(self.alg['Local Regression'][0].run())) if self.GBRcheckBox.isChecked(): paramgrids['GBR'] = list(ParameterGrid(self.alg['GBR'][0].run())) if self.RFcheckBox.isChecked(): paramgrids['RF'] = list(ParameterGrid(self.alg['RF'][0].run())) datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value()] y = np.array(self.data[datakey].df[yvars]) match = np.squeeze((y > yrange[0]) & (y < yrange[1])) data_for_cv = spectral_data(self.data[datakey].df.loc[match]) for key in paramgrids.keys(): print('===== Cross validating '+key+' =====') method=key paramgrid = paramgrids[key] cv_obj = cv.cv(paramgrid) data_for_cv_out, cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv(data_for_cv.df, xcols=xvars, ycol=yvars, yrange=yrange, method=method) try: cv_results[('cv','Data_file')] = self.datafiles[datakey] except: pass cv_results[('cv','ymin')] = yrange[0] cv_results[('cv','ymax')] = yrange[1] cv_results[('cv','ycol')] = yvars[0][1] data_for_cv = spectral_data(data_for_cv_out) self.cv_results_combined = pd.concat((self.cv_results_combined,cv_results)) for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) for n, key in enumerate(cvmodelkeys): Modules.model_count += 1 self.list_amend(self.modelkeys, Modules.model_count, key) self.models[key] = cvmodels[n] self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': try: coef = np.squeeze(cvmodels[n].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples(self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) except: pass number = 1 cvid = str('CV Results - ' + yvars[0][1]) while cvid in self.datakeys: number += 1 cvid = str('CV Results - ' + yvars[0][1]) + ' - ' + str(number) self.list_amend(self.datakeys,self.results_index,cvid) self.data[cvid] = spectral_data(self.cv_results_combined) Modules.data_count += 1 new_datakey = datakey + '-' +str(yvars)+' '+ str(yrange)+'-CV Predictions' self.list_amend(self.datakeys, Modules.data_count, new_datakey) self.data[new_datakey] = spectral_data(data_for_cv_out)
def run(self): paramgrids = {} if self.ARDcheckbox.isChecked(): paramgrids['ARD']=list(ParameterGrid(self.alg['ARD'][0].run())) if self.BRRcheckbox.isChecked(): paramgrids['BRR']=list(ParameterGrid(self.alg['BRR'][0].run())) if self.ENetcheckbox.isChecked(): enet_params=self.alg['Elastic Net'][0].run() paramgrids['Elastic Net']={'alphas':enet_params[1],'params':list(ParameterGrid(enet_params[0]))} # if self.GPcheckBox.isChecked(): # paramgrids.append(list(ParameterGrid(self.alg['GP - Gaussian Processes'][0].run()))) if self.LARScheckbox.isChecked(): paramgrids['LARS']=list(ParameterGrid(self.alg['LARS'][0].run())) if self.LASSOcheckBox.isChecked(): lasso_params=self.alg['LASSO'][0].run() paramgrids['LASSO']={'alphas':lasso_params[1],'params':list(ParameterGrid(lasso_params[0]))} if self.OLScheckBox.isChecked(): paramgrids['OLS']=list(ParameterGrid(self.alg['OLS'][0].run())) if self.OMPcheckBox.isChecked(): paramgrids['OMP']=list(ParameterGrid(self.alg['OMP'][0].run())) if self.PLScheckBox.isChecked(): paramgrids['PLS']=list(ParameterGrid(self.alg['PLS'][0].run())) if self.RidgecheckBox.isChecked(): paramgrids['Ridge']=list(ParameterGrid(self.alg['Ridge'][0].run())) if self.SVRcheckBox.isChecked(): paramgrids['SVR']=list(ParameterGrid(self.alg['SVR'][0].run())) if self.LocalcheckBox.isChecked(): paramgrids['Local Regression']=list(ParameterGrid(self.alg['Local Regression'][0].run())) datakey = self.chooseDataComboBox.currentText() xvars = [str(x.text()) for x in self.xVariableList.selectedItems()] yvars = [('comp', str(y.text())) for y in self.yVariableList.selectedItems()] yrange = [self.yMinDoubleSpinBox.value(), self.yMaxDoubleSpinBox.value()] y = np.array(self.data[datakey].df[yvars]) match = np.squeeze((y > yrange[0]) & (y < yrange[1])) data_for_cv = spectral_data(self.data[datakey].df.ix[match]) for key in paramgrids.keys(): print('===== Cross validating '+key+' =====') method=key #if the method supports it, separate out alpha from the other parameters and prepare for calculating path path_methods = ['Elastic Net', 'LASSO']#, 'Ridge'] if method in path_methods: calc_path = True alphas = paramgrids[key]['alphas'] paramgrid = paramgrids[key]['params'] else: alphas = None calc_path = False paramgrid = paramgrids[key] progbar = QtWidgets.QProgressBar() cv_obj = cv.cv(paramgrid, progressbar=progbar) self.data[datakey].df, cv_results, cvmodels, cvmodelkeys, cvpredictkeys = cv_obj.do_cv(data_for_cv.df, xcols=xvars, ycol=yvars, yrange=yrange, method=method, alphas = alphas, calc_path = calc_path) try: self.cv_results_combined = pd.concat((self.cv_results_combined,cv_results)) except: self.cv_results_combined = cv_results for key in cvpredictkeys: self.list_amend(self.predictkeys, len(self.predictkeys), key) for n, key in enumerate(cvmodelkeys): self.list_amend(self.modelkeys, len(self.modelkeys), key) self.models[key] = cvmodels[n] self.model_xvars[key] = xvars self.model_yvars[key] = yvars if method != 'GP': coef = np.squeeze(cvmodels[n].model.coef_) coef = pd.DataFrame(coef) coef.index = pd.MultiIndex.from_tuples(self.data[datakey].df[xvars].columns.values) coef = coef.T coef[('meta', 'Model')] = key try: coef[('meta', 'Intercept')] = cvmodels[n].model.intercept_ except: pass try: self.data['Model Coefficients'] = spectral_data( pd.concat([self.data['Model Coefficients'].df, coef])) except: self.data['Model Coefficients'] = spectral_data(coef) self.datakeys.append('Model Coefficients') number = 1 cvid = str('CV Results - ' + yvars[0][1]) while cvid in self.datakeys: number += 1 cvid = str('CV Results - ' + yvars[0][1]) + ' - ' + str(number) self.datakeys.append(cvid) self.data[cvid] = spectral_data(self.cv_results_combined)