def do_machine_learning(self, prediction_alg, x_train, y_train, cbn_name='temp'): """ This performs common SL learning :param prediction_alg: A name of SL learning :param x_train: Training data for X variables :param y_train: Training data for a y variable :param cbn_name: A special parameter for continuous BN learning :return: A SL learning object """ if prediction_alg is 'GradientBoostingRegressor': model = RegressionML(self.metrics).GradientBoostingRegressor_ML( x_train, y_train) elif prediction_alg is 'RandomForestRegressor': model = RegressionML(self.metrics).RandomForestRegressor_ML( x_train, y_train) elif prediction_alg is 'GaussianProcessRegressor': model = RegressionML(self.metrics).GaussianProcessRegressor_ML( x_train, y_train) elif prediction_alg is 'LinearRegression': model = RegressionML(self.metrics).LinearRegressor_ML( x_train, y_train) elif prediction_alg is 'ContinuousBNRegressor': model = RegressionML(self.metrics).ContinuousBNRegressor_ML( cbn_name, x_train, y_train) return model
def __init__(self): # Sets configuration self.cf = {} self.cf['sheet'] = 'Sheet1' # self.cf['selected_Xs'] = ['DIC', 'pH', 'Phosphate'] self.cf['selected_Xs'] = ['Depth (µm)', 'Concentration', 'DIC', 'pH', 'Phosphate'] self.cf['distance'] = 'Depth (µm)' self.cf['removal_targets'] = ['DIC', 'pH', 'Phosphate'] # Concentration # self.size_experiments = 2 self.size_experiments = 5 # Define excel data for storing all data to Excel self.excel = {} self.excel['Data'] = [] self.excel['Target'] = [] self.excel['Removed'] = [] self.excel['Normalization'] = [] self.excel['Train Size'] = [] self.excel['Test Size'] = [] self.ml = RegressionML() for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE'] = [] for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE-STD'] = [] self.excel['X Variables'] = [] # set model info for v in self.cf['selected_Xs']: self.excel['X Variables'].append(v)
def __init__(self, regression=True): super().__init__() self.cf['targets'] = ['OV (Oily value)'] # self.cf['targets'] = ['OS (Oil separation)'] # self.cf['targets'] = ['Turbidity'] self.size_experiments = 10 self.ml = RegressionML() for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE'] = [] for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE-STD'] = []
def __init__(self, regression=True): super().__init__() self.cf['targets'] = ['OV (Oily value)'] # self.cf['targets'] = ['OS (Oil separation)'] # self.cf['targets'] = ['Turbidity'] # self.normalization = False self.normalization = True self.metrics = 'MAE' # self.metrics = 'RMSE' self.ml = RegressionML() for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-Score'] = []
def do_prediction(self, model_name, model, X, y=None, cbn_name='temp'): """ This performs prediction using a given SL learning :param model_name: A name of SL learning :param model: An object of SL learning :param X: Data for X variables :param y: Data for a y variable :param cbn_name: A special parameter for continuous BN learning :return: """ if model_name is 'ContinuousBNRegressor': yPredicted, r2 = RegressionML( self.metrics).ContinuousBNRegressor_prediction( cbn_name, model, X, y) else: yPredicted, r2 = RegressionML(self.metrics).prediction(model, X, y) return yPredicted, r2
def __init__(self, regression=True): super().__init__() # self.cf['target'] = 'OV (Oily value)' # self.cf['target'] = 'OS (Oil separation)' self.cf['target'] = 'Turbidity' # self.normalization = False self.normalization = True self.metrics = 'MAE' # self.metrics = 'RMSE' self.ml = RegressionML() # Use only RandomForestRegressor self.ml.regressors = [ RandomForestRegressor(n_estimators=100), ] for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-Score'] = []
def __init__(self, regression=True): self.cf = {} self.cf[ 'file'] = '../data/20200203_UCF_Env_Data_DD_updated_for_paper_11-10-2020.xlsx' self.cf['sheet'] = 'Image analysis' self.cf['targets'] = [ 'OV (Oily value)', 'OS (Oil separation)', 'Turbidity' ] # self.cf['targets'] = ['Turbidity'] self.cf['target'] = 'OV (Oily value)' # self.cf['target'] = None self.cf['selected_Xs'] = [ 'Critical micelle concentration (CMClog) (ppm)', 'Equilibrium surface tension (ST) above CMC (air) (mN/M)', 'Equilibrium interfacial tension (IFT) above CMC with NSBM (mN/M)', 'Micelle size (nm)', 'Zeta potential (mV)', 'Alkalinity (mg CaCO3/L)', "Surfactant's Initial pH at 7CMC", 'Surfactant concentration (ppm)', 'pH', 'Suspended solids concentration (ppm)', 'Salinity (ppm)', 'Temperature (°C)', ] self.cf['regression'] = regression # False means 'classification' self.cf['normalization'] = [False] # False means 'non-normalization' if self.cf['regression'] is True: self.ml = RegressionML() else: self.ml = ClassificationML() # ======================================================== # Define the number of data split self.cf['n_splits'] = 10 # Define excel data for storing all data to Excel self.init_excel_file()
class RegressionExperiment(Model_Setting): def __init__(self, regression=True): super().__init__() self.cf['targets'] = ['OV (Oily value)'] # self.cf['targets'] = ['OS (Oil separation)'] # self.cf['targets'] = ['Turbidity'] # self.normalization = False self.normalization = True self.metrics = 'MAE' # self.metrics = 'RMSE' self.ml = RegressionML() for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-Score'] = [] def run_experiment(self, excel_data): for target in self.cf['targets']: print('target: ', target) self.cf['target'] = target # 2. Select variables data = excel_data[self.cf['selected_Xs'] + [self.cf['target']]] # 3. Remove data if it contains 'nan' data = data[~data[self.cf['target']].isna()] # 4. Normalization for the target variable scaled_target_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(data[self.cf['target']].to_frame()) data[self.cf['target']] = scaled_target_df # 5. Normalization is performed if self.normalization is True: scaled_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(data) data = pd.DataFrame(scaled_df, index=data.index, columns=data.columns) # 6. Separate Xs and y df_X, df_y = data[self.cf['selected_Xs']], data[self.cf['target']] # 9. Split into training and test part X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=.2, random_state=42) # 10. Split data for cross validation cv_results = {} n_splits = self.cf['n_splits'] if n_splits > 0: kf = KFold(n_splits=n_splits) for train_index, val_index in kf.split(X_train): train_x, val_x = X_train.iloc[train_index], X_train.iloc[val_index] df_y_train = pd.DataFrame(data=y_train) train_y, val_y = df_y_train.iloc[train_index], df_y_train.iloc[val_index] # 11. Set data X and y for cross validation self.ml.set_train_test_data(train_x, val_x, train_y, val_y) # 12. Perform ML for cross validation results = self.ml.perform_ML() if len(cv_results) == 0: for x, v in results.items(): cv_results[x] = [] for x, v in results.items(): cv_results[x].append(v) for x, v in cv_results.items(): print(f'[{x}] mean-std [{round(mean(v), 4)} ({round(stdev(v), 4)})') # 13. Set data X and y for ML self.ml.set_train_test_data(X_train, X_test, y_train, y_test) # 14. Perform ML results = self.ml.perform_ML(file_save=f'../output/{target}') # 15. Set all results for the excel output for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-Score'].append(round(results[name], 4)) print(name, round(results[name], 4)) self.excel['Train Size'].append(len(X_train)) self.excel['Test Size'].append(len(X_test)) self.excel['Target'].append(target) def run(self): # 1. Load data from xlsx excel_data = pd.read_excel(self.cf['file'], self.cf['sheet']) self.run_experiment(excel_data) self.save_excel_file()
class OneVariableRemoval(): def __init__(self): # Sets configuration self.cf = {} self.cf['sheet'] = 'Sheet1' # self.cf['selected_Xs'] = ['DIC', 'pH', 'Phosphate'] self.cf['selected_Xs'] = ['Depth (µm)', 'Concentration', 'DIC', 'pH', 'Phosphate'] self.cf['distance'] = 'Depth (µm)' self.cf['removal_targets'] = ['DIC', 'pH', 'Phosphate'] # Concentration # self.size_experiments = 2 self.size_experiments = 5 # Define excel data for storing all data to Excel self.excel = {} self.excel['Data'] = [] self.excel['Target'] = [] self.excel['Removed'] = [] self.excel['Normalization'] = [] self.excel['Train Size'] = [] self.excel['Test Size'] = [] self.ml = RegressionML() for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE'] = [] for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE-STD'] = [] self.excel['X Variables'] = [] # set model info for v in self.cf['selected_Xs']: self.excel['X Variables'].append(v) def append_excel_column(self): self.excel['Data'].append('Data') self.excel['Target'].append('Target') self.excel['Removed'].append('Removed') self.excel['Normalization'].append('Normalization') self.excel['Train Size'].append('Train Size') self.excel['Test Size'].append('Test Size') self.excel['X Variables'].append('X Variables') for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE'].append(name + '-MAE') for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE-STD'].append(name + '-MAE-STD') def save_excel_file(self): experiment_time = datetime.now().strftime("%m_%d_%Y-%H_%M_%S") # excel_file = f"../data_result/[Result]{self.cf['base_name']}" excel_file = f"../data_sensitivity_analysis/[Result]{experiment_time}.xlsx" excel_experiment = SaveResults(excel_file) for k, l in self.excel.items(): excel_experiment.insert(k, l) excel_experiment.save() def run(self, f): base_name = os.path.basename(f) print(base_name) self.cf['input_file'] = f self.cf['base_name'] = base_name if 'pH' in base_name: self.cf['targets'] = ['pH_output'] else: self.cf['targets'] = ['Cs', 'J', 'K'] for target in self.cf['targets']: self.cf['target'] = target for removed in self.cf['removal_targets']: # 1. Loads data from xlsx excel_data = pd.read_excel(self.cf['input_file'], self.cf['sheet']) selected_Xs = self.cf['selected_Xs'].copy() selected_Xs.remove(removed) # 2. Selects variables excel_data = excel_data[selected_Xs + [self.cf['target']]] # 3. Removes all duplicated excel_data = excel_data.drop_duplicates() # 5. Normalization is performed scaled_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(excel_data) excel_data = pd.DataFrame(scaled_df, index=excel_data.index, columns=excel_data.columns) # 6. Separate Xs and y df_X, df_y = excel_data[selected_Xs], excel_data[self.cf['target']] # 9. Perform several ML experiments sum_results = None all_results = {} for i in range(self.size_experiments): # 9. Split into training and test part X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=.2) # 13. Set data X and y for ML self.ml.set_train_test_data(X_train, X_test, y_train, y_test) # 14. Perform ML results = self.ml.perform_ML() if len(all_results) == 0: all_results = {x: [v] for x, v in results.items()} else: for x, v in all_results.items(): for x2, v2 in results.items(): if x2 == x: v.append(v2) if sum_results is None: sum_results = results else: sum_results = {x: v + v2 for x, v in sum_results.items() for x2, v2 in results.items() if x2 == x} # 15. Set all results for the excel output for clf in self.ml.regressors: name = type(clf).__name__ # self.excel[name + '-MAE'].append(avg_results[name]) self.excel[name + '-MAE'].append(round(mean(all_results[name]), 4)) self.excel[name + '-MAE-STD'].append(round(stdev(all_results[name]), 4)) self.excel['Data'].append(self.cf['base_name']) self.excel['Normalization'].append('True') self.excel['Train Size'].append(len(X_train)) self.excel['Test Size'].append(len(X_test)) self.excel['Target'].append(target) self.excel['Removed'].append(removed)
class SensitivityAnalysisExperiment(Model_Setting): def __init__(self, regression=True): super().__init__() self.cf['targets'] = ['OV (Oily value)'] # self.cf['targets'] = ['OS (Oil separation)'] # self.cf['targets'] = ['Turbidity'] self.size_experiments = 10 self.ml = RegressionML() for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE'] = [] for clf in self.ml.regressors: name = type(clf).__name__ self.excel[name + '-MAE-STD'] = [] def show_heatmap_matrix(self, df, columns): cm = np.corrcoef(df[columns].values.T) sns.set(font_scale=0.8) # hm = sns.heatmap(cm, char=True, annot=True, square=True, fmt='.2f', annot_kws={'size':15}, yticklabels=columns, xticklabels=columns) hm = sns.heatmap(cm, annot=True, square=True, fmt='.2f', annot_kws={'size': 15}, yticklabels=columns, xticklabels=columns) plt.show() plt.close() def get_f_regression(self, X_train, y_train): f_test, _ = f_regression(X_train, y_train) results_f_dict = {} for i in range(X_train.shape[1]): results_f_dict[X_train.columns.values[i]] = f_test[i] results_f_dict = { k: v for k, v in sorted( results_f_dict.items(), key=lambda item: item[1], reverse=True) } print("======================") print("= F-test =") f_results = '' for k, v in results_f_dict.items(): f_results += f'{k}\t{v}\r' print(f'{k}\t{v}') return f_results def run_ml_removing_sensitivity_analysis(self): # 1. Load data from xlsx excel_data = pd.read_excel(self.cf['file'], self.cf['sheet']) for X in self.cf['selected_Xs']: selected_Xs = self.cf['selected_Xs'].copy() selected_Xs.remove(X) print(f'******************** removed {X} ********************') for target in self.cf['targets']: print('target: ', target) self.cf['target'] = target # 2. Select variables data = excel_data[selected_Xs + [self.cf['target']]] # 3. Remove data if it contains 'nan' data = data[~data[self.cf['target']].isna()] if len(data) == 0: continue # 4. Surfactant is converted to a numeric variable if 'Surfactant name' in data.columns.values: data['Surfactant name'] = LabelEncoder().fit_transform( data['Surfactant name']) # 5. Normalization is performed scaled_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(data) data = pd.DataFrame(scaled_df, index=data.index, columns=data.columns) # 6. Separate Xs and y df_X, df_y = data[selected_Xs], data[self.cf['target']] # 7. Convert y to a categorical variable for classification # df_y = self.convert_to_categorical_variable(df_y) # 9. Perform several ML experiments sum_results = None all_results = {} for i in range(self.size_experiments): # 9. Split into training and test part X_train, X_test, y_train, y_test = train_test_split( df_X, df_y, test_size=.2) # 13. Set data X and y for ML self.ml.set_train_test_data(X_train, X_test, y_train, y_test) # 14. Perform ML results = self.ml.perform_ML() if len(all_results) == 0: all_results = {x: [v] for x, v in results.items()} else: for x, v in all_results.items(): for x2, v2 in results.items(): if x2 == x: v.append(v2) if sum_results is None: sum_results = results else: sum_results = { x: v + v2 for x, v in sum_results.items() for x2, v2 in results.items() if x2 == x } # 15. Set all results for the excel output for clf in self.ml.regressors: name = type(clf).__name__ # self.excel[name + '-MAE'].append(avg_results[name]) self.excel[name + '-MAE'].append( round(mean(all_results[name]), 4)) self.excel[name + '-MAE-STD'].append( round(stdev(all_results[name]), 4)) self.excel['Train Size'].append(len(X_train)) self.excel['Test Size'].append(len(X_test)) self.excel['Target'].append(target) self.excel['X Removed'].append(X) self.excel['F Results'].append('') self.save_excel_file() def run_experiment(self, excel_data): for target in self.cf['targets']: print('target: ', target) self.cf['target'] = target # 2. Select variables data = excel_data[self.cf['selected_Xs'] + [self.cf['target']]] # 3. Remove data if it contains 'nan' data = data[~data[self.cf['target']].isna()] # 5. Normalization is performed scaled_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(data) data = pd.DataFrame(scaled_df, index=data.index, columns=data.columns) # 6. Separate Xs and y df_X, df_y = data[self.cf['selected_Xs']], data[self.cf['target']] # 8. Perform Sensitivity Analysis f_results = '' # columns = df_X.columns.copy() # columns.append(self.cf['target']) # self.show_heatmap_matrix(data, columns) f_results = self.get_f_regression(df_X, df_y) def run(self): # 1. Load data from xlsx excel_data = pd.read_excel(self.cf['file'], self.cf['sheet']) self.run_experiment(excel_data) self.save_excel_file()