コード例 #1
0
    def do_machine_learning(self,
                            prediction_alg,
                            x_train,
                            y_train,
                            cbn_name='temp'):
        """
        This performs common SL learning
        :param prediction_alg: A name of SL learning
        :param x_train: Training data for X variables
        :param y_train: Training data for a y variable
        :param cbn_name: A special parameter for continuous BN learning
        :return: A SL learning object
        """

        if prediction_alg is 'GradientBoostingRegressor':
            model = RegressionML(self.metrics).GradientBoostingRegressor_ML(
                x_train, y_train)
        elif prediction_alg is 'RandomForestRegressor':
            model = RegressionML(self.metrics).RandomForestRegressor_ML(
                x_train, y_train)
        elif prediction_alg is 'GaussianProcessRegressor':
            model = RegressionML(self.metrics).GaussianProcessRegressor_ML(
                x_train, y_train)
        elif prediction_alg is 'LinearRegression':
            model = RegressionML(self.metrics).LinearRegressor_ML(
                x_train, y_train)
        elif prediction_alg is 'ContinuousBNRegressor':
            model = RegressionML(self.metrics).ContinuousBNRegressor_ML(
                cbn_name, x_train, y_train)
        return model
コード例 #2
0
    def __init__(self):
        # Sets configuration
        self.cf = {}
        self.cf['sheet'] = 'Sheet1'
        # self.cf['selected_Xs'] = ['DIC', 'pH', 'Phosphate']
        self.cf['selected_Xs'] = ['Depth (µm)', 'Concentration', 'DIC', 'pH', 'Phosphate']
        self.cf['distance'] = 'Depth (µm)'
        self.cf['removal_targets'] = ['DIC', 'pH', 'Phosphate']
        # Concentration

        # self.size_experiments = 2
        self.size_experiments = 5

        # Define excel data for storing all data to Excel
        self.excel = {}
        self.excel['Data'] = []
        self.excel['Target'] = []
        self.excel['Removed'] = []
        self.excel['Normalization'] = []
        self.excel['Train Size'] = []
        self.excel['Test Size'] = []

        self.ml = RegressionML()
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE'] = []
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE-STD'] = []

        self.excel['X Variables'] = []

        # set model info
        for v in self.cf['selected_Xs']:
            self.excel['X Variables'].append(v)
コード例 #3
0
    def __init__(self, regression=True):
        super().__init__()

        self.cf['targets'] = ['OV (Oily value)']
        # self.cf['targets'] = ['OS (Oil separation)']
        # self.cf['targets'] = ['Turbidity']

        self.size_experiments = 10
        self.ml = RegressionML()
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE'] = []

        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE-STD'] = []
コード例 #4
0
    def __init__(self, regression=True):
        super().__init__()

        self.cf['targets'] = ['OV (Oily value)']
        # self.cf['targets'] = ['OS (Oil separation)']
        # self.cf['targets'] = ['Turbidity']

        # self.normalization = False
        self.normalization = True

        self.metrics = 'MAE'
        # self.metrics = 'RMSE'

        self.ml = RegressionML()
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-Score'] = []
コード例 #5
0
    def do_prediction(self, model_name, model, X, y=None, cbn_name='temp'):
        """
        This performs prediction using a given SL learning
        :param model_name: A name of SL learning
        :param model: An object of SL learning
        :param X: Data for X variables
        :param y: Data for a y variable
        :param cbn_name: A special parameter for continuous BN learning
        :return:
        """

        if model_name is 'ContinuousBNRegressor':
            yPredicted, r2 = RegressionML(
                self.metrics).ContinuousBNRegressor_prediction(
                    cbn_name, model, X, y)
        else:
            yPredicted, r2 = RegressionML(self.metrics).prediction(model, X, y)
        return yPredicted, r2
    def __init__(self, regression=True):
        super().__init__()

        # self.cf['target'] = 'OV (Oily value)'
        # self.cf['target'] = 'OS (Oil separation)'
        self.cf['target'] = 'Turbidity'

        # self.normalization = False
        self.normalization = True

        self.metrics = 'MAE'
        # self.metrics = 'RMSE'

        self.ml = RegressionML()

        # Use only RandomForestRegressor
        self.ml.regressors = [
            RandomForestRegressor(n_estimators=100),
        ]

        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-Score'] = []
コード例 #7
0
    def __init__(self, regression=True):
        self.cf = {}
        self.cf[
            'file'] = '../data/20200203_UCF_Env_Data_DD_updated_for_paper_11-10-2020.xlsx'
        self.cf['sheet'] = 'Image analysis'
        self.cf['targets'] = [
            'OV (Oily value)', 'OS (Oil separation)', 'Turbidity'
        ]
        # self.cf['targets'] = ['Turbidity']
        self.cf['target'] = 'OV (Oily value)'
        # self.cf['target'] = None
        self.cf['selected_Xs'] = [
            'Critical micelle concentration (CMClog) (ppm)',
            'Equilibrium surface tension (ST) above CMC (air) (mN/M)',
            'Equilibrium interfacial tension (IFT) above CMC with NSBM (mN/M)',
            'Micelle size (nm)',
            'Zeta potential (mV)',
            'Alkalinity (mg CaCO3/L)',
            "Surfactant's Initial pH at 7CMC",
            'Surfactant concentration (ppm)',
            'pH',
            'Suspended solids concentration (ppm)',
            'Salinity (ppm)',
            'Temperature (°C)',
        ]

        self.cf['regression'] = regression  # False means 'classification'
        self.cf['normalization'] = [False]  # False means 'non-normalization'

        if self.cf['regression'] is True:
            self.ml = RegressionML()
        else:
            self.ml = ClassificationML()

        # ========================================================
        # Define the number of data split
        self.cf['n_splits'] = 10

        # Define excel data for storing all data to Excel
        self.init_excel_file()
コード例 #8
0
class RegressionExperiment(Model_Setting):
    def __init__(self, regression=True):
        super().__init__()

        self.cf['targets'] = ['OV (Oily value)']
        # self.cf['targets'] = ['OS (Oil separation)']
        # self.cf['targets'] = ['Turbidity']

        # self.normalization = False
        self.normalization = True

        self.metrics = 'MAE'
        # self.metrics = 'RMSE'

        self.ml = RegressionML()
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-Score'] = []

    def run_experiment(self, excel_data):
        for target in self.cf['targets']:
            print('target: ', target)
            self.cf['target'] = target

            # 2. Select variables
            data = excel_data[self.cf['selected_Xs'] + [self.cf['target']]]

            # 3. Remove data if it contains 'nan'
            data = data[~data[self.cf['target']].isna()]

            # 4. Normalization for the target variable
            scaled_target_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(data[self.cf['target']].to_frame())
            data[self.cf['target']] = scaled_target_df

            # 5. Normalization is performed
            if self.normalization is True:
                scaled_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(data)
                data = pd.DataFrame(scaled_df, index=data.index, columns=data.columns)

            # 6. Separate Xs and y
            df_X, df_y = data[self.cf['selected_Xs']], data[self.cf['target']]

            # 9. Split into training and test part
            X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=.2, random_state=42)

            # 10. Split data for cross validation
            cv_results = {}
            n_splits = self.cf['n_splits']
            if n_splits > 0:
                kf = KFold(n_splits=n_splits)

                for train_index, val_index in kf.split(X_train):
                    train_x, val_x = X_train.iloc[train_index], X_train.iloc[val_index]
                    df_y_train = pd.DataFrame(data=y_train)
                    train_y, val_y = df_y_train.iloc[train_index], df_y_train.iloc[val_index]

                    # 11. Set data X and y for cross validation
                    self.ml.set_train_test_data(train_x, val_x, train_y, val_y)

                    # 12. Perform ML for cross validation
                    results = self.ml.perform_ML()

                    if len(cv_results) == 0:
                        for x, v in results.items():
                            cv_results[x] = []

                    for x, v in results.items():
                        cv_results[x].append(v)

                for x, v in cv_results.items():
                    print(f'[{x}] mean-std [{round(mean(v), 4)} ({round(stdev(v), 4)})')

            # 13. Set data X and y for ML
            self.ml.set_train_test_data(X_train, X_test, y_train, y_test)

            # 14. Perform ML
            results = self.ml.perform_ML(file_save=f'../output/{target}')

            # 15. Set all results for the excel output
            for clf in self.ml.regressors:
                name = type(clf).__name__
                self.excel[name + '-Score'].append(round(results[name], 4))
                print(name, round(results[name], 4))

            self.excel['Train Size'].append(len(X_train))
            self.excel['Test Size'].append(len(X_test))
            self.excel['Target'].append(target)

    def run(self):
        # 1. Load data from xlsx
        excel_data = pd.read_excel(self.cf['file'], self.cf['sheet'])

        self.run_experiment(excel_data)

        self.save_excel_file()
コード例 #9
0
class OneVariableRemoval():
    def __init__(self):
        # Sets configuration
        self.cf = {}
        self.cf['sheet'] = 'Sheet1'
        # self.cf['selected_Xs'] = ['DIC', 'pH', 'Phosphate']
        self.cf['selected_Xs'] = ['Depth (µm)', 'Concentration', 'DIC', 'pH', 'Phosphate']
        self.cf['distance'] = 'Depth (µm)'
        self.cf['removal_targets'] = ['DIC', 'pH', 'Phosphate']
        # Concentration

        # self.size_experiments = 2
        self.size_experiments = 5

        # Define excel data for storing all data to Excel
        self.excel = {}
        self.excel['Data'] = []
        self.excel['Target'] = []
        self.excel['Removed'] = []
        self.excel['Normalization'] = []
        self.excel['Train Size'] = []
        self.excel['Test Size'] = []

        self.ml = RegressionML()
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE'] = []
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE-STD'] = []

        self.excel['X Variables'] = []

        # set model info
        for v in self.cf['selected_Xs']:
            self.excel['X Variables'].append(v)

    def append_excel_column(self):
        self.excel['Data'].append('Data')
        self.excel['Target'].append('Target')
        self.excel['Removed'].append('Removed')
        self.excel['Normalization'].append('Normalization')
        self.excel['Train Size'].append('Train Size')
        self.excel['Test Size'].append('Test Size')
        self.excel['X Variables'].append('X Variables')

        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE'].append(name + '-MAE')
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE-STD'].append(name + '-MAE-STD')


    def save_excel_file(self):
        experiment_time = datetime.now().strftime("%m_%d_%Y-%H_%M_%S")
        # excel_file = f"../data_result/[Result]{self.cf['base_name']}"
        excel_file = f"../data_sensitivity_analysis/[Result]{experiment_time}.xlsx"
        excel_experiment = SaveResults(excel_file)
        for k, l in self.excel.items():
            excel_experiment.insert(k, l)
        excel_experiment.save()

    def run(self, f):
        base_name = os.path.basename(f)
        print(base_name)

        self.cf['input_file'] = f
        self.cf['base_name'] = base_name

        if 'pH' in base_name:
            self.cf['targets'] = ['pH_output']
        else:
            self.cf['targets'] = ['Cs', 'J', 'K']

        for target in self.cf['targets']:
            self.cf['target'] = target

            for removed in self.cf['removal_targets']:

                # 1. Loads data from xlsx
                excel_data = pd.read_excel(self.cf['input_file'], self.cf['sheet'])

                selected_Xs = self.cf['selected_Xs'].copy()
                selected_Xs.remove(removed)

                # 2. Selects variables
                excel_data = excel_data[selected_Xs + [self.cf['target']]]

                # 3. Removes all duplicated
                excel_data = excel_data.drop_duplicates()

                # 5. Normalization is performed
                scaled_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(excel_data)
                excel_data = pd.DataFrame(scaled_df, index=excel_data.index, columns=excel_data.columns)

                # 6. Separate Xs and y
                df_X, df_y = excel_data[selected_Xs], excel_data[self.cf['target']]

                # 9. Perform several ML experiments
                sum_results = None
                all_results = {}
                for i in range(self.size_experiments):
                    # 9. Split into training and test part
                    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=.2)

                    # 13. Set data X and y for ML
                    self.ml.set_train_test_data(X_train, X_test, y_train, y_test)

                    # 14. Perform ML
                    results = self.ml.perform_ML()

                    if len(all_results) == 0:
                        all_results = {x: [v] for x, v in results.items()}
                    else:
                        for x, v in all_results.items():
                            for x2, v2 in results.items():
                                if x2 == x:
                                    v.append(v2)

                    if sum_results is None:
                        sum_results = results
                    else:
                        sum_results = {x: v + v2 for x, v in sum_results.items() for x2, v2 in results.items() if x2 == x}

                # 15. Set all results for the excel output
                for clf in self.ml.regressors:
                    name = type(clf).__name__
                    # self.excel[name + '-MAE'].append(avg_results[name])
                    self.excel[name + '-MAE'].append(round(mean(all_results[name]), 4))
                    self.excel[name + '-MAE-STD'].append(round(stdev(all_results[name]), 4))

                self.excel['Data'].append(self.cf['base_name'])
                self.excel['Normalization'].append('True')
                self.excel['Train Size'].append(len(X_train))
                self.excel['Test Size'].append(len(X_test))
                self.excel['Target'].append(target)
                self.excel['Removed'].append(removed)
コード例 #10
0
class SensitivityAnalysisExperiment(Model_Setting):
    def __init__(self, regression=True):
        super().__init__()

        self.cf['targets'] = ['OV (Oily value)']
        # self.cf['targets'] = ['OS (Oil separation)']
        # self.cf['targets'] = ['Turbidity']

        self.size_experiments = 10
        self.ml = RegressionML()
        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE'] = []

        for clf in self.ml.regressors:
            name = type(clf).__name__
            self.excel[name + '-MAE-STD'] = []

    def show_heatmap_matrix(self, df, columns):
        cm = np.corrcoef(df[columns].values.T)
        sns.set(font_scale=0.8)
        # hm = sns.heatmap(cm, char=True, annot=True, square=True, fmt='.2f', annot_kws={'size':15}, yticklabels=columns, xticklabels=columns)
        hm = sns.heatmap(cm,
                         annot=True,
                         square=True,
                         fmt='.2f',
                         annot_kws={'size': 15},
                         yticklabels=columns,
                         xticklabels=columns)
        plt.show()
        plt.close()

    def get_f_regression(self, X_train, y_train):
        f_test, _ = f_regression(X_train, y_train)

        results_f_dict = {}
        for i in range(X_train.shape[1]):
            results_f_dict[X_train.columns.values[i]] = f_test[i]

        results_f_dict = {
            k: v
            for k, v in sorted(
                results_f_dict.items(), key=lambda item: item[1], reverse=True)
        }

        print("======================")
        print("=       F-test       =")
        f_results = ''
        for k, v in results_f_dict.items():
            f_results += f'{k}\t{v}\r'
            print(f'{k}\t{v}')

        return f_results

    def run_ml_removing_sensitivity_analysis(self):
        # 1. Load data from xlsx
        excel_data = pd.read_excel(self.cf['file'], self.cf['sheet'])

        for X in self.cf['selected_Xs']:
            selected_Xs = self.cf['selected_Xs'].copy()
            selected_Xs.remove(X)
            print(f'******************** removed {X} ********************')
            for target in self.cf['targets']:
                print('target: ', target)
                self.cf['target'] = target

                # 2. Select variables
                data = excel_data[selected_Xs + [self.cf['target']]]

                # 3. Remove data if it contains 'nan'
                data = data[~data[self.cf['target']].isna()]
                if len(data) == 0:
                    continue

                # 4. Surfactant is converted to a numeric variable
                if 'Surfactant name' in data.columns.values:
                    data['Surfactant name'] = LabelEncoder().fit_transform(
                        data['Surfactant name'])

                # 5. Normalization is performed
                scaled_df = MinMaxScaler(feature_range=(0,
                                                        1)).fit_transform(data)
                data = pd.DataFrame(scaled_df,
                                    index=data.index,
                                    columns=data.columns)

                # 6. Separate Xs and y
                df_X, df_y = data[selected_Xs], data[self.cf['target']]

                # 7. Convert y to a categorical variable for classification
                # df_y = self.convert_to_categorical_variable(df_y)

                # 9. Perform several ML experiments
                sum_results = None
                all_results = {}
                for i in range(self.size_experiments):
                    # 9. Split into training and test part
                    X_train, X_test, y_train, y_test = train_test_split(
                        df_X, df_y, test_size=.2)

                    # 13. Set data X and y for ML
                    self.ml.set_train_test_data(X_train, X_test, y_train,
                                                y_test)

                    # 14. Perform ML
                    results = self.ml.perform_ML()

                    if len(all_results) == 0:
                        all_results = {x: [v] for x, v in results.items()}
                    else:
                        for x, v in all_results.items():
                            for x2, v2 in results.items():
                                if x2 == x:
                                    v.append(v2)

                    if sum_results is None:
                        sum_results = results
                    else:
                        sum_results = {
                            x: v + v2
                            for x, v in sum_results.items()
                            for x2, v2 in results.items() if x2 == x
                        }

                # 15. Set all results for the excel output
                for clf in self.ml.regressors:
                    name = type(clf).__name__
                    # self.excel[name + '-MAE'].append(avg_results[name])
                    self.excel[name + '-MAE'].append(
                        round(mean(all_results[name]), 4))
                    self.excel[name + '-MAE-STD'].append(
                        round(stdev(all_results[name]), 4))

                self.excel['Train Size'].append(len(X_train))
                self.excel['Test Size'].append(len(X_test))
                self.excel['Target'].append(target)
                self.excel['X Removed'].append(X)
                self.excel['F Results'].append('')

        self.save_excel_file()

    def run_experiment(self, excel_data):
        for target in self.cf['targets']:
            print('target: ', target)
            self.cf['target'] = target

            # 2. Select variables
            data = excel_data[self.cf['selected_Xs'] + [self.cf['target']]]

            # 3. Remove data if it contains 'nan'
            data = data[~data[self.cf['target']].isna()]

            # 5. Normalization is performed
            scaled_df = MinMaxScaler(feature_range=(0, 1)).fit_transform(data)
            data = pd.DataFrame(scaled_df,
                                index=data.index,
                                columns=data.columns)

            # 6. Separate Xs and y
            df_X, df_y = data[self.cf['selected_Xs']], data[self.cf['target']]

            # 8. Perform Sensitivity Analysis
            f_results = ''
            # columns = df_X.columns.copy()
            # columns.append(self.cf['target'])
            # self.show_heatmap_matrix(data, columns)
            f_results = self.get_f_regression(df_X, df_y)

    def run(self):
        # 1. Load data from xlsx
        excel_data = pd.read_excel(self.cf['file'], self.cf['sheet'])

        self.run_experiment(excel_data)

        self.save_excel_file()