Example #1
0
    def test_error(self, test_function='rmse'):
        """
        Computes the error in the current model with the given metric function. The result is stored in a class variable tget_testest_error.

        Args:
            test_function : string - for options see _test_metric?
        """
        Y_test_data = self.get_data('Y_test')
        X_test_data = self.get_data('X_test')
        Y_test_forecast = nn_out_to_list(self._model.predict(X_test_data[self._current_features]))
        test_rmse = _test_metric(Y_test_data, Y_test_forecast, test_function)
        self._test_error = test_rmse
        print('The RMSE on the test set was: ', test_rmse[0])
        print('The mean percentage error is: ', test_rmse[1], '%.')
        print('\nTo access the results, call get_test_error()')
Example #2
0
    def test_error(self, test_function='rmse'):
        """
        Computes the error in the current model with the given metric function. The result is stored in a class variable test_error.

        Args:
            test_function : string - for options see _test_metric?
        """
        Y_test_data = self.get_data('Y_test')
        X_test_data = self.get_data('X_test')
        Y_test_forecast = (X_test_data * self.get_latest_params()).sum(axis=1)
        test_rmse = _test_metric(Y_test_data, Y_test_forecast, test_function)
        self._test_error = test_rmse
        print('The RMSE on the test set was: ', test_rmse[0])
        if self._was_regularised:
            print(' ')
            print('NOTE: The model was regularised.')
            print(' ')
        print('The mean percentage error is: ', test_rmse[1], '%.')
        print('\nTo access the results, call get_test_error()')
Example #3
0
    def train(self, features=None):
        """
        Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use:

        Args:
            features : list - train model with list of desired features

        Returns:
        """
        if not isinstance(self.get_data('X_train'), pd.DataFrame):
            raise TypeError("ERROR: The input training data was not in the form of a pd.DataFrame.")
        print(' ')
        print("Training - ARIMAX")
        print("=================")
        print(" ")
        print("Running ARIMAX model on feauture set:")
        print(" ")
        if features == None:
            features = self.get_best_features()
        pprint.pprint(features)
        print(" ")
        self._current_features = features
        X_train_data = self.get_data('X_train')
        X_val_data = self.get_data('X_val')
        X_test_data = self.get_data('X_test')
        Y_train_data = self.get_data('Y_train')
        Y_val_data = self.get_data('Y_val')
        Y_test_data = self.get_data('Y_test')
        X_train_data_temp = X_train_data[features]
        X_val_data_temp = X_val_data[features]
        X_test_data_temp = X_test_data[features]
        model = SARIMAX(endog=pd.concat([Y_train_data, Y_val_data]), exog=pd.concat([X_train_data_temp, X_val_data_temp]), order=(self.p,self.d,self.q))
        model_fit = model.fit(disp=0)
        self._model = model_fit
        Y_test_pred = model_fit.forecast(len(Y_test_data), exog = np.array(X_test_data_temp).reshape(len(Y_test_data), len(X_test_data_temp.columns)))
        final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse')
        self._test_error = final_rmse_test
        print(' ')
        print('The RMSE on the test set was: ', final_rmse_test[0])
        print('The mean percentage error is: ', final_rmse_test[1], '%.')
        print('\nFinished training. To access the most recent classifier, call get_model()')
Example #4
0
    def train(self,
              features=None,
              epochs=500,
              units=64,
              activation='relu',
              dropout=0.0,
              batch_size=16,
              save=True,
              load=False,
              namespace='SLP_model'):
        """
        Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use:

        Args:
            features : list - train model with list of desired features
            epochs : int - number of epochs to train for
            units : int - number of units in hidden layer
            activation : str - choose activation function from keras options e.g. 'relu', 'sigmoid' etc.
            dropout : float - dropout rate from hidden layer to output
            batch_size : int - batch size for training
            save : bool - choose whether to save the model after training (This is safer than using the self.get_model() command)
            load : bool - choose whether to load the model from a saved instance
            namespace : str - file name space for saving/loading, will generate/look for files with this namespace and extensions .json/.h5

        Returns:

        Note: The input data for the SLP should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results.
        """
        if not isinstance(self.get_data('X_train'), pd.DataFrame):
            raise TypeError(
                "ERROR: The input training data was not in the form of a pd.DataFrame."
            )
        print("Training - Single Layer Perceptron")
        print("==================================")
        print(" ")
        print("Running SLP regression classifier on feauture set:")
        print(" ")
        if features == None:
            features = self.get_best_features()
        pprint.pprint(features)
        print(" ")
        self._current_features = features
        X_train_data = self.get_data('X_train')
        X_val_data = self.get_data('X_val')
        X_test_data = self.get_data('X_test')
        Y_train_data = self.get_data('Y_train')
        Y_val_data = self.get_data('Y_val')
        Y_test_data = self.get_data('Y_test')
        X_train_data_temp = X_train_data[features]
        X_val_data_temp = X_val_data[features]
        X_test_data_temp = X_test_data[features]
        try:
            if not load:
                epochs = epochs
                input_dim = len(X_train_data_temp.columns)
                nn_model = Sequential()
                nn_model.add(
                    Dense(units=units,
                          activation=activation,
                          input_dim=input_dim))
                nn_model.add(Dropout(dropout))
                nn_model.add(Dense(units=1))
                nn_model.compile(loss='mean_squared_error', optimizer='adam')
                history = nn_model.fit(X_train_data_temp,
                                       Y_train_data,
                                       epochs=epochs,
                                       verbose=1,
                                       batch_size=batch_size,
                                       validation_data=(X_val_data_temp,
                                                        Y_val_data),
                                       shuffle=False)
                self.history = history.history
                if save:
                    temp_fname = namespace + '.txt'
                    history_file = open(temp_fname, 'wb')
                    pickle.dump(history.history, history_file)
                    history_file.close()
                    temp_fname = namespace + '.json'
                    json_file = open(temp_fname, 'w')
                    json_file.write(nn_model.to_json())
                    json_file.close()
                    temp_fname = namespace + '.h5'
                    nn_model.save_weights(temp_fname)
                    print("Saved Model to namespace: ", namespace)
            else:
                try:
                    temp_fname = namespace + '.json'
                    json_file = open(temp_fname, 'r')
                    nn_model = model_from_json(json_file.read())
                    json_file.close()
                    temp_fname = namespace + '.h5'
                    nn_model.load_weights(temp_fname)
                    temp_fname = namespace + '.txt'
                    history_file = open(temp_fname, 'rb')
                    history = pickle.load(history_file)
                    history_file.close()
                    self.history = history
                    print("Loaded Model from namespace: ", namespace)
                except (OSError, IOError) as e:
                    print("ERROR: Model not found.")
                    raise RuntimeError("Now exiting training.")
            self._model = nn_model
            Y_val_pred = nn_out_to_list(nn_model.predict(X_val_data_temp))
            Y_test_pred = nn_out_to_list(nn_model.predict(X_test_data_temp))
            final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse')
            self._val_rmse = final_rmse_val
            final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse')
            print(' ')
            print('The RMSE on the validation set was: ', final_rmse_val[0])
            print('The mean percentage error is: ', final_rmse_val[1], '%.')
            print(' ')
            print('The RMSE on the test set was: ', final_rmse_test[0])
            print('The mean percentage error is: ', final_rmse_test[1], '%.')
            print(
                '\nFinished training. To access the most recent classifier, call get_model(). To access the training history, use get_history().'
            )
        except RuntimeError as re:
            print(re.args[0])
Example #5
0
    def train(self, features=None, params=None):
        """
        Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use:

        Args:
            features : list - train model with list of desired features
            params : dictionary e.g:  {'nthread': 4,
                                      'objective': 'reg:linear',
                                      'learning_rate': 0.02,
                                      'max_depth': 10,
                                      'min_child_weight': 4,
                                      'silent': 1,
                                      'subsample': 0.7,
                                      'colsample_bytree': 0.7,
                                      'n_estimators': 200}

        Returns:

        Note: The input data for the RFR should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results.
        """
        if not isinstance(self.get_data('X_train'), pd.DataFrame):
            raise TypeError(
                "ERROR: The input training data was not in the form of a pd.DataFrame."
            )
        print("Training - XGBoost Regression Classifier")
        print("==============================================")
        print(" ")
        print("Running XGBoost regression classifier on feauture set:")
        print(" ")
        if features == None:
            features = self.get_best_features()
        pprint.pprint(features)
        print(" ")
        self._current_features = features
        X_train_data = self.get_data('X_train')
        X_val_data = self.get_data('X_val')
        X_test_data = self.get_data('X_test')
        Y_train_data = self.get_data('Y_train')
        Y_val_data = self.get_data('Y_val')
        Y_test_data = self.get_data('Y_test')
        X_train_data_temp = X_train_data[features]
        X_val_data_temp = X_val_data[features]
        X_test_data_temp = X_test_data[features]
        if params == None:
            params = self._latest_params

        xgb_model = xgboost.XGBRegressor(**params)
        xgb_model.fit(X_train_data_temp, Y_train_data)
        self._model = xgb_model
        Y_val_pred = xgb_model.predict(X_val_data_temp)
        Y_test_pred = xgb_model.predict(X_test_data_temp)
        print("Training r-squared:",
              xgb_model.score(X_train_data_temp, Y_train_data))
        print("Validation r-squared:",
              xgb_model.score(X_val_data_temp, Y_val_data))
        print("Testing r-squared:",
              xgb_model.score(X_test_data_temp, Y_test_data))
        final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse')
        self._val_rmse = final_rmse_val
        final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse')
        print(' ')
        print('The RMSE on the validation set was: ', final_rmse_val[0])
        print('The mean percentage error is: ', final_rmse_val[1], '%.')
        print(' ')
        print('The RMSE on the test set was: ', final_rmse_test[0])
        print('The mean percentage error is: ', final_rmse_test[1], '%.')
        print(
            '\nFinished training. To access the most recent classifier, call get_model()'
        )
Example #6
0
    def feature_selection(self, test_function='rmse'):
        """
        Runs through a feature selection algorithm which enumerates the possible subsets of the input features and attempts to minimise the test_metric error on the validation set after training the classifier on the training data. Updates the self.best_features attribute which can then be used to run the full model on the training and test data. This is only really appropriate for a relatively small number of features. To avoid computational intensity, there is no hyperparameter optimisation, instead standard parameters are calculated from the data. Once the best feature set has been identified, one can use the additional functionality in the library to tune the hyperparameters.

        Args:
            test_function : function - default is rmse testing, but others are available, see _test_metric? for more information

        Note: the input data for the features must be in the form of a pd.DataFrame
        """
        try:
            if not isinstance(self.get_data('X_train'), pd.DataFrame):
                raise TypeError(
                    "ERROR: The input training data was not in the form of a pd.DataFrame."
                )
            feature_set = list(powerset(self.input_features))
            print("Feature Selection")
            print("=================")
            print(" ")
            print("Running feature selection on a feature set of size: ",
                  len(feature_set) - 1)
            print(" ")
            feature_dict = {}
            list_results = []
            counter = 0
            X_train_data = self.get_data('X_train')
            X_val_data = self.get_data('X_val')
            Y_train_data = self.get_data('Y_train')
            Y_val_data = self.get_data('Y_val')

            if self._latest_params == None:
                self.optimise_parameters()
                print("First optimising parameters over training set.")

            if (len(feature_set) < 100):
                counter_check = 10
            elif (len(feature_set) < 1000):
                counter_check = 100
            elif (len(feature_set) < 2500):
                counter_check = 250
            elif (len(feature_set) < 5000):
                counter_check = 500
            else:
                counter_check = 1000

            for _features in feature_set[1:]:
                if (counter % counter_check == counter_check - 1):
                    print('-------------------Completed ', counter + 1,
                          ' feature sets out of ',
                          len(feature_set) - 1, '-------------------\n')
                X_train_data_temp = X_train_data[list(_features)]
                X_val_data_temp = X_val_data[list(_features)]
                feature_dict[counter] = list(_features)
                temp_model = xgboost.XGBRegressor(**self._latest_params)
                temp_model.fit(X_train_data_temp, Y_train_data)
                val_forecast = temp_model.predict(X_val_data_temp)
                val_rmse = _test_metric(Y_val_data, val_forecast,
                                        test_function)[0]
                list_results.append(val_rmse)
                counter += 1
            print(
                '-------------------Finished iterating through possible feature sets.-------------------\n'
            )
            test_mse_df = pd.DataFrame({'test_mse': list_results})
            lowest_test_mse = test_mse_df.sort_values(['test_mse'])
            index = lowest_test_mse.index
            self.best_features = feature_dict[index[0]]
            X_train_data_temp = X_train_data[feature_dict[index[0]]]
            X_val_data_temp = X_val_data[feature_dict[index[0]]]
            temp_model = xgboost.XGBRegressor(**self._latest_params)
            temp_model.fit(X_train_data_temp, Y_train_data)
            val_forecast = temp_model.predict(X_val_data_temp)
            val_rmse = _test_metric(Y_val_data, val_forecast, test_function)[0]
            val_forecast = temp_model.predict(X_val_data_temp)
            final_rmse = _test_metric(Y_val_data, val_forecast, test_function)
            print('Lowest Error on validation set with feature set: ',
                  feature_dict[index[0]], '\n\n')
            print(
                'Set best_features attribute to this set. With this choice, the following regression results were obtained on the training data:\n\n'
            )
            print('The RMSE on the validation set was: ', final_rmse[0])
            print('The mean percentage error is: ', final_rmse[1], '%.')
            print(
                '\nFinished feature selection. To see list of best_features, call get_best_features() on your classifier. To access the regression parameters, call get_latest_params()'
            )

        except TypeError as te:
            print(te.args[0])
Example #7
0
    def train(self,
              features=None,
              C=None,
              epsilon=None,
              kernel='rbf',
              gamma=None,
              degree=2):
        """
        Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use:

        Args:
            features : list - train model with list of desired features
            C : float - hyperparameter to control the penalty on breaking the epsilon bound
            epsilon : float - hyperparameter to control the buffer zone around the true value
            kernel : str - choice of kernel for the SVR, 'rbf' is the default, another suggested choice is a degree 2 polynomial ('poly')
            gamma : float - hyperparamter for the 'rbf' kernel
            degree : int - only for the 'poly' kernel, defines the degree of the polynomial

        Returns:

        Note: The input data for the SVR should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results.
        """
        if not isinstance(self.get_data('X_train'), pd.DataFrame):
            raise TypeError(
                "ERROR: The input training data was not in the form of a pd.DataFrame."
            )
        print("Training - Support Vector Regression Classifier")
        print("===============================================")
        print(" ")
        print(
            "Running support vector classifier regression classifier on feauture set:"
        )
        print(" ")
        if features == None:
            features = self.get_best_features()
        pprint.pprint(features)
        print(" ")
        self._current_features = features
        X_train_data = self.get_data('X_train')
        X_val_data = self.get_data('X_val')
        X_test_data = self.get_data('X_test')
        Y_train_data = self.get_data('Y_train')
        Y_val_data = self.get_data('Y_val')
        Y_test_data = self.get_data('Y_test')
        X_train_data_temp = X_train_data[features]
        X_val_data_temp = X_val_data[features]
        X_test_data_temp = X_test_data[features]
        if C == None:
            C = max(np.abs(Y_train_data.mean() + 3 * Y_train_data.std()),
                    np.abs(Y_train_data.mean() - 3 * Y_train_data.std()))
        if epsilon == None:
            epsilon = Y_train_data.std() / len(Y_train_data)
        if gamma == None:
            gamma = 0.3**(1 / len(features))

        svr_model = sklearn.svm.SVR(
            C=C,
            epsilon=epsilon,
            kernel=kernel,
            gamma=gamma,
            degree=degree,
            verbose=False,
        )
        svr_model.fit(X_train_data_temp, Y_train_data)
        self._model = svr_model
        Y_val_pred = svr_model.predict(X_val_data_temp)
        Y_test_pred = svr_model.predict(X_test_data_temp)
        print("Training r-squared:",
              svr_model.score(X_train_data_temp, Y_train_data))
        print("Validation r-squared:",
              svr_model.score(X_val_data_temp, Y_val_data))
        print("Testing r-squared:",
              svr_model.score(X_test_data_temp, Y_test_data))
        final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse')
        self._val_rmse = final_rmse_val
        final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse')
        print(' ')
        print('The RMSE on the validation set was: ', final_rmse_val[0])
        print('The mean percentage error is: ', final_rmse_val[1], '%.')
        print(' ')
        print('The RMSE on the test set was: ', final_rmse_test[0])
        print('The mean percentage error is: ', final_rmse_test[1], '%.')
        print(
            '\nFinished training. To access the most recent classifier, call get_model()'
        )
Example #8
0
    def train(self, features=None, params=None):
        """
        Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use:

        Args:
            features : list - train model with list of desired features
            params : dictionary - { n_estimators : n_estimators,
                                    max_features : max_features,
                                    max_depth : max_depth,
                                    min_samples_split : min_samples_split,
                                    min_samples_leaf : min_samples_leaf,
                                    bootstrap : bootstrap }

        Returns:

        Note: The input data for the RFR should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results.
        """
        if not isinstance(self.get_data('X_train'), pd.DataFrame):
            raise TypeError(
                "ERROR: The input training data was not in the form of a pd.DataFrame."
            )
        print("Training - Random Forest Regression Classifier")
        print("==============================================")
        print(" ")
        print("Running random forest regression classifier on feauture set:")
        print(" ")
        if features == None:
            features = self.get_best_features()
        pprint.pprint(features)
        print(" ")
        self._current_features = features
        X_train_data = self.get_data('X_train')
        X_val_data = self.get_data('X_val')
        X_test_data = self.get_data('X_test')
        Y_train_data = self.get_data('Y_train')
        Y_val_data = self.get_data('Y_val')
        Y_test_data = self.get_data('Y_test')
        X_train_data_temp = X_train_data[features]
        X_val_data_temp = X_val_data[features]
        X_test_data_temp = X_test_data[features]
        if params == None:
            params = self._latest_params

        rfr_model = RandomForestRegressor(**params)
        rfr_model.fit(X_train_data_temp, Y_train_data)
        self._model = rfr_model
        Y_val_pred = rfr_model.predict(X_val_data_temp)
        Y_test_pred = rfr_model.predict(X_test_data_temp)
        print("Training r-squared:",
              rfr_model.score(X_train_data_temp, Y_train_data))
        print("Validation r-squared:",
              rfr_model.score(X_val_data_temp, Y_val_data))
        print("Testing r-squared:",
              rfr_model.score(X_test_data_temp, Y_test_data))
        final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse')
        self._val_rmse = final_rmse_val
        final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse')
        print(' ')
        print('The RMSE on the validation set was: ', final_rmse_val[0])
        print('The mean percentage error is: ', final_rmse_val[1], '%.')
        print(' ')
        print('The RMSE on the test set was: ', final_rmse_test[0])
        print('The mean percentage error is: ', final_rmse_test[1], '%.')
        print(
            '\nFinished training. To access the most recent classifier, call get_model()'
        )
Example #9
0
    def feature_selection(self, test_function='rmse'):
        """
        Runs through a feature selection algorithm which enumerates the possible subsets of the input features and attempts to minimise the test_metric error on the validation set after training the classifier on the training data. Updates the self.best_features attribute which can then be used to run the full model on the training and test data. This is only really appropriate for a relatively small number of features.

        Args:
            test_function : function - default is rmse testing, but others are available, see _test_metric? for more information

        Note: the input data for the features must be in the form of a pd.DataFrame
        """
        try:
            if not isinstance(self.get_data('X_train'), pd.DataFrame):
                raise TypeError(
                    "ERROR: The input training data was not in the form of a pd.DataFrame."
                )
            feature_set = list(powerset(self.input_features))
            print("Feature Selection")
            print("=================")
            print(" ")
            print("Running feature selection on a feature set of size: ",
                  len(feature_set) - 1)
            print(" ")
            feature_dict = {}
            list_results = []
            counter = 0
            X_train_data = self.get_data('X_train')
            X_val_data = self.get_data('X_val')
            Y_train_data = self.get_data('Y_train')
            Y_val_data = self.get_data('Y_val')

            if (len(feature_set) < 100):
                counter_check = 10
            elif (len(feature_set) < 1000):
                counter_check = 100
            elif (len(feature_set) < 2500):
                counter_check = 250
            elif (len(feature_set) < 5000):
                counter_check = 500
            else:
                counter_check = 1000

            for _features in feature_set[1:]:
                if (counter % counter_check == counter_check - 1):
                    print('-------------------Completed ', counter + 1,
                          ' feature sets out of ',
                          len(feature_set) - 1, '-------------------\n')
                X_train_data_temp = X_train_data[list(_features)]
                X_val_data_temp = X_val_data[list(_features)]
                feature_dict[counter] = list(_features)
                lin_model = sm.OLS(Y_train_data, X_train_data_temp)
                result = lin_model.fit()
                val_forecast = (X_val_data_temp * result.params).sum(axis=1)
                val_rmse = _test_metric(Y_val_data, val_forecast,
                                        test_function)[0]
                list_results.append(val_rmse)
                # train_rsquared = result.rsquared
                # list_results.append(train_rsquared)
                counter += 1
            print(
                '-------------------Finished iterating through possible feature sets.-------------------\n'
            )
            test_mse_df = pd.DataFrame({'test_mse': list_results})
            lowest_test_mse = test_mse_df.sort_values(['test_mse'])
            index = lowest_test_mse.index
            # test_rsquared_df = pd.DataFrame({'test_rsquared': list_results})
            # highest_test_rsquared = test_rsquared_df.sort_values(['test_rsquared'], ascending=False)
            # index = highest_test_rsquared.index
            X_train_data_temp = X_train_data[feature_dict[index[0]]]
            X_val_data_temp = X_val_data[feature_dict[index[0]]]
            lin_model = sm.OLS(Y_train_data, X_train_data_temp)
            result = lin_model.fit()
            val_forecast = (X_val_data_temp * result.params).sum(axis=1)
            final_rmse = _test_metric(Y_val_data, val_forecast, test_function)
            print('Lowest Error on validation set with feature set: ',
                  feature_dict[index[0]], '\n\n')
            print(
                'Set best_features attribute to this set. With this choice, the following regression results were obtained on the training data:\n\n'
            )
            self.best_features = feature_dict[index[0]]
            self.__result = result
            self._params = result.params
            self._was_regularised = False
            print(result.summary(), '\n\n')
            print('The RMSE on the validation set was: ', final_rmse[0])
            print('The mean percentage error is: ', final_rmse[1], '%.')
            print(
                '\nFinished feature selection. To see list of best_features, call get_best_features() on your classifier. To access the regression parameters, call get_latest_params()'
            )

        except TypeError as te:
            print(te.args[0])
Example #10
0
    def train(self, features=None, regularised=False, alpha=0.5, l1_wt=0.0):
        """
        Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use:

            clf.get_result()

        Args:
            features : list - train model with list of desired features
            regularised : bool - if True, a regularised fit model is applied, note also that data normalisation is carried out to run this method
            alpha : float - penalty weight in the regularised case
            L1_wt : float - float between 0 and 1, if 0 the fit is a ridge fit (default), if 1, it is a lasso fit

        Returns:
            result : sm.OLS.fit() object containing regression results

        Note: This also updates the clf.get_latest_params() attribute.
        """
        if not isinstance(self.get_data('X_train'), pd.DataFrame):
            raise TypeError(
                "ERROR: The input training data was not in the form of a pd.DataFrame."
            )
        print("Training - Linear Regression Classifier")
        print("=======================================")
        print(" ")
        print("Running linear regression classifier on feauture set:")
        print(" ")
        if features == None:
            features = self.get_best_features()
        pprint.pprint(features)
        print(" ")
        self._current_features = features
        X_train_data = self.get_data('X_train')
        X_test_data = self.get_data('X_test')
        Y_train_data = self.get_data('Y_train')
        Y_test_data = self.get_data('Y_test')
        X_train_data_temp = X_train_data[features]
        X_test_data_temp = X_test_data[features]
        if not regularised:
            print("Regularisation not employed.")
            print(" ")
            lin_model = sm.OLS(Y_train_data, X_train_data_temp)
            result = lin_model.fit()
            test_forecast = (X_test_data_temp * result.params).sum(axis=1)
            final_rmse = _test_metric(Y_test_data, test_forecast, "rmse")
            self.__result = result
            self._params = result.params
            print(result.summary(), '\n\n')
            print('The RMSE on the test set was: ', final_rmse[0])
            print('The mean percentage error is: ', final_rmse[1], '%.')
            print(
                '\nFinished training. To see full set of results, call get_result(). To access the regression parameters, call get_latest_params()'
            )
            self._was_regularised = False
        else:
            lin_model = sm.OLS(Y_train_data, X_train_data_temp)
            result = lin_model.fit_regularized(alpha=alpha, L1_wt=l1_wt)
            test_forecast = (X_test_data_temp * result.params).sum(axis=1)
            final_rmse = _test_metric(Y_test_data, test_forecast, "rmse")
            self.__result = None
            self._params = result.params
            print(' ')
            pprint.pprint(result.params)
            print('The RMSE on the validation set was: ', final_rmse[0])
            print('The mean percentage error is: ', final_rmse[1], '%.')
            print(
                '\nFinished training. To see full set of results, call get_result(). To access the regression parameters, call get_latest_params()'
            )
            self._was_regularised = True