def test_error(self, test_function='rmse'): """ Computes the error in the current model with the given metric function. The result is stored in a class variable tget_testest_error. Args: test_function : string - for options see _test_metric? """ Y_test_data = self.get_data('Y_test') X_test_data = self.get_data('X_test') Y_test_forecast = nn_out_to_list(self._model.predict(X_test_data[self._current_features])) test_rmse = _test_metric(Y_test_data, Y_test_forecast, test_function) self._test_error = test_rmse print('The RMSE on the test set was: ', test_rmse[0]) print('The mean percentage error is: ', test_rmse[1], '%.') print('\nTo access the results, call get_test_error()')
def test_error(self, test_function='rmse'): """ Computes the error in the current model with the given metric function. The result is stored in a class variable test_error. Args: test_function : string - for options see _test_metric? """ Y_test_data = self.get_data('Y_test') X_test_data = self.get_data('X_test') Y_test_forecast = (X_test_data * self.get_latest_params()).sum(axis=1) test_rmse = _test_metric(Y_test_data, Y_test_forecast, test_function) self._test_error = test_rmse print('The RMSE on the test set was: ', test_rmse[0]) if self._was_regularised: print(' ') print('NOTE: The model was regularised.') print(' ') print('The mean percentage error is: ', test_rmse[1], '%.') print('\nTo access the results, call get_test_error()')
def train(self, features=None): """ Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use: Args: features : list - train model with list of desired features Returns: """ if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError("ERROR: The input training data was not in the form of a pd.DataFrame.") print(' ') print("Training - ARIMAX") print("=================") print(" ") print("Running ARIMAX model on feauture set:") print(" ") if features == None: features = self.get_best_features() pprint.pprint(features) print(" ") self._current_features = features X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') X_test_data = self.get_data('X_test') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') Y_test_data = self.get_data('Y_test') X_train_data_temp = X_train_data[features] X_val_data_temp = X_val_data[features] X_test_data_temp = X_test_data[features] model = SARIMAX(endog=pd.concat([Y_train_data, Y_val_data]), exog=pd.concat([X_train_data_temp, X_val_data_temp]), order=(self.p,self.d,self.q)) model_fit = model.fit(disp=0) self._model = model_fit Y_test_pred = model_fit.forecast(len(Y_test_data), exog = np.array(X_test_data_temp).reshape(len(Y_test_data), len(X_test_data_temp.columns))) final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse') self._test_error = final_rmse_test print(' ') print('The RMSE on the test set was: ', final_rmse_test[0]) print('The mean percentage error is: ', final_rmse_test[1], '%.') print('\nFinished training. To access the most recent classifier, call get_model()')
def train(self, features=None, epochs=500, units=64, activation='relu', dropout=0.0, batch_size=16, save=True, load=False, namespace='SLP_model'): """ Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use: Args: features : list - train model with list of desired features epochs : int - number of epochs to train for units : int - number of units in hidden layer activation : str - choose activation function from keras options e.g. 'relu', 'sigmoid' etc. dropout : float - dropout rate from hidden layer to output batch_size : int - batch size for training save : bool - choose whether to save the model after training (This is safer than using the self.get_model() command) load : bool - choose whether to load the model from a saved instance namespace : str - file name space for saving/loading, will generate/look for files with this namespace and extensions .json/.h5 Returns: Note: The input data for the SLP should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results. """ if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) print("Training - Single Layer Perceptron") print("==================================") print(" ") print("Running SLP regression classifier on feauture set:") print(" ") if features == None: features = self.get_best_features() pprint.pprint(features) print(" ") self._current_features = features X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') X_test_data = self.get_data('X_test') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') Y_test_data = self.get_data('Y_test') X_train_data_temp = X_train_data[features] X_val_data_temp = X_val_data[features] X_test_data_temp = X_test_data[features] try: if not load: epochs = epochs input_dim = len(X_train_data_temp.columns) nn_model = Sequential() nn_model.add( Dense(units=units, activation=activation, input_dim=input_dim)) nn_model.add(Dropout(dropout)) nn_model.add(Dense(units=1)) nn_model.compile(loss='mean_squared_error', optimizer='adam') history = nn_model.fit(X_train_data_temp, Y_train_data, epochs=epochs, verbose=1, batch_size=batch_size, validation_data=(X_val_data_temp, Y_val_data), shuffle=False) self.history = history.history if save: temp_fname = namespace + '.txt' history_file = open(temp_fname, 'wb') pickle.dump(history.history, history_file) history_file.close() temp_fname = namespace + '.json' json_file = open(temp_fname, 'w') json_file.write(nn_model.to_json()) json_file.close() temp_fname = namespace + '.h5' nn_model.save_weights(temp_fname) print("Saved Model to namespace: ", namespace) else: try: temp_fname = namespace + '.json' json_file = open(temp_fname, 'r') nn_model = model_from_json(json_file.read()) json_file.close() temp_fname = namespace + '.h5' nn_model.load_weights(temp_fname) temp_fname = namespace + '.txt' history_file = open(temp_fname, 'rb') history = pickle.load(history_file) history_file.close() self.history = history print("Loaded Model from namespace: ", namespace) except (OSError, IOError) as e: print("ERROR: Model not found.") raise RuntimeError("Now exiting training.") self._model = nn_model Y_val_pred = nn_out_to_list(nn_model.predict(X_val_data_temp)) Y_test_pred = nn_out_to_list(nn_model.predict(X_test_data_temp)) final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse') self._val_rmse = final_rmse_val final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse') print(' ') print('The RMSE on the validation set was: ', final_rmse_val[0]) print('The mean percentage error is: ', final_rmse_val[1], '%.') print(' ') print('The RMSE on the test set was: ', final_rmse_test[0]) print('The mean percentage error is: ', final_rmse_test[1], '%.') print( '\nFinished training. To access the most recent classifier, call get_model(). To access the training history, use get_history().' ) except RuntimeError as re: print(re.args[0])
def train(self, features=None, params=None): """ Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use: Args: features : list - train model with list of desired features params : dictionary e.g: {'nthread': 4, 'objective': 'reg:linear', 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 4, 'silent': 1, 'subsample': 0.7, 'colsample_bytree': 0.7, 'n_estimators': 200} Returns: Note: The input data for the RFR should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results. """ if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) print("Training - XGBoost Regression Classifier") print("==============================================") print(" ") print("Running XGBoost regression classifier on feauture set:") print(" ") if features == None: features = self.get_best_features() pprint.pprint(features) print(" ") self._current_features = features X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') X_test_data = self.get_data('X_test') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') Y_test_data = self.get_data('Y_test') X_train_data_temp = X_train_data[features] X_val_data_temp = X_val_data[features] X_test_data_temp = X_test_data[features] if params == None: params = self._latest_params xgb_model = xgboost.XGBRegressor(**params) xgb_model.fit(X_train_data_temp, Y_train_data) self._model = xgb_model Y_val_pred = xgb_model.predict(X_val_data_temp) Y_test_pred = xgb_model.predict(X_test_data_temp) print("Training r-squared:", xgb_model.score(X_train_data_temp, Y_train_data)) print("Validation r-squared:", xgb_model.score(X_val_data_temp, Y_val_data)) print("Testing r-squared:", xgb_model.score(X_test_data_temp, Y_test_data)) final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse') self._val_rmse = final_rmse_val final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse') print(' ') print('The RMSE on the validation set was: ', final_rmse_val[0]) print('The mean percentage error is: ', final_rmse_val[1], '%.') print(' ') print('The RMSE on the test set was: ', final_rmse_test[0]) print('The mean percentage error is: ', final_rmse_test[1], '%.') print( '\nFinished training. To access the most recent classifier, call get_model()' )
def feature_selection(self, test_function='rmse'): """ Runs through a feature selection algorithm which enumerates the possible subsets of the input features and attempts to minimise the test_metric error on the validation set after training the classifier on the training data. Updates the self.best_features attribute which can then be used to run the full model on the training and test data. This is only really appropriate for a relatively small number of features. To avoid computational intensity, there is no hyperparameter optimisation, instead standard parameters are calculated from the data. Once the best feature set has been identified, one can use the additional functionality in the library to tune the hyperparameters. Args: test_function : function - default is rmse testing, but others are available, see _test_metric? for more information Note: the input data for the features must be in the form of a pd.DataFrame """ try: if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) feature_set = list(powerset(self.input_features)) print("Feature Selection") print("=================") print(" ") print("Running feature selection on a feature set of size: ", len(feature_set) - 1) print(" ") feature_dict = {} list_results = [] counter = 0 X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') if self._latest_params == None: self.optimise_parameters() print("First optimising parameters over training set.") if (len(feature_set) < 100): counter_check = 10 elif (len(feature_set) < 1000): counter_check = 100 elif (len(feature_set) < 2500): counter_check = 250 elif (len(feature_set) < 5000): counter_check = 500 else: counter_check = 1000 for _features in feature_set[1:]: if (counter % counter_check == counter_check - 1): print('-------------------Completed ', counter + 1, ' feature sets out of ', len(feature_set) - 1, '-------------------\n') X_train_data_temp = X_train_data[list(_features)] X_val_data_temp = X_val_data[list(_features)] feature_dict[counter] = list(_features) temp_model = xgboost.XGBRegressor(**self._latest_params) temp_model.fit(X_train_data_temp, Y_train_data) val_forecast = temp_model.predict(X_val_data_temp) val_rmse = _test_metric(Y_val_data, val_forecast, test_function)[0] list_results.append(val_rmse) counter += 1 print( '-------------------Finished iterating through possible feature sets.-------------------\n' ) test_mse_df = pd.DataFrame({'test_mse': list_results}) lowest_test_mse = test_mse_df.sort_values(['test_mse']) index = lowest_test_mse.index self.best_features = feature_dict[index[0]] X_train_data_temp = X_train_data[feature_dict[index[0]]] X_val_data_temp = X_val_data[feature_dict[index[0]]] temp_model = xgboost.XGBRegressor(**self._latest_params) temp_model.fit(X_train_data_temp, Y_train_data) val_forecast = temp_model.predict(X_val_data_temp) val_rmse = _test_metric(Y_val_data, val_forecast, test_function)[0] val_forecast = temp_model.predict(X_val_data_temp) final_rmse = _test_metric(Y_val_data, val_forecast, test_function) print('Lowest Error on validation set with feature set: ', feature_dict[index[0]], '\n\n') print( 'Set best_features attribute to this set. With this choice, the following regression results were obtained on the training data:\n\n' ) print('The RMSE on the validation set was: ', final_rmse[0]) print('The mean percentage error is: ', final_rmse[1], '%.') print( '\nFinished feature selection. To see list of best_features, call get_best_features() on your classifier. To access the regression parameters, call get_latest_params()' ) except TypeError as te: print(te.args[0])
def train(self, features=None, C=None, epsilon=None, kernel='rbf', gamma=None, degree=2): """ Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use: Args: features : list - train model with list of desired features C : float - hyperparameter to control the penalty on breaking the epsilon bound epsilon : float - hyperparameter to control the buffer zone around the true value kernel : str - choice of kernel for the SVR, 'rbf' is the default, another suggested choice is a degree 2 polynomial ('poly') gamma : float - hyperparamter for the 'rbf' kernel degree : int - only for the 'poly' kernel, defines the degree of the polynomial Returns: Note: The input data for the SVR should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results. """ if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) print("Training - Support Vector Regression Classifier") print("===============================================") print(" ") print( "Running support vector classifier regression classifier on feauture set:" ) print(" ") if features == None: features = self.get_best_features() pprint.pprint(features) print(" ") self._current_features = features X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') X_test_data = self.get_data('X_test') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') Y_test_data = self.get_data('Y_test') X_train_data_temp = X_train_data[features] X_val_data_temp = X_val_data[features] X_test_data_temp = X_test_data[features] if C == None: C = max(np.abs(Y_train_data.mean() + 3 * Y_train_data.std()), np.abs(Y_train_data.mean() - 3 * Y_train_data.std())) if epsilon == None: epsilon = Y_train_data.std() / len(Y_train_data) if gamma == None: gamma = 0.3**(1 / len(features)) svr_model = sklearn.svm.SVR( C=C, epsilon=epsilon, kernel=kernel, gamma=gamma, degree=degree, verbose=False, ) svr_model.fit(X_train_data_temp, Y_train_data) self._model = svr_model Y_val_pred = svr_model.predict(X_val_data_temp) Y_test_pred = svr_model.predict(X_test_data_temp) print("Training r-squared:", svr_model.score(X_train_data_temp, Y_train_data)) print("Validation r-squared:", svr_model.score(X_val_data_temp, Y_val_data)) print("Testing r-squared:", svr_model.score(X_test_data_temp, Y_test_data)) final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse') self._val_rmse = final_rmse_val final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse') print(' ') print('The RMSE on the validation set was: ', final_rmse_val[0]) print('The mean percentage error is: ', final_rmse_val[1], '%.') print(' ') print('The RMSE on the test set was: ', final_rmse_test[0]) print('The mean percentage error is: ', final_rmse_test[1], '%.') print( '\nFinished training. To access the most recent classifier, call get_model()' )
def train(self, features=None, params=None): """ Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use: Args: features : list - train model with list of desired features params : dictionary - { n_estimators : n_estimators, max_features : max_features, max_depth : max_depth, min_samples_split : min_samples_split, min_samples_leaf : min_samples_leaf, bootstrap : bootstrap } Returns: Note: The input data for the RFR should be normalised such that each feature lies in the same range e.g. [0,1] to ensure that no one feature dominates. Non-normalised data will result in potentially very poor results. """ if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) print("Training - Random Forest Regression Classifier") print("==============================================") print(" ") print("Running random forest regression classifier on feauture set:") print(" ") if features == None: features = self.get_best_features() pprint.pprint(features) print(" ") self._current_features = features X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') X_test_data = self.get_data('X_test') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') Y_test_data = self.get_data('Y_test') X_train_data_temp = X_train_data[features] X_val_data_temp = X_val_data[features] X_test_data_temp = X_test_data[features] if params == None: params = self._latest_params rfr_model = RandomForestRegressor(**params) rfr_model.fit(X_train_data_temp, Y_train_data) self._model = rfr_model Y_val_pred = rfr_model.predict(X_val_data_temp) Y_test_pred = rfr_model.predict(X_test_data_temp) print("Training r-squared:", rfr_model.score(X_train_data_temp, Y_train_data)) print("Validation r-squared:", rfr_model.score(X_val_data_temp, Y_val_data)) print("Testing r-squared:", rfr_model.score(X_test_data_temp, Y_test_data)) final_rmse_val = _test_metric(Y_val_data, Y_val_pred, 'rmse') self._val_rmse = final_rmse_val final_rmse_test = _test_metric(Y_test_data, Y_test_pred, 'rmse') print(' ') print('The RMSE on the validation set was: ', final_rmse_val[0]) print('The mean percentage error is: ', final_rmse_val[1], '%.') print(' ') print('The RMSE on the test set was: ', final_rmse_test[0]) print('The mean percentage error is: ', final_rmse_test[1], '%.') print( '\nFinished training. To access the most recent classifier, call get_model()' )
def feature_selection(self, test_function='rmse'): """ Runs through a feature selection algorithm which enumerates the possible subsets of the input features and attempts to minimise the test_metric error on the validation set after training the classifier on the training data. Updates the self.best_features attribute which can then be used to run the full model on the training and test data. This is only really appropriate for a relatively small number of features. Args: test_function : function - default is rmse testing, but others are available, see _test_metric? for more information Note: the input data for the features must be in the form of a pd.DataFrame """ try: if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) feature_set = list(powerset(self.input_features)) print("Feature Selection") print("=================") print(" ") print("Running feature selection on a feature set of size: ", len(feature_set) - 1) print(" ") feature_dict = {} list_results = [] counter = 0 X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') if (len(feature_set) < 100): counter_check = 10 elif (len(feature_set) < 1000): counter_check = 100 elif (len(feature_set) < 2500): counter_check = 250 elif (len(feature_set) < 5000): counter_check = 500 else: counter_check = 1000 for _features in feature_set[1:]: if (counter % counter_check == counter_check - 1): print('-------------------Completed ', counter + 1, ' feature sets out of ', len(feature_set) - 1, '-------------------\n') X_train_data_temp = X_train_data[list(_features)] X_val_data_temp = X_val_data[list(_features)] feature_dict[counter] = list(_features) lin_model = sm.OLS(Y_train_data, X_train_data_temp) result = lin_model.fit() val_forecast = (X_val_data_temp * result.params).sum(axis=1) val_rmse = _test_metric(Y_val_data, val_forecast, test_function)[0] list_results.append(val_rmse) # train_rsquared = result.rsquared # list_results.append(train_rsquared) counter += 1 print( '-------------------Finished iterating through possible feature sets.-------------------\n' ) test_mse_df = pd.DataFrame({'test_mse': list_results}) lowest_test_mse = test_mse_df.sort_values(['test_mse']) index = lowest_test_mse.index # test_rsquared_df = pd.DataFrame({'test_rsquared': list_results}) # highest_test_rsquared = test_rsquared_df.sort_values(['test_rsquared'], ascending=False) # index = highest_test_rsquared.index X_train_data_temp = X_train_data[feature_dict[index[0]]] X_val_data_temp = X_val_data[feature_dict[index[0]]] lin_model = sm.OLS(Y_train_data, X_train_data_temp) result = lin_model.fit() val_forecast = (X_val_data_temp * result.params).sum(axis=1) final_rmse = _test_metric(Y_val_data, val_forecast, test_function) print('Lowest Error on validation set with feature set: ', feature_dict[index[0]], '\n\n') print( 'Set best_features attribute to this set. With this choice, the following regression results were obtained on the training data:\n\n' ) self.best_features = feature_dict[index[0]] self.__result = result self._params = result.params self._was_regularised = False print(result.summary(), '\n\n') print('The RMSE on the validation set was: ', final_rmse[0]) print('The mean percentage error is: ', final_rmse[1], '%.') print( '\nFinished feature selection. To see list of best_features, call get_best_features() on your classifier. To access the regression parameters, call get_latest_params()' ) except TypeError as te: print(te.args[0])
def train(self, features=None, regularised=False, alpha=0.5, l1_wt=0.0): """ Train the model on a chosen set of features. If none are chosen, the default is to re run the model with the current best_features attribute. Note that the training is carried out on the training data, X_train, only. To access the result, use: clf.get_result() Args: features : list - train model with list of desired features regularised : bool - if True, a regularised fit model is applied, note also that data normalisation is carried out to run this method alpha : float - penalty weight in the regularised case L1_wt : float - float between 0 and 1, if 0 the fit is a ridge fit (default), if 1, it is a lasso fit Returns: result : sm.OLS.fit() object containing regression results Note: This also updates the clf.get_latest_params() attribute. """ if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) print("Training - Linear Regression Classifier") print("=======================================") print(" ") print("Running linear regression classifier on feauture set:") print(" ") if features == None: features = self.get_best_features() pprint.pprint(features) print(" ") self._current_features = features X_train_data = self.get_data('X_train') X_test_data = self.get_data('X_test') Y_train_data = self.get_data('Y_train') Y_test_data = self.get_data('Y_test') X_train_data_temp = X_train_data[features] X_test_data_temp = X_test_data[features] if not regularised: print("Regularisation not employed.") print(" ") lin_model = sm.OLS(Y_train_data, X_train_data_temp) result = lin_model.fit() test_forecast = (X_test_data_temp * result.params).sum(axis=1) final_rmse = _test_metric(Y_test_data, test_forecast, "rmse") self.__result = result self._params = result.params print(result.summary(), '\n\n') print('The RMSE on the test set was: ', final_rmse[0]) print('The mean percentage error is: ', final_rmse[1], '%.') print( '\nFinished training. To see full set of results, call get_result(). To access the regression parameters, call get_latest_params()' ) self._was_regularised = False else: lin_model = sm.OLS(Y_train_data, X_train_data_temp) result = lin_model.fit_regularized(alpha=alpha, L1_wt=l1_wt) test_forecast = (X_test_data_temp * result.params).sum(axis=1) final_rmse = _test_metric(Y_test_data, test_forecast, "rmse") self.__result = None self._params = result.params print(' ') pprint.pprint(result.params) print('The RMSE on the validation set was: ', final_rmse[0]) print('The mean percentage error is: ', final_rmse[1], '%.') print( '\nFinished training. To see full set of results, call get_result(). To access the regression parameters, call get_latest_params()' ) self._was_regularised = True