def backward_selection(self, max_features, X_train, y_train): # First select all features. selected_features = X_train.columns.tolist() ra = RegressionAlgorithms() re = RegressionEvaluation() # Select from the features that are still in the selection. for i in range(0, (len(X_train.columns) - max_features)): best_perf = sys.float_info.max worst_feature = '' for f in selected_features: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.remove(f) # Determine the score without the feature. pred_y_train, pred_y_test = ra.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If we score better (i.e. a lower mse) without the feature than what we have seen so far # this is the worst feature. if perf < best_perf: best_perf = perf worst_feature = f # Remove the worst feature. selected_features.remove(worst_feature) return selected_features
def gridsearch_reservoir_computing(self, train_X, train_y, test_X, test_y, per_time_step=False, error='mse', gridsearch_training_frac=0.7): tuned_parameters = { 'a': [0.6, 0.8], 'reservoir_size': [400, 700, 1000] } # tuned_parameters = {'a': [0.4], 'reservoir_size':[250]} params = tuned_parameters.keys() combinations = self.generate_parameter_combinations( tuned_parameters, params) split_point = int(gridsearch_training_frac * len(train_X.index)) train_params_X = train_X.ix[0:split_point, ] test_params_X = train_X.ix[split_point:len(train_X.index), ] train_params_y = train_y.ix[0:split_point, ] test_params_y = train_y.ix[split_point:len(train_X.index), ] if error == 'mse': best_error = sys.float_info.max elif error == 'accuracy': best_error = 0 best_combination = [] for comb in combinations: print comb # Order of the keys might have changed. keys = tuned_parameters.keys() pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.reservoir_computing( train_params_X, train_params_y, test_params_X, test_params_y, reservoir_size=comb[keys.index('reservoir_size')], a=comb[keys.index('a')], per_time_step=per_time_step, gridsearch=False) if error == 'mse': eval = RegressionEvaluation() mse = eval.mean_squared_error(test_params_y, pred_test_y_prob) if mse < best_error: best_error = mse best_combination = comb elif error == 'accuracy': eval = ClassificationEvaluation() acc = eval.accuracy(test_params_y, pred_test_y) if acc > best_error: best_error = acc best_combination = comb print '-------' print best_combination print '-------' return best_combination[keys.index( 'reservoir_size')], best_combination[keys.index('a')]
def gridsearch_recurrent_neural_network(self, train_X, train_y, test_X, test_y, error='accuracy', gridsearch_training_frac=0.7): tuned_parameters = { 'n_hidden_neurons': [50, 100], 'iterations': [250, 500], 'outputbias': [True] } params = list(tuned_parameters.keys()) combinations = self.generate_parameter_combinations( tuned_parameters, params) split_point = int(gridsearch_training_frac * len(train_X.index)) train_params_X = train_X.iloc[0:split_point, ] test_params_X = train_X.iloc[split_point:len(train_X.index), ] train_params_y = train_y.iloc[0:split_point, ] test_params_y = train_y.iloc[split_point:len(train_X.index), ] if error == 'mse': best_error = sys.float_info.max elif error == 'accuracy': best_error = 0 best_combination = [] for comb in combinations: print(comb) # Order of the keys might have changed. keys = list(tuned_parameters.keys()) # print(keys) pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.recurrent_neural_network( train_params_X, train_params_y, test_params_X, test_params_y, n_hidden_neurons=comb[keys.index('n_hidden_neurons')], iterations=comb[keys.index('iterations')], outputbias=comb[keys.index('outputbias')], gridsearch=False) if error == 'mse': eval = RegressionEvaluation() mse = eval.mean_squared_error(test_params_y, pred_test_y_prob) if mse < best_error: best_error = mse best_combination = comb elif error == 'accuracy': eval = ClassificationEvaluation() acc = eval.accuracy(test_params_y, pred_test_y) if acc > best_error: best_error = acc best_combination = comb print('-------') print(best_combination) print('-------') return best_combination[params.index( 'n_hidden_neurons')], best_combination[params.index( 'iterations')], best_combination[params.index('outputbias')]
def forward_selection( max_features: int, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[List[str], List[str], List[float]]: """ Select the given number of features for regression, that show the best accuracy, using forward selection. The method uses the given features and labels to train a decision tree and determine the mse of the predictions. The method returns the selected features as well as the the scores. :param max_features: Number of features to select. :param X_train: Features as DataFrame. :param y_train: True values corresponding to given features. :return: Selected features and scores. """ ordered_features = [] ordered_scores = [] # Start with no features selected_features = [] ra = RegressionAlgorithms() re = RegressionEvaluation() # Select the appropriate number of features for i in range(0, max_features): # Determine the features left to select features_left = list(set(X_train.columns) - set(selected_features)) best_perf = sys.float_info.max best_feature = '' # Iterate over all features left for f in features_left: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.append(f) # Determine the mse of a decision tree learner when adding the feature pred_y_train, pred_y_test = ra.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If the performance is better than seen so far (aiming for low mse) set the current feature to the best # feature and the same for the best performance if perf < best_perf: best_perf = perf best_feature = f # Select the feature with the best performance selected_features.append(best_feature) ordered_features.append(best_feature) ordered_scores.append(best_perf) return selected_features, ordered_features, ordered_scores
def gridsearch_time_series(self, train_X, train_y, test_X, test_y, error='mse', gridsearch_training_frac=0.7): tuned_parameters = {'ar': [0, 5], 'ma': [0, 5], 'd': [1]} params = tuned_parameters.keys() tc = TemporalClassificationAlgorithms() combinations = tc.generate_parameter_combinations( tuned_parameters, params) split_point = int(gridsearch_training_frac * len(train_X.index)) train_params_X = train_X.ix[0:split_point, ] test_params_X = train_X.ix[split_point:len(train_X.index), ] train_params_y = train_y.ix[0:split_point, ] test_params_y = train_y.ix[split_point:len(train_X.index), ] if error == 'mse': best_error = sys.float_info.max elif error == 'accuracy': best_error = 0 best_combination = [] for comb in combinations: print comb # Order of the keys might have changed. keys = tuned_parameters.keys() pred_train_y, pred_test_y = self.time_series( train_params_X, train_params_y, test_params_X, test_params_y, ar=comb[keys.index('ar')], ma=comb[keys.index('ma')], d=comb[keys.index('d')], gridsearch=False) eval = RegressionEvaluation() mse = eval.mean_squared_error(test_params_y, pred_test_y) if mse < best_error: best_error = mse best_combination = comb print '-------' print best_combination print '-------' return best_combination[keys.index('ar')], best_combination[keys.index( 'ma')], best_combination[keys.index('d')]
def forward_selection(self, max_features, X_train, y_train): ordered_features = [] ordered_scores = [] # Start with no features. selected_features = [] ra = RegressionAlgorithms() re = RegressionEvaluation() prev_best_perf = sys.float_info.max # Select the appropriate number of features. for i in range(0, max_features): print i #Determine the features left to select. features_left = list(set(X_train.columns) - set(selected_features)) best_perf = sys.float_info.max best_feature = '' # For all features we can still select... for f in features_left: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.append(f) # Determine the mse of a decision tree learner if we were to add # the feature. pred_y_train, pred_y_test = ra.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If the performance is better than what we have seen so far (we aim for low mse) # we set the current feature to the best feature and the same for the best performance. if perf < best_perf: best_perf = perf best_feature = f # We select the feature with the best performance. selected_features.append(best_feature) prev_best_perf = best_perf ordered_features.append(best_feature) ordered_scores.append(best_perf) return selected_features, ordered_features, ordered_scores
def backward_selection(max_features, X_train, y_train): """ Select the given number of features for regression, that show the best accuracy, using backward selection. The method uses the given features and labels to train a decision tree and determine the mse of the predictions. :param max_features: Number of features to select. :param X_train: Features as DataFrame. :param y_train: True values corresponding to given features. :return: Selected features. """ # First select all features selected_features = X_train.columns.tolist() ra = RegressionAlgorithms() re = RegressionEvaluation() # Select from the features that are still in the selection for i in range(0, (len(X_train.columns) - max_features)): best_perf = sys.float_info.max worst_feature = '' for f in selected_features: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.remove(f) # Determine the score without the feature pred_y_train, pred_y_test = ra.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If scoring better (i.e. a lower mse) without the feature than seen so far this is the worst feature if perf < best_perf: best_perf = perf worst_feature = f # Remove the worst feature selected_features.remove(worst_feature) return selected_features
performance_tr_svm_std = 0 performance_te_nn = 0 performance_te_nn_std = 0 performance_te_rf = 0 performance_te_rf_std = 0 performance_te_svm = 0 performance_te_svm_std = 0 for repeat in range(0, repeats): regr_train_y, regr_test_y = learner.feedforward_neural_network( selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = eval.mean_squared_error_with_std( train_y, regr_train_y) mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y) mean_training = eval.mean_squared_error(train_y, regr_train_y) performance_tr_nn += mean_tr performance_tr_nn_std += std_tr performance_te_nn += mean_te performance_te_nn_std += std_te regr_train_y, regr_test_y = learner.random_forest(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = eval.mean_squared_error_with_std( train_y, regr_train_y) mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_rf += mean_tr performance_tr_rf_std += std_tr performance_te_rf += mean_te