def backward_selection(self, max_features, X_train, y_train): # First select all features. selected_features = X_train.columns.tolist() ra = RegressionAlgorithms() re = RegressionEvaluation() # Select from the features that are still in the selection. for i in range(0, (len(X_train.columns) - max_features)): best_perf = sys.float_info.max worst_feature = '' for f in selected_features: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.remove(f) # Determine the score without the feature. pred_y_train, pred_y_test = ra.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If we score better (i.e. a lower mse) without the feature than what we have seen so far # this is the worst feature. if perf < best_perf: best_perf = perf worst_feature = f # Remove the worst feature. selected_features.remove(worst_feature) return selected_features
def forward_selection( max_features: int, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[List[str], List[str], List[float]]: """ Select the given number of features for regression, that show the best accuracy, using forward selection. The method uses the given features and labels to train a decision tree and determine the mse of the predictions. The method returns the selected features as well as the the scores. :param max_features: Number of features to select. :param X_train: Features as DataFrame. :param y_train: True values corresponding to given features. :return: Selected features and scores. """ ordered_features = [] ordered_scores = [] # Start with no features selected_features = [] ra = RegressionAlgorithms() re = RegressionEvaluation() # Select the appropriate number of features for i in range(0, max_features): # Determine the features left to select features_left = list(set(X_train.columns) - set(selected_features)) best_perf = sys.float_info.max best_feature = '' # Iterate over all features left for f in features_left: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.append(f) # Determine the mse of a decision tree learner when adding the feature pred_y_train, pred_y_test = ra.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If the performance is better than seen so far (aiming for low mse) set the current feature to the best # feature and the same for the best performance if perf < best_perf: best_perf = perf best_feature = f # Select the feature with the best performance selected_features.append(best_feature) ordered_features.append(best_feature) ordered_scores.append(best_perf) return selected_features, ordered_features, ordered_scores
def forward_selection(self, max_features, X_train, y_train): ordered_features = [] ordered_scores = [] # Start with no features. selected_features = [] ra = RegressionAlgorithms() re = RegressionEvaluation() prev_best_perf = sys.float_info.max # Select the appropriate number of features. for i in range(0, max_features): print i #Determine the features left to select. features_left = list(set(X_train.columns) - set(selected_features)) best_perf = sys.float_info.max best_feature = '' # For all features we can still select... for f in features_left: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.append(f) # Determine the mse of a decision tree learner if we were to add # the feature. pred_y_train, pred_y_test = ra.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If the performance is better than what we have seen so far (we aim for low mse) # we set the current feature to the best feature and the same for the best performance. if perf < best_perf: best_perf = perf best_feature = f # We select the feature with the best performance. selected_features.append(best_feature) prev_best_perf = best_perf ordered_features.append(best_feature) ordered_scores.append(best_perf) return selected_features, ordered_features, ordered_scores
def backward_selection(max_features, X_train, y_train): """ Select the given number of features for regression, that show the best accuracy, using backward selection. The method uses the given features and labels to train a decision tree and determine the mse of the predictions. :param max_features: Number of features to select. :param X_train: Features as DataFrame. :param y_train: True values corresponding to given features. :return: Selected features. """ # First select all features selected_features = X_train.columns.tolist() ra = RegressionAlgorithms() re = RegressionEvaluation() # Select from the features that are still in the selection for i in range(0, (len(X_train.columns) - max_features)): best_perf = sys.float_info.max worst_feature = '' for f in selected_features: temp_selected_features = copy.deepcopy(selected_features) temp_selected_features.remove(f) # Determine the score without the feature pred_y_train, pred_y_test = ra.decision_tree( X_train[temp_selected_features], y_train, X_train[temp_selected_features]) perf = re.mean_squared_error(y_train, pred_y_train) # If scoring better (i.e. a lower mse) without the feature than seen so far this is the worst feature if perf < best_perf: best_perf = perf worst_feature = f # Remove the worst feature selected_features.remove(worst_feature) return selected_features
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart # rate as target and split using timestamps, because this is considered as a temporal task. print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() learner = RegressionAlgorithms() evaluation = RegressionEvaluation() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate', '2016-02-08 18:28:56', '2016-02-08 19:34:07', '2016-02-08 20:07:50') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('- - - Selecting subsets - - -') basic_features = ['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding', 'labelDriving', 'labelEating', 'labelRunning', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure'] pca_features = ['pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7'] time_features = [name for name in dataset.columns if ('temp_' in name and 'hr_watch' not in name)] freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) if FLAGS.mode == 'selection' or FLAGS.mode == 'all': # First, consider the Pearson correlations and see whether features can be selected based on them fs = FeatureSelectionRegression() print('\n- - - Running feature selection - - -') features, correlations = fs.pearson_selection(10, train_X[features_after_chapter_5], train_y) util.print_pearson_correlations(correlations) # Select the 10 features with the highest correlation selected_features = ['temp_pattern_labelOnTable', 'labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'pca_2_temp_mean_ws_120', 'pca_1_temp_mean_ws_120', 'acc_watch_y_temp_mean_ws_120', 'pca_2', 'acc_phone_z_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse'] possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features] feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features'] if FLAGS.mode == 'overall' or FLAGS.mode == 'all': print('\n- - - Running test of all different regression algorithms - - -') # First study the importance of the parameter settings. Therefore repeat the experiment a number of times to get # a bit more robust data as the initialization of e.g. the NN is random REPEATS = FLAGS.repeats scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] performance_tr_nn, performance_tr_nn_std = 0, 0 performance_tr_rf, performance_tr_rf_std = 0, 0 performance_te_nn, performance_te_nn_std = 0, 0 performance_te_rf, performance_te_rf_std = 0, 0 # First run non deterministic classifiers a number of times to average their score for repeat in range(0, REPEATS): print(f'Training NeuralNetwork run {repeat + 1}/{REPEATS} ... ') regr_train_y, regr_test_y = learner.\ feedforward_neural_network(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_nn += mean_tr performance_tr_nn_std += std_tr performance_te_nn += mean_te performance_te_nn_std += std_te print(f'Training RandomForest run {repeat + 1}/{REPEATS} ... ') regr_train_y, regr_test_y = learner.random_forest(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_rf += mean_tr performance_tr_rf_std += std_tr performance_te_rf += mean_te performance_te_rf_std += std_te overall_performance_tr_nn = performance_tr_nn / REPEATS overall_performance_tr_nn_std = performance_tr_nn_std / REPEATS overall_performance_te_nn = performance_te_nn / REPEATS overall_performance_te_nn_std = performance_te_nn_std / REPEATS overall_performance_tr_rf = performance_tr_rf / REPEATS overall_performance_tr_rf_std = performance_tr_rf_std / REPEATS overall_performance_te_rf = performance_te_rf / REPEATS overall_performance_te_rf_std = performance_te_rf_std / REPEATS # Run deterministic algorithms: print("Support Vector Regressor run 1/1 ... ") # Convergence of the SVR does not always occur (even adjusting tolerance and iterations does not help) regr_train_y, regr_test_y = learner.\ support_vector_regression_without_kernel(selected_train_X, train_y, selected_test_X, gridsearch=False) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_svm = mean_tr performance_tr_svm_std = std_tr performance_te_svm = mean_te performance_te_svm_std = std_te print("Training Nearest Neighbor run 1/1 ... ") regr_train_y, regr_test_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_knn = mean_tr performance_tr_knn_std = std_tr performance_te_knn = mean_te performance_te_knn_std = std_te print("Training Decision Tree run 1/1 ... ") regr_train_y, regr_test_y = learner.\ decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True, export_tree_path=EXPORT_TREE_PATH) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_dt = mean_tr performance_tr_dt_std = std_tr performance_te_dt = mean_te performance_te_dt_std = std_te scores_with_sd = [(overall_performance_tr_nn, overall_performance_tr_nn_std, overall_performance_te_nn, overall_performance_te_nn_std), (overall_performance_tr_rf, overall_performance_tr_rf_std, overall_performance_te_rf, overall_performance_te_rf_std), (performance_tr_svm, performance_tr_svm_std, performance_te_svm, performance_te_svm_std), (performance_tr_knn, performance_tr_knn_std, performance_te_knn, performance_te_knn_std), (performance_tr_dt, performance_tr_dt_std, performance_te_dt, performance_te_dt_std)] util.print_table_row_performances_regression(feature_names[i], scores_with_sd) scores_over_all_algs.append(scores_with_sd) # Plot the results DataViz.plot_performances_regression(['NN', 'RF', 'SVM', 'KNN', 'DT'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': print('\n- - - Running visualization of results - - -') regr_train_y, regr_test_y = learner.random_forest(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], gridsearch=False, print_model_details=True) DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y, test_X.index, test_y, regr_test_y, 'heart rate')
'temp_pattern_labelOnTable(b)labelOnTable', 'pca_2_temp_mean_ws_120', 'pca_1_temp_mean_ws_120', 'acc_watch_y_temp_mean_ws_120', 'pca_2', 'acc_phone_z_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse' ] possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] # Let us first study the importance of the parameter settings. learner = RegressionAlgorithms() eval = RegressionEvaluation() # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. repeats = 5 scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First we run our non deterministic classifiers a number of times to average their score.