Beispiel #1
0
    def backward_selection(self, max_features, X_train, y_train):

        # First select all features.
        selected_features = X_train.columns.tolist()
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()

        # Select from the features that are still in the selection.
        for i in range(0, (len(X_train.columns) - max_features)):
            best_perf = sys.float_info.max
            worst_feature = ''
            for f in selected_features:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.remove(f)

                # Determine the score without the feature.
                pred_y_train, pred_y_test = ra.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)
                # If we score better (i.e. a lower mse) without the feature than what we have seen so far
                # this is the worst feature.
                if perf < best_perf:
                    best_perf = perf
                    worst_feature = f
            # Remove the worst feature.
            selected_features.remove(worst_feature)
        return selected_features
Beispiel #2
0
    def forward_selection(
            max_features: int, X_train: pd.DataFrame,
            y_train: pd.Series) -> Tuple[List[str], List[str], List[float]]:
        """
        Select the given number of features for regression, that show the best accuracy, using forward selection.
        The method uses the given features and labels to train a decision tree and determine the mse of the
        predictions. The method returns the selected features as well as the the scores.

        :param max_features: Number of features to select.
        :param X_train: Features as DataFrame.
        :param y_train: True values corresponding to given features.
        :return: Selected features and scores.
        """

        ordered_features = []
        ordered_scores = []

        # Start with no features
        selected_features = []
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()

        # Select the appropriate number of features
        for i in range(0, max_features):

            # Determine the features left to select
            features_left = list(set(X_train.columns) - set(selected_features))
            best_perf = sys.float_info.max
            best_feature = ''

            # Iterate over all features left
            for f in features_left:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.append(f)

                # Determine the mse of a decision tree learner when adding the feature
                pred_y_train, pred_y_test = ra.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)

                # If the performance is better than seen so far (aiming for low mse) set the current feature to the best
                # feature and the same for the best performance
                if perf < best_perf:
                    best_perf = perf
                    best_feature = f
            # Select the feature with the best performance
            selected_features.append(best_feature)
            ordered_features.append(best_feature)
            ordered_scores.append(best_perf)
        return selected_features, ordered_features, ordered_scores
Beispiel #3
0
    def forward_selection(self, max_features, X_train, y_train):
        ordered_features = []
        ordered_scores = []

        # Start with no features.
        selected_features = []
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()
        prev_best_perf = sys.float_info.max

        # Select the appropriate number of features.
        for i in range(0, max_features):
            print i

            #Determine the features left to select.
            features_left = list(set(X_train.columns) - set(selected_features))
            best_perf = sys.float_info.max
            best_feature = ''

            # For all features we can still select...
            for f in features_left:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.append(f)

                # Determine the mse of a decision tree learner if we were to add
                # the feature.
                pred_y_train, pred_y_test = ra.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)

                # If the performance is better than what we have seen so far (we aim for low mse)
                # we set the current feature to the best feature and the same for the best performance.
                if perf < best_perf:
                    best_perf = perf
                    best_feature = f
            # We select the feature with the best performance.
            selected_features.append(best_feature)
            prev_best_perf = best_perf
            ordered_features.append(best_feature)
            ordered_scores.append(best_perf)
        return selected_features, ordered_features, ordered_scores
Beispiel #4
0
    def backward_selection(max_features, X_train, y_train):
        """
        Select the given number of features for regression, that show the best accuracy, using backward selection.
        The method uses the given features and labels to train a decision tree and determine the mse of the
        predictions.

        :param max_features: Number of features to select.
        :param X_train: Features as DataFrame.
        :param y_train: True values corresponding to given features.
        :return: Selected features.
        """

        # First select all features
        selected_features = X_train.columns.tolist()
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()

        # Select from the features that are still in the selection
        for i in range(0, (len(X_train.columns) - max_features)):
            best_perf = sys.float_info.max
            worst_feature = ''
            for f in selected_features:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.remove(f)

                # Determine the score without the feature
                pred_y_train, pred_y_test = ra.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)
                # If scoring better (i.e. a lower mse) without the feature than seen so far this is the worst feature
                if perf < best_perf:
                    best_perf = perf
                    worst_feature = f
            # Remove the worst feature
            selected_features.remove(worst_feature)
        return selected_features
    performance_te_svm_std = std_te

    regr_train_y, regr_test_y = learner.k_nearest_neighbor(selected_train_X,
                                                           train_y,
                                                           selected_test_X,
                                                           gridsearch=True)
    mean_tr, std_tr = eval.mean_squared_error_with_std(train_y, regr_train_y)
    mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
    performance_tr_knn = mean_tr
    performance_tr_knn_std = std_tr
    performance_te_knn = mean_te
    performance_te_knn_std = std_te

    regr_train_y, regr_test_y = learner.decision_tree(
        selected_train_X,
        train_y,
        selected_test_X,
        gridsearch=True,
        export_tree_path=export_tree_path)

    mean_tr, std_tr = eval.mean_squared_error_with_std(train_y, regr_train_y)
    mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
    performance_tr_dt = mean_tr
    performance_tr_dt_std = std_tr
    performance_te_dt = mean_te
    performance_te_dt_std = std_te

    scores_with_sd = [
        (overall_performance_tr_nn, overall_performance_tr_nn_std,
         overall_performance_te_nn, overall_performance_te_nn_std),
        (overall_performance_tr_rf, overall_performance_tr_rf_std,
         overall_performance_te_rf, overall_performance_te_rf_std),