Example #1
0
    def backward_selection(self, max_features, X_train, y_train):

        # First select all features.
        selected_features = X_train.columns.tolist()
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()

        # Select from the features that are still in the selection.
        for i in range(0, (len(X_train.columns) - max_features)):
            best_perf = sys.float_info.max
            worst_feature = ''
            for f in selected_features:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.remove(f)

                # Determine the score without the feature.
                pred_y_train, pred_y_test = ra.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)
                # If we score better (i.e. a lower mse) without the feature than what we have seen so far
                # this is the worst feature.
                if perf < best_perf:
                    best_perf = perf
                    worst_feature = f
            # Remove the worst feature.
            selected_features.remove(worst_feature)
        return selected_features
    def gridsearch_reservoir_computing(self,
                                       train_X,
                                       train_y,
                                       test_X,
                                       test_y,
                                       per_time_step=False,
                                       error='mse',
                                       gridsearch_training_frac=0.7):
        tuned_parameters = {
            'a': [0.6, 0.8],
            'reservoir_size': [400, 700, 1000]
        }
        #        tuned_parameters = {'a': [0.4], 'reservoir_size':[250]}
        params = tuned_parameters.keys()
        combinations = self.generate_parameter_combinations(
            tuned_parameters, params)
        split_point = int(gridsearch_training_frac * len(train_X.index))
        train_params_X = train_X.ix[0:split_point, ]
        test_params_X = train_X.ix[split_point:len(train_X.index), ]
        train_params_y = train_y.ix[0:split_point, ]
        test_params_y = train_y.ix[split_point:len(train_X.index), ]

        if error == 'mse':
            best_error = sys.float_info.max
        elif error == 'accuracy':
            best_error = 0

        best_combination = []
        for comb in combinations:
            print comb
            # Order of the keys might have changed.
            keys = tuned_parameters.keys()
            pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.reservoir_computing(
                train_params_X,
                train_params_y,
                test_params_X,
                test_params_y,
                reservoir_size=comb[keys.index('reservoir_size')],
                a=comb[keys.index('a')],
                per_time_step=per_time_step,
                gridsearch=False)

            if error == 'mse':
                eval = RegressionEvaluation()
                mse = eval.mean_squared_error(test_params_y, pred_test_y_prob)
                if mse < best_error:
                    best_error = mse
                    best_combination = comb
            elif error == 'accuracy':
                eval = ClassificationEvaluation()
                acc = eval.accuracy(test_params_y, pred_test_y)
                if acc > best_error:
                    best_error = acc
                    best_combination = comb

        print '-------'
        print best_combination
        print '-------'
        return best_combination[keys.index(
            'reservoir_size')], best_combination[keys.index('a')]
    def gridsearch_recurrent_neural_network(self,
                                            train_X,
                                            train_y,
                                            test_X,
                                            test_y,
                                            error='accuracy',
                                            gridsearch_training_frac=0.7):
        tuned_parameters = {
            'n_hidden_neurons': [50, 100],
            'iterations': [250, 500],
            'outputbias': [True]
        }
        params = list(tuned_parameters.keys())
        combinations = self.generate_parameter_combinations(
            tuned_parameters, params)
        split_point = int(gridsearch_training_frac * len(train_X.index))
        train_params_X = train_X.iloc[0:split_point, ]
        test_params_X = train_X.iloc[split_point:len(train_X.index), ]
        train_params_y = train_y.iloc[0:split_point, ]
        test_params_y = train_y.iloc[split_point:len(train_X.index), ]

        if error == 'mse':
            best_error = sys.float_info.max
        elif error == 'accuracy':
            best_error = 0

        best_combination = []
        for comb in combinations:
            print(comb)
            # Order of the keys might have changed.
            keys = list(tuned_parameters.keys())
            # print(keys)
            pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.recurrent_neural_network(
                train_params_X,
                train_params_y,
                test_params_X,
                test_params_y,
                n_hidden_neurons=comb[keys.index('n_hidden_neurons')],
                iterations=comb[keys.index('iterations')],
                outputbias=comb[keys.index('outputbias')],
                gridsearch=False)

            if error == 'mse':
                eval = RegressionEvaluation()
                mse = eval.mean_squared_error(test_params_y, pred_test_y_prob)
                if mse < best_error:
                    best_error = mse
                    best_combination = comb
            elif error == 'accuracy':
                eval = ClassificationEvaluation()
                acc = eval.accuracy(test_params_y, pred_test_y)
                if acc > best_error:
                    best_error = acc
                    best_combination = comb
        print('-------')
        print(best_combination)
        print('-------')
        return best_combination[params.index(
            'n_hidden_neurons')], best_combination[params.index(
                'iterations')], best_combination[params.index('outputbias')]
Example #4
0
    def forward_selection(
            max_features: int, X_train: pd.DataFrame,
            y_train: pd.Series) -> Tuple[List[str], List[str], List[float]]:
        """
        Select the given number of features for regression, that show the best accuracy, using forward selection.
        The method uses the given features and labels to train a decision tree and determine the mse of the
        predictions. The method returns the selected features as well as the the scores.

        :param max_features: Number of features to select.
        :param X_train: Features as DataFrame.
        :param y_train: True values corresponding to given features.
        :return: Selected features and scores.
        """

        ordered_features = []
        ordered_scores = []

        # Start with no features
        selected_features = []
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()

        # Select the appropriate number of features
        for i in range(0, max_features):

            # Determine the features left to select
            features_left = list(set(X_train.columns) - set(selected_features))
            best_perf = sys.float_info.max
            best_feature = ''

            # Iterate over all features left
            for f in features_left:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.append(f)

                # Determine the mse of a decision tree learner when adding the feature
                pred_y_train, pred_y_test = ra.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)

                # If the performance is better than seen so far (aiming for low mse) set the current feature to the best
                # feature and the same for the best performance
                if perf < best_perf:
                    best_perf = perf
                    best_feature = f
            # Select the feature with the best performance
            selected_features.append(best_feature)
            ordered_features.append(best_feature)
            ordered_scores.append(best_perf)
        return selected_features, ordered_features, ordered_scores
    def gridsearch_time_series(self,
                               train_X,
                               train_y,
                               test_X,
                               test_y,
                               error='mse',
                               gridsearch_training_frac=0.7):
        tuned_parameters = {'ar': [0, 5], 'ma': [0, 5], 'd': [1]}
        params = tuned_parameters.keys()

        tc = TemporalClassificationAlgorithms()
        combinations = tc.generate_parameter_combinations(
            tuned_parameters, params)
        split_point = int(gridsearch_training_frac * len(train_X.index))
        train_params_X = train_X.ix[0:split_point, ]
        test_params_X = train_X.ix[split_point:len(train_X.index), ]
        train_params_y = train_y.ix[0:split_point, ]
        test_params_y = train_y.ix[split_point:len(train_X.index), ]

        if error == 'mse':
            best_error = sys.float_info.max
        elif error == 'accuracy':
            best_error = 0

        best_combination = []
        for comb in combinations:
            print comb
            # Order of the keys might have changed.
            keys = tuned_parameters.keys()
            pred_train_y, pred_test_y = self.time_series(
                train_params_X,
                train_params_y,
                test_params_X,
                test_params_y,
                ar=comb[keys.index('ar')],
                ma=comb[keys.index('ma')],
                d=comb[keys.index('d')],
                gridsearch=False)

            eval = RegressionEvaluation()
            mse = eval.mean_squared_error(test_params_y, pred_test_y)
            if mse < best_error:
                best_error = mse
                best_combination = comb

        print '-------'
        print best_combination
        print '-------'
        return best_combination[keys.index('ar')], best_combination[keys.index(
            'ma')], best_combination[keys.index('d')]
Example #6
0
    def forward_selection(self, max_features, X_train, y_train):
        ordered_features = []
        ordered_scores = []

        # Start with no features.
        selected_features = []
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()
        prev_best_perf = sys.float_info.max

        # Select the appropriate number of features.
        for i in range(0, max_features):
            print i

            #Determine the features left to select.
            features_left = list(set(X_train.columns) - set(selected_features))
            best_perf = sys.float_info.max
            best_feature = ''

            # For all features we can still select...
            for f in features_left:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.append(f)

                # Determine the mse of a decision tree learner if we were to add
                # the feature.
                pred_y_train, pred_y_test = ra.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)

                # If the performance is better than what we have seen so far (we aim for low mse)
                # we set the current feature to the best feature and the same for the best performance.
                if perf < best_perf:
                    best_perf = perf
                    best_feature = f
            # We select the feature with the best performance.
            selected_features.append(best_feature)
            prev_best_perf = best_perf
            ordered_features.append(best_feature)
            ordered_scores.append(best_perf)
        return selected_features, ordered_features, ordered_scores
Example #7
0
    def backward_selection(max_features, X_train, y_train):
        """
        Select the given number of features for regression, that show the best accuracy, using backward selection.
        The method uses the given features and labels to train a decision tree and determine the mse of the
        predictions.

        :param max_features: Number of features to select.
        :param X_train: Features as DataFrame.
        :param y_train: True values corresponding to given features.
        :return: Selected features.
        """

        # First select all features
        selected_features = X_train.columns.tolist()
        ra = RegressionAlgorithms()
        re = RegressionEvaluation()

        # Select from the features that are still in the selection
        for i in range(0, (len(X_train.columns) - max_features)):
            best_perf = sys.float_info.max
            worst_feature = ''
            for f in selected_features:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.remove(f)

                # Determine the score without the feature
                pred_y_train, pred_y_test = ra.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = re.mean_squared_error(y_train, pred_y_train)
                # If scoring better (i.e. a lower mse) without the feature than seen so far this is the worst feature
                if perf < best_perf:
                    best_perf = perf
                    worst_feature = f
            # Remove the worst feature
            selected_features.remove(worst_feature)
        return selected_features
    performance_tr_svm_std = 0
    performance_te_nn = 0
    performance_te_nn_std = 0
    performance_te_rf = 0
    performance_te_rf_std = 0
    performance_te_svm = 0
    performance_te_svm_std = 0

    for repeat in range(0, repeats):
        regr_train_y, regr_test_y = learner.feedforward_neural_network(
            selected_train_X, train_y, selected_test_X, gridsearch=True)

        mean_tr, std_tr = eval.mean_squared_error_with_std(
            train_y, regr_train_y)
        mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
        mean_training = eval.mean_squared_error(train_y, regr_train_y)
        performance_tr_nn += mean_tr
        performance_tr_nn_std += std_tr
        performance_te_nn += mean_te
        performance_te_nn_std += std_te

        regr_train_y, regr_test_y = learner.random_forest(selected_train_X,
                                                          train_y,
                                                          selected_test_X,
                                                          gridsearch=True)
        mean_tr, std_tr = eval.mean_squared_error_with_std(
            train_y, regr_train_y)
        mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
        performance_tr_rf += mean_tr
        performance_tr_rf_std += std_tr
        performance_te_rf += mean_te