def test_rerf_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space']
        )

        num_train_points = 51
        x_train_df, y_train_df = self.generate_points_simple_quadratic(num_train_points, len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_train_df, y_train_df)

        # generate new random sample to test predictions
        num_test_points = 50
        x_test_df, y_test_df = self.generate_points_simple_quadratic(num_test_points, len(self.test_case_globals['2d_X_input_space'].dimensions))
        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum()
        total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares

        test_threshold = 10 ** -5
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
Ejemplo n.º 2
0
    def test_lasso_coefficients(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.input_space,
            output_space=self.output_space)
        num_x = 1000
        np.random.seed(23)
        x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)])
        x_df = pd.DataFrame(x, columns=['x1', 'x2'])

        # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2
        y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0])
        poly_reg = PolynomialFeatures(degree=2)
        poly_terms_x = poly_reg.fit_transform(x)
        y = np.matmul(poly_terms_x, y_coef_true)
        y_df = pd.DataFrame(y, columns=['degree2_polynomial_y'])

        # fit model with same degree as true y
        rerf.fit(x_df, y_df)

        # test fit coef match known coef
        epsilon = 10**-2
        expected_non_zero_coef = y_coef_true[np.where(y_coef_true != 0.0)[0]]
        fit_poly_coef = [rerf.base_regressor_.intercept_]
        fit_poly_coef.extend(rerf.base_regressor_.coef_)
        incorrect_terms = np.where(
            np.abs(fit_poly_coef - expected_non_zero_coef) > epsilon)[0]
        num_incorrect_terms = len(incorrect_terms)
        assert num_incorrect_terms == 0
    def test_rerf_categorical_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['categorical_input_space'],
            output_space=self.test_case_globals['output_space']
        )

        # input space consists of 6 2-d domains that are 5 x 5 units wide.  Hence placing 25 points in each domain.
        num_train_x = 20 * 20
        x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(num_train_x)
        rerf.fit(x_train_df, y_train_df)

        # generate new random to test predictions
        num_test_points = 5 * 5
        x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic(num_test_points)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum()
        total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10 ** -4
        print(unexplained_variance, test_threshold)
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
Ejemplo n.º 4
0
    def test_lasso_feature_discovery(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.input_space,
            output_space=self.output_space)
        num_x = 100
        np.random.seed(17)
        x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)])
        x_df = pd.DataFrame(x, columns=['x1', 'x2'])

        # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2
        y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0])
        poly_reg = PolynomialFeatures(degree=2)
        poly_terms_x = poly_reg.fit_transform(x)
        y = np.matmul(poly_terms_x, y_coef_true)
        y_df = pd.DataFrame(y, columns=['degree2_polynomial_y'])

        # fit model with same degree as true y
        # rerf = RegressionEnhancedRandomForest(lasso_degree=2)
        rerf.fit(x_df, y_df)

        # test if expected non-zero terms were found
        expected_fit_model_terms = {1, 2, 3, 5}
        expected_symm_diff_found = expected_fit_model_terms - set(
            rerf.detected_feature_indices_)
        num_diffs = len(list(expected_symm_diff_found))
        assert num_diffs == 0
Ejemplo n.º 5
0
    def test_lasso_polynomial_gradient_invariants(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space'])

        num_points = 100
        x_df, y_df = self.generate_points_simple_quadratic(
            num_points,
            len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_df, y_df)

        final_num_features = 2
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial = self.n_choose_k(
            polynomial_degree + final_num_features, final_num_features)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')
        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_points, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
Ejemplo n.º 6
0
    def test_polynomial_gradient(self):
        print(self.model_config)
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.input_space,
            output_space=self.output_space)
        num_x = 100
        np.random.seed(13)
        x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)])
        x_df = pd.DataFrame(x, columns=['x1', 'x2'])

        # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2
        y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0])
        poly_reg = PolynomialFeatures(degree=2)
        poly_terms_x = poly_reg.fit_transform(x)
        y = np.matmul(poly_terms_x, y_coef_true)
        y_df = pd.DataFrame(y, columns=['degree2_polynomial_y'])

        # fit model with same degree as true y
        rerf.fit(x_df, y_df)

        # test gradient at X
        epsilon = 10**-2
        true_gradient_coef = np.array([[-3, -0.5 * 2, 0, 0, 0, 0],
                                       [-4, -2.0 * 2, 0, 0, 0,
                                        0]]).transpose()
        incorrect_terms = np.where(
            np.abs(true_gradient_coef -
                   rerf.root_model_gradient_coef_) > epsilon)[0]
        num_incorrect_terms = len(incorrect_terms)
        assert num_incorrect_terms == 0
Ejemplo n.º 7
0
    def test_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.input_space,
            output_space=self.output_space)
        num_x = 100
        np.random.seed(13)
        x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)])
        x_df = pd.DataFrame(x, columns=['x1', 'x2'])

        # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2
        y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0])
        poly_reg = PolynomialFeatures(degree=2)
        poly_terms_x = poly_reg.fit_transform(x)
        y = np.matmul(poly_terms_x, y_coef_true)
        y_df = pd.DataFrame(y, columns=['degree2_polynomial_y'])

        # fit model with same degree as true y
        rerf.fit(x_df, y_df)

        predictions = rerf.predict(x_df)
        pred_df = predictions.get_dataframe()

        sample_mean_col = Prediction.LegalColumnNames.SAMPLE_MEAN.value
        pred_df['residual'] = y - pred_df[sample_mean_col]
        r2 = np.sum(pred_df['residual']**2, axis=0)

        assert r2 < 10**-4
    def test_lasso_feature_discovery(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space']
        )

        num_points = 100
        x_df, y_df = self.generate_points_simple_quadratic(num_points, len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_df, y_df)

        final_num_features = 2
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial = self.n_choose_k(polynomial_degree + final_num_features, final_num_features)
        num_detected_features = len(rerf.detected_feature_indices_)

        assert rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect'
        assert rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect'
        assert rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect'
        assert rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect'

        # test if expected non-zero terms were found
        expected_fit_model_terms = {1, 2, 3, 5}
        expected_symm_diff_found = expected_fit_model_terms - set(rerf.detected_feature_indices_)
        num_diffs = len(list(expected_symm_diff_found))
        assert num_diffs == 0, 'Base model failed to find expected features'
    def test_lasso_hierarchical_categorical_predictions(self):
        objective_function_config = objective_function_config_store.get_config_by_name(
            'three_level_quadratic')
        objective_function = ObjectiveFunctionFactory.create_objective_function(
            objective_function_config=objective_function_config)

        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=objective_function.parameter_space,
            output_space=objective_function.output_space)

        # fit model with same degree as true y
        num_train_x = 100
        x_train_df = objective_function.parameter_space.random_dataframe(
            num_samples=num_train_x)
        y_train_df = objective_function.evaluate_dataframe(x_train_df)
        rerf.fit(x_train_df, y_train_df)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_x,
                                  rerf.polynomial_features_powers_.shape[0]),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
        self.assertTrue(rerf.polynomial_features_powers_.shape == (28, 8),
                        'PolynomalFeature.power_ shape is incorrect')

        # test predictions
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        num_test_x = 10

        # by generating a single X feature on which to make the predictions, the
        y_test_list = []
        predicted_y_list = []
        for _ in range(num_test_x):
            x_test_df = objective_function.parameter_space.random_dataframe(
                num_samples=1)
            y_test_df = objective_function.evaluate_dataframe(x_test_df)
            y_test_list.append(y_test_df['y'].values[0])

            predictions = rerf.predict(x_test_df)
            pred_df = predictions.get_dataframe()
            predicted_y_list.append(pred_df[predicted_value_col].values[0])

        predicted_y = np.array(predicted_y_list)
        y_test = np.array(y_test_list)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        self.assertTrue(unexplained_variance < 10**-4,
                        '1 - R^2 larger than expected')
    def test_lasso_categorical_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['categorical_input_space'],
            output_space=self.test_case_globals['output_space'])

        # input space consists of 6 2-d domains that are 5 x 5 units wide.  Hence placing 25 points in each domain.
        num_train_x = 20 * 20
        x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_train_x)
        rerf.fit(x_train_df, y_train_df)

        num_categorical_levels_expected = len(
            rerf.one_hot_encoder_adapter.get_one_hot_encoded_column_names())
        num_continuous_dimensions = 2  # x1 and x2
        final_num_features = num_categorical_levels_expected + num_continuous_dimensions
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial_per_categorical_level = self.n_choose_k(
            polynomial_degree + num_continuous_dimensions,
            num_continuous_dimensions)
        # 1 is added to the num_categorical_levels_expected to account for "level 0" which the one hot encoder in RERF drops the first level,
        # while the design matrix contains a polynomial fit for that level.
        # Since it is possible not all categorical levels will be present in the training set, RERF eliminates zero columns arising from
        # OneHotEncoder knowing the missing levels are possible.  The list of the dropped columns is established in RERF.fit() and used in the
        # RERF.predict() method.
        num_cols_in_design_matrix = num_terms_in_polynomial_per_categorical_level * (num_categorical_levels_expected + 1)\
                                  - len(rerf.categorical_zero_cols_idx_to_delete_)
        num_detected_features = len(rerf.detected_feature_indices_)

        assert rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect'
        assert rerf.fit_X_.shape == (
            num_train_x,
            num_cols_in_design_matrix), 'Design matrix shape is incorrect'
        assert rerf.partial_hat_matrix_.shape == (
            num_detected_features,
            num_detected_features), 'Hat matrix shape is incorrect'
        assert rerf.polynomial_features_powers_.shape == (
            num_cols_in_design_matrix,
            final_num_features), 'PolynomalFeature.power_ shape is incorrect'

        # generate new random to test predictions
        num_test_points = 5 * 5
        x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_test_points)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10**-4
        print(unexplained_variance, test_threshold)
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
Ejemplo n.º 11
0
    def test_lasso_hierarchical_categorical_predictions(self):
        random.seed(11001)
        objective_function_config = objective_function_config_store.get_config_by_name(
            'three_level_quadratic')
        objective_function = ObjectiveFunctionFactory.create_objective_function(
            objective_function_config=objective_function_config)

        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=objective_function.parameter_space,
            output_space=objective_function.output_space)

        # fit model with same degree as true y
        # The input space consists of 3 2-d domains 200 x 200 units.  Hence random samples smaller than a certain size will produce too few points to
        # train reliable models.
        # TODO: Good place to use a non-random training set design
        num_train_x = 600
        x_train_df = objective_function.parameter_space.random_dataframe(
            num_samples=num_train_x)
        y_train_df = objective_function.evaluate_dataframe(x_train_df)
        rerf.fit(x_train_df, y_train_df)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_x,
                                  rerf.polynomial_features_powers_.shape[0]),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
        self.assertTrue(rerf.polynomial_features_powers_.shape == (34, 9),
                        'PolynomalFeature.power_ shape is incorrect')

        # test predictions
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        num_test_x = 50
        x_test_df = objective_function.parameter_space.random_dataframe(
            num_samples=num_test_x)
        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = objective_function.evaluate_dataframe(
            x_test_df).to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10**-3
        self.assertTrue(
            unexplained_variance < test_threshold,
            f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
        )
    def test_lasso_categorical_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['categorical_input_space'],
            output_space=self.test_case_globals['output_space'])

        num_train_x = 300
        x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_train_x)
        rerf.fit(x_train_df, y_train_df)

        num_categorical_levels_expected = len(x_train_df['x0'].unique()) * len(
            x_train_df['i0'].unique())
        num_continuous_dimensions = 2  # x1 and x2
        final_num_features = num_categorical_levels_expected - 1 + num_continuous_dimensions
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial_per_categorical_level = self.n_choose_k(
            polynomial_degree + num_continuous_dimensions,
            num_continuous_dimensions)
        num_terms_in_polynomial = num_terms_in_polynomial_per_categorical_level * num_categorical_levels_expected
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_x, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')

        # generate new random to test predictions
        num_test_points = 50
        x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_test_points)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        self.assertTrue(unexplained_variance < 10**-4,
                        '1 - R^2 larger than expected')
Ejemplo n.º 13
0
    def test_lasso_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space'])

        num_train_points = 100
        x_train_df, y_train_df = self.generate_points_simple_quadratic(
            num_train_points,
            len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_train_df, y_train_df)

        final_num_features = 2
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial = self.n_choose_k(
            polynomial_degree + final_num_features, final_num_features)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')
        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_points, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')

        # generate new random sample to test predictions
        num_test_points = 50
        x_test_df, y_test_df = self.generate_points_simple_quadratic(
            num_test_points,
            len(self.test_case_globals['2d_X_input_space'].dimensions))
        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares

        test_threshold = 10**-3
        self.assertTrue(
            unexplained_variance < test_threshold,
            f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
        )
    def test_lasso_polynomial_gradient(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space'])

        np.random.seed(13)
        num_points = 100
        x_df, y_df = self.generate_points_simple_quadratic(
            num_points,
            len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_df, y_df)

        final_num_features = 2
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial = self.n_choose_k(
            polynomial_degree + final_num_features, final_num_features)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')
        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_points, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')

        # test gradient at X
        epsilon = 10**-2
        true_gradient_coef = np.array([[-3, -0.5 * 2, 0, 0, 0, 0],
                                       [-4, -2.0 * 2, 0, 0, 0,
                                        0]]).transpose()
        incorrect_terms = np.where(
            np.abs(true_gradient_coef -
                   rerf.root_model_gradient_coef_) > epsilon)[0]
        num_incorrect_terms = len(incorrect_terms)
        self.assertTrue(
            num_incorrect_terms == 0,
            'Estimated gradient coefficients deviated further than expected from known coefficients'
        )
    def test_lasso_polynomial_coefficients(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space'])

        np.random.seed(23)
        num_points = 1000
        x_df, y_df = self.generate_points_simple_quadratic(
            num_points,
            len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_df, y_df)

        final_num_features = 2
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial = self.n_choose_k(
            polynomial_degree + final_num_features, final_num_features)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')
        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_points, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')

        # test fit coef match known coef
        y_coef_true = self.get_simple_quadratic_coefficients()
        epsilon = 10**-2
        expected_non_zero_coef = y_coef_true[np.where(y_coef_true != 0.0)[0]]
        fit_poly_coef = [rerf.base_regressor_.intercept_]
        fit_poly_coef.extend(rerf.base_regressor_.coef_)
        incorrect_terms = np.where(
            np.abs(fit_poly_coef - expected_non_zero_coef) > epsilon)[0]
        num_incorrect_terms = len(incorrect_terms)
        self.assertTrue(
            num_incorrect_terms == 0,
            'Estimated polynomial coefficients deviated further than expected from known coefficients'
        )
    def __init__(self,
                 model_config: Point,
                 input_space: Hypergrid,
                 output_space: Hypergrid,
                 logger: logging.Logger = None):
        NaiveMultiObjectiveRegressionModel.__init__(
            self,
            model_type=RegressionEnhancedRandomForestRegressionModel,
            model_config=model_config,
            input_space=input_space,
            output_space=output_space,
            logger=logger)

        # We just need to assert that the model config belongs in regression_enhanced_random_forest_config_store.parameter_space.
        # A more elaborate solution might be needed down the road, but for now this simple solution should suffice.
        #
        assert model_config in regression_enhanced_random_forest_config_store.parameter_space

        for output_dimension in output_space.dimensions:
            # We copy the model_config (rather than share across objectives below because the perform_initial_random_forest_hyper_parameter_search
            #  is set to False after the initial fit() call so that subsequent .fit() calls don't pay the cost penalty for this embedded hyper parameter search
            rerf_model = RegressionEnhancedRandomForestRegressionModel(
                model_config=model_config.copy(),
                input_space=input_space,
                output_space=SimpleHypergrid(
                    name=f"{output_dimension.name}_objective",
                    dimensions=[output_dimension]),
                logger=self.logger)
            self._regressors_by_objective_name[
                output_dimension.name] = rerf_model
    def test_rerf_hierarchical_categorical_predictions(self):
        random.seed(11001)
        objective_function_config = objective_function_config_store.get_config_by_name('three_level_quadratic')
        objective_function = ObjectiveFunctionFactory.create_objective_function(objective_function_config=objective_function_config)

        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=objective_function.parameter_space,
            output_space=objective_function.output_space
        )

        # fit model with same degree as true y
        # The input space consists of 3 2-d domains 200 x 200 units.  Hence random samples smaller than a certain size will produce too few points to
        # train reliable models.
        # TODO: Good place to use a non-random training set design
        num_train_x = 300
        x_train_df = objective_function.parameter_space.random_dataframe(num_samples=num_train_x)
        y_train_df = objective_function.evaluate_dataframe(x_train_df)
        rerf.fit(x_train_df, y_train_df)

        # test predictions
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        num_test_x = 50
        x_test_df = objective_function.parameter_space.random_dataframe(num_samples=num_test_x)
        y_test = objective_function.evaluate_dataframe(x_test_df).to_numpy().reshape(-1)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()
        predicted_y = pred_df[predicted_value_col].to_numpy()

        residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum()
        total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10**-6
        print(unexplained_variance, test_threshold)
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
    def test_lasso_categorical_gradient(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['categorical_input_space'],
            output_space=self.test_case_globals['output_space'])
        np.random.seed(19)

        num_points = 300
        x_df, y_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_points)
        rerf.fit(x_df, y_df)

        num_categorical_levels_expected = len(x_df['x0'].unique()) * len(
            x_df['i0'].unique())
        num_continuous_dimensions = 2  # x1 and x2
        final_num_features = num_categorical_levels_expected - 1 + num_continuous_dimensions
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial_per_categorical_level = self.n_choose_k(
            polynomial_degree + num_continuous_dimensions,
            num_continuous_dimensions)
        num_terms_in_polynomial = num_terms_in_polynomial_per_categorical_level * num_categorical_levels_expected
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_points, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')

        # test gradient coefficients
        true_gradient_coef = np.zeros((36, 7))
        true_gradient_coef[0] = np.array([3, 7, 0, 10, 10, 15, 25])
        true_gradient_coef[1] = np.array([12, -11, 0, -11, -11, -3, -3])
        true_gradient_coef[11] = np.array([12, 12, 0, 12, 12, -7, -7])
        true_gradient_coef[13] = np.array([-3, -11, 0, 0, 0, 2, 2])
        true_gradient_coef[15] = np.array([4, 12, 0, 0, 0, 3, 3])
        true_gradient_coef[17] = np.array([-3, -7, 0, 0, 0, 0, 0])
        true_gradient_coef[19] = np.array([4, 6, 0, 0, 0, 0, 0])
        true_gradient_coef[21] = np.array([0, -7, 0, 0, 0, 0, 0])
        true_gradient_coef[23] = np.array([0, 6, 0, 0, 0, 0, 0])

        epsilon = 10**-2
        estimated_gradient_coef = rerf.root_model_gradient_coef_
        coef_abs_diff = np.abs(true_gradient_coef - estimated_gradient_coef)
        coef_abs_relative_error = np.divide(coef_abs_diff,
                                            np.abs(true_gradient_coef))
        incorrect_terms = np.where(coef_abs_relative_error > epsilon)[0]
        num_incorrect_terms = len(incorrect_terms)

        self.assertTrue(
            num_incorrect_terms == 0,
            'Estimated gradient coefficients deviated further than expected from known coefficients'
        )