def test_rerf_categorical_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['categorical_input_space'],
            output_space=self.test_case_globals['output_space']
        )

        # input space consists of 6 2-d domains that are 5 x 5 units wide.  Hence placing 25 points in each domain.
        num_train_x = 20 * 20
        x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(num_train_x)
        rerf.fit(x_train_df, y_train_df)

        # generate new random to test predictions
        num_test_points = 5 * 5
        x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic(num_test_points)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum()
        total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10 ** -4
        print(unexplained_variance, test_threshold)
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
    def test_rerf_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space']
        )

        num_train_points = 51
        x_train_df, y_train_df = self.generate_points_simple_quadratic(num_train_points, len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_train_df, y_train_df)

        # generate new random sample to test predictions
        num_test_points = 50
        x_test_df, y_test_df = self.generate_points_simple_quadratic(num_test_points, len(self.test_case_globals['2d_X_input_space'].dimensions))
        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum()
        total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares

        test_threshold = 10 ** -5
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
Exemple #3
0
    def test_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.input_space,
            output_space=self.output_space)
        num_x = 100
        np.random.seed(13)
        x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)])
        x_df = pd.DataFrame(x, columns=['x1', 'x2'])

        # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2
        y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0])
        poly_reg = PolynomialFeatures(degree=2)
        poly_terms_x = poly_reg.fit_transform(x)
        y = np.matmul(poly_terms_x, y_coef_true)
        y_df = pd.DataFrame(y, columns=['degree2_polynomial_y'])

        # fit model with same degree as true y
        rerf.fit(x_df, y_df)

        predictions = rerf.predict(x_df)
        pred_df = predictions.get_dataframe()

        sample_mean_col = Prediction.LegalColumnNames.SAMPLE_MEAN.value
        pred_df['residual'] = y - pred_df[sample_mean_col]
        r2 = np.sum(pred_df['residual']**2, axis=0)

        assert r2 < 10**-4
    def test_lasso_hierarchical_categorical_predictions(self):
        objective_function_config = objective_function_config_store.get_config_by_name(
            'three_level_quadratic')
        objective_function = ObjectiveFunctionFactory.create_objective_function(
            objective_function_config=objective_function_config)

        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=objective_function.parameter_space,
            output_space=objective_function.output_space)

        # fit model with same degree as true y
        num_train_x = 100
        x_train_df = objective_function.parameter_space.random_dataframe(
            num_samples=num_train_x)
        y_train_df = objective_function.evaluate_dataframe(x_train_df)
        rerf.fit(x_train_df, y_train_df)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_x,
                                  rerf.polynomial_features_powers_.shape[0]),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
        self.assertTrue(rerf.polynomial_features_powers_.shape == (28, 8),
                        'PolynomalFeature.power_ shape is incorrect')

        # test predictions
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        num_test_x = 10

        # by generating a single X feature on which to make the predictions, the
        y_test_list = []
        predicted_y_list = []
        for _ in range(num_test_x):
            x_test_df = objective_function.parameter_space.random_dataframe(
                num_samples=1)
            y_test_df = objective_function.evaluate_dataframe(x_test_df)
            y_test_list.append(y_test_df['y'].values[0])

            predictions = rerf.predict(x_test_df)
            pred_df = predictions.get_dataframe()
            predicted_y_list.append(pred_df[predicted_value_col].values[0])

        predicted_y = np.array(predicted_y_list)
        y_test = np.array(y_test_list)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        self.assertTrue(unexplained_variance < 10**-4,
                        '1 - R^2 larger than expected')
    def test_lasso_categorical_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['categorical_input_space'],
            output_space=self.test_case_globals['output_space'])

        # input space consists of 6 2-d domains that are 5 x 5 units wide.  Hence placing 25 points in each domain.
        num_train_x = 20 * 20
        x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_train_x)
        rerf.fit(x_train_df, y_train_df)

        num_categorical_levels_expected = len(
            rerf.one_hot_encoder_adapter.get_one_hot_encoded_column_names())
        num_continuous_dimensions = 2  # x1 and x2
        final_num_features = num_categorical_levels_expected + num_continuous_dimensions
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial_per_categorical_level = self.n_choose_k(
            polynomial_degree + num_continuous_dimensions,
            num_continuous_dimensions)
        # 1 is added to the num_categorical_levels_expected to account for "level 0" which the one hot encoder in RERF drops the first level,
        # while the design matrix contains a polynomial fit for that level.
        # Since it is possible not all categorical levels will be present in the training set, RERF eliminates zero columns arising from
        # OneHotEncoder knowing the missing levels are possible.  The list of the dropped columns is established in RERF.fit() and used in the
        # RERF.predict() method.
        num_cols_in_design_matrix = num_terms_in_polynomial_per_categorical_level * (num_categorical_levels_expected + 1)\
                                  - len(rerf.categorical_zero_cols_idx_to_delete_)
        num_detected_features = len(rerf.detected_feature_indices_)

        assert rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect'
        assert rerf.fit_X_.shape == (
            num_train_x,
            num_cols_in_design_matrix), 'Design matrix shape is incorrect'
        assert rerf.partial_hat_matrix_.shape == (
            num_detected_features,
            num_detected_features), 'Hat matrix shape is incorrect'
        assert rerf.polynomial_features_powers_.shape == (
            num_cols_in_design_matrix,
            final_num_features), 'PolynomalFeature.power_ shape is incorrect'

        # generate new random to test predictions
        num_test_points = 5 * 5
        x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_test_points)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10**-4
        print(unexplained_variance, test_threshold)
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
Exemple #6
0
    def test_lasso_hierarchical_categorical_predictions(self):
        random.seed(11001)
        objective_function_config = objective_function_config_store.get_config_by_name(
            'three_level_quadratic')
        objective_function = ObjectiveFunctionFactory.create_objective_function(
            objective_function_config=objective_function_config)

        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=objective_function.parameter_space,
            output_space=objective_function.output_space)

        # fit model with same degree as true y
        # The input space consists of 3 2-d domains 200 x 200 units.  Hence random samples smaller than a certain size will produce too few points to
        # train reliable models.
        # TODO: Good place to use a non-random training set design
        num_train_x = 600
        x_train_df = objective_function.parameter_space.random_dataframe(
            num_samples=num_train_x)
        y_train_df = objective_function.evaluate_dataframe(x_train_df)
        rerf.fit(x_train_df, y_train_df)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_x,
                                  rerf.polynomial_features_powers_.shape[0]),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
        self.assertTrue(rerf.polynomial_features_powers_.shape == (34, 9),
                        'PolynomalFeature.power_ shape is incorrect')

        # test predictions
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        num_test_x = 50
        x_test_df = objective_function.parameter_space.random_dataframe(
            num_samples=num_test_x)
        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = objective_function.evaluate_dataframe(
            x_test_df).to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10**-3
        self.assertTrue(
            unexplained_variance < test_threshold,
            f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
        )
    def test_lasso_categorical_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['categorical_input_space'],
            output_space=self.test_case_globals['output_space'])

        num_train_x = 300
        x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_train_x)
        rerf.fit(x_train_df, y_train_df)

        num_categorical_levels_expected = len(x_train_df['x0'].unique()) * len(
            x_train_df['i0'].unique())
        num_continuous_dimensions = 2  # x1 and x2
        final_num_features = num_categorical_levels_expected - 1 + num_continuous_dimensions
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial_per_categorical_level = self.n_choose_k(
            polynomial_degree + num_continuous_dimensions,
            num_continuous_dimensions)
        num_terms_in_polynomial = num_terms_in_polynomial_per_categorical_level * num_categorical_levels_expected
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_x, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')
        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')

        # generate new random to test predictions
        num_test_points = 50
        x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic(
            num_test_points)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        self.assertTrue(unexplained_variance < 10**-4,
                        '1 - R^2 larger than expected')
Exemple #8
0
    def test_lasso_predictions(self):
        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=self.test_case_globals['2d_X_input_space'],
            output_space=self.test_case_globals['output_space'])

        num_train_points = 100
        x_train_df, y_train_df = self.generate_points_simple_quadratic(
            num_train_points,
            len(self.test_case_globals['2d_X_input_space'].dimensions))
        rerf.fit(x_train_df, y_train_df)

        final_num_features = 2
        polynomial_degree = self.model_config.max_basis_function_degree
        num_terms_in_polynomial = self.n_choose_k(
            polynomial_degree + final_num_features, final_num_features)
        num_detected_features = len(rerf.detected_feature_indices_)

        self.assertTrue(
            rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial,
                                                       final_num_features),
            'PolynomalFeature.power_ shape is incorrect')
        self.assertTrue(
            rerf.root_model_gradient_coef_.shape ==
            rerf.polynomial_features_powers_.shape,
            'Gradient coefficient shape is incorrect')
        self.assertTrue(
            rerf.fit_X_.shape == (num_train_points, num_terms_in_polynomial),
            'Design matrix shape is incorrect')
        self.assertTrue(
            rerf.partial_hat_matrix_.shape == (num_detected_features,
                                               num_detected_features),
            'Hat matrix shape is incorrect')

        # generate new random sample to test predictions
        num_test_points = 50
        x_test_df, y_test_df = self.generate_points_simple_quadratic(
            num_test_points,
            len(self.test_case_globals['2d_X_input_space'].dimensions))
        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()

        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_y = pred_df[predicted_value_col].to_numpy()
        y_test = y_test_df.to_numpy().reshape(-1)
        residual_sum_of_squares = ((y_test - predicted_y)**2).sum()
        total_sum_of_squares = ((y_test - y_test.mean())**2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares

        test_threshold = 10**-3
        self.assertTrue(
            unexplained_variance < test_threshold,
            f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
        )
    def test_rerf_hierarchical_categorical_predictions(self):
        random.seed(11001)
        objective_function_config = objective_function_config_store.get_config_by_name('three_level_quadratic')
        objective_function = ObjectiveFunctionFactory.create_objective_function(objective_function_config=objective_function_config)

        rerf = RegressionEnhancedRandomForestRegressionModel(
            model_config=self.model_config,
            input_space=objective_function.parameter_space,
            output_space=objective_function.output_space
        )

        # fit model with same degree as true y
        # The input space consists of 3 2-d domains 200 x 200 units.  Hence random samples smaller than a certain size will produce too few points to
        # train reliable models.
        # TODO: Good place to use a non-random training set design
        num_train_x = 300
        x_train_df = objective_function.parameter_space.random_dataframe(num_samples=num_train_x)
        y_train_df = objective_function.evaluate_dataframe(x_train_df)
        rerf.fit(x_train_df, y_train_df)

        # test predictions
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        num_test_x = 50
        x_test_df = objective_function.parameter_space.random_dataframe(num_samples=num_test_x)
        y_test = objective_function.evaluate_dataframe(x_test_df).to_numpy().reshape(-1)

        predictions = rerf.predict(x_test_df)
        pred_df = predictions.get_dataframe()
        predicted_y = pred_df[predicted_value_col].to_numpy()

        residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum()
        total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum()
        unexplained_variance = residual_sum_of_squares / total_sum_of_squares
        test_threshold = 10**-6
        print(unexplained_variance, test_threshold)
        assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'