def test_rerf_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space'] ) num_train_points = 51 x_train_df, y_train_df = self.generate_points_simple_quadratic(num_train_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_train_df, y_train_df) # generate new random sample to test predictions num_test_points = 50 x_test_df, y_test_df = self.generate_points_simple_quadratic(num_test_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_y = pred_df[predicted_value_col].to_numpy() y_test = y_test_df.to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum() total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares test_threshold = 10 ** -5 assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
def test_lasso_coefficients(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.input_space, output_space=self.output_space) num_x = 1000 np.random.seed(23) x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)]) x_df = pd.DataFrame(x, columns=['x1', 'x2']) # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2 y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0]) poly_reg = PolynomialFeatures(degree=2) poly_terms_x = poly_reg.fit_transform(x) y = np.matmul(poly_terms_x, y_coef_true) y_df = pd.DataFrame(y, columns=['degree2_polynomial_y']) # fit model with same degree as true y rerf.fit(x_df, y_df) # test fit coef match known coef epsilon = 10**-2 expected_non_zero_coef = y_coef_true[np.where(y_coef_true != 0.0)[0]] fit_poly_coef = [rerf.base_regressor_.intercept_] fit_poly_coef.extend(rerf.base_regressor_.coef_) incorrect_terms = np.where( np.abs(fit_poly_coef - expected_non_zero_coef) > epsilon)[0] num_incorrect_terms = len(incorrect_terms) assert num_incorrect_terms == 0
def test_rerf_categorical_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['categorical_input_space'], output_space=self.test_case_globals['output_space'] ) # input space consists of 6 2-d domains that are 5 x 5 units wide. Hence placing 25 points in each domain. num_train_x = 20 * 20 x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(num_train_x) rerf.fit(x_train_df, y_train_df) # generate new random to test predictions num_test_points = 5 * 5 x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic(num_test_points) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_y = pred_df[predicted_value_col].to_numpy() y_test = y_test_df.to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum() total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares test_threshold = 10 ** -4 print(unexplained_variance, test_threshold) assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
def test_lasso_feature_discovery(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.input_space, output_space=self.output_space) num_x = 100 np.random.seed(17) x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)]) x_df = pd.DataFrame(x, columns=['x1', 'x2']) # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2 y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0]) poly_reg = PolynomialFeatures(degree=2) poly_terms_x = poly_reg.fit_transform(x) y = np.matmul(poly_terms_x, y_coef_true) y_df = pd.DataFrame(y, columns=['degree2_polynomial_y']) # fit model with same degree as true y # rerf = RegressionEnhancedRandomForest(lasso_degree=2) rerf.fit(x_df, y_df) # test if expected non-zero terms were found expected_fit_model_terms = {1, 2, 3, 5} expected_symm_diff_found = expected_fit_model_terms - set( rerf.detected_feature_indices_) num_diffs = len(list(expected_symm_diff_found)) assert num_diffs == 0
def test_lasso_polynomial_gradient_invariants(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space']) num_points = 100 x_df, y_df = self.generate_points_simple_quadratic( num_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_df, y_df) final_num_features = 2 polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial = self.n_choose_k( polynomial_degree + final_num_features, final_num_features) num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect')
def test_polynomial_gradient(self): print(self.model_config) rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.input_space, output_space=self.output_space) num_x = 100 np.random.seed(13) x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)]) x_df = pd.DataFrame(x, columns=['x1', 'x2']) # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2 y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0]) poly_reg = PolynomialFeatures(degree=2) poly_terms_x = poly_reg.fit_transform(x) y = np.matmul(poly_terms_x, y_coef_true) y_df = pd.DataFrame(y, columns=['degree2_polynomial_y']) # fit model with same degree as true y rerf.fit(x_df, y_df) # test gradient at X epsilon = 10**-2 true_gradient_coef = np.array([[-3, -0.5 * 2, 0, 0, 0, 0], [-4, -2.0 * 2, 0, 0, 0, 0]]).transpose() incorrect_terms = np.where( np.abs(true_gradient_coef - rerf.root_model_gradient_coef_) > epsilon)[0] num_incorrect_terms = len(incorrect_terms) assert num_incorrect_terms == 0
def test_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.input_space, output_space=self.output_space) num_x = 100 np.random.seed(13) x = np.random.uniform(0, 5, [num_x, len(self.input_space.dimensions)]) x_df = pd.DataFrame(x, columns=['x1', 'x2']) # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2 y_coef_true = np.array([1, -3, -4, -0.5, 0.0, -2.0]) poly_reg = PolynomialFeatures(degree=2) poly_terms_x = poly_reg.fit_transform(x) y = np.matmul(poly_terms_x, y_coef_true) y_df = pd.DataFrame(y, columns=['degree2_polynomial_y']) # fit model with same degree as true y rerf.fit(x_df, y_df) predictions = rerf.predict(x_df) pred_df = predictions.get_dataframe() sample_mean_col = Prediction.LegalColumnNames.SAMPLE_MEAN.value pred_df['residual'] = y - pred_df[sample_mean_col] r2 = np.sum(pred_df['residual']**2, axis=0) assert r2 < 10**-4
def test_lasso_feature_discovery(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space'] ) num_points = 100 x_df, y_df = self.generate_points_simple_quadratic(num_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_df, y_df) final_num_features = 2 polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial = self.n_choose_k(polynomial_degree + final_num_features, final_num_features) num_detected_features = len(rerf.detected_feature_indices_) assert rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect' assert rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect' assert rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect' assert rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect' # test if expected non-zero terms were found expected_fit_model_terms = {1, 2, 3, 5} expected_symm_diff_found = expected_fit_model_terms - set(rerf.detected_feature_indices_) num_diffs = len(list(expected_symm_diff_found)) assert num_diffs == 0, 'Base model failed to find expected features'
def test_lasso_hierarchical_categorical_predictions(self): objective_function_config = objective_function_config_store.get_config_by_name( 'three_level_quadratic') objective_function = ObjectiveFunctionFactory.create_objective_function( objective_function_config=objective_function_config) rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=objective_function.parameter_space, output_space=objective_function.output_space) # fit model with same degree as true y num_train_x = 100 x_train_df = objective_function.parameter_space.random_dataframe( num_samples=num_train_x) y_train_df = objective_function.evaluate_dataframe(x_train_df) rerf.fit(x_train_df, y_train_df) num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_train_x, rerf.polynomial_features_powers_.shape[0]), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') self.assertTrue(rerf.polynomial_features_powers_.shape == (28, 8), 'PolynomalFeature.power_ shape is incorrect') # test predictions predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value num_test_x = 10 # by generating a single X feature on which to make the predictions, the y_test_list = [] predicted_y_list = [] for _ in range(num_test_x): x_test_df = objective_function.parameter_space.random_dataframe( num_samples=1) y_test_df = objective_function.evaluate_dataframe(x_test_df) y_test_list.append(y_test_df['y'].values[0]) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_y_list.append(pred_df[predicted_value_col].values[0]) predicted_y = np.array(predicted_y_list) y_test = np.array(y_test_list) residual_sum_of_squares = ((y_test - predicted_y)**2).sum() total_sum_of_squares = ((y_test - y_test.mean())**2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares self.assertTrue(unexplained_variance < 10**-4, '1 - R^2 larger than expected')
def test_lasso_categorical_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['categorical_input_space'], output_space=self.test_case_globals['output_space']) # input space consists of 6 2-d domains that are 5 x 5 units wide. Hence placing 25 points in each domain. num_train_x = 20 * 20 x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic( num_train_x) rerf.fit(x_train_df, y_train_df) num_categorical_levels_expected = len( rerf.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) num_continuous_dimensions = 2 # x1 and x2 final_num_features = num_categorical_levels_expected + num_continuous_dimensions polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial_per_categorical_level = self.n_choose_k( polynomial_degree + num_continuous_dimensions, num_continuous_dimensions) # 1 is added to the num_categorical_levels_expected to account for "level 0" which the one hot encoder in RERF drops the first level, # while the design matrix contains a polynomial fit for that level. # Since it is possible not all categorical levels will be present in the training set, RERF eliminates zero columns arising from # OneHotEncoder knowing the missing levels are possible. The list of the dropped columns is established in RERF.fit() and used in the # RERF.predict() method. num_cols_in_design_matrix = num_terms_in_polynomial_per_categorical_level * (num_categorical_levels_expected + 1)\ - len(rerf.categorical_zero_cols_idx_to_delete_) num_detected_features = len(rerf.detected_feature_indices_) assert rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect' assert rerf.fit_X_.shape == ( num_train_x, num_cols_in_design_matrix), 'Design matrix shape is incorrect' assert rerf.partial_hat_matrix_.shape == ( num_detected_features, num_detected_features), 'Hat matrix shape is incorrect' assert rerf.polynomial_features_powers_.shape == ( num_cols_in_design_matrix, final_num_features), 'PolynomalFeature.power_ shape is incorrect' # generate new random to test predictions num_test_points = 5 * 5 x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic( num_test_points) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_y = pred_df[predicted_value_col].to_numpy() y_test = y_test_df.to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y)**2).sum() total_sum_of_squares = ((y_test - y_test.mean())**2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares test_threshold = 10**-4 print(unexplained_variance, test_threshold) assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
def test_lasso_hierarchical_categorical_predictions(self): random.seed(11001) objective_function_config = objective_function_config_store.get_config_by_name( 'three_level_quadratic') objective_function = ObjectiveFunctionFactory.create_objective_function( objective_function_config=objective_function_config) rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=objective_function.parameter_space, output_space=objective_function.output_space) # fit model with same degree as true y # The input space consists of 3 2-d domains 200 x 200 units. Hence random samples smaller than a certain size will produce too few points to # train reliable models. # TODO: Good place to use a non-random training set design num_train_x = 600 x_train_df = objective_function.parameter_space.random_dataframe( num_samples=num_train_x) y_train_df = objective_function.evaluate_dataframe(x_train_df) rerf.fit(x_train_df, y_train_df) num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_train_x, rerf.polynomial_features_powers_.shape[0]), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') self.assertTrue(rerf.polynomial_features_powers_.shape == (34, 9), 'PolynomalFeature.power_ shape is incorrect') # test predictions predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value num_test_x = 50 x_test_df = objective_function.parameter_space.random_dataframe( num_samples=num_test_x) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_y = pred_df[predicted_value_col].to_numpy() y_test = objective_function.evaluate_dataframe( x_test_df).to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y)**2).sum() total_sum_of_squares = ((y_test - y_test.mean())**2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares test_threshold = 10**-3 self.assertTrue( unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})' )
def test_lasso_categorical_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['categorical_input_space'], output_space=self.test_case_globals['output_space']) num_train_x = 300 x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic( num_train_x) rerf.fit(x_train_df, y_train_df) num_categorical_levels_expected = len(x_train_df['x0'].unique()) * len( x_train_df['i0'].unique()) num_continuous_dimensions = 2 # x1 and x2 final_num_features = num_categorical_levels_expected - 1 + num_continuous_dimensions polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial_per_categorical_level = self.n_choose_k( polynomial_degree + num_continuous_dimensions, num_continuous_dimensions) num_terms_in_polynomial = num_terms_in_polynomial_per_categorical_level * num_categorical_levels_expected num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_train_x, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') self.assertTrue( rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') # generate new random to test predictions num_test_points = 50 x_test_df, y_test_df = self.generate_points_nonhierarchical_categorical_quadratic( num_test_points) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_y = pred_df[predicted_value_col].to_numpy() y_test = y_test_df.to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y)**2).sum() total_sum_of_squares = ((y_test - y_test.mean())**2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares self.assertTrue(unexplained_variance < 10**-4, '1 - R^2 larger than expected')
def test_lasso_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space']) num_train_points = 100 x_train_df, y_train_df = self.generate_points_simple_quadratic( num_train_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_train_df, y_train_df) final_num_features = 2 polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial = self.n_choose_k( polynomial_degree + final_num_features, final_num_features) num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_train_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') # generate new random sample to test predictions num_test_points = 50 x_test_df, y_test_df = self.generate_points_simple_quadratic( num_test_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_y = pred_df[predicted_value_col].to_numpy() y_test = y_test_df.to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y)**2).sum() total_sum_of_squares = ((y_test - y_test.mean())**2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares test_threshold = 10**-3 self.assertTrue( unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})' )
def test_lasso_polynomial_gradient(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space']) np.random.seed(13) num_points = 100 x_df, y_df = self.generate_points_simple_quadratic( num_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_df, y_df) final_num_features = 2 polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial = self.n_choose_k( polynomial_degree + final_num_features, final_num_features) num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') # test gradient at X epsilon = 10**-2 true_gradient_coef = np.array([[-3, -0.5 * 2, 0, 0, 0, 0], [-4, -2.0 * 2, 0, 0, 0, 0]]).transpose() incorrect_terms = np.where( np.abs(true_gradient_coef - rerf.root_model_gradient_coef_) > epsilon)[0] num_incorrect_terms = len(incorrect_terms) self.assertTrue( num_incorrect_terms == 0, 'Estimated gradient coefficients deviated further than expected from known coefficients' )
def test_lasso_polynomial_coefficients(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space']) np.random.seed(23) num_points = 1000 x_df, y_df = self.generate_points_simple_quadratic( num_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_df, y_df) final_num_features = 2 polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial = self.n_choose_k( polynomial_degree + final_num_features, final_num_features) num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') # test fit coef match known coef y_coef_true = self.get_simple_quadratic_coefficients() epsilon = 10**-2 expected_non_zero_coef = y_coef_true[np.where(y_coef_true != 0.0)[0]] fit_poly_coef = [rerf.base_regressor_.intercept_] fit_poly_coef.extend(rerf.base_regressor_.coef_) incorrect_terms = np.where( np.abs(fit_poly_coef - expected_non_zero_coef) > epsilon)[0] num_incorrect_terms = len(incorrect_terms) self.assertTrue( num_incorrect_terms == 0, 'Estimated polynomial coefficients deviated further than expected from known coefficients' )
def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger: logging.Logger = None): NaiveMultiObjectiveRegressionModel.__init__( self, model_type=RegressionEnhancedRandomForestRegressionModel, model_config=model_config, input_space=input_space, output_space=output_space, logger=logger) # We just need to assert that the model config belongs in regression_enhanced_random_forest_config_store.parameter_space. # A more elaborate solution might be needed down the road, but for now this simple solution should suffice. # assert model_config in regression_enhanced_random_forest_config_store.parameter_space for output_dimension in output_space.dimensions: # We copy the model_config (rather than share across objectives below because the perform_initial_random_forest_hyper_parameter_search # is set to False after the initial fit() call so that subsequent .fit() calls don't pay the cost penalty for this embedded hyper parameter search rerf_model = RegressionEnhancedRandomForestRegressionModel( model_config=model_config.copy(), input_space=input_space, output_space=SimpleHypergrid( name=f"{output_dimension.name}_objective", dimensions=[output_dimension]), logger=self.logger) self._regressors_by_objective_name[ output_dimension.name] = rerf_model
def test_rerf_hierarchical_categorical_predictions(self): random.seed(11001) objective_function_config = objective_function_config_store.get_config_by_name('three_level_quadratic') objective_function = ObjectiveFunctionFactory.create_objective_function(objective_function_config=objective_function_config) rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=objective_function.parameter_space, output_space=objective_function.output_space ) # fit model with same degree as true y # The input space consists of 3 2-d domains 200 x 200 units. Hence random samples smaller than a certain size will produce too few points to # train reliable models. # TODO: Good place to use a non-random training set design num_train_x = 300 x_train_df = objective_function.parameter_space.random_dataframe(num_samples=num_train_x) y_train_df = objective_function.evaluate_dataframe(x_train_df) rerf.fit(x_train_df, y_train_df) # test predictions predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value num_test_x = 50 x_test_df = objective_function.parameter_space.random_dataframe(num_samples=num_test_x) y_test = objective_function.evaluate_dataframe(x_test_df).to_numpy().reshape(-1) predictions = rerf.predict(x_test_df) pred_df = predictions.get_dataframe() predicted_y = pred_df[predicted_value_col].to_numpy() residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum() total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares test_threshold = 10**-6 print(unexplained_variance, test_threshold) assert unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})'
def test_lasso_categorical_gradient(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['categorical_input_space'], output_space=self.test_case_globals['output_space']) np.random.seed(19) num_points = 300 x_df, y_df = self.generate_points_nonhierarchical_categorical_quadratic( num_points) rerf.fit(x_df, y_df) num_categorical_levels_expected = len(x_df['x0'].unique()) * len( x_df['i0'].unique()) num_continuous_dimensions = 2 # x1 and x2 final_num_features = num_categorical_levels_expected - 1 + num_continuous_dimensions polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial_per_categorical_level = self.n_choose_k( polynomial_degree + num_continuous_dimensions, num_continuous_dimensions) num_terms_in_polynomial = num_terms_in_polynomial_per_categorical_level * num_categorical_levels_expected num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue( rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue( rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue( rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') self.assertTrue( rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') # test gradient coefficients true_gradient_coef = np.zeros((36, 7)) true_gradient_coef[0] = np.array([3, 7, 0, 10, 10, 15, 25]) true_gradient_coef[1] = np.array([12, -11, 0, -11, -11, -3, -3]) true_gradient_coef[11] = np.array([12, 12, 0, 12, 12, -7, -7]) true_gradient_coef[13] = np.array([-3, -11, 0, 0, 0, 2, 2]) true_gradient_coef[15] = np.array([4, 12, 0, 0, 0, 3, 3]) true_gradient_coef[17] = np.array([-3, -7, 0, 0, 0, 0, 0]) true_gradient_coef[19] = np.array([4, 6, 0, 0, 0, 0, 0]) true_gradient_coef[21] = np.array([0, -7, 0, 0, 0, 0, 0]) true_gradient_coef[23] = np.array([0, 6, 0, 0, 0, 0, 0]) epsilon = 10**-2 estimated_gradient_coef = rerf.root_model_gradient_coef_ coef_abs_diff = np.abs(true_gradient_coef - estimated_gradient_coef) coef_abs_relative_error = np.divide(coef_abs_diff, np.abs(true_gradient_coef)) incorrect_terms = np.where(coef_abs_relative_error > epsilon)[0] num_incorrect_terms = len(incorrect_terms) self.assertTrue( num_incorrect_terms == 0, 'Estimated gradient coefficients deviated further than expected from known coefficients' )