def test_translating_dataframe_from_categorical_to_discrete_simple_hypergrid(self): adapter = CategoricalToDiscreteHypergridAdapter(adaptee=self.simple_hypergrid) original_df = self.simple_hypergrid.random_dataframe(num_samples=10000) translated_df = adapter.translate_dataframe(original_df, in_place=False) # Let's make sure we have a deep copy. # self.assertTrue(id(original_df) != id(translated_df)) # Make sure that a deep copy was made. self.assertFalse(original_df.equals(translated_df)) # TODO: assert translated df only has numbers # Let's copy the translated_df before testing if all is numeric - the test might change the data. copied_df = translated_df.copy(deep=True) columns = copied_df.columns.values.tolist() for column in columns: # For each column let's validate that it contains only numerics. We'll do this by coercing all values to numerics. # If such coercion fails, it produces a null value, so we can validate that there are no nulls in the output. self.assertTrue(pd.to_numeric(copied_df[column], errors='coerce').notnull().all()) # To make sure the check above is capable of failing, let's try the same trick on the input where we know there are non-numeric values # copied_original_df = original_df.copy(deep=True) self.assertFalse(pd.to_numeric(copied_original_df['categorical_mixed_types'], errors='coerce').notnull().all()) untranslated_df = adapter.untranslate_dataframe(translated_df, in_place=False) self.assertTrue(id(original_df) != id(untranslated_df)) self.assertTrue(original_df.equals(untranslated_df)) # Let's make sure that translating in place works as expected. translated_in_place_df = adapter.translate_dataframe(original_df) self.assertTrue(id(original_df) == id(translated_in_place_df)) self.assertTrue(translated_in_place_df.equals(translated_df)) untranslated_in_place_df = adapter.untranslate_dataframe(translated_in_place_df) self.assertTrue(id(original_df) == id(untranslated_in_place_df)) self.assertTrue(untranslated_in_place_df.equals(untranslated_df))
def test_translating_dataframe_from_categorical_hierarchical_to_discrete_flat_hypergrid(self): adapter = CategoricalToDiscreteHypergridAdapter( adaptee=HierarchicalToFlatHypergridAdapter( adaptee=self.hierarchical_hypergrid ) ) self.assertFalse(any(isinstance(dimension, CategoricalDimension) for dimension in adapter.dimensions)) self.assertFalse(any("." in dimension.name for dimension in adapter.dimensions)) original_df = self.hierarchical_hypergrid.random_dataframe(num_samples=10000) translated_df = adapter.translate_dataframe(df=original_df, in_place=False) untranslated_df = adapter.untranslate_dataframe(df=translated_df, in_place=False) self.assertTrue(original_df.equals(untranslated_df))
class DecisionTreeRegressionModel(RegressionModel): """ Possible extensions: * have a tree fit a linear model at each leaf. """ _PREDICTOR_OUTPUT_COLUMNS = [ Prediction.LegalColumnNames.IS_VALID_INPUT, Prediction.LegalColumnNames.PREDICTED_VALUE, Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE, Prediction.LegalColumnNames.SAMPLE_VARIANCE, Prediction.LegalColumnNames.SAMPLE_SIZE, Prediction.LegalColumnNames.DEGREES_OF_FREEDOM ] def __init__(self, model_config: DecisionTreeRegressionModelConfig, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger("DecisionTreeRegressionModel") self.logger = logger assert DecisionTreeRegressionModelConfig.contains(model_config) RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) self._input_space_adapter = CategoricalToDiscreteHypergridAdapter( adaptee=self.input_space) self.input_dimension_names = [ dimension.name for dimension in self._input_space_adapter.dimensions ] self.target_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.logger.debug( f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}." ) assert len( self.target_dimension_names ) == 1, "For now (and perhaps forever) we only support single target per tree." self._regressor = DecisionTreeRegressor( criterion=self.model_config.criterion, splitter=self.model_config.splitter, max_depth=self.model_config.max_depth_value, min_samples_split=self.model_config.min_samples_split, min_samples_leaf=self.model_config.min_samples_leaf, min_weight_fraction_leaf=self.model_config. min_weight_fraction_leaf, max_features=self.model_config.max_features, random_state=self.model_config.random_state, max_leaf_nodes=self.model_config.max_leaf_nodes_value, min_impurity_decrease=self.model_config.min_impurity_decrease, ccp_alpha=self.model_config.ccp_alpha) # These are used to compute the variance in predictions self._observations_per_leaf = dict() self._mean_per_leaf = dict() self._mean_variance_per_leaf = dict() self._sample_variance_per_leaf = dict() self._count_per_leaf = dict() @property def num_observations_used_to_fit(self): return self.fit_state.train_set_size def should_fit(self, num_samples): """ Returns true if the model should be fitted. This model should be fitted under the following conditions: 1) It has not been fitted yet and num_samples is larger than min_samples_to_fit 2) The model has been fitted and the number of new samples is larger than n_new_samples_before_refit :param num_samples: :return: """ if not self.fitted: return num_samples > self.model_config.min_samples_to_fit num_new_samples = num_samples - self.num_observations_used_to_fit return num_new_samples >= self.model_config.n_new_samples_before_refit @trace() def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration_number): self.logger.debug( f"Fitting a {self.__class__.__name__} with {len(feature_values_pandas_frame.index)} observations." ) # Let's get the numpy arrays out of the panda frames # feature_values_pandas_frame = self._input_space_adapter.translate_dataframe( feature_values_pandas_frame, in_place=False) feature_values = feature_values_pandas_frame[ self.input_dimension_names].to_numpy() target_values = target_values_pandas_frame[ self.target_dimension_names].to_numpy() # Clean up state before fitting again self._observations_per_leaf = dict() self._regressor.fit(feature_values, target_values) # Now that we have fit the model we can augment our tree by computing the variance # TODO: this code can be easily optimized, but premature optimization is the root of all evil. node_indices = self._regressor.apply(feature_values) self.logger.debug( f"The resulting three has {len(node_indices)} leaf nodes.") for node_index, sample_target_value in zip(node_indices, target_values): observations_at_leaf = self._observations_per_leaf.get( node_index, []) observations_at_leaf.append(sample_target_value) self._observations_per_leaf[node_index] = observations_at_leaf # Now let's compute all predictions for node_index in self._observations_per_leaf: # First convert the observations to a numpy array. observations_at_leaf = np.array( self._observations_per_leaf[node_index]) self._observations_per_leaf[node_index] = observations_at_leaf leaf_mean = np.mean(observations_at_leaf) leaf_sample_variance = np.var( observations_at_leaf, ddof=1 ) # ddof = delta degrees of freedom. We want sample variance. leaf_mean_variance = leaf_sample_variance / len( observations_at_leaf) # TODO: note that if we change the tree to fit a linear regression at each leaf, these predictions would have # to be computed in the .predict() function, though the slope and y-intersect could be computed here. self._mean_per_leaf[node_index] = leaf_mean self._mean_variance_per_leaf[node_index] = leaf_mean_variance self._sample_variance_per_leaf[node_index] = leaf_sample_variance self._count_per_leaf[node_index] = len(observations_at_leaf) self.fitted = True self.last_refit_iteration_number = iteration_number @trace() def predict(self, feature_values_pandas_frame, include_only_valid_rows=True): self.logger.debug( f"Creating predictions for {len(feature_values_pandas_frame.index)} samples." ) # dataframe column shortcuts is_valid_input_col = Prediction.LegalColumnNames.IS_VALID_INPUT.value predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_value_var_col = Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE.value sample_var_col = Prediction.LegalColumnNames.SAMPLE_VARIANCE.value sample_size_col = Prediction.LegalColumnNames.SAMPLE_SIZE.value dof_col = Prediction.LegalColumnNames.DEGREES_OF_FREEDOM.value valid_rows_index = None features_df = None if self.fitted: features_df = self._input_space_adapter.translate_dataframe( feature_values_pandas_frame, in_place=False) features_df = features_df[self.input_dimension_names] rows_with_no_nulls_index = features_df.index[ features_df.notnull().all(axis=1)] if not rows_with_no_nulls_index.empty: valid_rows_index = features_df.loc[ rows_with_no_nulls_index].index[ features_df.loc[rows_with_no_nulls_index].apply( lambda row: Point( **{ dim_name: row[i] for i, dim_name in enumerate( self.input_dimension_names) }) in self._input_space_adapter, axis=1)] predictions = Prediction( objective_name=self.target_dimension_names[0], predictor_outputs=self._PREDICTOR_OUTPUT_COLUMNS, dataframe_index=valid_rows_index) prediction_dataframe = predictions.get_dataframe() if valid_rows_index is not None and not valid_rows_index.empty: prediction_dataframe['leaf_node_index'] = self._regressor.apply( features_df.loc[valid_rows_index].to_numpy()) prediction_dataframe[predicted_value_col] = prediction_dataframe[ 'leaf_node_index'].map(self._mean_per_leaf) prediction_dataframe[ predicted_value_var_col] = prediction_dataframe[ 'leaf_node_index'].map(self._mean_variance_per_leaf) prediction_dataframe[sample_var_col] = prediction_dataframe[ 'leaf_node_index'].map(self._sample_variance_per_leaf) prediction_dataframe[sample_size_col] = prediction_dataframe[ 'leaf_node_index'].map(self._count_per_leaf) prediction_dataframe[ dof_col] = prediction_dataframe[sample_size_col] - 1 prediction_dataframe[is_valid_input_col] = True prediction_dataframe.drop(columns=['leaf_node_index'], inplace=True) predictions.validate_dataframe(prediction_dataframe) if not include_only_valid_rows: predictions.add_invalid_rows_at_missing_indices( desired_index=feature_values_pandas_frame.index) return predictions