Ejemplo n.º 1
0
    def test_translating_dataframe_from_categorical_to_discrete_simple_hypergrid(self):
        adapter = CategoricalToDiscreteHypergridAdapter(adaptee=self.simple_hypergrid)
        original_df = self.simple_hypergrid.random_dataframe(num_samples=10000)
        translated_df = adapter.translate_dataframe(original_df, in_place=False)
        # Let's make sure we have a deep copy.
        #
        self.assertTrue(id(original_df) != id(translated_df)) # Make sure that a deep copy was made.
        self.assertFalse(original_df.equals(translated_df))

        # TODO: assert translated df only has numbers
        # Let's copy the translated_df before testing if all is numeric - the test might change the data.
        copied_df = translated_df.copy(deep=True)
        columns = copied_df.columns.values.tolist()
        for column in columns:
            # For each column let's validate that it contains only numerics. We'll do this by coercing all values to numerics.
            # If such coercion fails, it produces a null value, so we can validate that there are no nulls in the output.
            self.assertTrue(pd.to_numeric(copied_df[column], errors='coerce').notnull().all())

        # To make sure the check above is capable of failing, let's try the same trick on the input where we know there are non-numeric values
        #
        copied_original_df = original_df.copy(deep=True)
        self.assertFalse(pd.to_numeric(copied_original_df['categorical_mixed_types'], errors='coerce').notnull().all())


        untranslated_df = adapter.untranslate_dataframe(translated_df, in_place=False)
        self.assertTrue(id(original_df) != id(untranslated_df))
        self.assertTrue(original_df.equals(untranslated_df))

        # Let's make sure that translating in place works as expected.
        translated_in_place_df = adapter.translate_dataframe(original_df)
        self.assertTrue(id(original_df) == id(translated_in_place_df))
        self.assertTrue(translated_in_place_df.equals(translated_df))
        untranslated_in_place_df = adapter.untranslate_dataframe(translated_in_place_df)
        self.assertTrue(id(original_df) == id(untranslated_in_place_df))
        self.assertTrue(untranslated_in_place_df.equals(untranslated_df))
Ejemplo n.º 2
0
    def test_translating_dataframe_from_categorical_hierarchical_to_discrete_flat_hypergrid(self):
        adapter = CategoricalToDiscreteHypergridAdapter(
            adaptee=HierarchicalToFlatHypergridAdapter(
                adaptee=self.hierarchical_hypergrid
            )
        )
        self.assertFalse(any(isinstance(dimension, CategoricalDimension) for dimension in adapter.dimensions))
        self.assertFalse(any("." in dimension.name for dimension in adapter.dimensions))

        original_df = self.hierarchical_hypergrid.random_dataframe(num_samples=10000)
        translated_df = adapter.translate_dataframe(df=original_df, in_place=False)
        untranslated_df = adapter.untranslate_dataframe(df=translated_df, in_place=False)
        self.assertTrue(original_df.equals(untranslated_df))
Ejemplo n.º 3
0
class DecisionTreeRegressionModel(RegressionModel):
    """
    Possible extensions:
    * have a tree fit a linear model at each leaf.
    """

    _PREDICTOR_OUTPUT_COLUMNS = [
        Prediction.LegalColumnNames.IS_VALID_INPUT,
        Prediction.LegalColumnNames.PREDICTED_VALUE,
        Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE,
        Prediction.LegalColumnNames.SAMPLE_VARIANCE,
        Prediction.LegalColumnNames.SAMPLE_SIZE,
        Prediction.LegalColumnNames.DEGREES_OF_FREEDOM
    ]

    def __init__(self,
                 model_config: DecisionTreeRegressionModelConfig,
                 input_space: Hypergrid,
                 output_space: Hypergrid,
                 logger=None):
        if logger is None:
            logger = create_logger("DecisionTreeRegressionModel")
        self.logger = logger

        assert DecisionTreeRegressionModelConfig.contains(model_config)
        RegressionModel.__init__(self,
                                 model_type=type(self),
                                 model_config=model_config,
                                 input_space=input_space,
                                 output_space=output_space)

        self._input_space_adapter = CategoricalToDiscreteHypergridAdapter(
            adaptee=self.input_space)

        self.input_dimension_names = [
            dimension.name
            for dimension in self._input_space_adapter.dimensions
        ]
        self.target_dimension_names = [
            dimension.name for dimension in self.output_space.dimensions
        ]
        self.logger.debug(
            f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}."
        )

        assert len(
            self.target_dimension_names
        ) == 1, "For now (and perhaps forever) we only support single target per tree."

        self._regressor = DecisionTreeRegressor(
            criterion=self.model_config.criterion,
            splitter=self.model_config.splitter,
            max_depth=self.model_config.max_depth_value,
            min_samples_split=self.model_config.min_samples_split,
            min_samples_leaf=self.model_config.min_samples_leaf,
            min_weight_fraction_leaf=self.model_config.
            min_weight_fraction_leaf,
            max_features=self.model_config.max_features,
            random_state=self.model_config.random_state,
            max_leaf_nodes=self.model_config.max_leaf_nodes_value,
            min_impurity_decrease=self.model_config.min_impurity_decrease,
            ccp_alpha=self.model_config.ccp_alpha)

        # These are used to compute the variance in predictions
        self._observations_per_leaf = dict()
        self._mean_per_leaf = dict()
        self._mean_variance_per_leaf = dict()
        self._sample_variance_per_leaf = dict()
        self._count_per_leaf = dict()

    @property
    def num_observations_used_to_fit(self):
        return self.fit_state.train_set_size

    def should_fit(self, num_samples):
        """ Returns true if the model should be fitted.

        This model should be fitted under the following conditions:
        1) It has not been fitted yet and num_samples is larger than min_samples_to_fit
        2) The model has been fitted and the number of new samples is larger than n_new_samples_before_refit

        :param num_samples:
        :return:
        """
        if not self.fitted:
            return num_samples > self.model_config.min_samples_to_fit
        num_new_samples = num_samples - self.num_observations_used_to_fit
        return num_new_samples >= self.model_config.n_new_samples_before_refit

    @trace()
    def fit(self, feature_values_pandas_frame, target_values_pandas_frame,
            iteration_number):
        self.logger.debug(
            f"Fitting a {self.__class__.__name__} with {len(feature_values_pandas_frame.index)} observations."
        )

        # Let's get the numpy arrays out of the panda frames
        #
        feature_values_pandas_frame = self._input_space_adapter.translate_dataframe(
            feature_values_pandas_frame, in_place=False)

        feature_values = feature_values_pandas_frame[
            self.input_dimension_names].to_numpy()
        target_values = target_values_pandas_frame[
            self.target_dimension_names].to_numpy()

        # Clean up state before fitting again
        self._observations_per_leaf = dict()

        self._regressor.fit(feature_values, target_values)

        # Now that we have fit the model we can augment our tree by computing the variance
        # TODO: this code can be easily optimized, but premature optimization is the root of all evil.
        node_indices = self._regressor.apply(feature_values)
        self.logger.debug(
            f"The resulting three has {len(node_indices)} leaf nodes.")

        for node_index, sample_target_value in zip(node_indices,
                                                   target_values):
            observations_at_leaf = self._observations_per_leaf.get(
                node_index, [])
            observations_at_leaf.append(sample_target_value)
            self._observations_per_leaf[node_index] = observations_at_leaf

        # Now let's compute all predictions
        for node_index in self._observations_per_leaf:
            # First convert the observations to a numpy array.
            observations_at_leaf = np.array(
                self._observations_per_leaf[node_index])
            self._observations_per_leaf[node_index] = observations_at_leaf

            leaf_mean = np.mean(observations_at_leaf)
            leaf_sample_variance = np.var(
                observations_at_leaf, ddof=1
            )  # ddof = delta degrees of freedom. We want sample variance.
            leaf_mean_variance = leaf_sample_variance / len(
                observations_at_leaf)

            # TODO: note that if we change the tree to fit a linear regression at each leaf, these predictions would have
            # to be computed in the .predict() function, though the slope and y-intersect could be computed here.
            self._mean_per_leaf[node_index] = leaf_mean
            self._mean_variance_per_leaf[node_index] = leaf_mean_variance
            self._sample_variance_per_leaf[node_index] = leaf_sample_variance
            self._count_per_leaf[node_index] = len(observations_at_leaf)

        self.fitted = True
        self.last_refit_iteration_number = iteration_number

    @trace()
    def predict(self,
                feature_values_pandas_frame,
                include_only_valid_rows=True):
        self.logger.debug(
            f"Creating predictions for {len(feature_values_pandas_frame.index)} samples."
        )

        # dataframe column shortcuts
        is_valid_input_col = Prediction.LegalColumnNames.IS_VALID_INPUT.value
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_value_var_col = Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE.value
        sample_var_col = Prediction.LegalColumnNames.SAMPLE_VARIANCE.value
        sample_size_col = Prediction.LegalColumnNames.SAMPLE_SIZE.value
        dof_col = Prediction.LegalColumnNames.DEGREES_OF_FREEDOM.value

        valid_rows_index = None
        features_df = None
        if self.fitted:
            features_df = self._input_space_adapter.translate_dataframe(
                feature_values_pandas_frame, in_place=False)
            features_df = features_df[self.input_dimension_names]

            rows_with_no_nulls_index = features_df.index[
                features_df.notnull().all(axis=1)]
            if not rows_with_no_nulls_index.empty:
                valid_rows_index = features_df.loc[
                    rows_with_no_nulls_index].index[
                        features_df.loc[rows_with_no_nulls_index].apply(
                            lambda row: Point(
                                **{
                                    dim_name: row[i]
                                    for i, dim_name in enumerate(
                                        self.input_dimension_names)
                                }) in self._input_space_adapter,
                            axis=1)]

        predictions = Prediction(
            objective_name=self.target_dimension_names[0],
            predictor_outputs=self._PREDICTOR_OUTPUT_COLUMNS,
            dataframe_index=valid_rows_index)
        prediction_dataframe = predictions.get_dataframe()

        if valid_rows_index is not None and not valid_rows_index.empty:
            prediction_dataframe['leaf_node_index'] = self._regressor.apply(
                features_df.loc[valid_rows_index].to_numpy())
            prediction_dataframe[predicted_value_col] = prediction_dataframe[
                'leaf_node_index'].map(self._mean_per_leaf)
            prediction_dataframe[
                predicted_value_var_col] = prediction_dataframe[
                    'leaf_node_index'].map(self._mean_variance_per_leaf)
            prediction_dataframe[sample_var_col] = prediction_dataframe[
                'leaf_node_index'].map(self._sample_variance_per_leaf)
            prediction_dataframe[sample_size_col] = prediction_dataframe[
                'leaf_node_index'].map(self._count_per_leaf)
            prediction_dataframe[
                dof_col] = prediction_dataframe[sample_size_col] - 1
            prediction_dataframe[is_valid_input_col] = True
            prediction_dataframe.drop(columns=['leaf_node_index'],
                                      inplace=True)

        predictions.validate_dataframe(prediction_dataframe)
        if not include_only_valid_rows:
            predictions.add_invalid_rows_at_missing_indices(
                desired_index=feature_values_pandas_frame.index)
        return predictions