Example #1
0
    def test_no_unselected_feature(self, get_iris_data):
        ground_truth = get_iris_data[["sepal width (cm)"]]
        dt_multi = DecisionTreeSupervisedDiscretiserMethod(
            tree_params={
                "max_depth": 3,
                "random_state": 2020
            },
            mode="multi",
            split_unselected_feat=False,
        )
        tree_discretiser = dt_multi.fit(
            feat_names=[
                "sepal length (cm)",
                "sepal width (cm)",
                "petal length (cm)",
                "petal width (cm)",
            ],
            dataframe=get_iris_data,
            target_continuous=False,
            target="target",
        )
        output = tree_discretiser.transform(get_iris_data[["sepal width (cm)"
                                                           ]])

        assert all(ground_truth == output)
Example #2
0
    def test_single_categorical(self, categorical_data):
        df = categorical_data.copy(deep=True)
        ground_truth = np.array(
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
                [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1],
                [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1],
                [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
            ]
        )  # ground truth is generated by manually use DecionTree to extract thresholds

        dt_single = DecisionTreeSupervisedDiscretiserMethod(
            tree_params={"max_depth": 2},
            mode="single",
        )
        tree_discretiser = dt_single.fit(
            feat_names=["petal width (cm)"],
            dataframe=df,
            target_continuous=False,
            target="target",
        )
        discretiser_output = tree_discretiser.transform(
            df[["petal width (cm)"]]).values
        assert (ground_truth == discretiser_output.reshape(-1, 15)).all()
Example #3
0
    def test_transform_shuffled_indices(self, get_iris_data):
        data = get_iris_data.copy(deep=True)
        ground_truth = np.array([
            [0, 0, 1, 0, 1],
            [0, 0, 0, 0, 0],
            [0, 1, 1, 2, 1],
            [2, 1, 1, 0, 1],
            [1, 2, 1, 0, 2],
            [2, 2, 2, 0, 0],
            [1, 1, 0, 2, 2],
            [1, 2, 0, 2, 2],
            [2, 2, 0, 1, 2],
            [1, 1, 2, 1, 1],
        ])

        sample_data = data[["sepal length (cm)",
                            "target"]].sample(50, random_state=2021)
        dt_single = DecisionTreeSupervisedDiscretiserMethod(
            tree_params={"max_depth": 2},
            mode="single",
        )
        dt_single.fit(
            feat_names=["sepal length (cm)"],
            dataframe=sample_data,
            target_continuous=False,
            target="target",
        )
        discretiser_output = dt_single.transform(
            sample_data[["sepal length (cm)"]])
        assert discretiser_output.isnull().values.sum() == 0
        assert (ground_truth == discretiser_output.values.reshape(10, 5)).all()
Example #4
0
    def test_multi_fit(self, get_iris_data):
        ground_truth_petal_length = np.array(
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                [0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2],
                [2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
                [2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2],
                [2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
            ]
        )  # ground truth is generated by manually use DecionTree to extract thresholds

        ground_truth_petal_width = np.array(
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
                [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1],
                [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1],
                [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
            ]
        )  # ground truth is generated by manually use DecionTree to extract thresholds

        iris = get_iris_data.copy(deep=True)

        dt_multi = DecisionTreeSupervisedDiscretiserMethod(tree_params={
            "max_depth": 3,
            "random_state": 2020
        },
                                                           mode="multi")

        tree_discretiser = dt_multi.fit(
            feat_names=[
                "sepal length (cm)",
                "sepal width (cm)",
                "petal length (cm)",
                "petal width (cm)",
            ],
            dataframe=iris,
            target_continuous=False,
            target="target",
        )

        discretiser_petal_length = tree_discretiser.transform(
            iris[["petal length (cm)"]]).values
        discretiser_petal_width = tree_discretiser.transform(
            iris[["petal width (cm)"]]).values
        assert (ground_truth_petal_length == discretiser_petal_length.reshape(
            -1, 15)).all()
        assert (ground_truth_petal_width == discretiser_petal_width.reshape(
            -1, 15)).all()
Example #5
0
    def test_single_continuous(self, continuous_data):
        diabete = continuous_data.copy(deep=True)

        dt_single = DecisionTreeSupervisedDiscretiserMethod(
            tree_params={"max_depth": 2},
            mode="single",
        )
        tree_discretiser = dt_single.fit(
            feat_names=["s6"],
            dataframe=diabete,
            target_continuous=True,
            target="target",
        )
        discretiser_output = tree_discretiser.transform(diabete[["s6"]]).values

        ground_truth = np.array(
            [
                [1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
                [1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 3, 0, 0],
                [1, 2, 0, 2, 1, 0, 1, 1, 0, 1, 1, 1, 1],
                [1, 2, 1, 0, 2, 1, 0, 0, 0, 1, 1, 1, 1],
                [1, 2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1],
                [1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1],
                [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1],
                [1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 1, 1],
                [0, 1, 0, 1, 2, 2, 1, 0, 1, 2, 1, 1, 1],
                [3, 2, 1, 1, 1, 2, 2, 1, 1, 0, 0, 0, 2],
                [2, 0, 1, 1, 0, 2, 1, 1, 2, 1, 1, 3, 1],
                [0, 1, 0, 1, 2, 1, 0, 1, 1, 2, 1, 1, 2],
                [0, 1, 0, 1, 0, 2, 1, 2, 1, 1, 0, 2, 3],
                [1, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 0],
                [1, 2, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1],
                [1, 1, 2, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2],
                [2, 1, 0, 1, 1, 1, 0, 2, 2, 2, 1, 1, 0],
                [0, 1, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 0],
                [2, 2, 1, 1, 1, 2, 1, 2, 0, 0, 1, 0, 0],
                [0, 2, 1, 2, 2, 1, 2, 2, 1, 1, 0, 2, 1],
                [1, 1, 1, 1, 1, 2, 1, 1, 2, 0, 1, 1, 0],
                [2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1],
                [1, 1, 1, 0, 2, 0, 0, 1, 0, 1, 0, 0, 1],
                [1, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 1, 1],
                [1, 1, 0, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1],
                [2, 1, 1, 1, 2, 1, 1, 2, 1, 0, 0, 2, 1],
                [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3],
                [0, 0, 1, 2, 1, 0, 0, 0, 2, 2, 1, 2, 1],
                [2, 2, 2, 1, 2, 1, 0, 1, 1, 1, 0, 1, 1],
                [0, 0, 1, 1, 0, 2, 1, 0, 1, 1, 0, 0, 0],
                [2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 1, 2, 2],
                [2, 1, 1, 0, 1, 2, 1, 0, 1, 2, 1, 1, 1],
                [2, 1, 1, 1, 2, 1, 1, 0, 1, 0, 2, 1, 3],
                [0, 1, 1, 2, 1, 1, 0, 0, 1, 2, 1, 1, 1],
            ]
        )  # ground truth is generated by manually use DecionTree to extract thresholds

        assert (ground_truth == discretiser_output.reshape(-1, 13)).all()
Example #6
0
    def test_transform_all_multi(self, get_iris_data):
        data = get_iris_data.copy(deep=True)
        sepal_length = data["sepal length (cm)"]
        sepal_width = data["sepal width (cm)"]
        petal_length = np.array([
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
            [0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2],
            [2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
            [2, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2],
            [2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        ])
        petal_width = np.array([
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1],
            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
            [2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1],
            [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1],
            [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        ])

        dt_multi = DecisionTreeSupervisedDiscretiserMethod(mode="multi",
                                                           tree_params={
                                                               "max_depth": 3,
                                                               "random_state":
                                                               2020
                                                           })
        tree_discretiser = dt_multi.fit(
            feat_names=[
                "sepal length (cm)",
                "sepal width (cm)",
                "petal length (cm)",
                "petal width (cm)",
            ],
            dataframe=data,
            target="target",
            target_continuous=False,
        )
        output_df = tree_discretiser.transform(data)

        assert (output_df["sepal length (cm)"].values == sepal_length).all()
        assert (output_df["sepal width (cm)"].values == sepal_width).all()
        assert (output_df["petal length (cm)"].values.reshape(
            -1, 15) == petal_length).all()
        assert (output_df["petal width (cm)"].values.reshape(
            -1, 15) == petal_width).all()
Example #7
0
    def test_keep_unselected_feature(self, get_iris_data):
        ground_truth = np.array(
            [
                [4, 2, 3, 3, 4, 6, 4, 4, 1, 3, 4, 4, 2, 2, 6],
                [6, 6, 4, 5, 5, 4, 4, 4, 3, 4, 2, 4, 4, 4, 3],
                [3, 4, 6, 6, 3, 3, 4, 4, 2, 4, 4, 0, 3, 4, 5],
                [2, 5, 3, 4, 3, 3, 3, 3, 0, 1, 1, 3, 0, 1, 1],
                [0, 2, 0, 1, 1, 3, 2, 1, 0, 1, 3, 1, 1, 1, 1],
                [2, 1, 2, 1, 1, 0, 0, 1, 1, 2, 4, 3, 0, 2, 1],
                [1, 2, 1, 0, 1, 2, 1, 1, 1, 1, 3, 1, 2, 1, 2],
                [2, 1, 1, 1, 4, 3, 1, 2, 1, 1, 3, 2, 5, 1, 0],
                [3, 1, 1, 1, 3, 3, 1, 2, 1, 2, 1, 5, 1, 1, 1],
                [2, 4, 3, 2, 3, 3, 3, 1, 3, 3, 2, 1, 2, 4, 2],
            ]
        )  # ground truth is generated by manually use DecionTree to extract thresholds

        dt_multi = DecisionTreeSupervisedDiscretiserMethod(
            tree_params={
                "max_depth": 3,
                "random_state": 2020
            },
            mode="multi",
            split_unselected_feat=True,
        )
        tree_discretiser = dt_multi.fit(
            feat_names=[
                "sepal length (cm)",
                "sepal width (cm)",
                "petal length (cm)",
                "petal width (cm)",
            ],
            dataframe=get_iris_data,
            target_continuous=False,
            target="target",
        )
        output = tree_discretiser.transform(get_iris_data[["sepal width (cm)"
                                                           ]]).values
        assert (ground_truth == output.reshape(-1, 15)).all()
Example #8
0
    def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Helper method to discretise input data using parameters in
        `discretiser_kwargs` and `discretiser_alg`.
        The splitting thresholds are extracted from the training data

        Args:
            X (pd.DataFrame): a dataframe to be discretised

        Returns:
            a discretised version of the input dataframe
        """

        X = X.copy()

        for col in self.discretiser_alg.keys():

            if self.discretiser_alg[col] == "unsupervised":

                if self.discretiser_kwargs[col]["method"] == "fixed":
                    X[col] = Discretiser(
                        **self.discretiser_kwargs[col]).transform(
                            X[col].values)
                else:
                    discretiser = Discretiser(
                        **self.discretiser_kwargs[col]).fit(
                            self._discretise_data[col].values)
                    X[col] = discretiser.transform(X[col].values)

            else:
                if self.discretiser_alg[col] == "tree":
                    discretiser = DecisionTreeSupervisedDiscretiserMethod(
                        mode="single",
                        tree_params=self.discretiser_kwargs[col])

                elif self.discretiser_alg[col] == "mdlp":
                    discretiser = MDLPSupervisedDiscretiserMethod(
                        self.discretiser_kwargs[col])

                discretiser.fit(
                    dataframe=self._discretise_data,
                    feat_names=[col],
                    target=self._target_name,
                    target_continuous=False,
                )

                X[col] = discretiser.transform(X[[col]])

        return X
Example #9
0
 def test_default_args(self):
     dt_multi = DecisionTreeSupervisedDiscretiserMethod()
     params = dt_multi.get_params()
     assert params["tree_params"]["max_depth"] == 2
Example #10
0
 def test_invalid_mode(self):
     with pytest.raises(KeyError):
         DecisionTreeSupervisedDiscretiserMethod(
             tree_params={"max_depth": 2}, mode="invalid")