Esempio n. 1
0
class ValidationStep:
    def __init__(self, params):

        # kfold is default validation technique
        self.validation_type = params.get("validation_type", "kfold")

        if self.validation_type == "kfold":
            self.validator = KFoldValidator(params)
        else:
            raise Exception("Other validation types are not implemented yet!")
        """
        elif self.validation_type == "split":
            self.validator = SplitValidator(params, data)
        elif self.validation_type == "with_dataset":
            self.validator = WithDatasetValidator(params, data)
        else:
            msg = "Unknown validation type: {0}".format(self.validation_type)
            raise ValidationStepException(msg)
        """

    def get_split(self, k):
        return self.validator.get_split(k)

    def split(self):
        return self.validator.split()

    def get_n_splits(self):
        return self.validator.get_n_splits()
    def test_repeats(self):

        data = {
            "X":
            pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                         columns=["a", "b"]),
            "y":
            pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
        }

        X_path = os.path.join(self._results_path, "X.data")
        y_path = os.path.join(self._results_path, "y.data")

        dump_data(X_path, data["X"])
        dump_data(y_path, data["y"])

        params = {
            "shuffle": True,
            "stratify": False,
            "k_folds": 2,
            "repeats": 10,
            "results_path": self._results_path,
            "X_path": X_path,
            "y_path": y_path,
            "random_seed": 1,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())
        self.assertEqual(params["repeats"], vl.get_repeats())

        for repeat in range(vl.get_repeats()):
            for k_fold in range(vl.get_n_splits()):
                train, validation = vl.get_split(k_fold, repeat)

                X_train, y_train = train.get("X"), train.get("y")
                X_validation, y_validation = validation.get(
                    "X"), validation.get("y")

                self.assertEqual(X_train.shape[0], 2)
                self.assertEqual(y_train.shape[0], 2)
                self.assertEqual(X_validation.shape[0], 2)
                self.assertEqual(y_validation.shape[0], 2)
Esempio n. 3
0
    def test_missing_target_values(self):

        data = {
            "train": {
                "X":
                pd.DataFrame(
                    np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]),
                    columns=["a", "b"],
                ),
                "y":
                pd.DataFrame(np.array(["a", "b", "a", "b", np.nan, np.nan]),
                             columns=["target"]),
            }
        }

        X_train_path = os.path.join(self._results_path, "X_train.parquet")
        y_train_path = os.path.join(self._results_path, "y_train.parquet")

        data["train"]["X"].to_parquet(X_train_path, index=False)
        data["train"]["y"].to_parquet(y_train_path, index=False)

        params = {
            "shuffle": True,
            "stratify": True,
            "k_folds": 2,
            "results_path": self._results_path,
            "X_train_path": X_train_path,
            "y_train_path": y_train_path,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())

        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)
            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 2)
            self.assertEqual(y_train.shape[0], 2)
            self.assertEqual(X_validation.shape[0], 2)
            self.assertEqual(y_validation.shape[0], 2)
Esempio n. 4
0
    def test_create(self):

        data = {
            "train": {
                "X":
                pd.DataFrame(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]),
                             columns=["a", "b"]),
                "y":
                pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
            }
        }

        X_train_path = os.path.join(self._results_path, "X_train.parquet")
        y_train_path = os.path.join(self._results_path, "y_train.parquet")

        data["train"]["X"].to_parquet(X_train_path, index=False)
        data["train"]["y"].to_parquet(y_train_path, index=False)

        params = {
            "shuffle": False,
            "stratify": False,
            "k_folds": 2,
            "results_path": self._results_path,
            "X_train_path": X_train_path,
            "y_train_path": y_train_path,
        }
        vl = KFoldValidator(params)

        self.assertEqual(params["k_folds"], vl.get_n_splits())
        # for train, validation in vl.split():
        for k_fold in range(vl.get_n_splits()):
            train, validation = vl.get_split(k_fold)

            X_train, y_train = train.get("X"), train.get("y")
            X_validation, y_validation = validation.get("X"), validation.get(
                "y")

            self.assertEqual(X_train.shape[0], 2)
            self.assertEqual(y_train.shape[0], 2)
            self.assertEqual(X_validation.shape[0], 2)
            self.assertEqual(y_validation.shape[0], 2)