Beispiel #1
0
    def __init__(self, params, data):
        BaseValidator.__init__(self, params, data)

        self.train_ratio = self.params.get("train_ratio", 0.8)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1706)
        log.debug("SplitValidator, train_ratio: {0}".format(self.train_ratio))
    def __init__(self, params, data):
        BaseValidator.__init__(self, params, data)

        if self.data.get("validation") is None:
            msg = "Missing validation data"
            raise WithDatasetValidatorException(msg)
        for i in ["X", "y"]:
            if self.data["validation"].get(i) is None:
                msg = "Missing {0} in validation data".format(i)
                raise WithDatasetValidatorException(msg)
Beispiel #3
0
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        cv_path = self.params.get("cv_path")

        if cv_path is None:
            raise AutoMLException(
                "You need to specify `cv` as list or iterable")

        self.cv = joblib.load(cv_path)
        self.cv = list(self.cv)

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")
        self._sample_weight_path = self.params.get("sample_weight_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in CustomValidator params")

        folds_path = os.path.join(self._results_path, "folds")

        if not os.path.exists(folds_path):

            os.mkdir(folds_path)

            print("Custom validation strategy")
            for fold_cnt, (train_index,
                           validation_index) in enumerate(self.cv):

                print(f"Split {fold_cnt}.")
                print(f"Train {train_index.shape[0]} samples.")
                print(f"Validation {validation_index.shape[0]} samples.")
                train_index_file = os.path.join(
                    self._results_path,
                    "folds",
                    f"fold_{fold_cnt}_train_indices.npy",
                )
                validation_index_file = os.path.join(
                    self._results_path,
                    "folds",
                    f"fold_{fold_cnt}_validation_indices.npy",
                )

                np.save(train_index_file, train_index)
                np.save(validation_index_file, validation_index)

        else:
            log.debug("Folds split already done, reuse it")
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.train_ratio = self.params.get("train_ratio", 0.8)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1234)
        log.debug("SplitValidator, train_ratio: {0}".format(self.train_ratio))

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in SplitValidator params")
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.train_ratio = self.params.get("train_ratio", 0.8)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1234)
        self.repeats = self.params.get("repeats", 1)

        if not self.shuffle and self.repeats > 1:
            warnings.warn(
                "Disable repeats in validation because shuffle is disabled")
            self.repeats = 1

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in SplitValidator params")
    def __init__(self, params, data):
        BaseValidator.__init__(self, params, data)

        self.k_folds = self.params.get("k_folds", 5)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1706)

        if self.stratify:
            self.skf = StratifiedKFold(
                n_splits=self.k_folds,
                shuffle=self.shuffle,
                random_state=self.random_seed,
            )
        else:
            self.skf = KFold(
                n_splits=self.k_folds,
                shuffle=self.shuffle,
                random_state=self.random_seed,
            )
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.k_folds = self.params.get("k_folds", 5)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1906)

        if self.stratify:
            if self.shuffle:
                self.skf = StratifiedKFold(
                    n_splits=self.k_folds,
                    shuffle=self.shuffle,
                    random_state=self.random_seed if self.shuffle else None,
                )
            else:
                self.skf = StratifiedKFold(n_splits=self.k_folds,
                                           shuffle=self.shuffle)
        else:
            self.skf = KFold(
                n_splits=self.k_folds,
                shuffle=self.shuffle,
                random_state=self.random_seed if self.shuffle else None,
            )

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in KFoldValidator params")

        folds_path = os.path.join(self._results_path, "folds")

        if not os.path.exists(folds_path):

            os.mkdir(folds_path)

            X = pd.read_parquet(self._X_path)
            y = pd.read_parquet(self._y_path)
            y = y["target"]

            if isinstance(y[0], bytes):
                # see https://github.com/scikit-learn/scikit-learn/issues/16980
                y = y.astype(str)

            for fold_cnt, (train_index,
                           validation_index) in enumerate(self.skf.split(X,
                                                                         y)):

                train_index_file = os.path.join(
                    self._results_path, "folds",
                    f"fold_{fold_cnt}_train_indices.npy")
                validation_index_file = os.path.join(
                    self._results_path,
                    "folds",
                    f"fold_{fold_cnt}_validation_indices.npy",
                )

                np.save(train_index_file, train_index)
                np.save(validation_index_file, validation_index)

            del X
            del y
            gc.collect()

        else:
            log.debug("Folds split already done, reuse it")
Beispiel #8
0
    def __init__(self, params):
        BaseValidator.__init__(self, params)

        self.k_folds = self.params.get("k_folds", 5)
        self.shuffle = self.params.get("shuffle", True)
        self.stratify = self.params.get("stratify", False)
        self.random_seed = self.params.get("random_seed", 1906)
        self.repeats = self.params.get("repeats", 1)

        if not self.shuffle and self.repeats > 1:
            warnings.warn(
                "Disable repeats in validation because shuffle is disabled")
            self.repeats = 1

        self.skf = []

        for r in range(self.repeats):
            random_seed = self.random_seed + r if self.shuffle else None
            if self.stratify:
                if self.shuffle:
                    self.skf += [
                        StratifiedKFold(
                            n_splits=self.k_folds,
                            shuffle=self.shuffle,
                            random_state=random_seed,
                        )
                    ]
                else:
                    self.skf += [
                        StratifiedKFold(
                            n_splits=self.k_folds,
                            #                             shuffle=self.shuffle,
                            random_state=random_seed,
                        )
                    ]
            else:
                self.skf += [
                    KFold(
                        n_splits=self.k_folds,
                        shuffle=self.shuffle,
                        random_state=random_seed,
                    )
                ]

        self._results_path = self.params.get("results_path")
        self._X_path = self.params.get("X_path")
        self._y_path = self.params.get("y_path")
        self._sample_weight_path = self.params.get("sample_weight_path")

        if self._X_path is None or self._y_path is None:
            raise AutoMLException("No data path set in KFoldValidator params")

        folds_path = os.path.join(self._results_path, "folds")

        if not os.path.exists(folds_path):

            os.mkdir(folds_path)

            X = pd.read_parquet(self._X_path)
            y = pd.read_parquet(self._y_path)
            y = y["target"]

            if isinstance(y[0], bytes):
                # see https://github.com/scikit-learn/scikit-learn/issues/16980
                y = y.astype(str)

            for repeat_cnt, skf in enumerate(self.skf):
                for fold_cnt, (train_index,
                               validation_index) in enumerate(skf.split(X, y)):
                    repeat_str = f"_repeat_{repeat_cnt}" if len(
                        self.skf) > 1 else ""
                    train_index_file = os.path.join(
                        self._results_path,
                        "folds",
                        f"fold_{fold_cnt}{repeat_str}_train_indices.npy",
                    )
                    validation_index_file = os.path.join(
                        self._results_path,
                        "folds",
                        f"fold_{fold_cnt}{repeat_str}_validation_indices.npy",
                    )

                    np.save(train_index_file, train_index)
                    np.save(validation_index_file, validation_index)

            del X
            del y
            gc.collect()

        else:
            log.debug("Folds split already done, reuse it")