def __init__(self, params, data): BaseValidator.__init__(self, params, data) self.train_ratio = self.params.get("train_ratio", 0.8) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1706) log.debug("SplitValidator, train_ratio: {0}".format(self.train_ratio))
def __init__(self, params, data): BaseValidator.__init__(self, params, data) if self.data.get("validation") is None: msg = "Missing validation data" raise WithDatasetValidatorException(msg) for i in ["X", "y"]: if self.data["validation"].get(i) is None: msg = "Missing {0} in validation data".format(i) raise WithDatasetValidatorException(msg)
def __init__(self, params): BaseValidator.__init__(self, params) cv_path = self.params.get("cv_path") if cv_path is None: raise AutoMLException( "You need to specify `cv` as list or iterable") self.cv = joblib.load(cv_path) self.cv = list(self.cv) self._results_path = self.params.get("results_path") self._X_path = self.params.get("X_path") self._y_path = self.params.get("y_path") self._sample_weight_path = self.params.get("sample_weight_path") if self._X_path is None or self._y_path is None: raise AutoMLException("No data path set in CustomValidator params") folds_path = os.path.join(self._results_path, "folds") if not os.path.exists(folds_path): os.mkdir(folds_path) print("Custom validation strategy") for fold_cnt, (train_index, validation_index) in enumerate(self.cv): print(f"Split {fold_cnt}.") print(f"Train {train_index.shape[0]} samples.") print(f"Validation {validation_index.shape[0]} samples.") train_index_file = os.path.join( self._results_path, "folds", f"fold_{fold_cnt}_train_indices.npy", ) validation_index_file = os.path.join( self._results_path, "folds", f"fold_{fold_cnt}_validation_indices.npy", ) np.save(train_index_file, train_index) np.save(validation_index_file, validation_index) else: log.debug("Folds split already done, reuse it")
def __init__(self, params): BaseValidator.__init__(self, params) self.train_ratio = self.params.get("train_ratio", 0.8) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1234) log.debug("SplitValidator, train_ratio: {0}".format(self.train_ratio)) self._results_path = self.params.get("results_path") self._X_path = self.params.get("X_path") self._y_path = self.params.get("y_path") if self._X_path is None or self._y_path is None: raise AutoMLException("No data path set in SplitValidator params")
def __init__(self, params): BaseValidator.__init__(self, params) self.train_ratio = self.params.get("train_ratio", 0.8) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1234) self.repeats = self.params.get("repeats", 1) if not self.shuffle and self.repeats > 1: warnings.warn( "Disable repeats in validation because shuffle is disabled") self.repeats = 1 self._results_path = self.params.get("results_path") self._X_path = self.params.get("X_path") self._y_path = self.params.get("y_path") if self._X_path is None or self._y_path is None: raise AutoMLException("No data path set in SplitValidator params")
def __init__(self, params, data): BaseValidator.__init__(self, params, data) self.k_folds = self.params.get("k_folds", 5) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1706) if self.stratify: self.skf = StratifiedKFold( n_splits=self.k_folds, shuffle=self.shuffle, random_state=self.random_seed, ) else: self.skf = KFold( n_splits=self.k_folds, shuffle=self.shuffle, random_state=self.random_seed, )
def __init__(self, params): BaseValidator.__init__(self, params) self.k_folds = self.params.get("k_folds", 5) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1906) if self.stratify: if self.shuffle: self.skf = StratifiedKFold( n_splits=self.k_folds, shuffle=self.shuffle, random_state=self.random_seed if self.shuffle else None, ) else: self.skf = StratifiedKFold(n_splits=self.k_folds, shuffle=self.shuffle) else: self.skf = KFold( n_splits=self.k_folds, shuffle=self.shuffle, random_state=self.random_seed if self.shuffle else None, ) self._results_path = self.params.get("results_path") self._X_path = self.params.get("X_path") self._y_path = self.params.get("y_path") if self._X_path is None or self._y_path is None: raise AutoMLException("No data path set in KFoldValidator params") folds_path = os.path.join(self._results_path, "folds") if not os.path.exists(folds_path): os.mkdir(folds_path) X = pd.read_parquet(self._X_path) y = pd.read_parquet(self._y_path) y = y["target"] if isinstance(y[0], bytes): # see https://github.com/scikit-learn/scikit-learn/issues/16980 y = y.astype(str) for fold_cnt, (train_index, validation_index) in enumerate(self.skf.split(X, y)): train_index_file = os.path.join( self._results_path, "folds", f"fold_{fold_cnt}_train_indices.npy") validation_index_file = os.path.join( self._results_path, "folds", f"fold_{fold_cnt}_validation_indices.npy", ) np.save(train_index_file, train_index) np.save(validation_index_file, validation_index) del X del y gc.collect() else: log.debug("Folds split already done, reuse it")
def __init__(self, params): BaseValidator.__init__(self, params) self.k_folds = self.params.get("k_folds", 5) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1906) self.repeats = self.params.get("repeats", 1) if not self.shuffle and self.repeats > 1: warnings.warn( "Disable repeats in validation because shuffle is disabled") self.repeats = 1 self.skf = [] for r in range(self.repeats): random_seed = self.random_seed + r if self.shuffle else None if self.stratify: if self.shuffle: self.skf += [ StratifiedKFold( n_splits=self.k_folds, shuffle=self.shuffle, random_state=random_seed, ) ] else: self.skf += [ StratifiedKFold( n_splits=self.k_folds, # shuffle=self.shuffle, random_state=random_seed, ) ] else: self.skf += [ KFold( n_splits=self.k_folds, shuffle=self.shuffle, random_state=random_seed, ) ] self._results_path = self.params.get("results_path") self._X_path = self.params.get("X_path") self._y_path = self.params.get("y_path") self._sample_weight_path = self.params.get("sample_weight_path") if self._X_path is None or self._y_path is None: raise AutoMLException("No data path set in KFoldValidator params") folds_path = os.path.join(self._results_path, "folds") if not os.path.exists(folds_path): os.mkdir(folds_path) X = pd.read_parquet(self._X_path) y = pd.read_parquet(self._y_path) y = y["target"] if isinstance(y[0], bytes): # see https://github.com/scikit-learn/scikit-learn/issues/16980 y = y.astype(str) for repeat_cnt, skf in enumerate(self.skf): for fold_cnt, (train_index, validation_index) in enumerate(skf.split(X, y)): repeat_str = f"_repeat_{repeat_cnt}" if len( self.skf) > 1 else "" train_index_file = os.path.join( self._results_path, "folds", f"fold_{fold_cnt}{repeat_str}_train_indices.npy", ) validation_index_file = os.path.join( self._results_path, "folds", f"fold_{fold_cnt}{repeat_str}_validation_indices.npy", ) np.save(train_index_file, train_index) np.save(validation_index_file, validation_index) del X del y gc.collect() else: log.debug("Folds split already done, reuse it")