class ThresholdStratifiedKFold(object): def __init__(self, thresholds, *args, **kwargs): if isinstance(thresholds, Iterable): self.thresholds = list(thresholds) else: self.thresholds = [thresholds] self.stratified = StratifiedKFold(*args, **kwargs) def get_n_splits(self, *args, **kwargs): return self.stratified.get_n_splits(*args, **kwargs) def split(self, X, y): y_thresh = np.zeros(y.shape) for thresh in self.thresholds: y_thresh += y >= thresh for train, test in self.stratified.split(X, y_thresh): yield train, test
def check_cv2(cv=3, y=None, classifier=False, random_state=None): """Input checker utility for building a cross-validator NOTE: this is the same as sklearn.model_selection._split.check_cv but with an added parameter for random_state So that nested CV splits are reproduceable Parameters ---------- cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if classifier is True and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. y : array-like, optional The target variable for supervised learning problems. classifier : boolean, optional, default False Whether the task is a classification task, in which case stratified KFold will be used. random_state : None, int or RandomState When shuffle=True, pseudo-random number generator state used for shuffling. If None, use default numpy RNG for shuffling. Returns ------- checked_cv : a cross-validator instance. The return value is a cross-validator which generates the train/test splits via the ``split`` method. """ if cv is None: cv = 3 if isinstance(cv, numbers.Integral): if (classifier and (y is not None) and (type_of_target(y) in ('binary', 'multiclass'))): return StratifiedKFold(cv, random_state=random_state) else: return KFold(cv, random_state=random_state) if not hasattr(cv, 'split') or isinstance(cv, str): if not isinstance(cv, Iterable) or isinstance(cv, str): raise ValueError("Expected cv as an integer, cross-validation " "object (from sklearn.model_selection) " "or an iterable. Got %s." % cv) return _CVIterableWrapper(cv) return cv # New style cv objects are passed without any modification
def MLKFoldCrossValid(epoch): seed = 7 np.random.seed(seed) dataset = np.loadtxt("pima-indians-diabetes.csv", delimiter=",") X = dataset[:, 0:8] Y = dataset[:, 8] kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) cvscores = [] for train, test in kfold.split(X, Y): layers = [Dense(12, input_dim=8, activation='relu', kernel_initializer='uniform'),\ Dense(8,activation='relu', kernel_initializer='uniform'),\ Dense(1,activation='sigmoid', kernel_initializer='uniform')] model = Sequential(layers) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X[train], Y[train], epochs=epoch, batch_size=10, verbose=0) scores = model.evaluate(X[test], Y[test], verbose=0) print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) cvscores.append(scores[1] * 100) print("%.2f%%(+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
def check_cv(cv=3, y=None, classifier=False): """Dask aware version of ``sklearn.model_selection.check_cv`` Same as the scikit-learn version, but works if ``y`` is a dask object. """ if cv is None: cv = 3 # If ``cv`` is not an integer, the scikit-learn implementation doesn't # touch the ``y`` object, so passing on a dask object is fine if not isinstance(y, Base) or not isinstance(cv, numbers.Integral): return model_selection.check_cv(cv, y, classifier) if classifier: # ``y`` is a dask object. We need to compute the target type target_type = delayed(type_of_target, pure=True)(y).compute() if target_type in ('binary', 'multiclass'): return StratifiedKFold(cv) return KFold(cv)
data_csv = pd.read_csv('dataset.csv') data_csv = data_csv.sort_values(['icustay_id']) data = np.array([ itemid for itemid in list(data_csv['icustay_id']) if os.path.exists(parameters['dataPath'] + '{}.csv'.format(itemid)) ]) data_csv = data_csv[data_csv['icustay_id'].isin(data)] data = np.array( [parameters['dataPath'] + '{}.csv'.format(itemid) for itemid in data]) classes = np.array([0 if c == 'S' else 1 for c in list(data_csv['class'])]) classes_for_stratified = np.array( [0 if c == 'S' else 1 for c in list(data_csv['class'])]) print('S', len([c for c in classes if c == [0]])) print('R', len([c for c in classes if c == [1]])) # Using a seed always will get the same data split even if the training stops kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=15) # Get input shape aux = pd.read_csv(data[0]) inputShape = (parameters['dataLength'], len(aux.columns)) config = None if os.path.exists(parameters['modelConfigPath']): with open(parameters['modelConfigPath'], 'r') as configHandler: config = json.load(configHandler) i = 0 # ====================== Script that start training new models with open(parameters['resultFilePath'], 'a+' ) as cvsFileHandler: # where the results for each fold are appended dictWriter = None
def kfold_stratified(self, n_splits=5, n_repeats=0, shuffle=False, random_state=None): ''' Uses sklearn's StratifiedKFold and RepeatedStratifiedKFold facility. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html See also method kfold() for folding without balancing. The 'X' in this context are sample ids that are eventually used by the dataloader to retrieve spectrograms. Recall that each sample id stands for one spectrogram snippet. The 'y' vector in the KFold page is the 'label' column in the Sqlite db, which is 1 or 0 for each column (i.e. time bin) of the spectrogram. All methods on the sklearn [Repeated]StratifiedKFold facility are available in this class by the same name. After calling this method, calls to next() will return train samples. I.e. the current queue is set to self.train_queue @param n_splits: number of folds to create @type n_splits: int @param n_repeats: number times fold splitting should be repeated (n-times k-fold cross validation. Set to zero, the method uses sklearn KFold class, else it uses the sklearn.RepeatedKFold @type n_repeats: int @param shuffle: whether or not to shuffle the data before splitting. Once split, the data in the folds are not shuffled @type shuffle: bool @param random_state: if shuffle is set to True, this argument allows for repeatability over multiple runs @type random_state: int ''' if n_repeats == 0: self.cross_validator = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) else: self.cross_validator = RepeatedStratifiedKFold( n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) # The following retrieves *indices* into # our list of sample_ids. However, since # our sample_ids are just numbers from 0 to n, # the indices are equivalent to the sample ids # themselves # The split method will return a generator # object. Each item in this generator is # a 2-tuple: a test set array and a validation # set array. There will be n_splits such tuples. # We grab the first pair: all_labels = self.labels_from_db(self.sample_ids) self.folds_iter = self.cross_validator.split(self.sample_ids, all_labels) (self.train_sample_ids, self.validate_sample_ids) = \ next(self.folds_iter) self.train_queue = deque(self.train_sample_ids) self.val_queue = deque(self.validate_sample_ids) self.train_labels = self.labels_from_db(self.train_sample_ids) self.validate_labels = self.labels_from_db(self.validate_sample_ids) self.switch_to_split('train')
def __init__(self, thresholds, *args, **kwargs): if isinstance(thresholds, Iterable): self.thresholds = list(thresholds) else: self.thresholds = [thresholds] self.stratified = StratifiedKFold(*args, **kwargs)