def split(self, X, y=None, groups=None, window_length=4): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) group_lst = np.unique(groups) n_groups = len(group_lst) indices = np.arange(n_samples) eras = range(n_groups - window_length) eras = list(eras) for i in eras[:]: yield (indices[groups == group_lst[i]], indices[groups == group_lst[i + window_length]])
def _split_cv(*arrays, y=None, groups=None, cv=3, random_state=None): '''supervise splitting y - class label,if None not to stratify groups - split by groups cv - number of splits return ---- generator of list containing splited arrays,shape = [m*n*k], for 1 fold [(0train, 0test), (1train, 1test), ...] m - indices of folds [0 : cv-1] n - indice of variable/arrays [0 : n_arrays-1] k - indice of train(0)/test[1] set [0:1] ''' n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") validation.check_consistent_length(*arrays, y, groups) arrays = list(arrays) if cv == 1: if y is not None: arrays.append(y) return [[(i, i) for i in arrays]] # get cross validator if y is not None: arrays.append(y) cv = _split.check_cv(cv, y=y, classifier=True) else: cv = _split.check_cv(cv, classifier=False) # set random state if hasattr(cv, 'random_state'): cv.random_state = random_state # reset_index pandas df or series arrays = _reset_index(*arrays) arrays = _split.indexable(*arrays) # get indexing method safe_index = _split.safe_indexing train_test = ([ (safe_index(i, train_index), safe_index(i, test_index)) for i in arrays ] for train_index, test_index in cv.split(arrays[0], y, groups)) return train_test
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) group_lst = np.unique(groups) n_groups = len(group_lst) indices = np.arange(n_samples) cutoff_eras = n_groups // self.n_splits np.random.shuffle(group_lst) for i in range(self.n_splits): yield (indices[groups.isin( group_lst[i * cutoff_eras:i * cutoff_eras + cutoff_eras])], indices[groups.isin( group_lst[i * cutoff_eras:i * cutoff_eras + cutoff_eras])])
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 group_list = np.unique(groups) n_groups = len(group_list) if n_folds > n_groups: raise ValueError(("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format( n_folds, n_groups)) indices = np.arange(n_samples) test_size = (n_groups // n_folds) test_starts = range(test_size + n_groups % n_folds, n_groups, test_size) test_starts = list(test_starts)[::-1] for test_start in test_starts: yield (indices[groups.isin(group_list[:test_start])], indices[groups.isin(group_list[test_start:test_start + test_size])])
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. Note that providing ``y`` is sufficient to generate the splits and hence ``np.zeros(n_samples)`` may be used as a placeholder for ``X`` instead of actual training data. y : array-like, shape (n_samples,) The target variable for supervised learning problems. Stratification is done based on the y labels. groups : object Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : None Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting ``random_state`` to an integer. """ X, y, groups = indexable(X, y, groups) for test, train in super().split(X, y, groups): if self.shuffle: np.random.seed(self.random_state) np.random.shuffle(train) yield train, test
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is None: raise ValueError("The 'groups' parameter should not be None") X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits group_gap = self.group_gap max_test_group_size = self.max_test_group_size max_train_group_size = self.max_train_group_size n_folds = n_splits + 1 group_dict = {} u, ind = np.unique(groups, return_index=True) unique_groups = u[np.argsort(ind)] n_samples = _num_samples(X) n_groups = _num_samples(unique_groups) for idx in np.arange(n_samples): if groups[idx] in group_dict: group_dict[groups[idx]].append(idx) else: group_dict[groups[idx]] = [idx] if n_folds > n_groups: raise ValueError( ("Cannot have number of folds={0} greater than" " the number of groups={1}").format(n_folds, n_groups)) group_test_size = min(n_groups // n_folds, max_test_group_size) group_test_starts = range(n_groups - n_splits * group_test_size, n_groups, group_test_size) for group_test_start in group_test_starts: train_array = [] test_array = [] group_st = max(0, group_test_start - group_gap - max_train_group_size) for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]: train_array_tmp = group_dict[train_group_idx] train_array = np.sort( np.unique( np.concatenate((train_array, train_array_tmp)), axis=None, ), axis=None, ) train_end = train_array.size for test_group_idx in unique_groups[ group_test_start:group_test_start + group_test_size]: test_array_tmp = group_dict[test_group_idx] test_array = np.sort( np.unique(np.concatenate((test_array, test_array_tmp)), axis=None), axis=None, ) test_array = test_array[group_gap:] if self.verbose > 0: pass yield [int(i) for i in train_array], [int(i) for i in test_array]
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is None: raise ValueError("The 'groups' parameter should not be None") X, y, groups = indexable(X, y, groups) n_folds = self.n_splits + 1 # np.unique returns sorted groups u, ind = np.unique(groups, return_index=True) # re-sort unique groups in order of first occurrence unique_groups = u[np.argsort(ind)] log.debug(f"u={u}, unique_groups={unique_groups}") n_samples = _num_samples(X) n_groups = _num_samples(unique_groups) if n_folds > n_groups: raise ValueError( ("Cannot have number of folds={0} greater than" " the number of groups={1}").format(n_folds, n_groups)) group_dict: Dict[int, List[int]] = {} for idx in np.arange(n_samples): if groups[idx] in group_dict: group_dict[groups[idx]].append(idx) else: group_dict[groups[idx]] = [idx] group_test_size = min(n_groups // n_folds, self.max_test_group_size) group_test_starts = range(n_groups - self.n_splits * group_test_size, n_groups, group_test_size) for group_test_start in group_test_starts: train_array = [] test_array = [] group_st = max( 0, group_test_start - self.group_gap - self.max_train_group_size) log.debug( f"group_st={group_st}, group_test_size={group_test_size}, group_test_starts={group_test_starts}" ) for train_group_idx in unique_groups[group_st:(group_test_start - self.group_gap)]: tmp = group_dict[train_group_idx] train_array = np.sort( np.unique(np.concatenate((train_array, tmp)), axis=None), axis=None, ) for test_group_idx in unique_groups[ group_test_start:group_test_start + group_test_size]: tmp = group_dict[test_group_idx] test_array = np.sort( np.unique(np.concatenate((test_array, tmp)), axis=None), axis=None, ) yield [int(i) for i in train_array], [int(i) for i in test_array]