def _split_blockwise(self, X): chunks = X.chunks[0] rng = check_random_state(self.random_state) seeds = rng.randint(0, 2**32 - 1, size=len(chunks)) train_pct, test_pct = _maybe_normalize_split_sizes( self.train_size, self.test_size) sizes = [ _validate_shuffle_split(c, test_pct, train_pct) for c in chunks ] objs = [ dask.delayed(_generate_idx, nout=2)(chunksize, seed, n_train, n_test) for chunksize, seed, (n_train, n_test) in zip(chunks, seeds, sizes) ] train_objs, test_objs = zip(*objs) offsets = np.hstack([0, np.cumsum(chunks)]) train_idx = da.concatenate([ da.from_delayed(x + offset, (train_size, ), 'i8') for x, chunksize, ( train_size, _), offset in zip(train_objs, chunks, sizes, offsets) ]) test_idx = da.concatenate([ da.from_delayed(x + offset, (test_size, ), 'i8') for x, chunksize, ( _, test_size), offset in zip(test_objs, chunks, sizes, offsets) ]) return train_idx, test_idx
def _split_blockwise(self, X, seeds): chunks = X.chunks[0] train_pct, test_pct = _maybe_normalize_split_sizes( self.train_size, self.test_size) sizes = [ _validate_shuffle_split(c, test_pct, train_pct) for c in chunks ] objs = [ dask.delayed(_generate_idx, nout=2)(chunksize, seed, n_train, n_test) for chunksize, seed, (n_train, n_test) in zip(chunks, seeds, sizes) ] train_objs, test_objs = zip(*objs) offsets = np.hstack([0, np.cumsum(chunks)]) train_idx = da.concatenate([ da.from_delayed(x + offset, (train_size, ), np.dtype("int")) for x, chunksize, ( train_size, _), offset in zip(train_objs, chunks, sizes, offsets) ]) test_idx = da.concatenate([ da.from_delayed(x + offset, (test_size, ), np.dtype("int")) for x, chunksize, ( _, test_size), offset in zip(test_objs, chunks, sizes, offsets) ]) return train_idx, test_idx
def __init__(self, gene_dataset, train_size=0.1, test_size=None, seed=0, **data_loaders_kwargs): """ :param train_size: float, int, or None (default is 0.1) :param test_size: float, int, or None (default is None) """ super(TrainTestDataLoaders, self).__init__(gene_dataset, **data_loaders_kwargs) n = len(self.gene_dataset) n_train, n_test = _validate_shuffle_split(n, test_size, train_size) np.random.seed(seed=seed) permutation = np.random.permutation(n) indices_test = permutation[:n_test] indices_train = permutation[n_test:(n_test + n_train)] data_loader_train = self(indices=indices_train) data_loader_test = self(indices=indices_test) self.data_loaders_dict.update({ 'train': data_loader_train, 'test': data_loader_test })
def _iter_indices(self, X, y, groups=None): n_samples = _num_samples(X) y = check_array(y, ensure_2d=False, dtype=None) y = np.asarray(y, dtype=bool) type_of_target_y = type_of_target(y) if type_of_target_y != "multilabel-indicator": raise ValueError( "Supported target type is: multilabel-indicator. Got {!r} instead.".format( type_of_target_y ) ) n_train, n_test = _validate_shuffle_split( n_samples, self.test_size, self.train_size ) n_samples = y.shape[0] rng = check_random_state(self.random_state) y_orig = y.copy() r = np.array([n_train, n_test]) / (n_train + n_test) for _ in range(self.n_splits): indices = np.arange(n_samples) rng.shuffle(indices) y = y_orig[indices] test_folds = _iterative_stratification(labels=y, r=r, random_state=rng) test_idx = test_folds[np.argsort(indices)] == 1 test = np.where(test_idx)[0] train = np.where(~test_idx)[0] yield train, test
def _iter_indices(self, X, y, groups=None): # type: ignore n_samples = _num_samples(X) y = check_array(y, ensure_2d=False, dtype=None) n_train, n_test = _validate_shuffle_split( n_samples, self.test_size, self.train_size, default_test_size=self._default_test_size) if y.ndim == 2: # for multi-label y, map each distinct row to a string repr # using join because str(row) uses an ellipsis if len(row) > 1000 y = np.array([' '.join(row.astype('str')) for row in y]) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] class_counts = np.bincount(y_indices) # print(class_counts) if n_train < n_classes: raise ValueError('The train_size = %d should be greater or ' 'equal to the number of classes = %d' % (n_train, n_classes)) if n_test < n_classes: raise ValueError('The test_size = %d should be greater or ' 'equal to the number of classes = %d' % (n_test, n_classes)) # Find the sorted list of instances for each class: # (np.unique above performs a sort, so code is O(n logn) already) class_indices = np.split(np.argsort(y_indices, kind='mergesort'), np.cumsum(class_counts)[:-1]) rng = check_random_state(self.random_state) for _ in range(self.n_splits): # if there are ties in the class-counts, we want # to make sure to break them anew in each iteration n_i = _approximate_mode(class_counts, n_train, rng) class_counts_remaining = class_counts - n_i t_i = _approximate_mode(class_counts_remaining, n_test, rng) train = [] test = [] for i in range(n_classes): # print("Before", i, class_counts[i], n_i[i], t_i[i]) permutation = rng.permutation(class_counts[i]) perm_indices_class_i = class_indices[i].take(permutation, mode='clip') if n_i[i] == 0: n_i[i] = 1 t_i[i] = t_i[i] - 1 # print("After", i, class_counts[i], n_i[i], t_i[i]) train.extend(perm_indices_class_i[:n_i[i]]) test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) train = rng.permutation(train) test = rng.permutation(test) yield train, test
def train_test(self, model=None, gene_dataset=None, train_size=0.1, test_size=None, seed=0, type_class=Posterior): """ :param train_size: float, int, or None (default is 0.1) :param test_size: float, int, or None (default is None) """ model = self.model if model is None and hasattr(self, "model") else model gene_dataset = self.gene_dataset if gene_dataset is None and hasattr( self, "model") else gene_dataset n = len(gene_dataset) n_train, n_test = _validate_shuffle_split(n, test_size, train_size) np.random.seed(seed=seed) permutation = np.random.permutation(n) indices_test = permutation[:n_test] indices_train = permutation[n_test:(n_test + n_train)] return (self.create_posterior(model, gene_dataset, indices=indices_train, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_test, type_class=type_class))
def __init__( self, n_splits=10, *, test_size=None, train_size=None, random_state=None, buffer_width, n_blocks, ): if not isinstance(buffer_width, int) or buffer_width < 0: raise ValueError( f"'buffer_width' must be a non-negative integer; it is used for indexing. Given {buffer_width}" ) super().__init__( n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state, ) self._default_test_size = 0.2 self._width = buffer_width self._n_blocks = n_blocks self._n_train, self._n_test = _validate_shuffle_split( self._n_blocks, self.test_size, self.train_size, default_test_size=self._default_test_size, )
def _train_test_val_split( self, adata: AnnData, train_size: float = 0.9, validation_size: Optional[float] = None, **kwargs, ): """ Creates data loaders ``train_set``, ``validation_set``, ``test_set``. If ``train_size + validation_set < 1`` then ``test_set`` is non-empty. Parameters ---------- adata Setup AnnData to be split into train, test, validation sets train_size float, or None (default is 0.9) validation_size float, or None (default is None) **kwargs Keyword args for `_make_scvi_dl()` """ train_size = float(train_size) if train_size > 1.0 or train_size <= 0.0: raise ValueError( "train_size needs to be greater than 0 and less than or equal to 1" ) n = len(adata) try: n_train, n_val = _validate_shuffle_split(n, validation_size, train_size) except ValueError: if train_size != 1.0: raise ValueError( "Choice of train_size={} and validation_size={} not understood" .format(train_size, validation_size)) n_train, n_val = n, 0 random_state = np.random.RandomState(seed=settings.seed) permutation = random_state.permutation(n) indices_validation = permutation[:n_val] indices_train = permutation[n_val:(n_val + n_train)] indices_test = permutation[(n_val + n_train):] return ( self._make_scvi_dl(adata, indices=indices_train, shuffle=True, **kwargs), self._make_scvi_dl(adata, indices=indices_validation, shuffle=True, **kwargs), self._make_scvi_dl(adata, indices=indices_test, shuffle=True, **kwargs), )
def get_train_val_split(n_samples, test_size, train_size): try: n_train, n_val = _validate_shuffle_split( n_samples, test_size, train_size) except ValueError: if train_size != 1.0 and n_samples != 1: raise ValueError( "Choice of train_size={} and validation_size={} not understood" .format(train_size, test_size)) n_train, n_val = n_samples, 0 return n_train, n_val
def _iter_indices(self, X, y=None, groups=None): n_samples = X.shape[0] n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) rng = check_random_state(self.random_state) for i in range(self.n_splits): # random partition ind_train = rng.randint(0, high=X.shape[0], size=n_train) ind_test = list( set(np.arange(0, X.shape[0])) - set(np.unique(ind_train))) yield ind_train, ind_test
def __init__(self, gene_dataset, train_size=0.1, test_size=None, seed=0, num_samples=None, **kwargs): """ :param train_size: float, int, or None (default is 0.1) :param test_size: float, int, or None (default is None) """ super(SupervisedTrainTestDataLoaders, self).__init__(gene_dataset, **kwargs) n = len(self.gene_dataset) n_train, n_test = _validate_shuffle_split(n, test_size, train_size) np.random.seed(seed=seed) permutation = np.random.permutation(n) # Get indices indices_test = permutation[:n_test] indices_train = permutation[n_test:(n_test + n_train)] # Get weights for each label unique_labels, label_counts = np.unique(gene_dataset.labels[:, 0], return_counts=True) self.weight_lookup = 1.0 / label_counts * 1.0 / len(unique_labels) # Set the number of samples in the iterator self.num_samples = num_samples if num_samples else len(gene_dataset) # Create weights weights_all = np.zeros(len(gene_dataset)) weights_train = np.zeros(len(gene_dataset)) weights_test = np.zeros(len(gene_dataset)) for idx in indices_train: weights_train[idx] = self.weight_lookup[gene_dataset.labels[idx, 0]] for idx in indices_test: weights_test[idx] = self.weight_lookup[gene_dataset.labels[idx, 0]] for idx in range(len(gene_dataset)): weights_all[idx] = self.weight_lookup[gene_dataset.labels[idx, 0]] data_loader_train = self(weights=weights_train, num_samples=n_train) data_loader_test = self(weights=weights_test, num_samples=n_test) data_loader_all = self(weights=weights_all, num_samples=self.num_samples) self.dict.update({ 'train': data_loader_train, 'test': data_loader_test, 'all': data_loader_all })
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_iter = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: p = bf.pmf(count) assert_true( p > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) splits = StratifiedShuffleSplit(n_iter=n_iter, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits = 0 for train, test in splits.split(X=np.ones(n_samples), y=labels): n_splits += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits, n_iter) n_train, n_test = _validate_shuffle_split(n_samples, test_size=1. / n_folds, train_size=1. - (1. / n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) assert_equal(len(set(train).intersection(test)), 0) label_counts = np.unique(labels) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(n_train + n_test, len(labels)) assert_equal(len(label_counts), 2) ex_test_p = float(n_test) / n_samples ex_train_p = float(n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)
def _iter_indices(self, X, y=None, groups=None): n_samples = _num_samples(self.mapping) n_train, n_test = _validate_shuffle_split(n_samples,self.test_size,self.train_size) rng = check_random_state(self.random_state) for _ in range(self.n_splits): # random partition permutation = rng.permutation(n_samples) ind_test = [] for it in permutation[:n_test]: ind_test.extend(self.mapping[it]) ind_train = [] for it in permutation[n_test:(n_test + n_train)]: ind_train.extend(self.mapping[it]) yield ind_train, ind_test
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_iter = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: p = bf.pmf(count) assert_true(p > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) splits = StratifiedShuffleSplit(n_iter=n_iter, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits = 0 for train, test in splits.split(X=np.ones(n_samples), y=labels): n_splits += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits, n_iter) n_train, n_test = _validate_shuffle_split(n_samples, test_size=1./n_folds, train_size=1.-(1./n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) assert_equal(len(set(train).intersection(test)), 0) label_counts = np.unique(labels) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(n_train + n_test, len(labels)) assert_equal(len(label_counts), 2) ex_test_p = float(n_test) / n_samples ex_train_p = float(n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)
def train_test_validation( self, model=None, gene_dataset=None, train_size=0.7, test_size=None, trainidxs=None, testidxs=None, type_class=Posterior, ): """Creates posteriors ``train_set``, ``test_set``, ``validation_set``. If ``train_size + test_size < 1`` then ``validation_set`` is non-empty. :param train_size: float, int, or None (default is 0.1) :param test_size: float, int, or None (default is None) """ model = self.model if model is None and hasattr(self, "model") else model gene_dataset = ( self.gene_dataset if gene_dataset is None and hasattr(self, "model") else gene_dataset ) #self.trainidxs = trainidxs #self.testidxs = testidxs n = len(gene_dataset) n_train, n_test = _validate_shuffle_split(n, test_size, train_size) random_state = np.random.RandomState(seed=self.seed) permutation = random_state.permutation(n) #indices_test = permutation[:n_test] #indices_train = permutation[n_test : (n_test + n_train)] indices_validation = permutation[(n_test + n_train) :] ### Attention ### # Use train/test split from BoltzmannMachines Package for comparability indices_test = testidxs indices_train = trainidxs indices_validation = np.array([]) return ( self.create_posterior( model, gene_dataset, indices=indices_train, type_class=type_class ), self.create_posterior( model, gene_dataset, indices=indices_test, type_class=type_class ), self.create_posterior( model, gene_dataset, indices=indices_validation, type_class=type_class ), )
def _iter_indices(self, X, y=None, groups=None): _ks = _KennardStone() inds = _ks._get_indexes(X) n_samples = _num_samples(X) n_train, n_test = _validate_shuffle_split( n_samples, self.test_size, self.train_size, default_test_size=self._default_test_size) for _ in range(self.n_splits): ind_test = inds[:n_test] ind_train = inds[n_test:(n_test + n_train)] yield ind_train, ind_test
def train_test_validation( self, model=None, gene_dataset=None, train_size=0.1, test_size=None, type_class=Posterior, ): """Creates posteriors ``train_set``, ``test_set``, ``validation_set``. If ``train_size + test_size < 1`` then ``validation_set`` is non-empty. :param train_size: float, int, or None (default is 0.1) :param test_size: float, int, or None (default is None) """ model = self.model if model is None and hasattr(self, "model") else model gene_dataset = (self.gene_dataset if gene_dataset is None and hasattr(self, "model") else gene_dataset) n = len(gene_dataset) try: n_train, n_test = _validate_shuffle_split(n, test_size, train_size) except ValueError: if train_size != 1.0: raise ValueError( "Choice of train_size={} and test_size={} not understood". format(train_size, test_size)) n_train, n_test = n, 0 random_state = np.random.RandomState(seed=self.seed) permutation = random_state.permutation(n) indices_test = permutation[:n_test] indices_train = permutation[n_test:(n_test + n_train)] indices_validation = permutation[(n_test + n_train):] return ( self.create_posterior(model, gene_dataset, indices=indices_train, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_test, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_validation, type_class=type_class), )
def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, random_state=None): n = len(y) self.y = np.array(y) self.classes, self.y_indices = np.unique(y, return_inverse=True) self.random_state = random_state self.train_size = train_size self.test_size = test_size self.n_iter = n_iter self.n = n self.n_train, self.n_test = _validate_shuffle_split( n, test_size, train_size)
def mc_split(R, n_splits=1, test_size='default', train_size=None, random_state=None): """Train-test splitting.""" n_samples = np.prod(R.shape) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size) rng = check_random_state(random_state) for i in range(n_splits): permutation = rng.permutation(n_samples) ind_test = permutation[:n_test] ind_train = permutation[n_test:(n_test + n_train)] yield ind_train, ind_test
def train_test(self, model=None, gene_dataset=None, train_size=0.1, test_size=None, seed=0, test_indices=None, type_class=Posterior): """ :param train_size: float, int, or None (default is 0.1) :param test_size: float, int, or None (default is None) :param model: :param gene_dataset: :param seed: :param test_indices: :param type_class: """ model = self.model if model is None and hasattr(self, "model") else model gene_dataset = self.gene_dataset if gene_dataset is None and hasattr( self, "model") else gene_dataset n = len(gene_dataset) if test_indices is None: n_train, n_test = _validate_shuffle_split(n, test_size, train_size) np.random.seed(seed=seed) permutation = np.random.permutation(n) indices_test = permutation[:n_test] indices_train = permutation[n_test:(n_test + n_train)] else: indices_test = np.array(test_indices) all_indices = np.arange(len(gene_dataset)) indices_train = ~np.isin(all_indices, indices_test) indices_train = all_indices[indices_train] assert len(np.intersect1d(indices_train, indices_test)) == 0 return (self.create_posterior(model, gene_dataset, indices=indices_train, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_test, type_class=type_class))
def _iter_indices(self, X, y, groups=None): y_labels = np.unique(y) y_inds = [np.where(y == t_y)[0] for t_y in y_labels] n_samples = [ _validate_shuffle_split(len(t_inds), self.test_size, self.train_size, default_test_size=self._default_test_size) for t_inds in y_inds ] for _ in range(self.n_splits): train = [] test = [] for t_inds, (n_train, _) in zip(y_inds, n_samples): bs_inds = np.random.choice(t_inds, len(t_inds), replace=True) train.extend(bs_inds[:n_train]) test.extend(bs_inds[n_train:]) yield train, test
def _iter_indices(self, X, y, groups=None): groupcount = np.bincount(y) mingroup = (int)(np.nanmin(groupcount)) train_size, test_size = _validate_shuffle_split( mingroup, self.test_size, self.train_size) for i in range(self.n_splits): train_idx = np.empty((0, ), dtype='int') test_idx = np.empty((0, ), dtype='int') for g in range(0, len(groupcount)): random_indices = np.random.permutation( np.arange(len(y))[y == g]) train_idx = np.concatenate( (train_idx, random_indices[0:train_size])) if self.test_size == None: test_idx = np.concatenate( (test_idx, random_indices[train_size:])) else: test_idx = np.concatenate( (test_idx, random_indices[train_size:train_size + test_size])) yield train_idx, test_idx
def train_test_split(*arrays, test_size=None, train_size=None, **kwargs): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) CVClass = KSSplit cv = CVClass(test_size=n_test, train_size=n_train) train, test = next(cv.split(X=arrays[0])) return list( chain.from_iterable((_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays))
def gen_folds(args, dataset, test_size): from sklearn.utils.validation import check_random_state from sklearn.model_selection._split import _validate_shuffle_split n_samples = len(dataset) n_train, n_test = _validate_shuffle_split(n_samples, test_size, None, default_test_size=0.1) rng = check_random_state(args.seed) train_folds = [] test_folds = [] for i in range(args.n_folds): # random partition permutation = rng.permutation(n_samples) ind_test = permutation[:n_test] ind_train = permutation[n_test:(n_test + n_train)] train_folds.append(ind_train) test_folds.append(ind_test) train_folds, test_folds = np.stack(train_folds), np.stack(test_folds) np.savez(args.fold_path, train=train_folds, test=test_folds)
def train_test_validation( self, model=None, gene_dataset=None, train_size=0.1, test_size=None, type_class=Posterior, ): model = self.model if model is None and hasattr(self, "model") else model gene_dataset = (self.gene_dataset if gene_dataset is None and hasattr(self, "model") else gene_dataset) n = len(gene_dataset) if train_size == 1.0: n_train = n n_test = 0 else: n_train, n_test = _validate_shuffle_split(n, test_size, train_size) random_state = np.random.RandomState(seed=self.seed) permutation = random_state.permutation(n) indices_test = permutation[:n_test] indices_train = permutation[n_test:(n_test + n_train)] indices_validation = permutation[(n_test + n_train):] return ( self.create_posterior(model, gene_dataset, indices=indices_train, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_test, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_validation, type_class=type_class), )
def getDataLoaders(self, batch_size, shuffle, device, *args): kwargs = { 'num_workers': 1, 'pin_memory': True } if device == "cuda" else {} print('Load training data...') dataset = SyntheticDataset(*self.data_dim, *args) n_train, n_test = _validate_shuffle_split(len(dataset), test_size=None, train_size=0.7) train_dataset, test_dataset = torch.utils.data.random_split( dataset, [n_train, n_test]) train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True, shuffle=shuffle, **kwargs) test_loader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True, shuffle=False, **kwargs) return train_loader, test_loader
def _indices(self, rng, x_grouped): shift = 0 ind_test = [] ind_train = [] for _, group in x_grouped: n_samples = len(group) if n_samples < self.min_samples: ind_train.extend(np.arange(n_samples) + shift) shift += n_samples continue n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) permutation = rng.permutation(n_samples) ind_test.extend(permutation[:n_test] + shift) ind_train.extend(permutation[n_test:(n_test + n_train)] + shift) shift += n_samples return np.array(ind_train), np.array(ind_test)
def train_test_validation( self, model=None, gene_dataset=None, train_size=0.9, test_size=None, type_class=Posterior, ): """Creates posteriors ``train_set``, ``test_set``, ``validation_set``. If ``train_size + test_size < 1`` then ``validation_set`` is non-empty. Parameters ---------- train_size : float, or None (default is 0.9) test_size : float, or None (default is None) model : (Default value = None) gene_dataset : (Default value = None) type_class : (Default value = Posterior) Returns ------- """ train_size = float(train_size) if train_size > 1.0 or train_size <= 0.0: raise ValueError( "train_size needs to be greater than 0 and less than or equal to 1" ) model = self.model if model is None and hasattr(self, "model") else model gene_dataset = (self.gene_dataset if gene_dataset is None and hasattr(self, "model") else gene_dataset) n = len(gene_dataset) try: n_train, n_test = _validate_shuffle_split(n, test_size, train_size) except ValueError: if train_size != 1.0: raise ValueError( "Choice of train_size={} and test_size={} not understood". format(train_size, test_size)) n_train, n_test = n, 0 random_state = np.random.RandomState(seed=self.seed) permutation = random_state.permutation(n) indices_test = permutation[:n_test] indices_train = permutation[n_test:(n_test + n_train)] indices_validation = permutation[(n_test + n_train):] return ( self.create_posterior(model, gene_dataset, indices=indices_train, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_test, type_class=type_class), self.create_posterior(model, gene_dataset, indices=indices_validation, type_class=type_class), )
def train_test_validation( self, model: GaussianTreeVAE = None, gene_dataset: TreeDataset = None, train_size: float = 0.8, test_size: int = None, type_class=GaussianTreePosterior, ): """Creates posteriors ``train_set``, ``test_set``, ``validation_set``. If ``train_size + test_size < 1`` then ``validation_set`` is non-empty. This works a bit differently for a TreeTrainer - in order to respect the tree prior we need to draw our observations from within sets of cells related to one another (i.e in a clade). One can think of this analagously to identifying clusters from the hierarchical ordering described by the tree, and splitting each cluster into train/test/validation. The procedure of actually clustering the tree into clades that contain several iid observations is done in the constructor function for TreeVAE (scvi.models.treevae). This procedure below will simply split the clades previously identified into train/test/validation sets according to the train_size specified. :param model: A ``TreeVAE` model. :param gene_dataset: A ``TreeDataset`` instance. :param train_size: float, int, or None (default is 0.1) :param test_size: float, int, or None (default is None) :param type_class: Type of Posterior object to create (here, TreePosterior) """ def get_indices_in_dataset(_subset, _subset_indices, master_list): _cells = np.array(_subset)[np.array(_subset_indices)] filt = np.array(list(map(lambda x: x in _cells, master_list))) return list(np.where(filt == True)[0]) model = self.model if model is None and hasattr(self, "model") else model gene_dataset = (self.gene_dataset if gene_dataset is None and hasattr(self, "model") else gene_dataset) barcodes = gene_dataset.barcodes leaves = [n for n in model.tree.traverse('levelorder') if n.is_leaf()] # this is where we need to shuffle within the tree structure train_indices, test_indices, validate_indices = [], [], [] # for each clade induced by an internal node at a given depth split into # train, test, and validation and append these indices to the master list # introduce an index for each leaf in the tree for l in leaves: c = l.cells indices = get_indices_in_dataset(c, list(range(len(c))), barcodes) l.indices = np.array(indices) self.clades.append(indices) # randomly split leaves into test, train, and validation sets for l in leaves: leaf_bunch = l.indices if len(leaf_bunch) == 1: #x = random.random() #if x < train_size: #train_indices.append([leaf_bunch[0]]) #else: #test_indices.append([leaf_bunch[0]]) train_indices.append([leaf_bunch[0]]) else: n_train, n_test = _validate_shuffle_split( len(leaf_bunch), test_size, train_size) random_state = np.random.RandomState(seed=self.seed) permutation = random_state.permutation(leaf_bunch) test_indices.append(list(permutation[:n_test])) train_indices.append( list(permutation[n_test:(n_test + n_train)])) # split test set in two validate_indices.append(list(permutation[(n_test + n_train):])) # some print statement to ensure test/train/validation sets created correctly print("train_leaves: ", train_indices) print("test_leaves: ", test_indices) print("validation leaves: ", validate_indices) return ( self.create_posterior(model, gene_dataset, train_indices, type_class=type_class) #self.create_posterior( #model, gene_dataset, test_indices, type_class=type_class #), #self.create_posterior( #model, gene_dataset, validate_indices, type_class=type_class #), )
def _daal_train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) stratify = options.pop('stratify', None) shuffle = options.pop('shuffle', True) rng = options.pop('rng', 'OPTIMIZED_MT19937') available_rngs = [ 'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937' ] if rng not in available_rngs: raise ValueError("Wrong random numbers generator is chosen. " "Available generators: %s" % str(available_rngs)[1:-1]) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for " "shuffle=False") train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) else: if mkl_random_is_imported and rng not in [ 'default', 'OPTIMIZED_MT19937' ] and (isinstance(random_state, int) or random_state is None): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_train + n_test) test, train = indexes[:n_test], indexes[n_test:] elif rng == 'OPTIMIZED_MT19937' and daal_check_version(((2020,'P', 3), (2021,'B',9))) \ and (isinstance(random_state, int) or random_state is None) \ and platform.system() != 'Windows': indexes = np.empty(shape=(n_train + n_test, ), dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) test, train = indexes[:n_test], indexes[n_test:] else: cv = ShuffleSplit(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: fallback = False # input format check if not isinstance(arr, np.ndarray): if pandas_is_imported: if not isinstance(arr, pd.core.frame.DataFrame) and not isinstance( arr, pd.core.series.Series): fallback = True else: fallback = True # dimensions check if hasattr(arr, 'ndim'): if arr.ndim > 2: fallback = True else: fallback = True # data types check dtypes = get_dtypes(arr) if dtypes is None: fallback = True else: for i, dtype in enumerate(dtypes): if 'float' not in str(dtype) and 'int' not in str(dtype): fallback = True break if fallback: res.append(safe_indexing(arr, train)) res.append(safe_indexing(arr, test)) else: if len(arr.shape) == 2: n_cols = arr.shape[1] reshape_later = False else: n_cols = 1 reshape_later = True arr_copy = d4p.get_data(arr) if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape((arr_copy.shape[0], n_cols), order='A') if isinstance(arr_copy, np.ndarray): order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' train_arr = np.empty(shape=(n_train, n_cols), dtype=arr_copy.dtype, order=order) test_arr = np.empty(shape=(n_test, n_cols), dtype=arr_copy.dtype, order=order) d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: train_arr, test_arr = train_arr.reshape( (n_train, )), test_arr.reshape((n_test, )) elif isinstance(arr_copy, list): train_arr = [ np.empty(shape=(n_train, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] test_arr = [ np.empty(shape=(n_test, ), dtype=el.dtype, order='C' if el.flags['C_CONTIGUOUS'] else 'F') for el in arr_copy ] d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) train_arr = { col: train_arr[i] for i, col in enumerate(arr.columns) } test_arr = { col: test_arr[i] for i, col in enumerate(arr.columns) } else: raise ValueError('Array can\'t be converted to needed format') if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): train_arr, test_arr = pd.DataFrame( train_arr), pd.DataFrame(test_arr) if isinstance(arr, pd.core.series.Series): train_arr, test_arr = train_arr.reshape( n_train), test_arr.reshape(n_test) train_arr, test_arr = pd.Series(train_arr), pd.Series( test_arr) if hasattr(arr, 'index'): train_arr.index = train test_arr.index = test res.append(train_arr) res.append(test_arr) return res
def _iter_indices(self, expr, omic=None, groups=None): """Generates indices of training/testing splits for use in stratified shuffle splitting of cohort data. """ # with one domain and one variant to predict proceed with stratified # sampling, binning mutation values if they are continuous if hasattr(expr, 'shape') and hasattr(omic, 'shape'): if len(omic.shape) > 1 and omic.shape[1] > 1: omic_use = np.apply_along_axis(lambda x: reduce(or_, x), 1, omic) elif len(np.unique(omic)) > 10: omic_use = omic > np.percentile(omic, 50) else: omic_use = omic.copy() for train, test in super()._iter_indices(X=expr, y=omic_use, groups=groups): yield train, test elif hasattr(omic, 'shape'): if len(np.unique(omic)) > 2: if len(omic.shape) == 1: omic = omic > np.percentile(omic, 50) else: if isinstance(omic, pd.DataFrame): samp_mean = np.mean(omic.fillna(0.0), axis=1) elif isinstance(omic, np.ndarray): samp_mean = np.mean(np.nan_to_num(omic), axis=1) omic = samp_mean > np.percentile(samp_mean, 50) for train, test in super()._iter_indices(X=list(expr.values())[0], y=omic, groups=groups): yield train, test elif hasattr(expr, 'shape'): # gets info about input n_samples = _num_samples(expr) n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) class_info = [np.unique(y, return_inverse=True) for y in omic] merged_classes = reduce( lambda x, y: x + y, [y_ind * 2**i for i, (_, y_ind) in enumerate(class_info)]) merged_counts = np.bincount(merged_classes) class_info = np.unique(merged_classes, return_inverse=True) new_counts = merged_counts.tolist() new_info = list(class_info) new_info[0] = new_info[0].tolist() remove_indx = [] for i, count in enumerate(merged_counts): if count < 2 and i in new_info[0]: remove_indx += [i] cur_ind = merged_classes == i if i > 0: new_counts[i - 1] += new_counts[i] rep_indx = new_info[0].index(i) - 1 else: new_counts[i + 1] += new_counts[i] rep_indx = new_info[0].index(i) + 1 merged_classes[cur_ind] = new_info[0][rep_indx] for i in remove_indx: new_info[0].remove(i) new_counts = np.array(new_counts) n_class = len(new_info[0]) if n_train < n_class: raise ValueError('The train_size = %d should be greater or ' 'equal to the number of classes = %d' % (n_train, n_class)) if n_test < n_class: raise ValueError('The test_size = %d should be greater or ' 'equal to the number of classes = %d' % (n_test, n_class)) # generates random training and testing cohorts rng = check_random_state(self.random_state) for _ in range(self.n_splits): n_is = _approximate_mode(new_counts, n_train, rng) class_counts_remaining = new_counts - n_is t_is = _approximate_mode(class_counts_remaining, n_test, rng) train = [] test = [] for class_i in new_info[0]: permutation = rng.permutation(new_counts[class_i]) perm_indices_class = np.where( merged_classes == class_i)[0][permutation] train.extend(perm_indices_class[:n_is[class_i]]) test.extend( perm_indices_class[n_is[class_i]:(n_is[class_i] + t_is[class_i])]) train = rng.permutation(train).tolist() test = rng.permutation(test).tolist() yield train, test # otherwise, perform stratified sampling on each cohort separately else: # gets info about input n_samples = {lbl: _num_samples(X) for lbl, X in expr.items()} n_train_test = { lbl: _validate_shuffle_split(n_samps, self.test_size, self.train_size) for lbl, n_samps in n_samples.items() } class_info = { lbl: np.unique(y, return_inverse=True) for lbl, y in omic.items() } n_classes = { lbl: classes.shape[0] for lbl, (classes, _) in class_info.items() } classes_counts = { lbl: np.bincount(y_indices) for lbl, (_, y_indices) in class_info.items() } # ensure we have enough samples in each class for stratification for lbl, (n_train, n_test) in n_train_test.items(): if np.min(classes_counts[lbl]) < 2: raise ValueError( "The least populated phenotype class in {} has only " "one member, which is too few. The minimum number of " "groups for any phenotypic feature to predict cannot " "be less than two.".format(lbl)) if n_train < n_classes[lbl]: raise ValueError( "The number of training samples ({}) should be " "greater or equal to the number of " "phenotypes ({})".format(n_train, n_classes[lbl])) if n_test < n_classes[lbl]: raise ValueError( "The number of testing samples ({}) should be " "greater or equal to the number of " "phenotypes ({})".format(n_test, n_classes[lbl])) # generates random training and testing cohorts rng = check_random_state(self.random_state) for _ in range(self.n_splits): n_is = { lbl: _approximate_mode(classes_counts[lbl], n_train_test[lbl][0], rng) for lbl in expr } classes_counts_left = { lbl: classes_counts[lbl] - n_is[lbl] for lbl in expr } t_is = { lbl: _approximate_mode(classes_counts_left[lbl], n_train_test[lbl][1], rng) for lbl in expr } train = {lbl: [] for lbl in expr} test = {lbl: [] for lbl in expr} for lbl, (classes, _) in class_info.items(): for i, class_i in enumerate(classes): permutation = rng.permutation(classes_counts[lbl][i]) perm_indices_class_i = np.where( (omic[lbl] == class_i))[0][permutation] train[lbl].extend(perm_indices_class_i[:n_is[lbl][i]]) test[lbl].extend( perm_indices_class_i[n_is[lbl][i]:n_is[lbl][i] + t_is[lbl][i]]) train[lbl] = rng.permutation(train[lbl]) test[lbl] = rng.permutation(test[lbl]) yield train, test