Exemple #1
0
    def _split_blockwise(self, X):
        chunks = X.chunks[0]
        rng = check_random_state(self.random_state)
        seeds = rng.randint(0, 2**32 - 1, size=len(chunks))

        train_pct, test_pct = _maybe_normalize_split_sizes(
            self.train_size, self.test_size)
        sizes = [
            _validate_shuffle_split(c, test_pct, train_pct) for c in chunks
        ]

        objs = [
            dask.delayed(_generate_idx, nout=2)(chunksize, seed, n_train,
                                                n_test)
            for chunksize, seed, (n_train,
                                  n_test) in zip(chunks, seeds, sizes)
        ]

        train_objs, test_objs = zip(*objs)
        offsets = np.hstack([0, np.cumsum(chunks)])
        train_idx = da.concatenate([
            da.from_delayed(x + offset, (train_size, ), 'i8')
            for x, chunksize, (
                train_size,
                _), offset in zip(train_objs, chunks, sizes, offsets)
        ])
        test_idx = da.concatenate([
            da.from_delayed(x + offset, (test_size, ), 'i8')
            for x, chunksize, (
                _, test_size), offset in zip(test_objs, chunks, sizes, offsets)
        ])

        return train_idx, test_idx
    def _split_blockwise(self, X, seeds):
        chunks = X.chunks[0]

        train_pct, test_pct = _maybe_normalize_split_sizes(
            self.train_size, self.test_size)
        sizes = [
            _validate_shuffle_split(c, test_pct, train_pct) for c in chunks
        ]

        objs = [
            dask.delayed(_generate_idx, nout=2)(chunksize, seed, n_train,
                                                n_test)
            for chunksize, seed, (n_train,
                                  n_test) in zip(chunks, seeds, sizes)
        ]

        train_objs, test_objs = zip(*objs)
        offsets = np.hstack([0, np.cumsum(chunks)])
        train_idx = da.concatenate([
            da.from_delayed(x + offset, (train_size, ), np.dtype("int"))
            for x, chunksize, (
                train_size,
                _), offset in zip(train_objs, chunks, sizes, offsets)
        ])
        test_idx = da.concatenate([
            da.from_delayed(x + offset, (test_size, ), np.dtype("int"))
            for x, chunksize, (
                _, test_size), offset in zip(test_objs, chunks, sizes, offsets)
        ])

        return train_idx, test_idx
Exemple #3
0
    def __init__(self,
                 gene_dataset,
                 train_size=0.1,
                 test_size=None,
                 seed=0,
                 **data_loaders_kwargs):
        """
        :param train_size: float, int, or None (default is 0.1)
        :param test_size: float, int, or None (default is None)
        """
        super(TrainTestDataLoaders, self).__init__(gene_dataset,
                                                   **data_loaders_kwargs)

        n = len(self.gene_dataset)
        n_train, n_test = _validate_shuffle_split(n, test_size, train_size)
        np.random.seed(seed=seed)
        permutation = np.random.permutation(n)
        indices_test = permutation[:n_test]
        indices_train = permutation[n_test:(n_test + n_train)]

        data_loader_train = self(indices=indices_train)
        data_loader_test = self(indices=indices_test)

        self.data_loaders_dict.update({
            'train': data_loader_train,
            'test': data_loader_test
        })
Exemple #4
0
    def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != "multilabel-indicator":
            raise ValueError(
                "Supported target type is: multilabel-indicator. Got {!r} instead.".format(
                    type_of_target_y
                )
            )

        n_train, n_test = _validate_shuffle_split(
            n_samples, self.test_size, self.train_size
        )

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = _iterative_stratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test
    def _iter_indices(self, X, y, groups=None):  # type: ignore
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        n_train, n_test = _validate_shuffle_split(
            n_samples, self.test_size, self.train_size,
            default_test_size=self._default_test_size)

        if y.ndim == 2:
            # for multi-label y, map each distinct row to a string repr
            # using join because str(row) uses an ellipsis if len(row) > 1000
            y = np.array([' '.join(row.astype('str')) for row in y])

        classes, y_indices = np.unique(y, return_inverse=True)
        n_classes = classes.shape[0]

        class_counts = np.bincount(y_indices)
        # print(class_counts)

        if n_train < n_classes:
            raise ValueError('The train_size = %d should be greater or '
                             'equal to the number of classes = %d' %
                             (n_train, n_classes))
        if n_test < n_classes:
            raise ValueError('The test_size = %d should be greater or '
                             'equal to the number of classes = %d' %
                             (n_test, n_classes))

        # Find the sorted list of instances for each class:
        # (np.unique above performs a sort, so code is O(n logn) already)
        class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
                                 np.cumsum(class_counts)[:-1])

        rng = check_random_state(self.random_state)

        for _ in range(self.n_splits):
            # if there are ties in the class-counts, we want
            # to make sure to break them anew in each iteration
            n_i = _approximate_mode(class_counts, n_train, rng)
            class_counts_remaining = class_counts - n_i
            t_i = _approximate_mode(class_counts_remaining, n_test, rng)
            train = []
            test = []

            for i in range(n_classes):
                # print("Before", i, class_counts[i], n_i[i], t_i[i])
                permutation = rng.permutation(class_counts[i])
                perm_indices_class_i = class_indices[i].take(permutation,
                                                             mode='clip')
                if n_i[i] == 0:
                    n_i[i] = 1
                    t_i[i] = t_i[i] - 1

                # print("After", i, class_counts[i], n_i[i], t_i[i])
                train.extend(perm_indices_class_i[:n_i[i]])
                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test
Exemple #6
0
    def train_test(self,
                   model=None,
                   gene_dataset=None,
                   train_size=0.1,
                   test_size=None,
                   seed=0,
                   type_class=Posterior):
        """
        :param train_size: float, int, or None (default is 0.1)
        :param test_size: float, int, or None (default is None)
        """
        model = self.model if model is None and hasattr(self,
                                                        "model") else model
        gene_dataset = self.gene_dataset if gene_dataset is None and hasattr(
            self, "model") else gene_dataset
        n = len(gene_dataset)
        n_train, n_test = _validate_shuffle_split(n, test_size, train_size)
        np.random.seed(seed=seed)
        permutation = np.random.permutation(n)
        indices_test = permutation[:n_test]
        indices_train = permutation[n_test:(n_test + n_train)]

        return (self.create_posterior(model,
                                      gene_dataset,
                                      indices=indices_train,
                                      type_class=type_class),
                self.create_posterior(model,
                                      gene_dataset,
                                      indices=indices_test,
                                      type_class=type_class))
Exemple #7
0
    def __init__(
        self,
        n_splits=10,
        *,
        test_size=None,
        train_size=None,
        random_state=None,
        buffer_width,
        n_blocks,
    ):
        if not isinstance(buffer_width, int) or buffer_width < 0:
            raise ValueError(
                f"'buffer_width' must be a non-negative integer; it is used for indexing. Given {buffer_width}"
            )

        super().__init__(
            n_splits=n_splits,
            test_size=test_size,
            train_size=train_size,
            random_state=random_state,
        )

        self._default_test_size = 0.2
        self._width = buffer_width
        self._n_blocks = n_blocks

        self._n_train, self._n_test = _validate_shuffle_split(
            self._n_blocks,
            self.test_size,
            self.train_size,
            default_test_size=self._default_test_size,
        )
Exemple #8
0
    def _train_test_val_split(
        self,
        adata: AnnData,
        train_size: float = 0.9,
        validation_size: Optional[float] = None,
        **kwargs,
    ):
        """
        Creates data loaders ``train_set``, ``validation_set``, ``test_set``.

        If ``train_size + validation_set < 1`` then ``test_set`` is non-empty.

        Parameters
        ----------
        adata
            Setup AnnData to be split into train, test, validation sets
        train_size
            float, or None (default is 0.9)
        validation_size
            float, or None (default is None)
        **kwargs
            Keyword args for `_make_scvi_dl()`
        """
        train_size = float(train_size)
        if train_size > 1.0 or train_size <= 0.0:
            raise ValueError(
                "train_size needs to be greater than 0 and less than or equal to 1"
            )

        n = len(adata)
        try:
            n_train, n_val = _validate_shuffle_split(n, validation_size,
                                                     train_size)
        except ValueError:
            if train_size != 1.0:
                raise ValueError(
                    "Choice of train_size={} and validation_size={} not understood"
                    .format(train_size, validation_size))
            n_train, n_val = n, 0
        random_state = np.random.RandomState(seed=settings.seed)
        permutation = random_state.permutation(n)
        indices_validation = permutation[:n_val]
        indices_train = permutation[n_val:(n_val + n_train)]
        indices_test = permutation[(n_val + n_train):]

        return (
            self._make_scvi_dl(adata,
                               indices=indices_train,
                               shuffle=True,
                               **kwargs),
            self._make_scvi_dl(adata,
                               indices=indices_validation,
                               shuffle=True,
                               **kwargs),
            self._make_scvi_dl(adata,
                               indices=indices_test,
                               shuffle=True,
                               **kwargs),
        )
Exemple #9
0
 def get_train_val_split(n_samples, test_size, train_size):
     try:
         n_train, n_val = _validate_shuffle_split(
             n_samples, test_size, train_size)
     except ValueError:
         if train_size != 1.0 and n_samples != 1:
             raise ValueError(
                 "Choice of train_size={} and validation_size={} not understood"
                 .format(train_size, test_size))
         n_train, n_val = n_samples, 0
     return n_train, n_val
Exemple #10
0
 def _iter_indices(self, X, y=None, groups=None):
     n_samples = X.shape[0]
     n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                               self.train_size)
     rng = check_random_state(self.random_state)
     for i in range(self.n_splits):
         # random partition
         ind_train = rng.randint(0, high=X.shape[0], size=n_train)
         ind_test = list(
             set(np.arange(0, X.shape[0])) - set(np.unique(ind_train)))
         yield ind_train, ind_test
    def __init__(self,
                 gene_dataset,
                 train_size=0.1,
                 test_size=None,
                 seed=0,
                 num_samples=None,
                 **kwargs):
        """
        :param train_size: float, int, or None (default is 0.1)
        :param test_size: float, int, or None (default is None)
        """
        super(SupervisedTrainTestDataLoaders,
              self).__init__(gene_dataset, **kwargs)

        n = len(self.gene_dataset)
        n_train, n_test = _validate_shuffle_split(n, test_size, train_size)
        np.random.seed(seed=seed)
        permutation = np.random.permutation(n)

        # Get indices
        indices_test = permutation[:n_test]
        indices_train = permutation[n_test:(n_test + n_train)]

        # Get weights for each label
        unique_labels, label_counts = np.unique(gene_dataset.labels[:, 0],
                                                return_counts=True)
        self.weight_lookup = 1.0 / label_counts * 1.0 / len(unique_labels)

        # Set the number of samples in the iterator
        self.num_samples = num_samples if num_samples else len(gene_dataset)

        # Create weights
        weights_all = np.zeros(len(gene_dataset))
        weights_train = np.zeros(len(gene_dataset))
        weights_test = np.zeros(len(gene_dataset))
        for idx in indices_train:
            weights_train[idx] = self.weight_lookup[gene_dataset.labels[idx,
                                                                        0]]
        for idx in indices_test:
            weights_test[idx] = self.weight_lookup[gene_dataset.labels[idx, 0]]
        for idx in range(len(gene_dataset)):
            weights_all[idx] = self.weight_lookup[gene_dataset.labels[idx, 0]]

        data_loader_train = self(weights=weights_train, num_samples=n_train)
        data_loader_test = self(weights=weights_test, num_samples=n_test)
        data_loader_all = self(weights=weights_all,
                               num_samples=self.num_samples)

        self.dict.update({
            'train': data_loader_train,
            'test': data_loader_test,
            'all': data_loader_all
        })
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(
                p > threshold,
                "An index is not drawn with chance corresponding "
                "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = StratifiedShuffleSplit(n_iter=n_iter,
                                        test_size=1. / n_folds,
                                        random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits.split(X=np.ones(n_samples), y=labels):
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        n_train, n_test = _validate_shuffle_split(n_samples,
                                                  test_size=1. / n_folds,
                                                  train_size=1. -
                                                  (1. / n_folds))

        assert_equal(len(train), n_train)
        assert_equal(len(test), n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(n_train + n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(n_test) / n_samples
        ex_train_p = float(n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p)
Exemple #13
0
 def _iter_indices(self, X, y=None, groups=None):
     n_samples = _num_samples(self.mapping)
     n_train, n_test = _validate_shuffle_split(n_samples,self.test_size,self.train_size)
     rng = check_random_state(self.random_state)
     for _ in range(self.n_splits):
         # random partition
         permutation = rng.permutation(n_samples)
         ind_test = []
         for it in permutation[:n_test]:
             ind_test.extend(self.mapping[it])
         ind_train = []
         for it in permutation[n_test:(n_test + n_train)]:
             ind_train.extend(self.mapping[it])
         yield ind_train, ind_test
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = StratifiedShuffleSplit(n_iter=n_iter,
                                        test_size=1. / n_folds,
                                        random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits.split(X=np.ones(n_samples), y=labels):
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        n_train, n_test = _validate_shuffle_split(n_samples,
                                                  test_size=1./n_folds,
                                                  train_size=1.-(1./n_folds))

        assert_equal(len(train), n_train)
        assert_equal(len(test), n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(n_train + n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(n_test) / n_samples
        ex_train_p = float(n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p)
Exemple #15
0
    def train_test_validation(
        self,
        model=None,
        gene_dataset=None,
        train_size=0.7,
        test_size=None,
        trainidxs=None,
        testidxs=None,
        type_class=Posterior,
    ):
        """Creates posteriors ``train_set``, ``test_set``, ``validation_set``.
            If ``train_size + test_size < 1`` then ``validation_set`` is non-empty.

            :param train_size: float, int, or None (default is 0.1)
            :param test_size: float, int, or None (default is None)
            """
        model = self.model if model is None and hasattr(self, "model") else model
        gene_dataset = (
            self.gene_dataset
            if gene_dataset is None and hasattr(self, "model")
            else gene_dataset
        )
        #self.trainidxs = trainidxs
        #self.testidxs = testidxs
        n = len(gene_dataset)
        n_train, n_test = _validate_shuffle_split(n, test_size, train_size)
        random_state = np.random.RandomState(seed=self.seed)
        permutation = random_state.permutation(n)
        #indices_test = permutation[:n_test]
        #indices_train = permutation[n_test : (n_test + n_train)]
        indices_validation = permutation[(n_test + n_train) :]

        ### Attention ###
        # Use train/test split from BoltzmannMachines Package for comparability 
        indices_test = testidxs
        indices_train = trainidxs
        
        indices_validation = np.array([])
        return (
            self.create_posterior(
                model, gene_dataset, indices=indices_train, type_class=type_class
            ),
            self.create_posterior(
                model, gene_dataset, indices=indices_test, type_class=type_class
            ),
            self.create_posterior(
                model, gene_dataset, indices=indices_validation, type_class=type_class
            ),
        )
Exemple #16
0
    def _iter_indices(self, X, y=None, groups=None):
        _ks = _KennardStone()
        inds = _ks._get_indexes(X)

        n_samples = _num_samples(X)
        n_train, n_test = _validate_shuffle_split(
            n_samples,
            self.test_size,
            self.train_size,
            default_test_size=self._default_test_size)

        for _ in range(self.n_splits):
            ind_test = inds[:n_test]
            ind_train = inds[n_test:(n_test + n_train)]
            yield ind_train, ind_test
Exemple #17
0
    def train_test_validation(
        self,
        model=None,
        gene_dataset=None,
        train_size=0.1,
        test_size=None,
        type_class=Posterior,
    ):
        """Creates posteriors ``train_set``, ``test_set``, ``validation_set``.

        If ``train_size + test_size < 1`` then ``validation_set`` is non-empty.

        :param train_size: float, int, or None (default is 0.1)
        :param test_size: float, int, or None (default is None)
        """
        model = self.model if model is None and hasattr(self,
                                                        "model") else model
        gene_dataset = (self.gene_dataset if gene_dataset is None
                        and hasattr(self, "model") else gene_dataset)
        n = len(gene_dataset)
        try:
            n_train, n_test = _validate_shuffle_split(n, test_size, train_size)
        except ValueError:
            if train_size != 1.0:
                raise ValueError(
                    "Choice of train_size={} and test_size={} not understood".
                    format(train_size, test_size))
            n_train, n_test = n, 0
        random_state = np.random.RandomState(seed=self.seed)
        permutation = random_state.permutation(n)
        indices_test = permutation[:n_test]
        indices_train = permutation[n_test:(n_test + n_train)]
        indices_validation = permutation[(n_test + n_train):]

        return (
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_train,
                                  type_class=type_class),
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_test,
                                  type_class=type_class),
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_validation,
                                  type_class=type_class),
        )
 def __init__(self,
              y,
              n_iter=10,
              test_size=0.1,
              train_size=None,
              random_state=None):
     n = len(y)
     self.y = np.array(y)
     self.classes, self.y_indices = np.unique(y, return_inverse=True)
     self.random_state = random_state
     self.train_size = train_size
     self.test_size = test_size
     self.n_iter = n_iter
     self.n = n
     self.n_train, self.n_test = _validate_shuffle_split(
         n, test_size, train_size)
Exemple #19
0
def mc_split(R,
             n_splits=1,
             test_size='default',
             train_size=None,
             random_state=None):
    """Train-test splitting."""
    n_samples = np.prod(R.shape)
    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size)

    rng = check_random_state(random_state)
    for i in range(n_splits):
        permutation = rng.permutation(n_samples)
        ind_test = permutation[:n_test]
        ind_train = permutation[n_test:(n_test + n_train)]

        yield ind_train, ind_test
Exemple #20
0
    def train_test(self,
                   model=None,
                   gene_dataset=None,
                   train_size=0.1,
                   test_size=None,
                   seed=0,
                   test_indices=None,
                   type_class=Posterior):
        """
        :param train_size: float, int, or None (default is 0.1)
        :param test_size: float, int, or None (default is None)
        :param model:
        :param gene_dataset:
        :param seed:
        :param test_indices:
        :param type_class:
        """
        model = self.model if model is None and hasattr(self,
                                                        "model") else model
        gene_dataset = self.gene_dataset if gene_dataset is None and hasattr(
            self, "model") else gene_dataset

        n = len(gene_dataset)
        if test_indices is None:
            n_train, n_test = _validate_shuffle_split(n, test_size, train_size)
            np.random.seed(seed=seed)
            permutation = np.random.permutation(n)
            indices_test = permutation[:n_test]
            indices_train = permutation[n_test:(n_test + n_train)]
        else:
            indices_test = np.array(test_indices)
            all_indices = np.arange(len(gene_dataset))
            indices_train = ~np.isin(all_indices, indices_test)
            indices_train = all_indices[indices_train]
            assert len(np.intersect1d(indices_train, indices_test)) == 0

        return (self.create_posterior(model,
                                      gene_dataset,
                                      indices=indices_train,
                                      type_class=type_class),
                self.create_posterior(model,
                                      gene_dataset,
                                      indices=indices_test,
                                      type_class=type_class))
Exemple #21
0
    def _iter_indices(self, X, y, groups=None):
        y_labels = np.unique(y)
        y_inds = [np.where(y == t_y)[0] for t_y in y_labels]
        n_samples = [
            _validate_shuffle_split(len(t_inds),
                                    self.test_size,
                                    self.train_size,
                                    default_test_size=self._default_test_size)
            for t_inds in y_inds
        ]
        for _ in range(self.n_splits):
            train = []
            test = []
            for t_inds, (n_train, _) in zip(y_inds, n_samples):
                bs_inds = np.random.choice(t_inds, len(t_inds), replace=True)
                train.extend(bs_inds[:n_train])
                test.extend(bs_inds[n_train:])

            yield train, test
Exemple #22
0
 def _iter_indices(self, X, y, groups=None):
     groupcount = np.bincount(y)
     mingroup = (int)(np.nanmin(groupcount))
     train_size, test_size = _validate_shuffle_split(
         mingroup, self.test_size, self.train_size)
     for i in range(self.n_splits):
         train_idx = np.empty((0, ), dtype='int')
         test_idx = np.empty((0, ), dtype='int')
         for g in range(0, len(groupcount)):
             random_indices = np.random.permutation(
                 np.arange(len(y))[y == g])
             train_idx = np.concatenate(
                 (train_idx, random_indices[0:train_size]))
             if self.test_size == None:
                 test_idx = np.concatenate(
                     (test_idx, random_indices[train_size:]))
             else:
                 test_idx = np.concatenate(
                     (test_idx,
                      random_indices[train_size:train_size + test_size]))
         yield train_idx, test_idx
Exemple #23
0
def train_test_split(*arrays, test_size=None, train_size=None, **kwargs):
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(n_samples,
                                              test_size,
                                              train_size,
                                              default_test_size=0.25)

    CVClass = KSSplit
    cv = CVClass(test_size=n_test, train_size=n_train)

    train, test = next(cv.split(X=arrays[0]))

    return list(
        chain.from_iterable((_safe_indexing(a, train), _safe_indexing(a, test))
                            for a in arrays))
def gen_folds(args, dataset, test_size):
    from sklearn.utils.validation import check_random_state
    from sklearn.model_selection._split import _validate_shuffle_split

    n_samples = len(dataset)
    n_train, n_test = _validate_shuffle_split(n_samples, test_size, None, default_test_size=0.1)
    rng = check_random_state(args.seed)

    train_folds = []
    test_folds = []

    for i in range(args.n_folds):
        # random partition
        permutation = rng.permutation(n_samples)
        ind_test = permutation[:n_test]
        ind_train = permutation[n_test:(n_test + n_train)]
        train_folds.append(ind_train)
        test_folds.append(ind_test)

    train_folds, test_folds = np.stack(train_folds), np.stack(test_folds)
    np.savez(args.fold_path, train=train_folds, test=test_folds)
Exemple #25
0
    def train_test_validation(
        self,
        model=None,
        gene_dataset=None,
        train_size=0.1,
        test_size=None,
        type_class=Posterior,
    ):
        model = self.model if model is None and hasattr(self,
                                                        "model") else model
        gene_dataset = (self.gene_dataset if gene_dataset is None
                        and hasattr(self, "model") else gene_dataset)
        n = len(gene_dataset)
        if train_size == 1.0:
            n_train = n
            n_test = 0
        else:
            n_train, n_test = _validate_shuffle_split(n, test_size, train_size)

        random_state = np.random.RandomState(seed=self.seed)
        permutation = random_state.permutation(n)
        indices_test = permutation[:n_test]
        indices_train = permutation[n_test:(n_test + n_train)]
        indices_validation = permutation[(n_test + n_train):]

        return (
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_train,
                                  type_class=type_class),
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_test,
                                  type_class=type_class),
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_validation,
                                  type_class=type_class),
        )
Exemple #26
0
 def getDataLoaders(self, batch_size, shuffle, device, *args):
     kwargs = {
         'num_workers': 1,
         'pin_memory': True
     } if device == "cuda" else {}
     print('Load training data...')
     dataset = SyntheticDataset(*self.data_dim, *args)
     n_train, n_test = _validate_shuffle_split(len(dataset),
                                               test_size=None,
                                               train_size=0.7)
     train_dataset, test_dataset = torch.utils.data.random_split(
         dataset, [n_train, n_test])
     train_loader = DataLoader(train_dataset,
                               batch_size=batch_size,
                               drop_last=True,
                               shuffle=shuffle,
                               **kwargs)
     test_loader = DataLoader(test_dataset,
                              batch_size=batch_size,
                              drop_last=True,
                              shuffle=False,
                              **kwargs)
     return train_loader, test_loader
    def _indices(self, rng, x_grouped):
        shift = 0
        ind_test = []
        ind_train = []

        for _, group in x_grouped:
            n_samples = len(group)

            if n_samples < self.min_samples:
                ind_train.extend(np.arange(n_samples) + shift)
                shift += n_samples
                continue

            n_train, n_test = _validate_shuffle_split(n_samples,
                                                      self.test_size,
                                                      self.train_size)
            permutation = rng.permutation(n_samples)

            ind_test.extend(permutation[:n_test] + shift)
            ind_train.extend(permutation[n_test:(n_test + n_train)] + shift)

            shift += n_samples

        return np.array(ind_train), np.array(ind_test)
Exemple #28
0
    def train_test_validation(
        self,
        model=None,
        gene_dataset=None,
        train_size=0.9,
        test_size=None,
        type_class=Posterior,
    ):
        """Creates posteriors ``train_set``, ``test_set``, ``validation_set``.

        If ``train_size + test_size < 1`` then ``validation_set`` is non-empty.

        Parameters
        ----------
        train_size :
            float, or None (default is 0.9)
        test_size :
            float, or None (default is None)
        model :
             (Default value = None)
        gene_dataset :
             (Default value = None)
        type_class :
             (Default value = Posterior)

        Returns
        -------

        """
        train_size = float(train_size)
        if train_size > 1.0 or train_size <= 0.0:
            raise ValueError(
                "train_size needs to be greater than 0 and less than or equal to 1"
            )

        model = self.model if model is None and hasattr(self,
                                                        "model") else model
        gene_dataset = (self.gene_dataset if gene_dataset is None
                        and hasattr(self, "model") else gene_dataset)
        n = len(gene_dataset)
        try:
            n_train, n_test = _validate_shuffle_split(n, test_size, train_size)
        except ValueError:
            if train_size != 1.0:
                raise ValueError(
                    "Choice of train_size={} and test_size={} not understood".
                    format(train_size, test_size))
            n_train, n_test = n, 0
        random_state = np.random.RandomState(seed=self.seed)
        permutation = random_state.permutation(n)
        indices_test = permutation[:n_test]
        indices_train = permutation[n_test:(n_test + n_train)]
        indices_validation = permutation[(n_test + n_train):]

        return (
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_train,
                                  type_class=type_class),
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_test,
                                  type_class=type_class),
            self.create_posterior(model,
                                  gene_dataset,
                                  indices=indices_validation,
                                  type_class=type_class),
        )
    def train_test_validation(
        self,
        model: GaussianTreeVAE = None,
        gene_dataset: TreeDataset = None,
        train_size: float = 0.8,
        test_size: int = None,
        type_class=GaussianTreePosterior,
    ):
        """Creates posteriors ``train_set``, ``test_set``, ``validation_set``.
		If ``train_size + test_size < 1`` then ``validation_set`` is non-empty.

        This works a bit differently for a TreeTrainer - in order to respect the
        tree prior we need to draw our observations from within sets of cells related
        to one another (i.e in a clade).  One can think of this analagously to
        identifying clusters from the hierarchical ordering described by the tree, and splitting
        each cluster into train/test/validation.

        The procedure of actually clustering the tree into clades that contain several
        iid observations is done in the constructor function for TreeVAE (scvi.models.treevae).
        This procedure below will simply split the clades previously identified into
        train/test/validation sets according to the train_size specified.

        :param model: A ``TreeVAE` model.
        :param gene_dataset: A ``TreeDataset`` instance.
		:param train_size: float, int, or None (default is 0.1)
		:param test_size: float, int, or None (default is None)
        :param type_class: Type of Posterior object to create (here, TreePosterior)
		"""
        def get_indices_in_dataset(_subset, _subset_indices, master_list):

            _cells = np.array(_subset)[np.array(_subset_indices)]
            filt = np.array(list(map(lambda x: x in _cells, master_list)))

            return list(np.where(filt == True)[0])

        model = self.model if model is None and hasattr(self,
                                                        "model") else model
        gene_dataset = (self.gene_dataset if gene_dataset is None
                        and hasattr(self, "model") else gene_dataset)

        barcodes = gene_dataset.barcodes
        leaves = [n for n in model.tree.traverse('levelorder') if n.is_leaf()]

        # this is where we need to shuffle within the tree structure
        train_indices, test_indices, validate_indices = [], [], []

        # for each clade induced by an internal node at a given depth split into
        # train, test, and validation and append these indices to the master list
        # introduce an index for each leaf in the tree
        for l in leaves:
            c = l.cells
            indices = get_indices_in_dataset(c, list(range(len(c))), barcodes)
            l.indices = np.array(indices)
            self.clades.append(indices)

        # randomly split leaves into test, train, and validation sets
        for l in leaves:
            leaf_bunch = l.indices

            if len(leaf_bunch) == 1:
                #x = random.random()
                #if x < train_size:
                #train_indices.append([leaf_bunch[0]])
                #else:
                #test_indices.append([leaf_bunch[0]])
                train_indices.append([leaf_bunch[0]])

            else:
                n_train, n_test = _validate_shuffle_split(
                    len(leaf_bunch), test_size, train_size)

                random_state = np.random.RandomState(seed=self.seed)
                permutation = random_state.permutation(leaf_bunch)
                test_indices.append(list(permutation[:n_test]))
                train_indices.append(
                    list(permutation[n_test:(n_test + n_train)]))
                # split test set in two
                validate_indices.append(list(permutation[(n_test + n_train):]))

        # some print statement to ensure test/train/validation sets created correctly
        print("train_leaves: ", train_indices)
        print("test_leaves: ", test_indices)
        print("validation leaves: ", validate_indices)
        return (
            self.create_posterior(model,
                                  gene_dataset,
                                  train_indices,
                                  type_class=type_class)
            #self.create_posterior(
            #model, gene_dataset, test_indices, type_class=type_class
            #),
            #self.create_posterior(
            #model, gene_dataset, validate_indices, type_class=type_class
            #),
        )
Exemple #30
0
def _daal_train_test_split(*arrays, **options):
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    stratify = options.pop('stratify', None)
    shuffle = options.pop('shuffle', True)
    rng = options.pop('rng', 'OPTIMIZED_MT19937')

    available_rngs = [
        'default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', 'MCG31',
        'MCG59', 'MRG32K3A', 'PHILOX4X32X10', 'NONDETERM', 'OPTIMIZED_MT19937'
    ]
    if rng not in available_rngs:
        raise ValueError("Wrong random numbers generator is chosen. "
                         "Available generators: %s" %
                         str(available_rngs)[1:-1])

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(n_samples,
                                              test_size,
                                              train_size,
                                              default_test_size=0.25)
    if shuffle is False:
        if stratify is not None:
            raise ValueError(
                "Stratified train/test split is not implemented for "
                "shuffle=False")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)
    else:
        if stratify is not None:
            cv = StratifiedShuffleSplit(test_size=n_test,
                                        train_size=n_train,
                                        random_state=random_state)
            train, test = next(cv.split(X=arrays[0], y=stratify))
        else:
            if mkl_random_is_imported and rng not in [
                    'default', 'OPTIMIZED_MT19937'
            ] and (isinstance(random_state, int) or random_state is None):
                random_state = mkl_random.RandomState(random_state, rng)
                indexes = random_state.permutation(n_train + n_test)
                test, train = indexes[:n_test], indexes[n_test:]
            elif rng == 'OPTIMIZED_MT19937' and daal_check_version(((2020,'P', 3), (2021,'B',9))) \
            and (isinstance(random_state, int) or random_state is None) \
            and platform.system() != 'Windows':
                indexes = np.empty(shape=(n_train + n_test, ),
                                   dtype=np.int64 if
                                   n_train + n_test > 2**31 - 1 else np.int32)
                random_state = np.random.RandomState(random_state)
                random_state = random_state.get_state()[1]
                d4p.daal_generate_shuffled_indices([indexes], [random_state])
                test, train = indexes[:n_test], indexes[n_test:]
            else:
                cv = ShuffleSplit(test_size=n_test,
                                  train_size=n_train,
                                  random_state=random_state)
                train, test = next(cv.split(X=arrays[0], y=stratify))

    res = []
    for arr in arrays:
        fallback = False

        # input format check
        if not isinstance(arr, np.ndarray):
            if pandas_is_imported:
                if not isinstance(arr,
                                  pd.core.frame.DataFrame) and not isinstance(
                                      arr, pd.core.series.Series):
                    fallback = True
            else:
                fallback = True

        # dimensions check
        if hasattr(arr, 'ndim'):
            if arr.ndim > 2:
                fallback = True
        else:
            fallback = True

        # data types check
        dtypes = get_dtypes(arr)
        if dtypes is None:
            fallback = True
        else:
            for i, dtype in enumerate(dtypes):
                if 'float' not in str(dtype) and 'int' not in str(dtype):
                    fallback = True
                    break

        if fallback:
            res.append(safe_indexing(arr, train))
            res.append(safe_indexing(arr, test))
        else:

            if len(arr.shape) == 2:
                n_cols = arr.shape[1]
                reshape_later = False
            else:
                n_cols = 1
                reshape_later = True

            arr_copy = d4p.get_data(arr)
            if not isinstance(arr_copy, list):
                arr_copy = arr_copy.reshape((arr_copy.shape[0], n_cols),
                                            order='A')
            if isinstance(arr_copy, np.ndarray):
                order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F'
                train_arr = np.empty(shape=(n_train, n_cols),
                                     dtype=arr_copy.dtype,
                                     order=order)
                test_arr = np.empty(shape=(n_test, n_cols),
                                    dtype=arr_copy.dtype,
                                    order=order)
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                if reshape_later:
                    train_arr, test_arr = train_arr.reshape(
                        (n_train, )), test_arr.reshape((n_test, ))
            elif isinstance(arr_copy, list):
                train_arr = [
                    np.empty(shape=(n_train, ),
                             dtype=el.dtype,
                             order='C' if el.flags['C_CONTIGUOUS'] else 'F')
                    for el in arr_copy
                ]
                test_arr = [
                    np.empty(shape=(n_test, ),
                             dtype=el.dtype,
                             order='C' if el.flags['C_CONTIGUOUS'] else 'F')
                    for el in arr_copy
                ]
                d4p.daal_train_test_split(arr_copy, train_arr, test_arr,
                                          [train], [test])
                train_arr = {
                    col: train_arr[i]
                    for i, col in enumerate(arr.columns)
                }
                test_arr = {
                    col: test_arr[i]
                    for i, col in enumerate(arr.columns)
                }
            else:
                raise ValueError('Array can\'t be converted to needed format')

            if pandas_is_imported:
                if isinstance(arr, pd.core.frame.DataFrame):
                    train_arr, test_arr = pd.DataFrame(
                        train_arr), pd.DataFrame(test_arr)
                if isinstance(arr, pd.core.series.Series):
                    train_arr, test_arr = train_arr.reshape(
                        n_train), test_arr.reshape(n_test)
                    train_arr, test_arr = pd.Series(train_arr), pd.Series(
                        test_arr)

            if hasattr(arr, 'index'):
                train_arr.index = train
                test_arr.index = test

            res.append(train_arr)
            res.append(test_arr)

    return res
Exemple #31
0
    def _iter_indices(self, expr, omic=None, groups=None):
        """Generates indices of training/testing splits for use in
           stratified shuffle splitting of cohort data.
        """

        # with one domain and one variant to predict proceed with stratified
        # sampling, binning mutation values if they are continuous
        if hasattr(expr, 'shape') and hasattr(omic, 'shape'):

            if len(omic.shape) > 1 and omic.shape[1] > 1:
                omic_use = np.apply_along_axis(lambda x: reduce(or_, x), 1,
                                               omic)

            elif len(np.unique(omic)) > 10:
                omic_use = omic > np.percentile(omic, 50)

            else:
                omic_use = omic.copy()

            for train, test in super()._iter_indices(X=expr,
                                                     y=omic_use,
                                                     groups=groups):

                yield train, test

        elif hasattr(omic, 'shape'):

            if len(np.unique(omic)) > 2:
                if len(omic.shape) == 1:
                    omic = omic > np.percentile(omic, 50)
                else:
                    if isinstance(omic, pd.DataFrame):
                        samp_mean = np.mean(omic.fillna(0.0), axis=1)
                    elif isinstance(omic, np.ndarray):
                        samp_mean = np.mean(np.nan_to_num(omic), axis=1)

                    omic = samp_mean > np.percentile(samp_mean, 50)

            for train, test in super()._iter_indices(X=list(expr.values())[0],
                                                     y=omic,
                                                     groups=groups):

                yield train, test

        elif hasattr(expr, 'shape'):

            # gets info about input
            n_samples = _num_samples(expr)
            n_train, n_test = _validate_shuffle_split(n_samples,
                                                      self.test_size,
                                                      self.train_size)

            class_info = [np.unique(y, return_inverse=True) for y in omic]
            merged_classes = reduce(
                lambda x, y: x + y,
                [y_ind * 2**i for i, (_, y_ind) in enumerate(class_info)])
            merged_counts = np.bincount(merged_classes)
            class_info = np.unique(merged_classes, return_inverse=True)

            new_counts = merged_counts.tolist()
            new_info = list(class_info)
            new_info[0] = new_info[0].tolist()

            remove_indx = []
            for i, count in enumerate(merged_counts):
                if count < 2 and i in new_info[0]:

                    remove_indx += [i]
                    cur_ind = merged_classes == i

                    if i > 0:
                        new_counts[i - 1] += new_counts[i]
                        rep_indx = new_info[0].index(i) - 1

                    else:
                        new_counts[i + 1] += new_counts[i]
                        rep_indx = new_info[0].index(i) + 1

                    merged_classes[cur_ind] = new_info[0][rep_indx]

            for i in remove_indx:
                new_info[0].remove(i)
            new_counts = np.array(new_counts)

            n_class = len(new_info[0])
            if n_train < n_class:
                raise ValueError('The train_size = %d should be greater or '
                                 'equal to the number of classes = %d' %
                                 (n_train, n_class))
            if n_test < n_class:
                raise ValueError('The test_size = %d should be greater or '
                                 'equal to the number of classes = %d' %
                                 (n_test, n_class))

            # generates random training and testing cohorts
            rng = check_random_state(self.random_state)
            for _ in range(self.n_splits):
                n_is = _approximate_mode(new_counts, n_train, rng)
                class_counts_remaining = new_counts - n_is
                t_is = _approximate_mode(class_counts_remaining, n_test, rng)

                train = []
                test = []

                for class_i in new_info[0]:
                    permutation = rng.permutation(new_counts[class_i])
                    perm_indices_class = np.where(
                        merged_classes == class_i)[0][permutation]

                    train.extend(perm_indices_class[:n_is[class_i]])
                    test.extend(
                        perm_indices_class[n_is[class_i]:(n_is[class_i] +
                                                          t_is[class_i])])

                    train = rng.permutation(train).tolist()
                    test = rng.permutation(test).tolist()

                yield train, test

        # otherwise, perform stratified sampling on each cohort separately
        else:

            # gets info about input
            n_samples = {lbl: _num_samples(X) for lbl, X in expr.items()}
            n_train_test = {
                lbl: _validate_shuffle_split(n_samps, self.test_size,
                                             self.train_size)
                for lbl, n_samps in n_samples.items()
            }

            class_info = {
                lbl: np.unique(y, return_inverse=True)
                for lbl, y in omic.items()
            }
            n_classes = {
                lbl: classes.shape[0]
                for lbl, (classes, _) in class_info.items()
            }
            classes_counts = {
                lbl: np.bincount(y_indices)
                for lbl, (_, y_indices) in class_info.items()
            }

            # ensure we have enough samples in each class for stratification
            for lbl, (n_train, n_test) in n_train_test.items():
                if np.min(classes_counts[lbl]) < 2:
                    raise ValueError(
                        "The least populated phenotype class in {} has only "
                        "one member, which is too few. The minimum number of "
                        "groups for any phenotypic feature to predict cannot "
                        "be less than two.".format(lbl))

                if n_train < n_classes[lbl]:
                    raise ValueError(
                        "The number of training samples ({}) should be "
                        "greater or equal to the number of "
                        "phenotypes ({})".format(n_train, n_classes[lbl]))

                if n_test < n_classes[lbl]:
                    raise ValueError(
                        "The number of testing samples ({}) should be "
                        "greater or equal to the number of "
                        "phenotypes ({})".format(n_test, n_classes[lbl]))

            # generates random training and testing cohorts
            rng = check_random_state(self.random_state)
            for _ in range(self.n_splits):
                n_is = {
                    lbl: _approximate_mode(classes_counts[lbl],
                                           n_train_test[lbl][0], rng)
                    for lbl in expr
                }

                classes_counts_left = {
                    lbl: classes_counts[lbl] - n_is[lbl]
                    for lbl in expr
                }
                t_is = {
                    lbl: _approximate_mode(classes_counts_left[lbl],
                                           n_train_test[lbl][1], rng)
                    for lbl in expr
                }

                train = {lbl: [] for lbl in expr}
                test = {lbl: [] for lbl in expr}

                for lbl, (classes, _) in class_info.items():
                    for i, class_i in enumerate(classes):
                        permutation = rng.permutation(classes_counts[lbl][i])

                        perm_indices_class_i = np.where(
                            (omic[lbl] == class_i))[0][permutation]
                        train[lbl].extend(perm_indices_class_i[:n_is[lbl][i]])

                        test[lbl].extend(
                            perm_indices_class_i[n_is[lbl][i]:n_is[lbl][i] +
                                                 t_is[lbl][i]])

                    train[lbl] = rng.permutation(train[lbl])
                    test[lbl] = rng.permutation(test[lbl])

                yield train, test