def test_different_seeds(): seed = 37 state = np.random.RandomState(seed) n = 100000 # Use an integer seeds = set(different_seeds(n, seed)) assert len(seeds) == n # Use RandomState object seeds2 = set(different_seeds(n, state)) assert seeds == seeds2 # Should be sorted smallseeds = different_seeds(10, 1234) assert smallseeds == sorted(smallseeds)
def split(self, X, y=None): """Iterate tuples of data split into training and test sets. Parameters ---------- X : dask object Training data. May be a ``da.Array``, ``db.Bag``, or ``dklearn.Matrix``. y : dask object, optional The target variable for supervised learning problems. Yields ------- X_train, y_train, X_test, y_test : dask objects The split training and testing data, returned as the same type as the input. If y is not provided, ``y_train`` and ``y_test`` will be ``None``. """ X, y = check_X_y(X, y) seeds = different_seeds(self.n_iter, random_state=self.random_state) for seed in seeds: X_train, X_test = random_split(X, self.test_size, seed) if y is None: y_train = y_test = None else: y_train, y_test = random_split(y, self.test_size, seed) yield X_train, y_train, X_test, y_test
def train_test_split(*arrays, **options): """Split dask collections into random train and test subsets. Quick utility that wraps input validation and calls to train/test splitting with ``RandomSplit`` into a single call for splitting data in a oneliner. Parameters ---------- *arrays : sequence of dask collections with same length and partitions Allowed inputs are ``db.Bag``, ``da.Array``, or ``dm.Matrix``. All inputs must share the same length and partitions. test_size : float, optional Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. Default is 0.25. random_state : int or RandomState Pseudo-random number generator state used for random sampling. Returns ------- splitting : list, length = 2 * len(arrays), List containing train-test split of inputs. Examples -------- >>> X_train, X_test, y_train, y_test = train_test_split( # doctest: +SKIP ... X, y, test_size=0.20, random_state=42) """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") check_aligned_partitions(*arrays) test_size = options.pop('test_size', 0.25) random_state = options.pop('random_state', None) if options: raise ValueError("Invalid parameters passed: %s" % str(options)) seed = different_seeds(1, random_state=random_state)[0] return list(concat(random_split(a, test_size, seed) for a in arrays))
def random_split(x, p_test=0.1, random_state=None): """Approximately split a dask collection into train/test data. Parameters ---------- X : da.Array, db.Bag, or dm.Matrix The dask collection to split p_test : float, optional The fraction of samples to use in the test set. Default is 0.1. random_state : int or RandomState, optional The ``RandomState`` or seed to use when performing the random split. """ if not 0 < p_test < 1: raise ValueError("p_test must be in (0, 1)") random_state = check_random_state(random_state) token = tokenize(x, p_test, random_state.get_state()) names = ['random-split-test-' + token, 'random-split-train-' + token] if isinstance(x, da.Array): x, x_keys = _as_tall_skinny_and_keys(x) chunks = np.array(x.chunks[0]) seeds = different_seeds(len(chunks) + 1, random_state) n_test = np.random.RandomState(seeds[0]).binomial(chunks, p_test) n_train = chunks - n_test dsks = [ dict(((name, ) + k[1:], (arr_split, k, n, b, s)) for k, n, s in zip(x_keys, n_test, seeds[1:])) for name, b in zip(names, [True, False]) ] test = da.Array(merge(dsks[0], x.dask), names[0], (tuple(n_test), ) + x.chunks[1:], x.dtype) train = da.Array(merge(dsks[1], x.dask), names[1], (tuple(n_train), ) + x.chunks[1:], x.dtype) elif isinstance(x, (db.Bag, dm.Matrix)): seeds = different_seeds(x.npartitions, random_state) split = bag_split if isinstance(x, db.Bag) else mat_split dsks = [ dict(((name, k[1]), (split, k, p_test, b, s)) for k, s in zip(x._keys(), seeds)) for name, b in zip(names, [True, False]) ] if isinstance(x, dm.Matrix): if x.ndim is not None: shape = (None, ) if x.ndim == 1 else (None, x.shape[1]) else: shape = None test = dm.Matrix(merge(dsks[0], x.dask), names[0], x.npartitions, dtype=x.dtype, shape=shape) train = dm.Matrix(merge(dsks[1], x.dask), names[1], x.npartitions, dtype=x.dtype, shape=shape) else: test = db.Bag(merge(dsks[0], x.dask), names[0], x.npartitions) train = db.Bag(merge(dsks[1], x.dask), names[1], x.npartitions) else: raise TypeError("Expected an instance of ``da.Array``, ``db.Bag``, or " "``dm.Matrix`` - got {0}".format(type(x).__name__)) return train, test