Example #1
0
def test_different_seeds():
    seed = 37
    state = np.random.RandomState(seed)
    n = 100000

    # Use an integer
    seeds = set(different_seeds(n, seed))
    assert len(seeds) == n

    # Use RandomState object
    seeds2 = set(different_seeds(n, state))
    assert seeds == seeds2

    # Should be sorted
    smallseeds = different_seeds(10, 1234)
    assert smallseeds == sorted(smallseeds)
Example #2
0
def test_different_seeds():
    seed = 37
    state = np.random.RandomState(seed)
    n = 100000

    # Use an integer
    seeds = set(different_seeds(n, seed))
    assert len(seeds) == n

    # Use RandomState object
    seeds2 = set(different_seeds(n, state))
    assert seeds == seeds2

    # Should be sorted
    smallseeds = different_seeds(10, 1234)
    assert smallseeds == sorted(smallseeds)
Example #3
0
    def split(self, X, y=None):
        """Iterate tuples of data split into training and test sets.

        Parameters
        ----------
        X : dask object
            Training data. May be a ``da.Array``, ``db.Bag``, or
            ``dklearn.Matrix``.

        y : dask object, optional
            The target variable for supervised learning problems.

        Yields
        -------
        X_train, y_train, X_test, y_test : dask objects
            The split training and testing data, returned as the same type as
            the input. If y is not provided, ``y_train`` and ``y_test`` will be
            ``None``.
        """
        X, y = check_X_y(X, y)
        seeds = different_seeds(self.n_iter, random_state=self.random_state)
        for seed in seeds:
            X_train, X_test = random_split(X, self.test_size, seed)
            if y is None:
                y_train = y_test = None
            else:
                y_train, y_test = random_split(y, self.test_size, seed)
            yield X_train, y_train, X_test, y_test
Example #4
0
def train_test_split(*arrays, **options):
    """Split dask collections into random train and test subsets.

    Quick utility that wraps input validation and calls to train/test splitting
    with ``RandomSplit`` into a single call for splitting data in a oneliner.

    Parameters
    ----------
    *arrays : sequence of dask collections with same length and partitions

        Allowed inputs are ``db.Bag``, ``da.Array``, or ``dm.Matrix``. All
        inputs must share the same length and partitions.

    test_size : float, optional
        Should be between 0.0 and 1.0 and represent the proportion of the
        dataset to include in the test split. Default is 0.25.

    random_state : int or RandomState
        Pseudo-random number generator state used for random sampling.

    Returns
    -------
    splitting : list, length = 2 * len(arrays),
        List containing train-test split of inputs.

    Examples
    --------
    >>> X_train, X_test, y_train, y_test = train_test_split(  # doctest: +SKIP
    ...     X, y, test_size=0.20, random_state=42)
    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    check_aligned_partitions(*arrays)

    test_size = options.pop('test_size', 0.25)
    random_state = options.pop('random_state', None)

    if options:
        raise ValueError("Invalid parameters passed: %s" % str(options))

    seed = different_seeds(1, random_state=random_state)[0]
    return list(concat(random_split(a, test_size, seed) for a in arrays))
Example #5
0
def random_split(x, p_test=0.1, random_state=None):
    """Approximately split a dask collection into train/test data.

    Parameters
    ----------
    X : da.Array, db.Bag, or dm.Matrix
        The dask collection to split
    p_test : float, optional
        The fraction of samples to use in the test set. Default is 0.1.
    random_state : int or RandomState, optional
        The ``RandomState`` or seed to use when performing the random split.
    """
    if not 0 < p_test < 1:
        raise ValueError("p_test must be in (0, 1)")

    random_state = check_random_state(random_state)
    token = tokenize(x, p_test, random_state.get_state())
    names = ['random-split-test-' + token, 'random-split-train-' + token]

    if isinstance(x, da.Array):
        x, x_keys = _as_tall_skinny_and_keys(x)
        chunks = np.array(x.chunks[0])
        seeds = different_seeds(len(chunks) + 1, random_state)
        n_test = np.random.RandomState(seeds[0]).binomial(chunks, p_test)
        n_train = chunks - n_test
        dsks = [
            dict(((name, ) + k[1:], (arr_split, k, n, b, s))
                 for k, n, s in zip(x_keys, n_test, seeds[1:]))
            for name, b in zip(names, [True, False])
        ]

        test = da.Array(merge(dsks[0], x.dask), names[0],
                        (tuple(n_test), ) + x.chunks[1:], x.dtype)
        train = da.Array(merge(dsks[1], x.dask), names[1],
                         (tuple(n_train), ) + x.chunks[1:], x.dtype)

    elif isinstance(x, (db.Bag, dm.Matrix)):
        seeds = different_seeds(x.npartitions, random_state)
        split = bag_split if isinstance(x, db.Bag) else mat_split
        dsks = [
            dict(((name, k[1]), (split, k, p_test, b, s))
                 for k, s in zip(x._keys(), seeds))
            for name, b in zip(names, [True, False])
        ]

        if isinstance(x, dm.Matrix):
            if x.ndim is not None:
                shape = (None, ) if x.ndim == 1 else (None, x.shape[1])
            else:
                shape = None
            test = dm.Matrix(merge(dsks[0], x.dask),
                             names[0],
                             x.npartitions,
                             dtype=x.dtype,
                             shape=shape)
            train = dm.Matrix(merge(dsks[1], x.dask),
                              names[1],
                              x.npartitions,
                              dtype=x.dtype,
                              shape=shape)

        else:
            test = db.Bag(merge(dsks[0], x.dask), names[0], x.npartitions)
            train = db.Bag(merge(dsks[1], x.dask), names[1], x.npartitions)
    else:
        raise TypeError("Expected an instance of ``da.Array``, ``db.Bag``, or "
                        "``dm.Matrix`` - got {0}".format(type(x).__name__))

    return train, test