Esempio n. 1
0
def test_matching_blocks_raises(arrays):
    with pytest.raises(ValueError):
        check_matching_blocks(*arrays)
Esempio n. 2
0
def test_matching_blocks_ok(arrays):
    check_matching_blocks(*arrays)
Esempio n. 3
0
def train_test_split(*arrays, **options):
    """Split arrays into random train and test matricies.

    Parameters
    ----------
    *arrays : Sequence of Dask Arrays
    test_size : float or int, defualt 0.1
    train_size: float or int, optional
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    shuffle : bool, default True
        Whether to shuffle the data before splitting.
    blockwise : bool, optional.
        Whether to shuffle data only within blocks (True), or allow data to
        be shuffled between blocks (False). Shuffling between blocks can
        be much more expensive, especially in distributed environments.

        The default behavior depends on the types in arrays. For Dask Arrays,
        the default is True (data are not shuffled between blocks). For Dask
        DataFrames, the default and only allowed value is True (data are
        shuffled between blocks).

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs

    Examples
    --------
    import dask.array as da
    from dask_ml.datasets import make_regression

    >>> X, y = make_regression(n_samples=125, n_features=4, chunks=50,
    ...                    random_state=0)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
    ...                                                     random_state=0)
    >>> X_train
    dask.array<concatenate, shape=(113, 4), dtype=float64, chunksize=(45, 4)>
    >>> X_train.compute()[:2]
    array([[ 0.12372191,  0.58222459,  0.92950511, -2.09460307],
           [ 0.99439439, -0.70972797, -0.27567053,  1.73887268]])
    """
    test_size = options.pop("test_size", None)
    train_size = options.pop("train_size", None)
    random_state = options.pop("random_state", None)
    shuffle = options.pop("shuffle", True)
    blockwise = options.pop("blockwise", None)

    if train_size is None and test_size is None:
        # all other validation dones elsewhere.
        test_size = 0.1

    if options:
        raise TypeError("Unexpected options {}".format(options))

    if not shuffle:
        raise NotImplementedError(
            "'shuffle=False' is not currently supported.")

    if all(isinstance(arr, (dd.Series, dd.DataFrame)) for arr in arrays):
        check_matching_blocks(*arrays)
        if blockwise is None:
            blockwise = False

        rng = check_random_state(random_state)
        rng = draw_seed(rng, 0, 2**32 - 1, dtype="uint")
        return list(
            itertools.chain.from_iterable(
                arr.random_split([train_size, test_size], random_state=rng)
                for arr in arrays))

    elif all(isinstance(arr, da.Array) for arr in arrays):
        if blockwise is None:
            blockwise = True

        splitter = ShuffleSplit(
            n_splits=1,
            test_size=test_size,
            train_size=train_size,
            blockwise=blockwise,
            random_state=random_state,
        )
        train_idx, test_idx = next(splitter.split(*arrays))

        train_test_pairs = ((_blockwise_slice(arr, train_idx),
                             _blockwise_slice(arr, test_idx))
                            for arr in arrays)

        return list(itertools.chain.from_iterable(train_test_pairs))
    else:
        logger.warning(
            "Mixture of types in 'arrays'. Falling back to scikit-learn.")
        return ms.train_test_split(*arrays,
                                   test_size=test_size,
                                   train_size=train_size,
                                   random_state=random_state,
                                   shuffle=shuffle)
Esempio n. 4
0
def train_test_split(
    *arrays,
    test_size=None,
    train_size=None,
    random_state=None,
    shuffle=None,
    blockwise=None,
    convert_mixed_types=False,
    **options,
):
    """Split arrays into random train and test matricies.

    Parameters
    ----------
    *arrays : Sequence of Dask Arrays, DataFrames, or Series
        Non-dask objects will be passed through to
        :func:`sklearn.model_selection.train_test_split`.
    test_size : float or int, default 0.1
    train_size : float or int, optional
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    shuffle : bool, default None
        Whether to shuffle the data before splitting.
    blockwise : bool, optional.
        Whether to shuffle data only within blocks (True), or allow data to
        be shuffled between blocks (False). Shuffling between blocks can
        be much more expensive, especially in distributed environments.

        The default behavior depends on the types in arrays. For Dask Arrays,
        the default is True (data are not shuffled between blocks). For Dask
        DataFrames, the default and only allowed value is False (data are
        shuffled between blocks).

    convert_mixed_types : bool, default False
        Whether to convert dask DataFrames and Series to dask Arrays when
        arrays contains a mixiture of types. This results in some computation
        to determine the length of each block.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs

    Examples
    --------
    >>> import dask.array as da
    >>> from dask_ml.datasets import make_regression

    >>> X, y = make_regression(n_samples=125, n_features=4, chunks=50,
    ...                    random_state=0)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
    ...                                                     random_state=0)
    >>> X_train
    dask.array<concatenate, shape=(113, 4), dtype=float64, chunksize=(45, 4)>
    >>> X_train.compute()[:2]
    array([[ 0.12372191,  0.58222459,  0.92950511, -2.09460307],
           [ 0.99439439, -0.70972797, -0.27567053,  1.73887268]])
    """
    if train_size is None and test_size is None:
        # all other validation dones elsewhere.
        test_size = 0.1

    if train_size is None and test_size is not None:
        train_size = 1 - test_size
    if test_size is None and train_size is not None:
        test_size = 1 - train_size

    if options:
        raise TypeError("Unexpected options {}".format(options))

    types = set(type(arr) for arr in arrays)

    if da.Array in types and types & {dd.Series, dd.DataFrame}:
        if convert_mixed_types:
            arrays = tuple(
                x.to_dask_array(
                    lengths=True) if isinstance(x, (dd.Series,
                                                    dd.DataFrame)) else x
                for x in arrays)
        else:
            raise TypeError(
                "Got mixture of dask DataFrames and Arrays. Specify "
                "'convert_mixed_types=True'")

    if all(isinstance(arr, (dd.Series, dd.DataFrame)) for arr in arrays):
        check_matching_blocks(*arrays)
        if blockwise is False:
            raise NotImplementedError(
                "'blockwise=False' is not currently supported for dask DataFrames."
            )

        rng = check_random_state(random_state)
        rng = draw_seed(rng, 0, _I4MAX, dtype="uint")
        if DASK_2130:
            if shuffle is None:
                shuffle = False
                warnings.warn(
                    message="The default value for 'shuffle' must be specified"
                    " when splitting DataFrames. In the future"
                    " DataFrames will automatically be shuffled within"
                    " blocks prior to splitting. Specify 'shuffle=True'"
                    " to adopt the future behavior now, or 'shuffle=False'"
                    " to retain the previous behavior.",
                    category=FutureWarning,
                )
            kwargs = {"shuffle": shuffle}
        else:
            if shuffle is None:
                shuffle = True
            if not shuffle:
                raise NotImplementedError(
                    f"'shuffle=False' is not supported for DataFrames in"
                    f" dask versions<2.13.0. Current version is {DASK_VERSION}."
                )
            kwargs = {}
        return list(
            itertools.chain.from_iterable(
                arr.random_split(
                    [train_size, test_size], random_state=rng, **kwargs)
                for arr in arrays))

    elif all(isinstance(arr, da.Array) for arr in arrays):
        if shuffle is None:
            shuffle = True
        if not shuffle:
            raise NotImplementedError(
                "'shuffle=False' is not currently supported for dask Arrays.")
        if blockwise is None:
            blockwise = True

        splitter = ShuffleSplit(
            n_splits=1,
            test_size=test_size,
            train_size=train_size,
            blockwise=blockwise,
            random_state=random_state,
        )
        train_idx, test_idx = next(splitter.split(*arrays))

        train_test_pairs = ((_blockwise_slice(arr, train_idx),
                             _blockwise_slice(arr, test_idx))
                            for arr in arrays)

        return list(itertools.chain.from_iterable(train_test_pairs))
    else:
        return ms.train_test_split(
            *arrays,
            test_size=test_size,
            train_size=train_size,
            random_state=random_state,
            shuffle=shuffle,
        )
Esempio n. 5
0
def train_test_split(
    *arrays,
    test_size=None,
    train_size=None,
    stratify=None,
    classes=None,
    random_state=None,
    shuffle=None,
    blockwise=None,
    convert_mixed_types=False,
    **options,
):
    """Split arrays into random train and test matricies.

    Parameters
    ----------
    *arrays : Sequence of Dask Arrays, DataFrames, or Series
        Non-dask objects will be passed through to
        :func:`sklearn.model_selection.train_test_split`.
    test_size : float or int, default 0.1
    train_size : float or int, optional
    stratify : Dask Array or Series, optional (default=None)
        If all *arrays are non-dask objects, stratify will be passed through
        to
        :func:`sklearn.model_selection.train_test_split`.
        If not None, data is split in a stratified fashion, using this as
        the class labels.
    classes: non-dask array-like object, optional (default=None)
        If stratify is not None and any of *arrays is a dask object, this is
        required. This contains the unique class labels in `stratify`
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    shuffle : bool, default None
        Whether to shuffle the data before splitting.
    blockwise : bool, optional.
        Whether to shuffle data only within blocks (True), or allow data to
        be shuffled between blocks (False). Shuffling between blocks can
        be much more expensive, especially in distributed environments.

        The default behavior depends on the types in arrays. For Dask Arrays,
        the default is True (data are not shuffled between blocks). For Dask
        DataFrames, the default and only allowed value is False (data are
        shuffled between blocks).

    convert_mixed_types : bool, default False
        Whether to convert dask DataFrames and Series to dask Arrays when
        arrays contains a mixiture of types. This results in some computation
        to determine the length of each block.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs

    Examples
    --------
    >>> import dask.array as da
    >>> from dask_ml.datasets import make_regression

    >>> X, y = make_regression(n_samples=125, n_features=4, chunks=50,
    ...                    random_state=0)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
    ...                                                     random_state=0)
    >>> X_train
    dask.array<concatenate, shape=(113, 4), dtype=float64, chunksize=(45, 4)>
    >>> X_train.compute()[:2]
    array([[ 0.12372191,  0.58222459,  0.92950511, -2.09460307],
           [ 0.99439439, -0.70972797, -0.27567053,  1.73887268]])
    """
    if train_size is None and test_size is None:
        # all other validation dones elsewhere.
        test_size = 0.1

    if train_size is None and test_size is not None:
        train_size = 1 - test_size
    if test_size is None and train_size is not None:
        test_size = 1 - train_size

    if options:
        raise TypeError("Unexpected options {}".format(options))

    types = set(type(arr) for arr in arrays)

    if stratify is not None:
        # type check
        if not isinstance(stratify, (da.Array, dd.Series, dd.DataFrame)):
            # raise error iff not passing thru to sklearn's train_test_split
            if any(
                    isinstance(arr, (da.Array, dd.Series, dd.DataFrame))
                    for arr in arrays):
                raise TypeError(
                    "If 'stratify' is not None, it must be an instance of either"
                    " one of dask Array, Series, or DataFrame. "
                    "Got type {} instead".format(type(stratify)))

        if classes is None:
            raise ValueError(
                "If 'stratify' is not None, 'classes' must be specified")

    if da.Array in types and types & {dd.Series, dd.DataFrame}:
        if convert_mixed_types:
            arrays = tuple(
                x.to_dask_array(
                    lengths=True) if isinstance(x, (dd.Series,
                                                    dd.DataFrame)) else x
                for x in arrays)
        else:
            raise TypeError(
                "Got mixture of dask DataFrames and Arrays. Specify "
                "'convert_mixed_types=True'")

    if all(isinstance(arr, (dd.Series, dd.DataFrame)) for arr in arrays):
        if stratify is not None:
            # convert to dd.Series
            if isinstance(stratify, da.Array):
                stratify = dd.from_dask_array(stratify)
            check_matching_blocks([*arrays] + [stratify])
        else:
            check_matching_blocks(*arrays)
        if blockwise is False:
            raise NotImplementedError(
                "'blockwise=False' is not currently supported for dask DataFrames."
            )

        rng = check_random_state(random_state)
        rng = draw_seed(rng, 0, _I4MAX, dtype="uint")
        if DASK_2130:
            if shuffle is None:
                shuffle = False
                warnings.warn(
                    message="The default value for 'shuffle' must be specified"
                    " when splitting DataFrames. In the future"
                    " DataFrames will automatically be shuffled within"
                    " blocks prior to splitting. Specify 'shuffle=True'"
                    " to adopt the future behavior now, or 'shuffle=False'"
                    " to retain the previous behavior.",
                    category=FutureWarning,
                )
            kwargs = {"shuffle": shuffle}
        else:
            if shuffle is None:
                shuffle = True
            if not shuffle:
                raise NotImplementedError(
                    f"'shuffle=False' is not supported for DataFrames in"
                    f" dask versions<2.13.0. Current version is {DASK_VERSION}."
                )
            kwargs = {}

        if stratify is not None:
            train_test_pairs = []
            for arr in arrays:
                # list of class-wise train/test split dataframe slices
                arr_train_slices = []
                arr_test_slices = []
                for ci in classes:
                    # get subdf of data from this class
                    ci_arr = arr[stratify == ci]

                    # split subdf
                    arr_train, arr_test = ci_arr.random_split(
                        [train_size, test_size],
                        random_state=rng,
                        **kwargs,
                    )

                    # add subdf's from this class to list of all subdf's
                    arr_train_slices.append(arr_train)
                    arr_test_slices.append(arr_test)

                # concat all train subdfs as 1 train df, same for test
                train_test_pairs.append(
                    [dd.concat(arr_train_slices),
                     dd.concat(arr_test_slices)])
        else:
            train_test_pairs = [
                arr.random_split([train_size, test_size],
                                 random_state=rng,
                                 **kwargs) for arr in arrays
            ]
        return list(itertools.chain.from_iterable(train_test_pairs))

    elif all(isinstance(arr, da.Array) for arr in arrays):
        if shuffle is None:
            shuffle = True
        if not shuffle:
            raise NotImplementedError(
                "'shuffle=False' is not currently supported for dask Arrays.")
        if blockwise is None:
            blockwise = True

        splitter = ShuffleSplit(
            n_splits=1,
            test_size=test_size,
            train_size=train_size,
            blockwise=blockwise,
            random_state=random_state,
        )

        if stratify is not None:
            # convert to da.Array
            if not isinstance(stratify, da.Array):
                stratify = stratify.to_dask_array(lengths=True)

            # must be 1-d for indexing
            stratify = stratify.ravel()

            train_test_pairs = []
            for arr in arrays:
                # list of class-wise train/test split array slices
                arr_train_slices = []
                arr_test_slices = []
                for ci in classes:
                    # get subarray of data from this class
                    ci_arr = arr[stratify == ci]
                    # FIXME: can chunks be determined lazily?
                    ci_arr.compute_chunk_sizes()

                    # split for this class
                    train_idx, test_idx = next(splitter.split(ci_arr))

                    # add slices from this class to lists of all slices
                    arr_train_slices.append(_blockwise_slice(
                        ci_arr, train_idx))
                    arr_test_slices.append(_blockwise_slice(ci_arr, test_idx))

                # concat all train subarrays as 1 train arr, same for test
                train_test_pairs.append([
                    da.concatenate(arr_train_slices),
                    da.concatenate(arr_test_slices)
                ])
        else:
            train_idx, test_idx = next(splitter.split(*arrays))

            train_test_pairs = ((_blockwise_slice(arr, train_idx),
                                 _blockwise_slice(arr, test_idx))
                                for arr in arrays)

        return list(itertools.chain.from_iterable(train_test_pairs))
    else:
        return ms.train_test_split(
            *arrays,
            test_size=test_size,
            train_size=train_size,
            random_state=random_state,
            stratify=stratify,
            shuffle=shuffle,
        )