Python indexable Examples, sklearn.model_selection._split.indexable Python Examples

Example #1

0

Show file

File: cv_splits.py Project: Shathur/Numer.ai

    def split(self, X, y=None, groups=None, window_length=4):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        group_lst = np.unique(groups)
        n_groups = len(group_lst)

        indices = np.arange(n_samples)

        eras = range(n_groups - window_length)
        eras = list(eras)
        for i in eras[:]:
            yield (indices[groups == group_lst[i]],
                   indices[groups == group_lst[i + window_length]])

Example #2

0

Show file

File: lw_model.py Project: tfaatfcn/lw-mlearn-rogerluo

def _split_cv(*arrays, y=None, groups=None, cv=3, random_state=None):
    '''supervise splitting
    
    y
        - class label,if None not to stratify
    groups
        - split by groups
    cv
        - number of splits

    return
    ----
    generator of list containing splited arrays,shape = [m*n*k], for 1 fold
    [(0train, 0test), (1train, 1test), ...]

    m - indices of folds [0 : cv-1]
    n - indice of variable/arrays [0 : n_arrays-1]
    k - indice of train(0)/test[1] set [0:1]
    '''

    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    validation.check_consistent_length(*arrays, y, groups)
    arrays = list(arrays)

    if cv == 1:
        if y is not None:
            arrays.append(y)
        return [[(i, i) for i in arrays]]
    # get cross validator
    if y is not None:
        arrays.append(y)
        cv = _split.check_cv(cv, y=y, classifier=True)
    else:
        cv = _split.check_cv(cv, classifier=False)
    # set random state
    if hasattr(cv, 'random_state'):
        cv.random_state = random_state
    # reset_index pandas df or series
    arrays = _reset_index(*arrays)
    arrays = _split.indexable(*arrays)
    # get indexing method
    safe_index = _split.safe_indexing
    train_test = ([
        (safe_index(i, train_index), safe_index(i, test_index)) for i in arrays
    ] for train_index, test_index in cv.split(arrays[0], y, groups))

    return train_test

Example #3

0

Show file

File: cv_splits.py Project: Shathur/Numer.ai

    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        group_lst = np.unique(groups)
        n_groups = len(group_lst)

        indices = np.arange(n_samples)

        cutoff_eras = n_groups // self.n_splits
        np.random.shuffle(group_lst)

        for i in range(self.n_splits):
            yield (indices[groups.isin(
                group_lst[i * cutoff_eras:i * cutoff_eras + cutoff_eras])],
                   indices[groups.isin(
                       group_lst[i * cutoff_eras:i * cutoff_eras +
                                 cutoff_eras])])

Example #4

0

Show file

File: cv_splits.py Project: Shathur/Numer.ai

 def split(self, X, y=None, groups=None):
     X, y, groups = indexable(X, y, groups)
     n_samples = _num_samples(X)
     n_splits = self.n_splits
     n_folds = n_splits + 1
     group_list = np.unique(groups)
     n_groups = len(group_list)
     if n_folds > n_groups:
         raise ValueError(("Cannot have number of folds ={0} greater"
                           " than the number of samples: {1}.").format(
                               n_folds, n_groups))
     indices = np.arange(n_samples)
     test_size = (n_groups // n_folds)
     test_starts = range(test_size + n_groups % n_folds, n_groups,
                         test_size)
     test_starts = list(test_starts)[::-1]
     for test_start in test_starts:
         yield (indices[groups.isin(group_list[:test_start])],
                indices[groups.isin(group_list[test_start:test_start +
                                               test_size])])

Example #5

0

Show file

File: splits.py Project: xiaoruishan/TSGL-EEGNet

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.
            Stratification is done based on the y labels.

        groups : object
            Always ignored, exists for compatibility.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : None

        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        X, y, groups = indexable(X, y, groups)
        for test, train in super().split(X, y, groups):
            if self.shuffle:
                np.random.seed(self.random_state)
                np.random.shuffle(train)
            yield train, test

Example #6

0

Show file

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError("The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0,
                           group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start -
                                                           group_gap)]:
                train_array_tmp = group_dict[train_group_idx]

                train_array = np.sort(
                    np.unique(
                        np.concatenate((train_array, train_array_tmp)),
                        axis=None,
                    ),
                    axis=None,
                )

            train_end = train_array.size

            for test_group_idx in unique_groups[
                    group_test_start:group_test_start + group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(
                    np.unique(np.concatenate((test_array, test_array_tmp)),
                              axis=None),
                    axis=None,
                )

            test_array = test_array[group_gap:]

            if self.verbose > 0:
                pass

            yield [int(i) for i in train_array], [int(i) for i in test_array]

Example #7

0

Show file

 def split(self, X, y=None, groups=None):
     """Generate indices to split data into training and test set.
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)
         Training data, where n_samples is the number of samples
         and n_features is the number of features.
     y : array-like of shape (n_samples,)
         Always ignored, exists for compatibility.
     groups : array-like of shape (n_samples,)
         Group labels for the samples used while splitting the dataset into
         train/test set.
     Yields
     ------
     train : ndarray
         The training set indices for that split.
     test : ndarray
         The testing set indices for that split.
     """
     if groups is None:
         raise ValueError("The 'groups' parameter should not be None")
     X, y, groups = indexable(X, y, groups)
     n_folds = self.n_splits + 1
     # np.unique returns sorted groups
     u, ind = np.unique(groups, return_index=True)
     # re-sort unique groups in order of first occurrence
     unique_groups = u[np.argsort(ind)]
     log.debug(f"u={u}, unique_groups={unique_groups}")
     n_samples = _num_samples(X)
     n_groups = _num_samples(unique_groups)
     if n_folds > n_groups:
         raise ValueError(
             ("Cannot have number of folds={0} greater than"
              " the number of groups={1}").format(n_folds, n_groups))
     group_dict: Dict[int, List[int]] = {}
     for idx in np.arange(n_samples):
         if groups[idx] in group_dict:
             group_dict[groups[idx]].append(idx)
         else:
             group_dict[groups[idx]] = [idx]
     group_test_size = min(n_groups // n_folds, self.max_test_group_size)
     group_test_starts = range(n_groups - self.n_splits * group_test_size,
                               n_groups, group_test_size)
     for group_test_start in group_test_starts:
         train_array = []
         test_array = []
         group_st = max(
             0,
             group_test_start - self.group_gap - self.max_train_group_size)
         log.debug(
             f"group_st={group_st}, group_test_size={group_test_size}, group_test_starts={group_test_starts}"
         )
         for train_group_idx in unique_groups[group_st:(group_test_start -
                                                        self.group_gap)]:
             tmp = group_dict[train_group_idx]
             train_array = np.sort(
                 np.unique(np.concatenate((train_array, tmp)), axis=None),
                 axis=None,
             )
         for test_group_idx in unique_groups[
                 group_test_start:group_test_start + group_test_size]:
             tmp = group_dict[test_group_idx]
             test_array = np.sort(
                 np.unique(np.concatenate((test_array, tmp)), axis=None),
                 axis=None,
             )
         yield [int(i) for i in train_array], [int(i) for i in test_array]