Example #1
0
def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x):
    """
    Parameters
    ----------
    data : BlockManager

    Returns
    -------
    generator
    """
    indexer = lib.groupsort_indexer(group_index.astype("i4"), ngroups)[0]
    group_index = group_index.take(indexer)

    if isinstance(data, BlockManager):
        # this is sort of wasteful but...
        sorted_axis = data.axes[axis].take(indexer)
        sorted_data = data.reindex_axis(sorted_axis, axis=axis)
    if isinstance(data, Series):
        sorted_axis = data.index.take(indexer)
        sorted_data = data.reindex(sorted_axis)
    elif isinstance(data, DataFrame):
        sorted_data = data.take(indexer, axis=axis)

    if isinstance(sorted_data, DataFrame):

        def _get_slice(slob):
            if axis == 0:
                return sorted_data[slob]
            else:
                return sorted_data.ix[:, slob]

    elif isinstance(sorted_data, BlockManager):

        def _get_slice(slob):
            return factory(sorted_data.get_slice(slob, axis=axis))

    elif isinstance(sorted_data, Series):

        def _get_slice(slob):
            return sorted_data._get_values(slob)

    else:  # pragma: no cover

        def _get_slice(slob):
            return sorted_data[slob]

    starts, ends = lib.generate_slices(group_index.astype("i4"), ngroups)

    for i, (start, end) in enumerate(zip(starts, ends)):
        # Since I'm now compressing the group ids, it's now not "possible" to
        # produce empty slices because such groups would not be observed in the
        # data
        assert start < end
        yield i, _get_slice(slice(start, end))
Example #2
0
def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x):
    """
    Parameters
    ----------
    data : BlockManager

    Returns
    -------
    generator
    """
    group_index = get_group_index(label_list, shape)
    na_mask = np.zeros(len(label_list[0]), dtype=bool)
    for arr in label_list:
        na_mask |= arr == -1
    group_index[na_mask] = -1
    indexer = lib.groupsort_indexer(group_index.astype('i4'),
                                    np.prod(shape))[0]
    group_index = group_index.take(indexer)

    if isinstance(data, BlockManager):
        # this is sort of wasteful but...
        sorted_axis = data.axes[axis].take(indexer)
        sorted_data = data.reindex_axis(sorted_axis, axis=axis)
    if isinstance(data, Series):
        sorted_axis = data.index.take(indexer)
        sorted_data = data.reindex(sorted_axis)
    elif isinstance(data, DataFrame):
        sorted_data = data.take(indexer, axis=axis)

    if isinstance(sorted_data, DataFrame):
        def _get_slice(slob):
            if axis == 0:
                return sorted_data[slob]
            else:
                return sorted_data.ix[:, slob]
    elif isinstance(sorted_data, BlockManager):
        def _get_slice(slob):
            return factory(sorted_data.get_slice(slob, axis=axis))
    elif isinstance(sorted_data, Series):
        def _get_slice(slob):
            return sorted_data._get_values(slob)
    else:  # pragma: no cover
        def _get_slice(slob):
            return sorted_data[slob]

    starts, ends = lib.generate_slices(group_index.astype('i4'),
                                       np.prod(shape))

    for i, (start, end) in enumerate(zip(starts, ends)):
        if start == end:
            yield i, None
        else:
            yield i, _get_slice(slice(start, end))
Example #3
0
def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x):
    """
    Parameters
    ----------
    data : BlockManager

    Returns
    -------
    generator
    """
    group_index = com._ensure_int32(group_index)

    indexer = lib.groupsort_indexer(group_index, ngroups)[0]
    group_index = group_index.take(indexer)

    if isinstance(data, BlockManager):
        # this is sort of wasteful but...
        sorted_axis = data.axes[axis].take(indexer)
        sorted_data = data.reindex_axis(sorted_axis, axis=axis)
    if isinstance(data, Series):
        sorted_axis = data.index.take(indexer)
        sorted_data = data.reindex(sorted_axis)
    elif isinstance(data, DataFrame):
        sorted_data = data.take(indexer, axis=axis)

    if isinstance(sorted_data, DataFrame):

        def _get_slice(slob):
            if axis == 0:
                return sorted_data[slob]
            else:
                return sorted_data.ix[:, slob]
    elif isinstance(sorted_data, BlockManager):

        def _get_slice(slob):
            return factory(sorted_data.get_slice(slob, axis=axis))
    elif isinstance(sorted_data, Series):

        def _get_slice(slob):
            return sorted_data._get_values(slob)
    else:  # pragma: no cover

        def _get_slice(slob):
            return sorted_data[slob]

    starts, ends = lib.generate_slices(group_index, ngroups)

    for i, (start, end) in enumerate(zip(starts, ends)):
        # Since I'm now compressing the group ids, it's now not "possible" to
        # produce empty slices because such groups would not be observed in the
        # data
        assert (start < end)
        yield i, _get_slice(slice(start, end))