Esempio n. 1
0
def test_groupsort_indexer():
    a = np.random.randint(0, 1000, 100).astype(np.int64)
    b = np.random.randint(0, 1000, 100).astype(np.int64)

    result = lib.groupsort_indexer(a, 1000)[0]

    # need to use a stable sort
    expected = np.argsort(a, kind='mergesort')
    assert(np.array_equal(result, expected))

    # compare with lexsort
    key = a * 1000 + b
    result = lib.groupsort_indexer(key, 1000000)[0]
    expected = np.lexsort((b, a))
    assert(np.array_equal(result, expected))
Esempio n. 2
0
def test_groupsort_indexer():
    a = np.random.randint(0, 1000, 100).astype('i4')
    b = np.random.randint(0, 1000, 100).astype('i4')

    result = lib.groupsort_indexer(a, 1000)[0]

    # need to use a stable sort
    expected = np.argsort(a, kind='mergesort')
    assert (np.array_equal(result, expected))

    # compare with lexsort
    key = a * 1000 + b
    result = lib.groupsort_indexer(key, 1000000)[0]
    expected = np.lexsort((b, a))
    assert (np.array_equal(result, expected))
Esempio n. 3
0
def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x):
    """
    Parameters
    ----------
    data : BlockManager

    Returns
    -------
    generator
    """
    indexer = lib.groupsort_indexer(group_index.astype("i4"), ngroups)[0]
    group_index = group_index.take(indexer)

    if isinstance(data, BlockManager):
        # this is sort of wasteful but...
        sorted_axis = data.axes[axis].take(indexer)
        sorted_data = data.reindex_axis(sorted_axis, axis=axis)
    if isinstance(data, Series):
        sorted_axis = data.index.take(indexer)
        sorted_data = data.reindex(sorted_axis)
    elif isinstance(data, DataFrame):
        sorted_data = data.take(indexer, axis=axis)

    if isinstance(sorted_data, DataFrame):

        def _get_slice(slob):
            if axis == 0:
                return sorted_data[slob]
            else:
                return sorted_data.ix[:, slob]

    elif isinstance(sorted_data, BlockManager):

        def _get_slice(slob):
            return factory(sorted_data.get_slice(slob, axis=axis))

    elif isinstance(sorted_data, Series):

        def _get_slice(slob):
            return sorted_data._get_values(slob)

    else:  # pragma: no cover

        def _get_slice(slob):
            return sorted_data[slob]

    starts, ends = lib.generate_slices(group_index.astype("i4"), ngroups)

    for i, (start, end) in enumerate(zip(starts, ends)):
        # Since I'm now compressing the group ids, it's now not "possible" to
        # produce empty slices because such groups would not be observed in the
        # data
        assert start < end
        yield i, _get_slice(slice(start, end))
Esempio n. 4
0
def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x):
    """
    Parameters
    ----------
    data : BlockManager

    Returns
    -------
    generator
    """
    group_index = get_group_index(label_list, shape)
    na_mask = np.zeros(len(label_list[0]), dtype=bool)
    for arr in label_list:
        na_mask |= arr == -1
    group_index[na_mask] = -1
    indexer = lib.groupsort_indexer(group_index.astype('i4'),
                                    np.prod(shape))[0]
    group_index = group_index.take(indexer)

    if isinstance(data, BlockManager):
        # this is sort of wasteful but...
        sorted_axis = data.axes[axis].take(indexer)
        sorted_data = data.reindex_axis(sorted_axis, axis=axis)
    if isinstance(data, Series):
        sorted_axis = data.index.take(indexer)
        sorted_data = data.reindex(sorted_axis)
    elif isinstance(data, DataFrame):
        sorted_data = data.take(indexer, axis=axis)

    if isinstance(sorted_data, DataFrame):
        def _get_slice(slob):
            if axis == 0:
                return sorted_data[slob]
            else:
                return sorted_data.ix[:, slob]
    elif isinstance(sorted_data, BlockManager):
        def _get_slice(slob):
            return factory(sorted_data.get_slice(slob, axis=axis))
    elif isinstance(sorted_data, Series):
        def _get_slice(slob):
            return sorted_data._get_values(slob)
    else:  # pragma: no cover
        def _get_slice(slob):
            return sorted_data[slob]

    starts, ends = lib.generate_slices(group_index.astype('i4'),
                                       np.prod(shape))

    for i, (start, end) in enumerate(zip(starts, ends)):
        if start == end:
            yield i, None
        else:
            yield i, _get_slice(slice(start, end))
Esempio n. 5
0
def _aggregate_series_fast(obj, func, group_index, ngroups):
    if obj.index._has_complex_internals:
        raise TypeError('Incompatible index for Cython grouper')

    # avoids object / Series creation overhead
    dummy = obj[:0]
    indexer = lib.groupsort_indexer(group_index, ngroups)[0]
    obj = obj.take(indexer)
    group_index = group_index.take(indexer)
    grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, dummy)
    result, counts = grouper.get_result()
    return result, counts
Esempio n. 6
0
    def _aggregate_series_fast(self, obj, func, group_index, ngroups):
        if obj.index._has_complex_internals:
            raise TypeError("Incompatible index for Cython grouper")

        # avoids object / Series creation overhead
        dummy = obj[:0].copy()
        indexer = lib.groupsort_indexer(group_index, ngroups)[0]
        obj = obj.take(indexer)
        group_index = group_index.take(indexer)
        grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, dummy)
        result, counts = grouper.get_result()
        return result, counts
Esempio n. 7
0
def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x):
    """
    Parameters
    ----------
    data : BlockManager

    Returns
    -------
    generator
    """
    group_index = com._ensure_int32(group_index)

    indexer = lib.groupsort_indexer(group_index, ngroups)[0]
    group_index = group_index.take(indexer)

    if isinstance(data, BlockManager):
        # this is sort of wasteful but...
        sorted_axis = data.axes[axis].take(indexer)
        sorted_data = data.reindex_axis(sorted_axis, axis=axis)
    if isinstance(data, Series):
        sorted_axis = data.index.take(indexer)
        sorted_data = data.reindex(sorted_axis)
    elif isinstance(data, DataFrame):
        sorted_data = data.take(indexer, axis=axis)

    if isinstance(sorted_data, DataFrame):

        def _get_slice(slob):
            if axis == 0:
                return sorted_data[slob]
            else:
                return sorted_data.ix[:, slob]
    elif isinstance(sorted_data, BlockManager):

        def _get_slice(slob):
            return factory(sorted_data.get_slice(slob, axis=axis))
    elif isinstance(sorted_data, Series):

        def _get_slice(slob):
            return sorted_data._get_values(slob)
    else:  # pragma: no cover

        def _get_slice(slob):
            return sorted_data[slob]

    starts, ends = lib.generate_slices(group_index, ngroups)

    for i, (start, end) in enumerate(zip(starts, ends)):
        # Since I'm now compressing the group ids, it's now not "possible" to
        # produce empty slices because such groups would not be observed in the
        # data
        assert (start < end)
        yield i, _get_slice(slice(start, end))
Esempio n. 8
0
def _indexer_from_factorized(labels, shape, compress=True):
    if _int64_overflow_possible(shape):
        indexer = np.lexsort(np.array(labels[::-1]))
        return indexer

    group_index = get_group_index(labels, shape)

    if compress:
        comp_ids, obs_ids = _compress_group_index(group_index)
        max_group = len(obs_ids)
    else:
        comp_ids = group_index
        max_group = np.prod(shape)

    indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)

    return indexer
Esempio n. 9
0
def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x):
    """
    Parameters
    ----------
    data : BlockManager

    Returns
    -------
    generator
    """
    # indexer = np.lexsort(label_list[::-1])
    group_index = get_group_index(label_list, shape)
    na_mask = np.zeros(len(label_list[0]), dtype=bool)
    for arr in label_list:
        na_mask |= arr == -1
    group_index[na_mask] = -1
    indexer = lib.groupsort_indexer(group_index.astype('i4'),
                                    np.prod(shape))

    sorted_labels = [labels.take(indexer) for labels in label_list]

    if isinstance(data, BlockManager):
        # this is sort of wasteful but...
        sorted_axis = data.axes[axis].take(indexer)
        sorted_data = data.reindex_axis(sorted_axis, axis=axis)
    if isinstance(data, Series):
        sorted_axis = data.index.take(indexer)
        sorted_data = data.reindex(sorted_axis)
    elif isinstance(data, DataFrame):
        sorted_data = data.take(indexer, axis=axis)

    gen = _generate_groups(sorted_data, sorted_labels, shape,
                           0, len(label_list[0]), axis=axis, which=0,
                           factory=factory)
    for key, group in gen:
        yield key, group
Esempio n. 10
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'],
                               table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor(index)
        minor = Factor(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(key, J * K)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block], [block.items,
                                         major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print ('Duplicate entries in table, taking most recently '
                       'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Esempio n. 11
0
    def _read_panel_table(self, group, where=None):
        from pandas.core.index import unique_int64, Factor
        from pandas.core.common import _asarray_tuplesafe
        from pandas.core.internals import BlockManager
        from pandas.core.reshape import block2d_to_block3d

        table = getattr(group, "table")

        # create the selection
        sel = Selection(table, where)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values["column"], table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values["index"], table._v_attrs.index_kind)
        values = sel.values["values"]

        major = Factor(index)
        minor = Factor(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique_int64(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(key, J * K)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels)

            mgr = BlockManager([block], [block.items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print ("Duplicate entries in table, taking most recently " "appended")

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()
            index_map = lib.map_indices_object(tuple_index)

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = lib.merge_indexer_object(unique_tuples, index_map)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Esempio n. 12
0
    def _read_panel_table(self, group, where=None):
        from pandas.core.index import unique_int64, Factor
        from pandas.core.common import _asarray_tuplesafe
        from pandas.core.internals import BlockManager
        from pandas.core.reshape import block2d_to_block3d

        table = getattr(group, 'table')

        # create the selection
        sel = Selection(table, where)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor(index)
        minor = Factor(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique_int64(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(key, J * K)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block],
                               [block.items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()
            index_map = lib.map_indices_object(tuple_index)

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = lib.merge_indexer_object(unique_tuples, index_map)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Esempio n. 13
0

import pandas._tseries as lib

f = np.std


grouped = df.groupby(['A', 'B'])

label_list = [ping.labels for ping in grouped.groupings]
shape = [len(ping.ids) for ping in grouped.groupings]

from pandas.core.groupby import get_group_index


group_index = get_group_index(label_list, shape).astype('i4')

ngroups = np.prod(shape)

indexer = lib.groupsort_indexer(group_index, ngroups)

values = df['C'].values.take(indexer)
group_index = group_index.take(indexer)

f = lambda x: x.std(ddof=1)

grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups)
result = grouper.get_result()

expected = grouped.std()