def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) b = np.random.randint(0, 1000, 100).astype(np.int64) result = lib.groupsort_indexer(a, 1000)[0] # need to use a stable sort expected = np.argsort(a, kind='mergesort') assert(np.array_equal(result, expected)) # compare with lexsort key = a * 1000 + b result = lib.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) assert(np.array_equal(result, expected))
def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype('i4') b = np.random.randint(0, 1000, 100).astype('i4') result = lib.groupsort_indexer(a, 1000)[0] # need to use a stable sort expected = np.argsort(a, kind='mergesort') assert (np.array_equal(result, expected)) # compare with lexsort key = a * 1000 + b result = lib.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) assert (np.array_equal(result, expected))
def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x): """ Parameters ---------- data : BlockManager Returns ------- generator """ indexer = lib.groupsort_indexer(group_index.astype("i4"), ngroups)[0] group_index = group_index.take(indexer) if isinstance(data, BlockManager): # this is sort of wasteful but... sorted_axis = data.axes[axis].take(indexer) sorted_data = data.reindex_axis(sorted_axis, axis=axis) if isinstance(data, Series): sorted_axis = data.index.take(indexer) sorted_data = data.reindex(sorted_axis) elif isinstance(data, DataFrame): sorted_data = data.take(indexer, axis=axis) if isinstance(sorted_data, DataFrame): def _get_slice(slob): if axis == 0: return sorted_data[slob] else: return sorted_data.ix[:, slob] elif isinstance(sorted_data, BlockManager): def _get_slice(slob): return factory(sorted_data.get_slice(slob, axis=axis)) elif isinstance(sorted_data, Series): def _get_slice(slob): return sorted_data._get_values(slob) else: # pragma: no cover def _get_slice(slob): return sorted_data[slob] starts, ends = lib.generate_slices(group_index.astype("i4"), ngroups) for i, (start, end) in enumerate(zip(starts, ends)): # Since I'm now compressing the group ids, it's now not "possible" to # produce empty slices because such groups would not be observed in the # data assert start < end yield i, _get_slice(slice(start, end))
def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x): """ Parameters ---------- data : BlockManager Returns ------- generator """ group_index = get_group_index(label_list, shape) na_mask = np.zeros(len(label_list[0]), dtype=bool) for arr in label_list: na_mask |= arr == -1 group_index[na_mask] = -1 indexer = lib.groupsort_indexer(group_index.astype('i4'), np.prod(shape))[0] group_index = group_index.take(indexer) if isinstance(data, BlockManager): # this is sort of wasteful but... sorted_axis = data.axes[axis].take(indexer) sorted_data = data.reindex_axis(sorted_axis, axis=axis) if isinstance(data, Series): sorted_axis = data.index.take(indexer) sorted_data = data.reindex(sorted_axis) elif isinstance(data, DataFrame): sorted_data = data.take(indexer, axis=axis) if isinstance(sorted_data, DataFrame): def _get_slice(slob): if axis == 0: return sorted_data[slob] else: return sorted_data.ix[:, slob] elif isinstance(sorted_data, BlockManager): def _get_slice(slob): return factory(sorted_data.get_slice(slob, axis=axis)) elif isinstance(sorted_data, Series): def _get_slice(slob): return sorted_data._get_values(slob) else: # pragma: no cover def _get_slice(slob): return sorted_data[slob] starts, ends = lib.generate_slices(group_index.astype('i4'), np.prod(shape)) for i, (start, end) in enumerate(zip(starts, ends)): if start == end: yield i, None else: yield i, _get_slice(slice(start, end))
def _aggregate_series_fast(obj, func, group_index, ngroups): if obj.index._has_complex_internals: raise TypeError('Incompatible index for Cython grouper') # avoids object / Series creation overhead dummy = obj[:0] indexer = lib.groupsort_indexer(group_index, ngroups)[0] obj = obj.take(indexer) group_index = group_index.take(indexer) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts
def _aggregate_series_fast(self, obj, func, group_index, ngroups): if obj.index._has_complex_internals: raise TypeError("Incompatible index for Cython grouper") # avoids object / Series creation overhead dummy = obj[:0].copy() indexer = lib.groupsort_indexer(group_index, ngroups)[0] obj = obj.take(indexer) group_index = group_index.take(indexer) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts
def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x): """ Parameters ---------- data : BlockManager Returns ------- generator """ group_index = com._ensure_int32(group_index) indexer = lib.groupsort_indexer(group_index, ngroups)[0] group_index = group_index.take(indexer) if isinstance(data, BlockManager): # this is sort of wasteful but... sorted_axis = data.axes[axis].take(indexer) sorted_data = data.reindex_axis(sorted_axis, axis=axis) if isinstance(data, Series): sorted_axis = data.index.take(indexer) sorted_data = data.reindex(sorted_axis) elif isinstance(data, DataFrame): sorted_data = data.take(indexer, axis=axis) if isinstance(sorted_data, DataFrame): def _get_slice(slob): if axis == 0: return sorted_data[slob] else: return sorted_data.ix[:, slob] elif isinstance(sorted_data, BlockManager): def _get_slice(slob): return factory(sorted_data.get_slice(slob, axis=axis)) elif isinstance(sorted_data, Series): def _get_slice(slob): return sorted_data._get_values(slob) else: # pragma: no cover def _get_slice(slob): return sorted_data[slob] starts, ends = lib.generate_slices(group_index, ngroups) for i, (start, end) in enumerate(zip(starts, ends)): # Since I'm now compressing the group ids, it's now not "possible" to # produce empty slices because such groups would not be observed in the # data assert (start < end) yield i, _get_slice(slice(start, end))
def _indexer_from_factorized(labels, shape, compress=True): if _int64_overflow_possible(shape): indexer = np.lexsort(np.array(labels[::-1])) return indexer group_index = get_group_index(labels, shape) if compress: comp_ids, obs_ids = _compress_group_index(group_index) max_group = len(obs_ids) else: comp_ids = group_index max_group = np.prod(shape) indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group) return indexer
def generate_groups(data, label_list, shape, axis=0, factory=lambda x: x): """ Parameters ---------- data : BlockManager Returns ------- generator """ # indexer = np.lexsort(label_list[::-1]) group_index = get_group_index(label_list, shape) na_mask = np.zeros(len(label_list[0]), dtype=bool) for arr in label_list: na_mask |= arr == -1 group_index[na_mask] = -1 indexer = lib.groupsort_indexer(group_index.astype('i4'), np.prod(shape)) sorted_labels = [labels.take(indexer) for labels in label_list] if isinstance(data, BlockManager): # this is sort of wasteful but... sorted_axis = data.axes[axis].take(indexer) sorted_data = data.reindex_axis(sorted_axis, axis=axis) if isinstance(data, Series): sorted_axis = data.index.take(indexer) sorted_data = data.reindex(sorted_axis) elif isinstance(data, DataFrame): sorted_data = data.take(indexer, axis=axis) gen = _generate_groups(sorted_data, sorted_labels, shape, 0, len(label_list[0]), axis=axis, which=0, factory=factory) for key, group in gen: yield key, group
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor(index) minor = Factor(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(key, J * K) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print ('Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index.get_tuple_index() unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _read_panel_table(self, group, where=None): from pandas.core.index import unique_int64, Factor from pandas.core.common import _asarray_tuplesafe from pandas.core.internals import BlockManager from pandas.core.reshape import block2d_to_block3d table = getattr(group, "table") # create the selection sel = Selection(table, where) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values["column"], table._v_attrs.columns_kind) index = _maybe_convert(sel.values["index"], table._v_attrs.index_kind) values = sel.values["values"] major = Factor(index) minor = Factor(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique_int64(key)) == len(key): sorter, _ = lib.groupsort_indexer(key, J * K) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print ("Duplicate entries in table, taking most recently " "appended") # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index.get_tuple_index() index_map = lib.map_indices_object(tuple_index) unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = lib.merge_indexer_object(unique_tuples, index_map) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _read_panel_table(self, group, where=None): from pandas.core.index import unique_int64, Factor from pandas.core.common import _asarray_tuplesafe from pandas.core.internals import BlockManager from pandas.core.reshape import block2d_to_block3d table = getattr(group, 'table') # create the selection sel = Selection(table, where) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor(index) minor = Factor(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique_int64(key)) == len(key): sorter, _ = lib.groupsort_indexer(key, J * K) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index.get_tuple_index() index_map = lib.map_indices_object(tuple_index) unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = lib.merge_indexer_object(unique_tuples, index_map) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
import pandas._tseries as lib f = np.std grouped = df.groupby(['A', 'B']) label_list = [ping.labels for ping in grouped.groupings] shape = [len(ping.ids) for ping in grouped.groupings] from pandas.core.groupby import get_group_index group_index = get_group_index(label_list, shape).astype('i4') ngroups = np.prod(shape) indexer = lib.groupsort_indexer(group_index, ngroups) values = df['C'].values.take(indexer) group_index = group_index.take(indexer) f = lambda x: x.std(ddof=1) grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups) result = grouper.get_result() expected = grouped.std()