Esempio n. 1
0
def test_groupsort_indexer():
    a = np.random.randint(0, 1000, 100).astype(np.int64)
    b = np.random.randint(0, 1000, 100).astype(np.int64)

    result = algos.groupsort_indexer(a, 1000)[0]

    # need to use a stable sort
    expected = np.argsort(a, kind='mergesort')
    assert(np.array_equal(result, expected))

    # compare with lexsort
    key = a * 1000 + b
    result = algos.groupsort_indexer(key, 1000000)[0]
    expected = np.lexsort((b, a))
    assert(np.array_equal(result, expected))
Esempio n. 2
0
def test_groupsort_indexer():
    a = np.random.randint(0, 1000, 100).astype(np.int64)
    b = np.random.randint(0, 1000, 100).astype(np.int64)

    result = algos.groupsort_indexer(a, 1000)[0]

    # need to use a stable sort
    expected = np.argsort(a, kind='mergesort')
    assert (np.array_equal(result, expected))

    # compare with lexsort
    key = a * 1000 + b
    result = algos.groupsort_indexer(key, 1000000)[0]
    expected = np.lexsort((b, a))
    assert (np.array_equal(result, expected))
Esempio n. 3
0
def get_group_index_sorter(group_index, ngroups):
    """
    _algos.groupsort_indexer implements `counting sort` and it is at least
    O(ngroups), where
        ngroups = prod(shape)
        shape = map(len, keys)
    that is, linear in the number of combinations (cartesian product) of unique
    values of groupby keys. This can be huge when doing multi-key groupby.
    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
    length of the data-frame;
    Both algorithms are `stable` sort and that is necessary for correctness of
    groupby operations. e.g. consider:
        df.groupby(key)[col].transform('first')
    """
    count = len(group_index)
    alpha = 0.0  # taking complexities literally; there may be
    beta = 1.0  # some room for fine-tuning these parameters
    do_groupsort = (count > 0 and ((alpha + beta * ngroups) <
                                   (count * np.log(count))))
    if do_groupsort:
        sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index),
                                             ngroups)
        return _ensure_platform_int(sorter)
    else:
        return group_index.argsort(kind='mergesort')
Esempio n. 4
0
def get_group_index_sorter(group_index, ngroups):
    """
    _algos.groupsort_indexer implements `counting sort` and it is at least
    O(ngroups), where
        ngroups = prod(shape)
        shape = map(len, keys)
    that is, linear in the number of combinations (cartesian product) of unique
    values of groupby keys. This can be huge when doing multi-key groupby.
    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
    length of the data-frame;
    Both algorithms are `stable` sort and that is necessary for correctness of
    groupby operations. e.g. consider:
        df.groupby(key)[col].transform('first')
    """
    count = len(group_index)
    alpha = 0.0  # taking complexities literally; there may be
    beta = 1.0  # some room for fine-tuning these parameters
    do_groupsort = (count > 0 and ((alpha + beta * ngroups) <
                                   (count * np.log(count))))
    if do_groupsort:
        sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index),
                                             ngroups)
        return _ensure_platform_int(sorter)
    else:
        return group_index.argsort(kind='mergesort')
Esempio n. 5
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = list(self.index.labels)
        levs = list(self.index.levels)
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Esempio n. 6
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = list(self.index.labels)
        levs = list(self.index.levels)
        to_sort = labs[:v] + labs[v + 1 :] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Esempio n. 7
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = self.index.labels
        levs = self.index.levels
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        group_index = get_group_index(to_sort, sizes)
        comp_index, obs_ids = _compress_group_index(group_index)
        ngroups = len(obs_ids)

        indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = com.take_2d(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Esempio n. 8
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = self.index.labels
        levs = self.index.levels
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        group_index = get_group_index(to_sort, sizes)
        comp_index, obs_ids = _compress_group_index(group_index)
        ngroups = len(obs_ids)

        indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = com.take_2d(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]