Esempio n. 1
0
    def _make_labels(self):
        if self._was_factor:  # pragma: no cover
            raise Exception('Should not call this method grouping by level')
        else:
            values = self.grouper
            if values.dtype != np.object_:
                values = values.astype('O')

            # khash
            rizer = lib.Factorizer(len(values))
            labels, counts = rizer.factorize(values, sort=False)

            uniques = Index(rizer.uniques, name=self.name)
            if self.sort and len(counts) > 0:
                sorter = uniques.argsort()
                reverse_indexer = np.empty(len(sorter), dtype=np.int32)
                reverse_indexer.put(sorter, np.arange(len(sorter)))

                mask = labels < 0
                labels = reverse_indexer.take(labels)
                np.putmask(labels, mask, -1)

                uniques = uniques.take(sorter)
                counts = counts.take(sorter)

            self._labels = labels
            self._group_index = uniques
            self._counts = counts
Esempio n. 2
0
def khash_unique(values, expected_K, size_hint=False, sort=False,
                 memory=False):
    if memory:
        gc.collect()
        before_mem = proc.get_memory_info().rss

    if size_hint:
        rizer = lib.Factorizer(len(values))
    else:
        rizer = lib.Factorizer(100)

    result = []
    result = rizer.unique(values)

    if memory:
        result = proc.get_memory_info().rss - before_mem
        return result

    if sort:
        result.sort()
    assert(len(result) == expected_K)
Esempio n. 3
0
File: merge.py Progetto: lahi/pandas
def _factorize_objects(left_index, right_index, sort=True):
    rizer = lib.Factorizer(max(len(left_index), len(right_index)))

    llab, _ = rizer.factorize(left_index.astype('O'))
    rlab, _ = rizer.factorize(right_index.astype('O'))

    count = rizer.get_count()

    if sort:
        llab, rlab = _sort_labels(rizer.uniques, llab, rlab)

        # TODO: na handling

    return llab, rlab, count