Beispiel #1
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # we might have a missing index
        if len(index) != values.shape[0]:
            mask = isnull(index)
            if mask.any():
                l = np.arange(len(index))
                values, orig_values = np.empty((len(index),values.shape[1])), values
                values.fill(np.nan)
                values_indexer = com._ensure_int64(l[~mask])
                for i, j in enumerate(values_indexer):
                    values[j] = orig_values[i]
            else:
                index = index.take(self.unique_groups)
        
        return DataFrame(values, index=index, columns=columns)
Beispiel #2
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # we might have a missing index
        if len(index) != values.shape[0]:
            mask = isnull(index)
            if mask.any():
                l = np.arange(len(index))
                values, orig_values = (np.empty(
                    (len(index), values.shape[1])), values)
                values.fill(np.nan)
                values_indexer = com._ensure_int64(l[~mask])
                for i, j in enumerate(values_indexer):
                    values[j] = orig_values[i]
            else:
                index = index.take(self.unique_groups)

        return DataFrame(values, index=index, columns=columns)
Beispiel #3
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        group_index = get_group_index(self.sorted_labels[:-1],
                                      [len(x) for x in new_levels])

        comp_index, obs_ids = _compress_group_index(group_index)
        ngroups = len(obs_ids)

        comp_index = _ensure_platform_int(comp_index)
        stride = self.index.levshape[self.level]
        self.full_shape = ngroups, stride

        selector = self.sorted_labels[-1] + stride * comp_index
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        if mask.sum() < len(self.index):
            raise ReshapeError('Index contains duplicate entries, '
                               'cannot reshape')

        self.group_index = comp_index
        self.mask = mask
        self.unique_groups = obs_ids
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
Beispiel #4
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        group_index = get_group_index(self.sorted_labels[:-1],
                                      [len(x) for x in new_levels])

        comp_index, obs_ids = _compress_group_index(group_index)
        ngroups = len(obs_ids)

        comp_index = _ensure_platform_int(comp_index)
        stride = self.index.levshape[self.level]
        self.full_shape = ngroups, stride

        selector = self.sorted_labels[-1] + stride * comp_index
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        if mask.sum() < len(self.index):
            raise ReshapeError('Index contains duplicate entries, '
                               'cannot reshape')

        self.group_index = comp_index
        self.mask = mask
        self.unique_groups = obs_ids
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
Beispiel #5
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            values = [
                Categorical.from_array(
                    values[:, i],
                    categories=self.is_categorical.categories,
                    ordered=True) for i in range(values.shape[-1])
            ]

        return DataFrame(values, index=index, columns=columns)
Beispiel #6
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = algos.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            categories = self.is_categorical.categories
            ordered = self.is_categorical.ordered
            values = [
                Categorical(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1])
            ]

        return DataFrame(values, index=index, columns=columns)
Beispiel #7
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = self.index.labels
        levs = self.index.levels
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        group_index = get_group_index(to_sort, sizes)
        comp_index, obs_ids = _compress_group_index(group_index)
        ngroups = len(obs_ids)

        indexer = lib.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = com.take_2d(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Beispiel #8
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_2d(values, inds, axis=1)
                columns = columns[inds]

        return DataFrame(values, index=index, columns=columns)
Beispiel #9
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_nd(values, inds, axis=1)
                columns = columns[inds]

        return DataFrame(values, index=index, columns=columns)
Beispiel #10
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = self.index.labels
        levs = self.index.levels
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        group_index = get_group_index(to_sort, sizes)
        comp_index, obs_ids = _compress_group_index(group_index)
        ngroups = len(obs_ids)

        indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = com.take_2d(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Beispiel #11
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = self.index.labels
        levs = self.index.levels
        to_sort = labs[:v] + labs[v + 1 :] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]

        group_index = get_group_index(to_sort, sizes)
        max_groups = np.prod(sizes)
        if max_groups > 1000000:
            comp_index, obs_ids = _compress_group_index(group_index)
            ngroups = len(obs_ids)
        else:
            comp_index, ngroups = group_index, max_groups

        indexer = lib.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = self.values.take(indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Beispiel #12
0
def get_compressed_ids(labels, sizes):
    # no overflow
    if com._long_prod(sizes) < 2 ** 63:
        group_index = get_group_index(labels, sizes)
        comp_index, obs_ids = _compress_group_index(group_index)
    else:
        n = len(labels[0])
        mask = np.zeros(n, dtype=bool)
        for v in labels:
            mask |= v < 0

        while com._long_prod(sizes) >= 2 ** 63:
            i = len(sizes)
            while com._long_prod(sizes[:i]) >= 2 ** 63:
                i -= 1

            rem_index, rem_ids = get_compressed_ids(labels[:i], sizes[:i])
            sizes = [len(rem_ids)] + sizes[i:]
            labels = [rem_index] + labels[i:]

        return get_compressed_ids(labels, sizes)

    return comp_index, obs_ids
Beispiel #13
0
def get_compressed_ids(labels, sizes):
    # no overflow
    if com._long_prod(sizes) < 2**63:
        group_index = get_group_index(labels, sizes)
        comp_index, obs_ids = _compress_group_index(group_index)
    else:
        n = len(labels[0])
        mask = np.zeros(n, dtype=bool)
        for v in labels:
            mask |= v < 0

        while com._long_prod(sizes) >= 2**63:
            i = len(sizes)
            while com._long_prod(sizes[:i]) >= 2**63:
                i -= 1

            rem_index, rem_ids = get_compressed_ids(labels[:i], sizes[:i])
            sizes = [len(rem_ids)] + sizes[i:]
            labels = [rem_index] + labels[i:]

        return get_compressed_ids(labels, sizes)

    return comp_index, obs_ids
Beispiel #14
0
def _unstack_multiple(data, clocs):
    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    clabels = [index.labels[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rlabels = [index.labels[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(clabels, shape)

    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
    recons_labels = decons_group_index(obs_ids, shape)

    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                             labels=rlabels + [comp_ids],
                             names=rnames + ['__placeholder__'])

    if isinstance(data, Series):
        dummy = Series(data.values, index=dummy_index)
        unstacked = dummy.unstack('__placeholder__')
        new_levels = clevels
        new_names = cnames
        new_labels = recons_labels
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [val if i > val else val - 1 for val in clocs]

            return result

        dummy = DataFrame(data.values, index=dummy_index,
                          columns=data.columns)

        unstacked = dummy.unstack('__placeholder__')
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_labels = [unstcols.labels[0]]
        for rec in recons_labels:
            new_labels.append(rec.take(unstcols.labels[-1]))

    new_columns = MultiIndex(levels=new_levels, labels=new_labels,
                             names=new_names)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
Beispiel #15
0
def get_compressed_ids(labels, sizes):
    from pandas.core.groupby import get_flat_ids

    ids = get_flat_ids(labels, sizes, True)
    return _compress_group_index(ids, sort=True)
Beispiel #16
0
def _unstack_multiple(data, clocs):
    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    clabels = [index.labels[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rlabels = [index.labels[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(clabels, shape)

    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
    recons_labels = decons_group_index(obs_ids, shape)

    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                             labels=rlabels + [comp_ids],
                             names=rnames + ['__placeholder__'])

    if isinstance(data, Series):
        dummy = Series(data.values, index=dummy_index)
        unstacked = dummy.unstack('__placeholder__')
        new_levels = clevels
        new_names = cnames
        new_labels = recons_labels
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [val if i > val else val - 1 for val in clocs]

            return result

        dummy = DataFrame(data.values, index=dummy_index, columns=data.columns)

        unstacked = dummy.unstack('__placeholder__')
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_labels = [unstcols.labels[0]]
        for rec in recons_labels:
            new_labels.append(rec.take(unstcols.labels[-1]))

    new_columns = MultiIndex(levels=new_levels,
                             labels=new_labels,
                             names=new_names)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
Beispiel #17
0
def get_compressed_ids(labels, sizes):
    from pandas.core.groupby import get_flat_ids

    ids = get_flat_ids(labels, sizes, True)
    return _compress_group_index(ids, sort=True)
Beispiel #18
0
def get_compressed_ids(labels, sizes):
    from pandas.core.groupby import get_group_index

    ids = get_group_index(labels, sizes, sort=True, xnull=False)
    return _compress_group_index(ids, sort=True)
Beispiel #19
0
def get_compressed_ids(labels, sizes):
    from pandas.core.groupby import get_group_index

    ids = get_group_index(labels, sizes, sort=True, xnull=False)
    return _compress_group_index(ids, sort=True)