def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = np.empty((len(index),values.shape[1])), values values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) return DataFrame(values, index=index, columns=columns)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = (np.empty( (len(index), values.shape[1])), values) values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) return DataFrame(values, index=index, columns=columns)
def _make_selectors(self): new_levels = self.new_index_levels # make the mask group_index = get_group_index(self.sorted_labels[:-1], [len(x) for x in new_levels]) comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ReshapeError('Index contains duplicate entries, ' 'cannot reshape') self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups))
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: values = [ Categorical.from_array( values[:, i], categories=self.is_categorical.categories, ordered=True) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = algos.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [ Categorical(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def _make_sorted_values_labels(self): v = self.level labs = self.index.labels levs = self.index.levels to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] group_index = get_group_index(to_sort, sizes) comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) indexer = lib.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = com.take_2d(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_2d(values, inds, axis=1) columns = columns[inds] return DataFrame(values, index=index, columns=columns)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] return DataFrame(values, index=index, columns=columns)
def _make_sorted_values_labels(self): v = self.level labs = self.index.labels levs = self.index.levels to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] group_index = get_group_index(to_sort, sizes) comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) indexer = algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = com.take_2d(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_sorted_values_labels(self): v = self.level labs = self.index.labels levs = self.index.levels to_sort = labs[:v] + labs[v + 1 :] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] group_index = get_group_index(to_sort, sizes) max_groups = np.prod(sizes) if max_groups > 1000000: comp_index, obs_ids = _compress_group_index(group_index) ngroups = len(obs_ids) else: comp_index, ngroups = group_index, max_groups indexer = lib.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = self.values.take(indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def get_compressed_ids(labels, sizes): # no overflow if com._long_prod(sizes) < 2 ** 63: group_index = get_group_index(labels, sizes) comp_index, obs_ids = _compress_group_index(group_index) else: n = len(labels[0]) mask = np.zeros(n, dtype=bool) for v in labels: mask |= v < 0 while com._long_prod(sizes) >= 2 ** 63: i = len(sizes) while com._long_prod(sizes[:i]) >= 2 ** 63: i -= 1 rem_index, rem_ids = get_compressed_ids(labels[:i], sizes[:i]) sizes = [len(rem_ids)] + sizes[i:] labels = [rem_index] + labels[i:] return get_compressed_ids(labels, sizes) return comp_index, obs_ids
def get_compressed_ids(labels, sizes): # no overflow if com._long_prod(sizes) < 2**63: group_index = get_group_index(labels, sizes) comp_index, obs_ids = _compress_group_index(group_index) else: n = len(labels[0]) mask = np.zeros(n, dtype=bool) for v in labels: mask |= v < 0 while com._long_prod(sizes) >= 2**63: i = len(sizes) while com._long_prod(sizes[:i]) >= 2**63: i -= 1 rem_index, rem_ids = get_compressed_ids(labels[:i], sizes[:i]) sizes = [len(rem_ids)] + sizes[i:] labels = [rem_index] + labels[i:] return get_compressed_ids(labels, sizes) return comp_index, obs_ids
def _unstack_multiple(data, clocs): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) recons_labels = decons_group_index(obs_ids, shape) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__']) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [val if i > val else val - 1 for val in clocs] return result dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def get_compressed_ids(labels, sizes): from pandas.core.groupby import get_flat_ids ids = get_flat_ids(labels, sizes, True) return _compress_group_index(ids, sort=True)
def get_compressed_ids(labels, sizes): from pandas.core.groupby import get_group_index ids = get_group_index(labels, sizes, sort=True, xnull=False) return _compress_group_index(ids, sort=True)