def result_ilocs(self) -> npt.NDArray[np.intp]: """ Get the original integer locations of result_index in the input. """ # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. group_index = get_group_index( self.codes, self.shape, sort=self._sort, xnull=True ) group_index, _ = compress_group_index(group_index, sort=self._sort) if self.has_dropped_na: mask = np.where(group_index >= 0) # Count how many gaps are caused by previous null values for each position null_gaps = np.cumsum(group_index == -1)[mask] group_index = group_index[mask] result = get_group_index_sorter(group_index, self.ngroups) if self.has_dropped_na: # Shift by the number of prior null gaps result += np.take(null_gaps, result) return result
def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index))
def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.intp]]: # The first returned ndarray may have any signed integer dtype if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] return ping.labels, np.arange(len(ping.group_index))
def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] return ping.labels, np.arange(len(ping.group_index))
def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: all_codes = self.codes print("All codes:") print(all_codes) if len(all_codes) > 1: group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index))
def reindex_duplicates(df, subset=None): """ Replace index for all duplicate rows with the first row's index. Args: df (pandas.DataFrame): The frame to reindex subset: Only these columns are used to determine duplicate entries. Returns: pandas.DataFrame """ from pandas._libs.hashtable import SIZE_HINT_LIMIT from pandas.core.sorting import get_group_index if df.empty: return df._constructor_sliced(dtype=bool) def f(vals): labels, shape = _algorithms.factorize(vals, size_hint=min( len(df), SIZE_HINT_LIMIT)) return labels.astype("i8", copy=False), len(shape) if subset is None: subset = df.columns elif (not np.iterable(subset) or isinstance(subset, str) or isinstance(subset, tuple) and subset in df.columns): subset = (subset, ) # needed for mypy since can't narrow types using np.iterable subset = cast(Iterable, subset) # Verify all columns in subset exist in the queried dataframe # Otherwise, raise a KeyError, same as if you try to __getitem__ with a # key that doesn't exist. diff = Index(subset).difference(df.columns) if not diff.empty: raise KeyError(diff) vals = (col.values for name, col in df.items() if name in subset) labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) uids = np.unique(ids, return_index=True, return_inverse=True) first_ids = df.index[uids[1][uids[2]]] return df._constructor(data=df.to_numpy(), index=first_ids, columns=df.columns)
def factorize_2d(a): # column compressor size_hint = min(a.shape[0], SIZE_HINT_LIMIT) def f(vals): codes, labels = pd.factorize(vals, sort=True, size_hint=size_hint) return codes.astype("i8", copy=False), labels, len(labels) # get grouped ids vals = (col for col in a.T) codes, labels, shape = map(list, zip(*map(f, vals))) gids = get_group_index(codes, shape, sort=False, xnull=False) # get unique ids gcode, glabs = pd.factorize(gids, sort=True) coords = np.unravel_index(glabs, shape) ulabs = np.vstack([lab[crd] for crd, lab in zip(coords, labels)]).T return gcode, ulabs
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index # GH 19966 Make sure if MultiIndexed index has tuple name, they will be # recognised as a whole if clocs in index.names: clocs = [clocs] clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = tuple(len(x) for x in clevels) group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if not rlocs: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: dummy_index = MultiIndex( levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ["__placeholder__"], verify_integrity=False, ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val, fill_value=fill_value) clocs = [v if v < val else v - 1 for v in clocs] return result # GH#42579 deep=False to avoid consolidating dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns assert isinstance(unstcols, MultiIndex) # for mypy new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: dummy_index = MultiIndex( levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ["__placeholder__"], verify_integrity=False, ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [v if i > v else v - 1 for v in clocs] return result dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def testit(codes_list, shape): group_index = get_group_index(codes_list, shape, sort=True, xnull=True) codes_list2 = decons_group_index(group_index, shape) for a, b in zip(codes_list, codes_list2): tm.assert_numpy_array_equal(a, b)
def get_compressed_ids(labels, sizes): ids = get_group_index(labels, sizes, sort=True, xnull=False) return compress_group_index(ids, sort=True)
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name='__placeholder__') else: dummy_index = MultiIndex(levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ['__placeholder__'], verify_integrity=False) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [v if i > v else v - 1 for v in clocs] return result dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def testit(label_list, shape): group_index = get_group_index(label_list, shape, sort=True, xnull=True) label_list2 = decons_group_index(group_index, shape) for a, b in zip(label_list, label_list2): assert (np.array_equal(a, b))
def testit(label_list, shape): group_index = get_group_index(label_list, shape, sort=True, xnull=True) label_list2 = decons_group_index(group_index, shape) for a, b in zip(label_list, label_list2): tm.assert_numpy_array_equal(a, b)
def get_compressed_ids(labels, sizes): ids = get_group_index(labels, sizes, sort=True, xnull=False) return compress_group_index(ids, sort=True)