def validate_categories(categories, fastpath: bool = False) -> Index: """ Validates that we have good categories Parameters ---------- categories : array-like fastpath : bool Whether to skip nan and uniqueness checks Returns ------- categories : Index """ from pandas.core.indexes.base import Index if not fastpath and not is_list_like(categories): raise TypeError( f"Parameter 'categories' must be list-like, was {repr(categories)}" ) elif not isinstance(categories, ABCIndex): categories = Index._with_infer(categories, tupleize_cols=False) if not fastpath: if categories.hasnans: raise ValueError("Categorical categories cannot be null") if not categories.is_unique: raise ValueError("Categorical categories must be unique") if isinstance(categories, ABCCategoricalIndex): categories = categories.categories return categories
def _hash_ndarray( vals: np.ndarray, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, ) -> np.ndarray: """ See hash_array.__doc__. """ dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif dtype == bool: vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8") else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import ( Categorical, Index, factorize, ) codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index._with_infer(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xBF58476D1CE4E5B9) vals ^= vals >> 27 vals *= np.uint64(0x94D049BB133111EB) vals ^= vals >> 31 return vals
def box_expected(expected, box_cls, transpose=True): """ Helper function to wrap the expected output of a test in a given box_class. Parameters ---------- expected : np.ndarray, Index, Series box_cls : {Index, Series, DataFrame} Returns ------- subclass of box_cls """ if box_cls is pd.array: if isinstance(expected, RangeIndex): # pd.array would return an IntegerArray expected = PandasArray(np.asarray(expected._values)) else: expected = pd.array(expected) elif box_cls is Index: expected = Index._with_infer(expected) elif box_cls is Series: expected = Series(expected) elif box_cls is DataFrame: expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame # vectors of the same length. But convert to two rows to avoid # single-row special cases in datetime arithmetic expected = expected.T expected = pd.concat([expected] * 2, ignore_index=True) elif box_cls is PeriodArray: # the PeriodArray constructor is not as flexible as period_array expected = period_array(expected) elif box_cls is DatetimeArray: expected = DatetimeArray(expected) elif box_cls is TimedeltaArray: expected = TimedeltaArray(expected) elif box_cls is np.ndarray: expected = np.array(expected) elif box_cls is to_array: expected = to_array(expected) else: raise NotImplementedError(box_cls) return expected