Exemple #1
0
def hash_array(
    vals: ArrayLike,
    encoding: str = "utf8",
    hash_key: str = _default_hash_key,
    categorize: bool = True,
) -> np.ndarray:
    """
    Given a 1d array, return an array of deterministic integers.

    Parameters
    ----------
    vals : ndarray or ExtensionArray
    encoding : str, default 'utf8'
        Encoding for data & key when strings.
    hash_key : str, default _default_hash_key
        Hash_key for string key to encode.
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

    Returns
    -------
    ndarray[np.uint64, ndim=1]
        Hashed values, same length as the vals.
    """
    if not hasattr(vals, "dtype"):
        raise TypeError("must pass a ndarray-like")
    dtype = vals.dtype

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke).
    if is_categorical_dtype(dtype):
        vals = cast("Categorical", vals)
        return _hash_categorical(vals, encoding, hash_key)

    elif isinstance(vals, ABCExtensionArray):
        vals, _ = vals._values_for_factorize()

    elif not isinstance(vals, np.ndarray):
        # GH#42003
        raise TypeError(
            "hash_array requires np.ndarray or ExtensionArray, not "
            f"{type(vals).__name__}. Use hash_pandas_object instead."
        )

    return _hash_ndarray(vals, encoding, hash_key, categorize)
Exemple #2
0
def hash_array(
    vals: ArrayLike,
    encoding: str = "utf8",
    hash_key: str = _default_hash_key,
    categorize: bool = True,
) -> np.ndarray:
    """
    Given a 1d array, return an array of deterministic integers.

    Parameters
    ----------
    vals : ndarray or ExtensionArray
    encoding : str, default 'utf8'
        Encoding for data & key when strings.
    hash_key : str, default _default_hash_key
        Hash_key for string key to encode.
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals
    """
    if not hasattr(vals, "dtype"):
        raise TypeError("must pass a ndarray-like")
    dtype = vals.dtype

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke).
    if is_categorical_dtype(dtype):
        # error: Incompatible types in assignment (expression has type "Categorical",
        # variable has type "ndarray")
        vals = cast("Categorical", vals)  # type: ignore[assignment]
        # error: Argument 1 to "_hash_categorical" has incompatible type "ndarray";
        # expected "Categorical"
        return _hash_categorical(vals, encoding, hash_key)  # type: ignore[arg-type]
    elif is_extension_array_dtype(dtype):
        # error: Incompatible types in assignment (expression has type "ndarray",
        # variable has type "ExtensionArray")
        # error: "ndarray" has no attribute "_values_for_factorize"
        vals, _ = vals._values_for_factorize()  # type: ignore[assignment,attr-defined]

    # error: Argument 1 to "_hash_ndarray" has incompatible type "ExtensionArray";
    # expected "ndarray"
    return _hash_ndarray(vals, encoding, hash_key, categorize)  # type: ignore[arg-type]