def _get_table_for_categories(values, categories):
        # ripped out of _get_codes_for_values() in pandas Categorical module
        if not pd_cat_module.is_dtype_equal(values.dtype, categories.dtype):
            values = ensure_object(values)
            categories = ensure_object(categories)

        (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
        (_, _), cats = _get_data_algo(categories, _hashtables)
        t = hash_klass(len(cats))
        t.map_locations(cats)
        return t
Ejemplo n.º 2
0
def _get_codes_for_values(values, levels):
    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != levels.dtype:
        values = com._ensure_object(values)
        levels = com._ensure_object(levels)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(levels))
    t.map_locations(levels)
    return com._ensure_platform_int(t.lookup(values))
Ejemplo n.º 3
0
def _get_codes_for_values(values, levels):
    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != levels.dtype:
        values = com._ensure_object(values)
        levels = com._ensure_object(levels)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(levels))
    t.map_locations(levels)
    return com._ensure_platform_int(t.lookup(values))
Ejemplo n.º 4
0
def _get_codes_for_values(values, categories):
    """"
    utility routine to turn values into codes given the specified categories
    """

    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != categories.dtype:
        values = com._ensure_object(values)
        categories = com._ensure_object(categories)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(categories))
    t.map_locations(com._values_from_object(categories))
    return com._ensure_platform_int(t.lookup(values))
Ejemplo n.º 5
0
def _get_codes_for_values(values, levels):
    """"
    utility routine to turn values into codes given the specified levels
    """

    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != levels.dtype:
        values = com._ensure_object(values)
        levels = com._ensure_object(levels)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(levels))
    t.map_locations(com._values_from_object(levels))
    return com._ensure_platform_int(t.lookup(values))
Ejemplo n.º 6
0
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.
    verify : bool, default True
        Check if labels are out of bound for the values and put out of bound
        labels equal to na_sentinel. If ``verify=False``, it is assumed there
        are no out of bound labels. Ignored when ``labels`` is None.

        .. versionadded:: 0.25.0

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError(
            "Only list-like objects are allowed to be passed to" "safe_sort as values"
        )

    if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
        # don't convert to string types
        dtype, _ = infer_dtype_from_array(values)
        values = np.asarray(values, dtype=dtype)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return np.concatenate([nums, np.asarray(strs, dtype=object)])

    sorter = None
    if (
        not is_extension_array_dtype(values)
        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
    ):
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError(
            "Only list-like objects or None are allowed to be"
            "passed to safe_sort as labels"
        )
    labels = ensure_platform_int(np.asarray(labels))

    from pandas import Index

    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass, _), values = algorithms._get_data_algo(
            values, algorithms._hashtables
        )
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = ensure_platform_int(t.lookup(ordered))

    if na_sentinel == -1:
        # take_1d is faster, but only works for na_sentinels of -1
        order2 = sorter.argsort()
        new_labels = algorithms.take_1d(order2, labels, fill_value=-1)
        if verify:
            mask = (labels < -len(values)) | (labels >= len(values))
        else:
            mask = None
    else:
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))
        # Out of bound indices will be masked with `na_sentinel` next, so we
        # may deal with them here without performance loss using `mode='wrap'`
        new_labels = reverse_indexer.take(labels, mode="wrap")

        mask = labels == na_sentinel
        if verify:
            mask = mask | (labels < -len(values)) | (labels >= len(values))

    if mask is not None:
        np.putmask(new_labels, mask, na_sentinel)

    return ordered, ensure_platform_int(new_labels)
Ejemplo n.º 7
0
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError("Only list-like objects are allowed to be passed to"
                        "safe_sort as values")

    if not isinstance(values, np.ndarray):

        # don't convert to string types
        dtype, _ = infer_dtype_from_array(values)
        values = np.asarray(values, dtype=dtype)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, string_types) for x in values],
                           dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return np.concatenate([nums, np.asarray(strs, dtype=object)])

    sorter = None
    if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer':
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError("Only list-like objects or None are allowed to be"
                        "passed to safe_sort as labels")
    labels = ensure_platform_int(np.asarray(labels))

    from pandas import Index
    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass, _), values = algorithms._get_data_algo(
            values, algorithms._hashtables)
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = ensure_platform_int(t.lookup(ordered))

    reverse_indexer = np.empty(len(sorter), dtype=np.int_)
    reverse_indexer.put(sorter, np.arange(len(sorter)))

    mask = (labels < -len(values)) | (labels >= len(values)) | \
        (labels == na_sentinel)

    # (Out of bound indices will be masked with `na_sentinel` next, so we may
    # deal with them here without performance loss using `mode='wrap'`.)
    new_labels = reverse_indexer.take(labels, mode='wrap')
    np.putmask(new_labels, mask, na_sentinel)

    return ordered, ensure_platform_int(new_labels)