Exemple #1
0
def mask_missing(arr, values_to_mask):
    """
    Return a masking array of same size/shape as arr
    with entries equaling any member of values_to_mask set to True
    """
    dtype, values_to_mask = infer_dtype_from_array(values_to_mask)

    try:
        values_to_mask = np.array(values_to_mask, dtype=dtype)

    except Exception:
        values_to_mask = np.array(values_to_mask, dtype=object)

    na_mask = isna(values_to_mask)
    nonna = values_to_mask[~na_mask]

    mask = None
    for x in nonna:
        if mask is None:

            # numpy elementwise comparison warning
            if is_numeric_v_string_like(arr, x):
                mask = False
            else:
                mask = arr == x

            # if x is a string and arr is not, then we get False and we must
            # expand the mask to size arr.shape
            if is_scalar(mask):
                mask = np.zeros(arr.shape, dtype=bool)
        else:

            # numpy elementwise comparison warning
            if is_numeric_v_string_like(arr, x):
                mask |= False
            else:
                mask |= arr == x

    if na_mask.any():
        if mask is None:
            mask = isna(arr)
        else:
            mask |= isna(arr)

    # GH 21977
    if mask is None:
        mask = np.zeros(arr.shape, dtype=bool)

    return mask
Exemple #2
0
def mask_missing(arr, values_to_mask):
    """
    Return a masking array of same size/shape as arr
    with entries equaling any member of values_to_mask set to True
    """
    dtype, values_to_mask = infer_dtype_from_array(values_to_mask)

    try:
        values_to_mask = np.array(values_to_mask, dtype=dtype)

    except Exception:
        values_to_mask = np.array(values_to_mask, dtype=object)

    na_mask = isna(values_to_mask)
    nonna = values_to_mask[~na_mask]

    mask = None
    for x in nonna:
        if mask is None:

            # numpy elementwise comparison warning
            if is_numeric_v_string_like(arr, x):
                mask = False
            else:
                mask = arr == x

            # if x is a string and arr is not, then we get False and we must
            # expand the mask to size arr.shape
            if is_scalar(mask):
                mask = np.zeros(arr.shape, dtype=bool)
        else:

            # numpy elementwise comparison warning
            if is_numeric_v_string_like(arr, x):
                mask |= False
            else:
                mask |= arr == x

    if na_mask.any():
        if mask is None:
            mask = isna(arr)
        else:
            mask |= isna(arr)

    # GH 21977
    if mask is None:
        mask = np.zeros(arr.shape, dtype=bool)

    return mask
Exemple #3
0
def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray:
    """
    Return a masking array of same size/shape as arr
    with entries equaling any member of values_to_mask set to True

    Parameters
    ----------
    arr : ArrayLike
    values_to_mask: list, tuple, or scalar

    Returns
    -------
    np.ndarray[bool]
    """
    # When called from Block.replace/replace_list, values_to_mask is a scalar
    #  known to be holdable by arr.
    # When called from Series._single_replace, values_to_mask is tuple or list
    dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
    values_to_mask = np.array(values_to_mask, dtype=dtype)

    na_mask = isna(values_to_mask)
    nonna = values_to_mask[~na_mask]

    # GH 21977
    mask = np.zeros(arr.shape, dtype=bool)
    for x in nonna:
        if is_numeric_v_string_like(arr, x):
            # GH#29553 prevent numpy deprecation warnings
            pass
        else:
            mask |= arr == x

    if na_mask.any():
        mask |= isna(arr)

    return mask
Exemple #4
0
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.
    verify : bool, default True
        Check if labels are out of bound for the values and put out of bound
        labels equal to na_sentinel. If ``verify=False``, it is assumed there
        are no out of bound labels. Ignored when ``labels`` is None.

        .. versionadded:: 0.25.0

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError(
            "Only list-like objects are allowed to be passed to" "safe_sort as values"
        )

    if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
        # don't convert to string types
        dtype, _ = infer_dtype_from_array(values)
        values = np.asarray(values, dtype=dtype)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return np.concatenate([nums, np.asarray(strs, dtype=object)])

    sorter = None
    if (
        not is_extension_array_dtype(values)
        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
    ):
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError(
            "Only list-like objects or None are allowed to be"
            "passed to safe_sort as labels"
        )
    labels = ensure_platform_int(np.asarray(labels))

    from pandas import Index

    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass, _), values = algorithms._get_data_algo(
            values, algorithms._hashtables
        )
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = ensure_platform_int(t.lookup(ordered))

    if na_sentinel == -1:
        # take_1d is faster, but only works for na_sentinels of -1
        order2 = sorter.argsort()
        new_labels = algorithms.take_1d(order2, labels, fill_value=-1)
        if verify:
            mask = (labels < -len(values)) | (labels >= len(values))
        else:
            mask = None
    else:
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))
        # Out of bound indices will be masked with `na_sentinel` next, so we
        # may deal with them here without performance loss using `mode='wrap'`
        new_labels = reverse_indexer.take(labels, mode="wrap")

        mask = labels == na_sentinel
        if verify:
            mask = mask | (labels < -len(values)) | (labels >= len(values))

    if mask is not None:
        np.putmask(new_labels, mask, na_sentinel)

    return ordered, ensure_platform_int(new_labels)
Exemple #5
0
    def test_infer_dtype_from_array(self, arr, expected, pandas_dtype):

        dtype, _ = infer_dtype_from_array(arr, pandas_dtype=pandas_dtype)
        assert is_dtype_equal(dtype, expected)
Exemple #6
0
def test_infer_dtype_from_array(arr, expected, pandas_dtype):
    dtype, _ = infer_dtype_from_array(arr, pandas_dtype=pandas_dtype)
    assert is_dtype_equal(dtype, expected)
Exemple #7
0
    def test_infer_dtype_from_array(self, arr, expected):

        # these infer specifically to numpy dtypes
        dtype, _ = infer_dtype_from_array(arr)
        assert dtype == expected
Exemple #8
0
def test_infer_dtype_from_scalar(value, expected, pandas_dtype):
    dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=pandas_dtype)
    assert is_dtype_equal(dtype, expected)

    with pytest.raises(TypeError, match="must be list-like"):
        infer_dtype_from_array(value, pandas_dtype=pandas_dtype)
Exemple #9
0
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError("Only list-like objects are allowed to be passed to"
                        "safe_sort as values")

    if not isinstance(values, np.ndarray):

        # don't convert to string types
        dtype, _ = infer_dtype_from_array(values)
        values = np.asarray(values, dtype=dtype)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, string_types) for x in values],
                           dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return np.concatenate([nums, np.asarray(strs, dtype=object)])

    sorter = None
    if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer':
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError("Only list-like objects or None are allowed to be"
                        "passed to safe_sort as labels")
    labels = ensure_platform_int(np.asarray(labels))

    from pandas import Index
    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass, _), values = algorithms._get_data_algo(
            values, algorithms._hashtables)
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = ensure_platform_int(t.lookup(ordered))

    reverse_indexer = np.empty(len(sorter), dtype=np.int_)
    reverse_indexer.put(sorter, np.arange(len(sorter)))

    mask = (labels < -len(values)) | (labels >= len(values)) | \
        (labels == na_sentinel)

    # (Out of bound indices will be masked with `na_sentinel` next, so we may
    # deal with them here without performance loss using `mode='wrap'`.)
    new_labels = reverse_indexer.take(labels, mode='wrap')
    np.putmask(new_labels, mask, na_sentinel)

    return ordered, ensure_platform_int(new_labels)
Exemple #10
0
    def test_infer_dtype_from_array(self, arr, expected):

        # these infer specifically to numpy dtypes
        dtype, _ = infer_dtype_from_array(arr)
        assert dtype == expected