Ejemplo n.º 1
0
def _hashtable_algo(f, values, return_dtype=None):
    """
    f(HashTable, type_caster) -> result
    """

    dtype = values.dtype
    if is_float_dtype(dtype):
        return f(htable.Float64HashTable, _ensure_float64)
    elif is_signed_integer_dtype(dtype):
        return f(htable.Int64HashTable, _ensure_int64)
    elif is_unsigned_integer_dtype(dtype):
        return f(htable.UInt64HashTable, _ensure_uint64)
    elif is_datetime64_dtype(dtype):
        return_dtype = return_dtype or 'M8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    elif is_timedelta64_dtype(dtype):
        return_dtype = return_dtype or 'm8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

    # its cheaper to use a String Hash Table than Object
    if lib.infer_dtype(values) in ['string']:
        return f(htable.StringHashTable, _ensure_object)

    # use Object
    return f(htable.PyObjectHashTable, _ensure_object)
Ejemplo n.º 2
0
def _get_data_algo(values, func_map):

    f = None
    if is_float_dtype(values):
        f = func_map['float64']
        values = _ensure_float64(values)

    elif needs_i8_conversion(values):
        f = func_map['int64']
        values = values.view('i8')

    elif is_signed_integer_dtype(values):
        f = func_map['int64']
        values = _ensure_int64(values)

    elif is_unsigned_integer_dtype(values):
        f = func_map['uint64']
        values = _ensure_uint64(values)

    else:
        values = _ensure_object(values)

        # its cheaper to use a String Hash Table than Object
        if lib.infer_dtype(values) in ['string']:
            try:
                f = func_map['string']
            except KeyError:
                pass

    if f is None:
        f = func_map['object']

    return f, values
Ejemplo n.º 3
0
def _hashtable_algo(f, values, return_dtype=None):
    """
    f(HashTable, type_caster) -> result
    """

    dtype = values.dtype
    if is_float_dtype(dtype):
        return f(htable.Float64HashTable, _ensure_float64)
    elif is_signed_integer_dtype(dtype):
        return f(htable.Int64HashTable, _ensure_int64)
    elif is_unsigned_integer_dtype(dtype):
        return f(htable.UInt64HashTable, _ensure_uint64)
    elif is_datetime64_dtype(dtype):
        return_dtype = return_dtype or 'M8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    elif is_timedelta64_dtype(dtype):
        return_dtype = return_dtype or 'm8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

    # its cheaper to use a String Hash Table than Object
    if lib.infer_dtype(values) in ['string']:
        return f(htable.StringHashTable, _ensure_object)

    # use Object
    return f(htable.PyObjectHashTable, _ensure_object)
Ejemplo n.º 4
0
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz_type = is_datetimetz(values)
    is_period_type = (is_period_dtype(values) or
                      is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if needs_i8_conversion(dtype) or is_period_type:

        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period_type:
            # values may be an object
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz_type:
            keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz)
        if is_period_type:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif is_signed_integer_dtype(dtype):
        values = _ensure_int64(values)
        keys, counts = htable.value_count_int64(values, dropna)
    elif is_unsigned_integer_dtype(dtype):
        values = _ensure_uint64(values)
        keys, counts = htable.value_count_uint64(values, dropna)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        keys, counts = htable.value_count_float64(values, dropna)
    else:
        values = _ensure_object(values)
        keys, counts = htable.value_count_object(values, dropna)

        mask = isnull(values)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
Ejemplo n.º 5
0
def duplicated(values, keep='first'):
    """
    Return boolean ndarray denoting duplicate values.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : ndarray-like
        Array over which to check for duplicate values.
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Mark duplicates as ``True`` except for the first
          occurrence.
        - ``last`` : Mark duplicates as ``True`` except for the last
          occurrence.
        - False : Mark all duplicates as ``True``.

    Returns
    -------
    duplicated : ndarray
    """

    dtype = values.dtype

    # no need to revert to original type
    if needs_i8_conversion(dtype):
        values = values.view(np.int64)
    elif is_period_arraylike(values):
        from pandas.tseries.period import PeriodIndex
        values = PeriodIndex(values).asi8
    elif is_categorical_dtype(dtype):
        values = values.values.codes
    elif isinstance(values, (ABCSeries, ABCIndex)):
        values = values.values

    if is_signed_integer_dtype(dtype):
        values = _ensure_int64(values)
        duplicated = htable.duplicated_int64(values, keep=keep)
    elif is_unsigned_integer_dtype(dtype):
        values = _ensure_uint64(values)
        duplicated = htable.duplicated_uint64(values, keep=keep)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        duplicated = htable.duplicated_float64(values, keep=keep)
    else:
        values = _ensure_object(values)
        duplicated = htable.duplicated_object(values, keep=keep)

    return duplicated
Ejemplo n.º 6
0
def duplicated(values, keep='first'):
    """
    Return boolean ndarray denoting duplicate values.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : ndarray-like
        Array over which to check for duplicate values.
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Mark duplicates as ``True`` except for the first
          occurrence.
        - ``last`` : Mark duplicates as ``True`` except for the last
          occurrence.
        - False : Mark all duplicates as ``True``.

    Returns
    -------
    duplicated : ndarray
    """

    dtype = values.dtype

    # no need to revert to original type
    if needs_i8_conversion(dtype):
        values = values.view(np.int64)
    elif is_period_arraylike(values):
        from pandas.tseries.period import PeriodIndex
        values = PeriodIndex(values).asi8
    elif is_categorical_dtype(dtype):
        values = values.values.codes
    elif isinstance(values, (ABCSeries, ABCIndex)):
        values = values.values

    if is_signed_integer_dtype(dtype):
        values = _ensure_int64(values)
        duplicated = htable.duplicated_int64(values, keep=keep)
    elif is_unsigned_integer_dtype(dtype):
        values = _ensure_uint64(values)
        duplicated = htable.duplicated_uint64(values, keep=keep)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        duplicated = htable.duplicated_float64(values, keep=keep)
    else:
        values = _ensure_object(values)
        duplicated = htable.duplicated_object(values, keep=keep)

    return duplicated
Ejemplo n.º 7
0
def mode(values):
    """
    Returns the mode(s) of an array.

    Parameters
    ----------
    values : array-like
        Array over which to check for duplicate values.

    Returns
    -------
    mode : Series
    """

    # must sort because hash order isn't necessarily defined.
    from pandas.core.series import Series

    if isinstance(values, Series):
        constructor = values._constructor
        values = values.values
    else:
        values = np.asanyarray(values)
        constructor = Series

    dtype = values.dtype
    if is_signed_integer_dtype(values):
        values = _ensure_int64(values)
        result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
    elif is_unsigned_integer_dtype(values):
        values = _ensure_uint64(values)
        result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype)
    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
    elif is_categorical_dtype(values):
        result = constructor(values.mode())
    else:
        values = _ensure_object(values)
        res = htable.mode_object(values)
        try:
            res = np.sort(res)
        except TypeError as e:
            warn("Unable to sort modes: %s" % e)
        result = constructor(res, dtype=dtype)

    return result
Ejemplo n.º 8
0
def mode(values):
    """
    Returns the mode(s) of an array.

    Parameters
    ----------
    values : array-like
        Array over which to check for duplicate values.

    Returns
    -------
    mode : Series
    """

    # must sort because hash order isn't necessarily defined.
    from pandas.core.series import Series

    if isinstance(values, Series):
        constructor = values._constructor
        values = values.values
    else:
        values = np.asanyarray(values)
        constructor = Series

    dtype = values.dtype
    if is_signed_integer_dtype(values):
        values = _ensure_int64(values)
        result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
    elif is_unsigned_integer_dtype(values):
        values = _ensure_uint64(values)
        result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype)
    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype)
    elif is_categorical_dtype(values):
        result = constructor(values.mode())
    else:
        values = _ensure_object(values)
        res = htable.mode_object(values)
        try:
            res = np.sort(res)
        except TypeError as e:
            warn("Unable to sort modes: %s" % e)
        result = constructor(res, dtype=dtype)

    return result