def _hashtable_algo(f, values, return_dtype=None): """ f(HashTable, type_caster) -> result """ dtype = values.dtype if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_signed_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_unsigned_integer_dtype(dtype): return f(htable.UInt64HashTable, _ensure_uint64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: return f(htable.StringHashTable, _ensure_object) # use Object return f(htable.PyObjectHashTable, _ensure_object)
def _get_data_algo(values, func_map): f = None if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_signed_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) else: values = _ensure_object(values) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: try: f = func_map['string'] except KeyError: pass if f is None: f = func_map['object'] return f, values
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period_type = (is_period_dtype(values) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if needs_i8_conversion(dtype) or is_period_type: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period_type: # values may be an object values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) keys, counts = htable.value_count_uint64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) keys, counts = htable.value_count_object(values, dropna) mask = isnull(values) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values. .. versionadded:: 0.19.0 Parameters ---------- values : ndarray-like Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if needs_i8_conversion(dtype): values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_signed_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) duplicated = htable.duplicated_uint64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated
def mode(values): """ Returns the mode(s) of an array. Parameters ---------- values : array-like Array over which to check for duplicate values. Returns ------- mode : Series """ # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_signed_integer_dtype(values): values = _ensure_int64(values) result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) elif is_unsigned_integer_dtype(values): values = _ensure_uint64(values) result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: values = _ensure_object(values) res = htable.mode_object(values) try: res = np.sort(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result