def test_dst(self): dr1 = date_range('2013-01-01', periods=3, tz='US/Eastern') s1 = Series(dr1, name='A') self.assertTrue(is_datetimetz(s1)) dr2 = date_range('2013-08-01', periods=3, tz='US/Eastern') s2 = Series(dr2, name='A') self.assertTrue(is_datetimetz(s2)) self.assertEqual(s1.dtype, s2.dtype)
def test_basic(self): self.assertTrue(is_datetime64tz_dtype(self.dtype)) dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr, name='A') # dtypes self.assertTrue(is_datetime64tz_dtype(s.dtype)) self.assertTrue(is_datetime64tz_dtype(s)) self.assertFalse(is_datetime64tz_dtype(np.dtype('float64'))) self.assertFalse(is_datetime64tz_dtype(1.0)) self.assertTrue(is_datetimetz(s)) self.assertTrue(is_datetimetz(s.dtype)) self.assertFalse(is_datetimetz(np.dtype('float64'))) self.assertFalse(is_datetimetz(1.0))
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period_type = (is_period_dtype(values) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if needs_i8_conversion(dtype) or is_period_type: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period_type: # values may be an object values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) keys, counts = htable.value_count_uint64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) keys, counts = htable.value_count_object(values, dropna) mask = isnull(values) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def test_value_counts_unique_nunique(self): for orig in self.objs: o = orig.copy() klass = type(o) values = o._values if isinstance(values, Index): # reset name not to affect latter process values.name = None # create repeated values, 'n'th element is repeated by n+1 times # skip boolean, because it only has 2 values at most if isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): expected_index = pd.Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' else: expected_index = pd.Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) rep = np.repeat(values, range(1, len(o) + 1)) o = klass(rep, index=idx, name='a') # check values has the same dtype as the original self.assertEqual(o.dtype, orig.dtype) expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') result = o.value_counts() tm.assert_series_equal(result, expected_s) self.assertTrue(result.index.name is None) self.assertEqual(result.name, 'a') result = o.unique() if isinstance(o, Index): self.assertTrue(isinstance(result, o.__class__)) self.assert_index_equal(result, orig) elif is_datetimetz(o): # datetimetz Series returns array of Timestamp self.assertEqual(result[0], orig[0]) for r in result: self.assertIsInstance(r, pd.Timestamp) tm.assert_numpy_array_equal(result, orig._values.asobject.values) else: tm.assert_numpy_array_equal(result, orig.values) self.assertEqual(o.nunique(), len(np.unique(o.values)))
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period = (isinstance(values, ABCPeriodIndex) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values .. versionadded:: 0.19.0 Parameters ---------- keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype): if isinstance(values, (ABCSeries, ABCIndex)): values = values.values.view(np.int64) else: values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex, PeriodIndex # handling two possibilities here # - for a numpy datetimelike simply view as i8 then cast back # - for an extension datetimelike view as i8 then # reconstruct from boxed values to transfer metadata dtype = None if needs_i8_conversion(values): if is_period_dtype(values): values = PeriodIndex(values) vals = values.asi8 elif is_datetimetz(values): values = DatetimeIndex(values) vals = values.asi8 else: # numpy dtype dtype = values.dtype vals = values.view(np.int64) else: vals = np.asarray(values) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if dtype is not None: uniques = uniques.astype(dtype) if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ Specialized Cython take which sets NaN values in one pass Parameters ---------- arr : ndarray Input array indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indicies are filed with fill_value axis : int, default 0 Axis to take from out : ndarray or None, default None Optional output array, must be appropriate type to hold input and fill_value together, if indexer has any -1 value entries; call _maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with mask_info : tuple of (ndarray, boolean) If provided, value should correspond to: (indexer != -1, (indexer != -1).any()) If not provided, it will be computed internally if necessary allow_fill : boolean, default True If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. """ # dispatch to internal type takes if is_categorical(arr): return arr.take_nd(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: indexer = _ensure_int64(indexer) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: mask, needs_masking = mask_info else: mask = indexer == -1 needs_masking = mask.any() mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: raise TypeError('Incompatible type for fill_value') else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() flip_order = False if arr.ndim == 2: if arr.flags.f_contiguous: flip_order = True if flip_order: arr = arr.T axis = arr.ndim - axis - 1 if out is not None: out = out.T # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value if out is None: out_shape = list(arr.shape) out_shape[axis] = len(indexer) out_shape = tuple(out_shape) if arr.flags.f_contiguous and axis == arr.ndim - 1: # minor tweak that can make an order-of-magnitude difference # for dataframes initialized directly from 2-d ndarrays # (s.t. df.values is c-contiguous and df._data.blocks[0] is its # f-contiguous transpose) out = np.empty(out_shape, dtype=dtype, order='F') else: out = np.empty(out_shape, dtype=dtype) func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: out = out.T return out
def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: for o in self.objs: klass = type(o) values = o.values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = pd.tslib.iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = pd.tslib.iNaT values = o._values elif is_datetime64_dtype(o) or isinstance(o, PeriodIndex): values[0:2] = pd.tslib.iNaT else: values[0:2] = null_obj # check values has the same dtype as the original self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, PeriodIndex): # freq must be specified because repeat makes freq # ambiguous # resets name from Index expected_index = pd.Index(o, name=None) # attach name to klass o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a') elif isinstance(o, Index): expected_index = pd.Index(values, name=None) o = klass(np.repeat(values, range(1, len(o) + 1)), name='a') else: expected_index = pd.Index(values, name=None) idx = np.repeat(o.index.values, range(1, len(o) + 1)) o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a') expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) self.assertTrue(result_s_na.index.name is None) self.assertEqual(result_s_na.name, 'a') result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) self.assertTrue(result_s.index.name is None) self.assertEqual(result_s.name, 'a') result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan tm.assert_numpy_array_equal(result[1:], values[2:].asobject.values) self.assertIs(result[0], pd.NaT) else: tm.assert_numpy_array_equal(result[1:], values[2:]) self.assertTrue(pd.isnull(result[0])) self.assertEqual(o.nunique(), 8) self.assertEqual(o.nunique(dropna=False), 9)
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex vals = np.asarray(values) # localize to UTC is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) vals = values.tz_localize(None) is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(_ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types)]]) sorter = _ensure_platform_int(t.lookup( _ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetimetz_type: # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( values.tz) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: for orig in self.objs: o = orig.copy() klass = type(o) values = o._values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = iNaT values = o._values elif needs_i8_conversion(o): values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, (DatetimeIndex, PeriodIndex)): expected_index = o.copy() expected_index.name = None # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) o.name = 'a' else: if is_datetimetz(o): expected_index = orig._values._shallow_copy(values) else: expected_index = pd.Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' # check values has the same dtype as the original self.assertEqual(o.dtype, orig.dtype) # check values correctly have NaN nanloc = np.zeros(len(o), dtype=np.bool) nanloc[:3] = True if isinstance(o, Index): self.assert_numpy_array_equal(pd.isnull(o), nanloc) else: exp = pd.Series(nanloc, o.index, name='a') self.assert_series_equal(pd.isnull(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) self.assertTrue(result_s_na.index.name is None) self.assertEqual(result_s_na.name, 'a') result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) self.assertTrue(result_s.index.name is None) self.assertEqual(result_s.name, 'a') result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan tm.assert_numpy_array_equal(result[1:], values[2:].asobject.values) self.assertIs(result[0], pd.NaT) else: tm.assert_numpy_array_equal(result[1:], values[2:]) self.assertTrue(pd.isnull(result[0])) self.assertEqual(result.dtype, orig.dtype) self.assertEqual(o.nunique(), 8) self.assertEqual(o.nunique(dropna=False), 9)
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import to_timedelta ovalues = values supplied_dtype = None if not is_list_like(values): values = np.array([values]) # if this is a Series that contains relevant dtype info, then use this # instead of the inferred type; this avoids coercing Series([NaT], # dtype='datetime64[ns]') to Series([NaT], dtype='timedelta64[ns]') elif (isinstance(values, pd.Series) and (is_timedelta64_dtype(values) or is_datetime64_dtype(values))): supplied_dtype = values.dtype inferred_type = supplied_dtype or lib.infer_dtype(values) if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or is_datetimetz(inferred_type)): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (supplied_dtype is None and other is not None and (other.dtype in ('timedelta64[ns]', 'datetime64[ns]')) and isnull(values).all()): values = np.empty(values.shape, dtype='timedelta64[ns]') values[:] = iNaT # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() # datetime with tz elif (isinstance(ovalues, datetime.datetime) and hasattr(ovalues, 'tz')): values = pd.DatetimeIndex(values) # datetime array with tz elif is_datetimetz(values): if isinstance(values, ABCSeries): values = values._values elif not (isinstance(values, (np.ndarray, ABCSeries)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = to_timedelta(values, errors='coerce', box=False) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__', '__rmul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif inferred_type == 'floating': if (isnull(values).all() and name in ('__add__', '__radd__', '__sub__', '__rsub__')): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT return values elif self._is_offset(values): return values else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" " operation".format(np.array(values).dtype)) return values
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series name = getattr(values, 'name', None) if bins is not None: try: from pandas.tools.tile import cut values = Series(values).values cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if is_extension_type(values) and not is_datetimetz(values): # handle Categorical and sparse, # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) result.name = name counts = result.values else: # ndarray path. pass original to handle DatetimeTzBlock keys, counts = _value_counts_arraylike(values, dropna=dropna) from pandas import Index, Series if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(counts.sum()) return result
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex, PeriodIndex # handling two possibilities here # - for a numpy datetimelike simply view as i8 then cast back # - for an extension datetimelike view as i8 then # reconstruct from boxed values to transfer metadata dtype = None if needs_i8_conversion(values): if is_period_dtype(values): values = PeriodIndex(values) vals = values.asi8 elif is_datetimetz(values): values = DatetimeIndex(values) vals = values.asi8 else: # numpy dtype dtype = values.dtype vals = values.view(np.int64) else: vals = np.asarray(values) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() check_nulls = not is_integer_dtype(values) labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if dtype is not None: uniques = uniques.astype(dtype) if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex vals = np.asarray(values) # localize to UTC is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) vals = values.asi8 is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if is_datetimetz_type: # reset tz uniques = values._shallow_copy(uniques) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def test_value_counts_unique_nunique(self): for orig in self.objs: o = orig.copy() klass = type(o) values = o.values # create repeated values, 'n'th element is repeated by n+1 times if isinstance(o, PeriodIndex): # freq must be specified because repeat makes freq ambiguous # resets name from Index expected_index = pd.Index(o[::-1]) expected_index.name = None # attach name to klass o = o.repeat(range(1, len(o) + 1)) o.name = 'a' elif isinstance(o, DatetimeIndex): # resets name from Index expected_index = pd.Index(o[::-1]) expected_index.name = None # attach name to klass o = o.repeat(range(1, len(o) + 1)) o.name = 'a' # don't test boolean elif isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): expected_index = pd.Index(values[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' else: expected_index = pd.Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a') expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') result = o.value_counts() tm.assert_series_equal(result, expected_s) self.assertTrue(result.index.name is None) self.assertEqual(result.name, 'a') result = o.unique() if isinstance(o, (DatetimeIndex, PeriodIndex)): self.assertTrue(isinstance(result, o.__class__)) self.assertEqual(result.freq, o.freq) self.assert_index_equal(result, orig) else: self.assert_numpy_array_equal(result, values) self.assertEqual(o.nunique(), len(np.unique(o.values))) for null_obj in [np.nan, None]: for o in self.objs: klass = type(o) values = o.values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = pd.tslib.iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = pd.tslib.iNaT values = o.values elif o.values.dtype == 'datetime64[ns]' or isinstance( o, PeriodIndex): values[0:2] = pd.tslib.iNaT else: values[0:2] = null_obj # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, PeriodIndex): # freq must be specified because repeat makes freq # ambiguous # resets name from Index expected_index = pd.Index(o, name=None) # attach name to klass o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a') elif isinstance(o, Index): expected_index = pd.Index(values, name=None) o = klass( np.repeat(values, range(1, len(o) + 1)), name='a') else: expected_index = pd.Index(values, name=None) idx = np.repeat(o.index.values, range(1, len(o) + 1)) o = klass( np.repeat(values, range( 1, len(o) + 1)), index=idx, name='a') expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) self.assertTrue(result_s_na.index.name is None) self.assertEqual(result_s_na.name, 'a') result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) self.assertTrue(result_s.index.name is None) self.assertEqual(result_s.name, 'a') # numpy_array_equal cannot compare arrays includes nan result = o.unique() self.assert_numpy_array_equal(result[1:], values[2:]) if isinstance(o, (DatetimeIndex, PeriodIndex)): self.assertTrue(result.asi8[0] == pd.tslib.iNaT) else: self.assertTrue(pd.isnull(result[0])) self.assertEqual(o.nunique(), 8) self.assertEqual(o.nunique(dropna=False), 9)