def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray or None Indices of output values in original index """ if method is not None: raise NotImplementedError("argument method is not implemented for " "CategoricalIndex.reindex") if level is not None: raise NotImplementedError("argument level is not implemented for " "CategoricalIndex.reindex") if limit is not None: raise NotImplementedError("argument limit is not implemented for " "CategoricalIndex.reindex") target = ibase._ensure_index(target) if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) new_target = self.take(indexer) # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] new_target = self._create_from_codes(codes) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) return new_target, indexer
def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray or None Indices of output values in original index """ if method is not None: raise NotImplementedError("argument method is not implemented for " "CategoricalIndex.reindex") if level is not None: raise NotImplementedError("argument level is not implemented for " "CategoricalIndex.reindex") if limit is not None: raise NotImplementedError("argument limit is not implemented for " "CategoricalIndex.reindex") target = ibase._ensure_index(target) if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) new_target = self.take(indexer) # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] new_target = self._create_from_codes(codes) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) return new_target, indexer
def wrapper(self, other, axis=None): # Validate the axis parameter if axis is not None: self._get_axis_number(axis) if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast if (not lib.isscalar(lib.item_from_zerodim(other)) and len(self) != len(other)): raise ValueError('Lengths must match to compare') if isinstance(other, ABCPeriodIndex): # temp workaround until fixing GH 13637 # tested in test_nat_comparisons # (pandas.tests.series.test_operators.TestSeriesOperators) return self._constructor(na_op(self.values, other.asobject.values), index=self.index) return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not is_categorical_dtype(self): msg = ("Cannot compare a Categorical for op {op} with Series " "of dtype {typ}.\nIf you want to compare values, use " "'series <op> np.asarray(other)'.") raise TypeError(msg.format(op=op, typ=self.dtype)) if is_categorical_dtype(self): # cats are a special case as get_values() would return an ndarray, # which would then not take categories ordering into account # we can go directly to op, as the na_op would just test again and # dispatch to it. res = op(self.values, other) else: values = self.get_values() if isinstance(other, (list, np.ndarray)): other = np.asarray(other) res = na_op(values, other) if isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') return res
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x, y) elif is_categorical_dtype(y) and not isscalar(y): return op(y, x) if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if isscalar(y) and isnull(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or (not isscalar(y) and needs_i8_conversion(y))): if isscalar(y): mask = isnull(x) y = _index.convert_scalar(x, _values_from_object(y)) else: mask = isnull(x) | isnull(y) y = y.view('i8') x = x.view('i8') try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = masker return result
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_integer_dtype(values): values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: mask = isnull(values) values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not is_list_like(values): values = [values] other = CategoricalIndex( self._create_categorical(self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # GH12574 self.assertRaises( ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') self.assertTrue(is_categorical_dtype(cat)) self.assertTrue(is_categorical_dtype(cat.dtype)) s = Series([1, 2, 3], dtype='category') self.assertTrue(is_categorical_dtype(s)) self.assertTrue(is_categorical_dtype(s.dtype))
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_integer_dtype(values): values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: mask = isnull(values) values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def nargsort(items, kind='quicksort', ascending=True, na_position='last'): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ # specially handle Categorical if is_categorical_dtype(items): return items.argsort(ascending=ascending) items = np.asanyarray(items) idx = np.arange(len(items)) mask = isnull(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position if na_position == 'last': indexer = np.concatenate([indexer, nan_idx]) elif na_position == 'first': indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError('invalid na_position: {!r}'.format(na_position)) return indexer
def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # GH12574 self.assertRaises( ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') self.assertTrue(is_categorical_dtype(cat)) self.assertTrue(is_categorical_dtype(cat.dtype)) s = Series([1, 2, 3], dtype='category') self.assertTrue(is_categorical_dtype(s)) self.assertTrue(is_categorical_dtype(s.dtype))
def nargsort(items, kind='quicksort', ascending=True, na_position='last'): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ # specially handle Categorical if is_categorical_dtype(items): return items.argsort(ascending=ascending) items = np.asanyarray(items) idx = np.arange(len(items)) mask = isnull(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position if na_position == 'last': indexer = np.concatenate([indexer, nan_idx]) elif na_position == 'first': indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError('invalid na_position: {!r}'.format(na_position)) return indexer
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor, name='A') # dtypes self.assertTrue(is_categorical_dtype(s.dtype)) self.assertTrue(is_categorical_dtype(s)) self.assertFalse(is_categorical_dtype(np.dtype('float64'))) self.assertTrue(is_categorical(s.dtype)) self.assertTrue(is_categorical(s)) self.assertFalse(is_categorical(np.dtype('float64'))) self.assertFalse(is_categorical(1.0))
def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor, name='A') # dtypes self.assertTrue(is_categorical_dtype(s.dtype)) self.assertTrue(is_categorical_dtype(s)) self.assertFalse(is_categorical_dtype(np.dtype('float64'))) self.assertTrue(is_categorical(s.dtype)) self.assertTrue(is_categorical(s)) self.assertFalse(is_categorical(np.dtype('float64'))) self.assertFalse(is_categorical(1.0))
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): if copy: self = self.copy() return self elif is_object_dtype(dtype): return Index(self.values, dtype=object) elif is_categorical_dtype(dtype): from pandas import Categorical return Categorical(self, ordered=True) raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype)
def unconvert(values, dtype, compress=None): as_is_ext = isinstance(values, ExtType) and values.code == 0 if as_is_ext: values = values.data if is_categorical_dtype(dtype): return values elif is_object_dtype(dtype): return np.array(values, dtype=object) dtype = pandas_dtype(dtype).base if not as_is_ext: values = values.encode('latin1') if compress: if compress == u'zlib': _check_zlib() decompress = zlib.decompress elif compress == u'blosc': _check_blosc() decompress = blosc.decompress else: raise ValueError("compress must be one of 'zlib' or 'blosc'") try: return np.frombuffer( _move_into_mutable_buffer(decompress(values)), dtype=dtype, ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. # We don't just store this in the locals because we want to # minimize the risk of giving users access to a `bytes` object # whose data is also given to a mutable buffer. values = e.args[0] if len(values) > 1: # The empty string and single characters are memoized in many # string creating functions in the capi. This case should not # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( 'copying data after decompressing; this may mean that' ' decompress is caching its result', PerformanceWarning, ) # fall through to copying `np.fromstring` # Copy the string into a numpy array. return np.fromstring(values, dtype=dtype)
def unconvert(values, dtype, compress=None): as_is_ext = isinstance(values, ExtType) and values.code == 0 if as_is_ext: values = values.data if is_categorical_dtype(dtype): return values elif is_object_dtype(dtype): return np.array(values, dtype=object) dtype = pandas_dtype(dtype).base if not as_is_ext: values = values.encode('latin1') if compress: if compress == u'zlib': _check_zlib() decompress = zlib.decompress elif compress == u'blosc': _check_blosc() decompress = blosc.decompress else: raise ValueError("compress must be one of 'zlib' or 'blosc'") try: return np.frombuffer( _move_into_mutable_buffer(decompress(values)), dtype=dtype, ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. # We don't just store this in the locals because we want to # minimize the risk of giving users access to a `bytes` object # whose data is also given to a mutable buffer. values = e.args[0] if len(values) > 1: # The empty string and single characters are memoized in many # string creating functions in the capi. This case should not # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( 'copying data after decompressing; this may mean that' ' decompress is caching its result', PerformanceWarning, ) # fall through to copying `np.fromstring` # Copy the string into a numpy array. return np.fromstring(values, dtype=dtype)
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) index = data.index name = data.name orig = data if is_categorical_dtype(data) else None if orig is not None: data = orig.values.categories if is_datetime64_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data)))
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) index = data.index name = data.name orig = data if is_categorical_dtype(data) else None if orig is not None: data = orig.values.categories if is_datetime64_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data)))
def test_setitem(self): df = DataFrame({'A': range(10)}) s = pd.cut(df.A, 5) self.assertIsInstance(s.cat.categories, IntervalIndex) # B & D end up as Categoricals # the remainer are converted to in-line objects # contining an IntervalIndex.values df['B'] = s df['C'] = np.array(s) df['D'] = s.values df['E'] = np.array(s.values) assert is_categorical_dtype(df['B']) assert is_interval_dtype(df['B'].cat.categories) assert is_categorical_dtype(df['D']) assert is_interval_dtype(df['D'].cat.categories) assert is_object_dtype(df['C']) assert is_object_dtype(df['E']) # they compare equal as Index # when converted to numpy objects c = lambda x: Index(np.array(x)) tm.assert_index_equal(c(df.B), c(df.B), check_names=False) tm.assert_index_equal(c(df.B), c(df.C), check_names=False) tm.assert_index_equal(c(df.B), c(df.D), check_names=False) tm.assert_index_equal(c(df.B), c(df.D), check_names=False) # B & D are the same Series tm.assert_series_equal(df['B'], df['B'], check_names=False) tm.assert_series_equal(df['B'], df['D'], check_names=False) # C & E are the same Series tm.assert_series_equal(df['C'], df['C'], check_names=False) tm.assert_series_equal(df['C'], df['E'], check_names=False)
def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values. .. versionadded:: 0.19.0 Parameters ---------- values : ndarray-like Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if needs_i8_conversion(dtype): values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_signed_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) duplicated = htable.duplicated_uint64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated
def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values. .. versionadded:: 0.19.0 Parameters ---------- values : ndarray-like Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if needs_i8_conversion(dtype): values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_signed_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) duplicated = htable.duplicated_uint64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated
def test_categorical_ordering(self): parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) parsed_115_unordered = read_stata(self.dta19_115, order_categoricals=False) parsed_117_unordered = read_stata(self.dta19_117, order_categoricals=False) for col in parsed_115: if not is_categorical_dtype(parsed_115[col]): continue self.assertEqual(True, parsed_115[col].cat.ordered) self.assertEqual(True, parsed_117[col].cat.ordered) self.assertEqual(False, parsed_115_unordered[col].cat.ordered) self.assertEqual(False, parsed_117_unordered[col].cat.ordered)
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def mode(values): """ Returns the mode(s) of an array. Parameters ---------- values : array-like Array over which to check for duplicate values. Returns ------- mode : Series """ # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_signed_integer_dtype(values): values = _ensure_int64(values) result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) elif is_unsigned_integer_dtype(values): values = _ensure_uint64(values) result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: values = _ensure_object(values) res = htable.mode_object(values) try: res = np.sort(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def mode(values): """ Returns the mode(s) of an array. Parameters ---------- values : array-like Array over which to check for duplicate values. Returns ------- mode : Series """ # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_signed_integer_dtype(values): values = _ensure_int64(values) result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) elif is_unsigned_integer_dtype(values): values = _ensure_uint64(values) result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: values = _ensure_object(values) res = htable.mode_object(values) try: res = np.sort(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't # ask numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we # can manage it. if is_bool_array(vals): vals = vals.astype('u8') elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals) or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object # dtypes, then hash and rename categories. We allow skipping the # categorization when the values are known/likely to be unique. if categorize: codes, categories = pd.factorize(vals, sort=False) cat = pd.Categorical(codes, pd.Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) vals = hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def lexsort_indexer(keys, orders=None, na_position='last'): labels = [] shape = [] if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) for key, order in zip(keys, orders): # we are already a Categorical if is_categorical_dtype(key): c = key # create the Categorical else: c = Categorical(key, ordered=True) if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) n = len(c.categories) codes = c.codes.copy() mask = (c.codes == -1) if order: # ascending if na_position == 'last': codes = np.where(mask, n, codes) elif na_position == 'first': codes += 1 else: # not order means descending if na_position == 'last': codes = np.where(mask, n, n - codes - 1) elif na_position == 'first': codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 shape.append(n) labels.append(codes) return indexer_from_factorized(labels, shape)
def lexsort_indexer(keys, orders=None, na_position='last'): labels = [] shape = [] if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) for key, order in zip(keys, orders): # we are already a Categorical if is_categorical_dtype(key): c = key # create the Categorical else: c = Categorical(key, ordered=True) if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) n = len(c.categories) codes = c.codes.copy() mask = (c.codes == -1) if order: # ascending if na_position == 'last': codes = np.where(mask, n, codes) elif na_position == 'first': codes += 1 else: # not order means descending if na_position == 'last': codes = np.where(mask, n, n - codes - 1) elif na_position == 'first': codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 shape.append(n) labels.append(codes) return indexer_from_factorized(labels, shape)
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't # ask numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we # can manage it. if is_bool_array(vals): vals = vals.astype('u8') elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals) or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object # dtypes, then hash and rename categories. We allow skipping the # categorization when the values are known/likely to be unique. if categorize: codes, categories = pd.factorize(vals, sort=False) cat = pd.Categorical(codes, pd.Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) vals = hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def make_field(arr, dtype=None): dtype = dtype or arr.dtype field = {'name': arr.name or 'values', 'type': as_json_table_type(dtype)} if is_categorical_dtype(arr): if hasattr(arr, 'categories'): cats = arr.categories ordered = arr.ordered else: cats = arr.cat.categories ordered = arr.cat.ordered field['constraints'] = {"enum": list(cats)} field['ordered'] = ordered elif is_period_dtype(arr): field['freq'] = arr.freqstr elif is_datetime64tz_dtype(arr): if hasattr(arr, 'dt'): field['tz'] = arr.dt.tz.zone else: field['tz'] = arr.tz.zone return field
def make_field(arr, dtype=None): dtype = dtype or arr.dtype field = {'name': arr.name or 'values', 'type': as_json_table_type(dtype)} if is_categorical_dtype(arr): if hasattr(arr, 'categories'): cats = arr.categories ordered = arr.ordered else: cats = arr.cat.categories ordered = arr.cat.ordered field['constraints'] = {"enum": list(cats)} field['ordered'] = ordered elif is_period_dtype(arr): field['freq'] = arr.freqstr elif is_datetime64tz_dtype(arr): if hasattr(arr, 'dt'): field['tz'] = arr.dt.tz.zone else: field['tz'] = arr.tz.zone return field
def convert(values): """ convert the numpy values to a list """ dtype = values.dtype if is_categorical_dtype(values): return values elif is_object_dtype(dtype): return values.ravel().tolist() if needs_i8_conversion(dtype): values = values.view('i8') v = values.ravel() if compressor == 'zlib': _check_zlib() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, zlib.compress(v)) elif compressor == 'blosc': _check_blosc() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, blosc.compress(v, typesize=dtype.itemsize)) # ndarray (on original dtype) return ExtType(0, v.tostring())
def convert(values): """ convert the numpy values to a list """ dtype = values.dtype if is_categorical_dtype(values): return values elif is_object_dtype(dtype): return values.ravel().tolist() if needs_i8_conversion(dtype): values = values.view('i8') v = values.ravel() if compressor == 'zlib': _check_zlib() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, zlib.compress(v)) elif compressor == 'blosc': _check_blosc() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, blosc.compress(v, typesize=dtype.itemsize)) # ndarray (on original dtype) return ExtType(0, v.tostring())
def _get_data_algo(values, func_map): f = None if is_categorical_dtype(values): values = values._values_for_rank() if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_signed_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) else: values = _ensure_object(values) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: try: f = func_map['string'] except KeyError: pass if f is None: f = func_map['object'] return f, values
def test_categorical_order(self): # Directly construct using expected codes # Format is is_cat, col_name, labels (in order), underlying data expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])), (True, 'floating', [ 'a', 'b', 'c', 'd', 'e'], np.arange(0, 5)), (True, 'float_missing', [ 'a', 'd', 'e'], np.array([0, 1, 2, -1, -1])), (False, 'nolabel', [ 1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)), (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))] cols = [] for is_cat, col, labels, codes in expected: if is_cat: cols.append((col, pd.Categorical.from_codes(codes, labels))) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_items(cols) # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) tm.assert_frame_equal(expected, parsed_115, check_categorical=False) tm.assert_frame_equal(expected, parsed_117, check_categorical=False) # Check identity of codes for col in expected: if is_categorical_dtype(expected[col]): tm.assert_series_equal(expected[col].cat.codes, parsed_115[col].cat.codes) tm.assert_index_equal(expected[col].cat.categories, parsed_115[col].cat.categories)
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(repr(bins))) else: bins = unique_bins side = 'left' if right else 'right' ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def hash_array(vals, encoding='utf8', hash_key=None): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ # work with cagegoricals as ints. (This check is above the complex # check so that we don't ask numpy if categorical is a subdtype of # complex, as it will choke. if hash_key is None: hash_key = _default_hash_key if is_categorical_dtype(vals.dtype): vals = vals.codes # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # MAIN LOGIC: inferred = infer_dtype(vals) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if inferred == 'boolean': vals = vals.astype('u8') if (np.issubdtype(vals.dtype, np.datetime64) or np.issubdtype(vals.dtype, np.timedelta64) or np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # its MUCH faster to categorize object dtypes, then hash and rename codes, categories = factorize(vals, sort=False) categories = Index(categories) c = Series(Categorical(codes, categories, ordered=False, fastpath=True)) vals = _hash.hash_object_array(categories.values, hash_key, encoding) # rename & extract vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def hash_array(vals, encoding='utf8', hash_key=None): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ # work with cagegoricals as ints. (This check is above the complex # check so that we don't ask numpy if categorical is a subdtype of # complex, as it will choke. if hash_key is None: hash_key = _default_hash_key if is_categorical_dtype(vals.dtype): vals = vals.codes # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # MAIN LOGIC: inferred = infer_dtype(vals) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if inferred == 'boolean': vals = vals.astype('u8') if (np.issubdtype(vals.dtype, np.datetime64) or np.issubdtype(vals.dtype, np.timedelta64) or np.issubdtype( vals.dtype, np.number)) and vals.dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # its MUCH faster to categorize object dtypes, then hash and rename codes, categories = factorize(vals, sort=False) categories = Index(categories) c = Series(Categorical(codes, categories, ordered=False, fastpath=True)) vals = _hash.hash_object_array(categories.values, hash_key, encoding) # rename & extract vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals) or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) vals = _hash.hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = _hash.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = _hash.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals