def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if com.is_categorical_dtype(x): return op(x,y) elif com.is_categorical_dtype(y) and not lib.isscalar(y): return op(y,x) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) if isinstance(y, (np.ndarray, pd.Series)): if y.dtype != np.object_: result = lib.vec_compare(x, y.astype(np.object_), op) else: result = lib.vec_compare(x, y, op) else: result = lib.scalar_compare(x, y, op) else: try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except (AttributeError): result = op(x, y) return result
def na_op(x, y): if com.is_categorical_dtype(x) != (not np.isscalar(y) and com.is_categorical_dtype(y)): msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ "compare values, use 'series <op> np.asarray(cat)'." raise TypeError(msg.format(op=op,typ=type(y))) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) if isinstance(y, (pa.Array, pd.Series)): if y.dtype != np.object_: result = lib.vec_compare(x, y.astype(np.object_), op) else: result = lib.vec_compare(x, y, op) else: result = lib.scalar_compare(x, y, op) else: try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except (AttributeError): result = op(x, y) return result
def na_op(x, y): if com.is_categorical_dtype(x) != com.is_categorical_dtype(y): msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ "compare values, use 'series <op> np.asarray(cat)'." raise TypeError(msg.format(op=op, typ=type(y))) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) if isinstance(y, (pa.Array, pd.Series)): if y.dtype != np.object_: result = lib.vec_compare(x, y.astype(np.object_), op) else: result = lib.vec_compare(x, y, op) else: result = lib.scalar_compare(x, y, op) else: try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except (AttributeError): result = op(x, y) return result
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if com.is_categorical_dtype(x): return op(x, y) elif com.is_categorical_dtype(y) and not lib.isscalar(y): return op(y, x) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) if isinstance(y, (np.ndarray, pd.Series)): if y.dtype != np.object_: result = lib.vec_compare(x, y.astype(np.object_), op) else: result = lib.vec_compare(x, y, op) else: result = lib.scalar_compare(x, y, op) else: try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except (AttributeError): result = op(x, y) return result
def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray or None Indices of output values in original index """ if method is not None: raise NotImplementedError("argument method is not implemented for " "CategoricalIndex.reindex") if level is not None: raise NotImplementedError("argument level is not implemented for " "CategoricalIndex.reindex") if limit is not None: raise NotImplementedError("argument limit is not implemented for " "CategoricalIndex.reindex") target = ibase._ensure_index(target) if not com.is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) new_target = self.take(indexer) # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] new_target = self._create_from_codes(codes) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if com.is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) return new_target, indexer
def wrapper(self, other, axis=None): # Validate the axis parameter if axis is not None: self._get_axis_number(axis) if isinstance(other, pd.Series): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not com.is_categorical_dtype(self): msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ "If you want to compare values, use 'series <op> np.asarray(other)'." raise TypeError(msg.format(op=op,typ=self.dtype)) mask = isnull(self) if com.is_categorical_dtype(self): # cats are a special case as get_values() would return an ndarray, which would then # not take categories ordering into account # we can go directly to op, as the na_op would just test again and dispatch to it. res = op(self.values, other) else: values = self.get_values() other = _index.convert_scalar(values,_values_from_object(other)) if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): values = values.view('i8') # scalars res = na_op(values, other) if np.isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') # mask out the invalids if mask.any(): res[mask] = masker return res
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x, y) elif is_categorical_dtype(y) and not isscalar(y): return op(y, x) if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if isscalar(y) and isnull(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or (not isscalar(y) and needs_i8_conversion(y))): if isscalar(y): mask = isnull(x) y = _index.convert_scalar(x, _values_from_object(y)) else: mask = isnull(x) | isnull(y) y = y.view('i8') x = x.view('i8') try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = masker return result
def wrapper(self, other, axis=None): # Validate the axis parameter if axis is not None: self._get_axis_number(axis) if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast if (not lib.isscalar(lib.item_from_zerodim(other)) and len(self) != len(other)): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not is_categorical_dtype(self): msg = ("Cannot compare a Categorical for op {op} with Series " "of dtype {typ}.\nIf you want to compare values, use " "'series <op> np.asarray(other)'.") raise TypeError(msg.format(op=op, typ=self.dtype)) if is_categorical_dtype(self): # cats are a special case as get_values() would return an ndarray, # which would then not take categories ordering into account # we can go directly to op, as the na_op would just test again and # dispatch to it. res = op(self.values, other) else: values = self.get_values() if isinstance(other, (list, np.ndarray)): other = np.asarray(other) res = na_op(values, other) if isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') return res
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if com.is_integer_dtype(values): values = com._ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif com.is_categorical_dtype(values): result = constructor(values.mode()) else: mask = com.isnull(values) values = com._ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if com.is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not com.is_list_like(values): values = [values] other = CategoricalIndex( self._create_categorical(self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if com.is_categorical_dtype(dtype): typ = 'category' elif com.is_sparse(arr): typ = 'sparse' elif com.is_datetimetz(arr): typ = 'datetimetz' elif com.is_datetime64_dtype(dtype): typ = 'datetime' elif com.is_timedelta64_dtype(dtype): typ = 'timedelta' elif com.is_object_dtype(dtype): typ = 'object' elif com.is_bool_dtype(dtype): typ = 'bool' else: typ = dtype.kind typs.add(typ) return typs
def _make_str_accessor(self): from pandas.core.series import Series from pandas.core.index import Index if isinstance(self, Series) and not( (is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype))): # it's neither a string series not a categorical series with strings # inside the categories. # this really should exclude all series with any non-string values (instead of test # for object dtype), but that isn't practical for performance reasons until we have a # str dtype (GH 9343) raise AttributeError("Can only use .str accessor with string " "values, which use np.object_ dtype in " "pandas") elif isinstance(self, Index): # see scc/inferrence.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if self.inferred_type not in allowed_types: message = ("Can only use .str accessor with string values " "(i.e. inferred_type is 'string', 'unicode' or 'mixed')") raise AttributeError(message) if self.nlevels > 1: message = "Can only use .str accessor with Index, not MultiIndex" raise AttributeError(message) return StringMethods(self)
def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) self.assertTrue(res.values.equals(cat)) # GH12574 self.assertRaises( ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') self.assertTrue(com.is_categorical_dtype(cat)) self.assertTrue(com.is_categorical_dtype(cat.dtype)) s = Series([1, 2, 3], dtype='category') self.assertTrue(com.is_categorical_dtype(s)) self.assertTrue(com.is_categorical_dtype(s.dtype))
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if com.is_categorical_dtype(dtype): typ = "category" elif com.is_sparse(arr): typ = "sparse" elif com.is_datetimetz(arr): typ = "datetimetz" elif com.is_datetime64_dtype(dtype): typ = "datetime" elif com.is_timedelta64_dtype(dtype): typ = "timedelta" elif com.is_object_dtype(dtype): typ = "object" elif com.is_bool_dtype(dtype): typ = "bool" else: typ = dtype.kind typs.add(typ) return typs
def test_categorical_order(self): # Directly construct using expected codes # Format is is_cat, col_name, labels (in order), underlying data expected = [ (True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])), (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)), (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])), (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)), (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5)) ] cols = [] for is_cat, col, labels, codes in expected: if is_cat: cols.append((col, pd.Categorical.from_codes(codes, labels))) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_items(cols) # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) tm.assert_frame_equal(expected, parsed_115, check_categorical=False) tm.assert_frame_equal(expected, parsed_117, check_categorical=False) # Check identity of codes for col in expected: if is_categorical_dtype(expected[col]): tm.assert_series_equal(expected[col].cat.codes, parsed_115[col].cat.codes) tm.assert_index_equal(expected[col].cat.categories, parsed_115[col].cat.categories)
def _make_str_accessor(self): from pandas.core.series import Series from pandas.core.index import Index if isinstance(self, Series) and not ((is_categorical_dtype( self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype))): # it's neither a string series not a categorical series with strings # inside the categories. # this really should exclude all series with any non-string values (instead of test # for object dtype), but that isn't practical for performance reasons until we have a # str dtype (GH 9343) raise AttributeError("Can only use .str accessor with string " "values, which use np.object_ dtype in " "pandas") elif isinstance(self, Index): # see scc/inferrence.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if self.inferred_type not in allowed_types: message = ( "Can only use .str accessor with string values " "(i.e. inferred_type is 'string', 'unicode' or 'mixed')") raise AttributeError(message) if self.nlevels > 1: message = "Can only use .str accessor with Index, not MultiIndex" raise AttributeError(message) return StringMethods(self)
def test_categorical_order(self): # Directly construct using expected codes # Format is is_cat, col_name, labels (in order), underlying data expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])), (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)), (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])), (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)), (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))] cols = [] for is_cat, col, labels, codes in expected: if is_cat: cols.append((col, pd.Categorical.from_codes(codes, labels))) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_items(cols) # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) tm.assert_frame_equal(expected, parsed_115) tm.assert_frame_equal(expected, parsed_117) # Check identity of codes for col in expected: if is_categorical_dtype(expected[col]): tm.assert_series_equal(expected[col].cat.codes, parsed_115[col].cat.codes) tm.assert_index_equal(expected[col].cat.categories, parsed_115[col].cat.categories)
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if com.is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not com.is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) factor = Categorical.from_array(["a", "b", "b", "a", "a", "c", "c", "c"]) s = Series(factor, name="A") # dtypes self.assertTrue(is_categorical_dtype(s.dtype)) self.assertTrue(is_categorical_dtype(s)) self.assertFalse(is_categorical_dtype(np.dtype("float64"))) self.assertTrue(is_categorical(s.dtype)) self.assertTrue(is_categorical(s)) self.assertFalse(is_categorical(np.dtype("float64"))) self.assertFalse(is_categorical(1.0))
def wrapper(self, other, axis=None): # Validate the axis parameter if axis is not None: self._get_axis_number(axis) if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not is_categorical_dtype(self): msg = ("Cannot compare a Categorical for op {op} with Series " "of dtype {typ}.\nIf you want to compare values, use " "'series <op> np.asarray(other)'.") raise TypeError(msg.format(op=op, typ=self.dtype)) if is_categorical_dtype(self): # cats are a special case as get_values() would return an ndarray, # which would then not take categories ordering into account # we can go directly to op, as the na_op would just test again and # dispatch to it. res = op(self.values, other) else: values = self.get_values() if isinstance(other, (list, np.ndarray)): other = np.asarray(other) res = na_op(values, other) if isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') return res
def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor,name='A') # dtypes self.assertTrue(is_categorical_dtype(s.dtype)) self.assertTrue(is_categorical_dtype(s)) self.assertFalse(is_categorical_dtype(np.dtype('float64'))) self.assertTrue(is_categorical(s.dtype)) self.assertTrue(is_categorical(s)) self.assertFalse(is_categorical(np.dtype('float64'))) self.assertFalse(is_categorical(1.0))
def encode_categorical(table, columns=None, **kwargs): """ Encode categorical columns with `M` categories into `M-1` columns according to the one-hot scheme. Parameters ---------- table : pandas.DataFrame Table with categorical columns to encode. columns : list-like, optional, default: None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. allow_drop : boolean, optional, default: True Whether to allow dropping categorical columns that only consist of a single category. Returns ------- encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged. """ if isinstance(table, pandas.Series): if not is_categorical_dtype( table.dtype) and not table.dtype.char == "O": raise TypeError( "series must be of categorical dtype, but was {}".format( table.dtype)) return _encode_categorical_series(table, **kwargs) def _is_categorical_or_object(series): return is_categorical_dtype(series.dtype) or series.dtype.char == "O" if columns is None: # for columns containing categories columns_to_encode = { nam for nam, s in table.iteritems() if _is_categorical_or_object(s) } else: columns_to_encode = set(columns) items = [] for name, series in table.iteritems(): if name in columns_to_encode: series = _encode_categorical_series(series, **kwargs) if series is None: continue items.append(series) # concat columns of tables new_table = pandas.concat(items, axis=1, copy=False) return new_table
def _concat_categorical(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : int Axis to provide concatenation in the current implementation this is always 0, e.g. we only have 1D categoricals Returns ------- Categorical A single array, preserving the combined dtypes """ from pandas.core.categorical import Categorical def convert_categorical(x): # coerce to object dtype if com.is_categorical_dtype(x.dtype): return x.get_values() return x.ravel() if get_dtype_kinds(to_concat) - set(['object', 'category']): # convert to object type and perform a regular concat return _concat_compat( [np.array(x, copy=False, dtype=object) for x in to_concat], axis=0) # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] # validate the categories categories = categoricals[0] rawcats = categories.categories for x in categoricals[1:]: if not categories.is_dtype_equal(x): raise ValueError("incompatible categories in categorical concat") # we've already checked that all categoricals are the same, so if their # length is equal to the input then we have all the same categories if len(categoricals) == len(to_concat): # concating numeric types is much faster than concating object types # and fastpath takes a shorter path through the constructor return Categorical(np.concatenate([x.codes for x in to_concat], axis=0), rawcats, ordered=categoricals[0].ordered, fastpath=True) else: concatted = np.concatenate(list(map(convert_categorical, to_concat)), axis=0) return Categorical(concatted, rawcats)
def _concat_categorical(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : int Axis to provide concatenation in the current implementation this is always 0, e.g. we only have 1D categoricals Returns ------- Categorical A single array, preserving the combined dtypes """ from pandas.core.categorical import Categorical def convert_categorical(x): # coerce to object dtype if com.is_categorical_dtype(x.dtype): return x.get_values() return x.ravel() if get_dtype_kinds(to_concat) - set(['object', 'category']): # convert to object type and perform a regular concat return _concat_compat([np.array(x, copy=False, dtype=object) for x in to_concat], axis=0) # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] # validate the categories categories = categoricals[0] rawcats = categories.categories for x in categoricals[1:]: if not categories.is_dtype_equal(x): raise ValueError("incompatible categories in categorical concat") # we've already checked that all categoricals are the same, so if their # length is equal to the input then we have all the same categories if len(categoricals) == len(to_concat): # concating numeric types is much faster than concating object types # and fastpath takes a shorter path through the constructor return Categorical(np.concatenate([x.codes for x in to_concat], axis=0), rawcats, ordered=categoricals[0].ordered, fastpath=True) else: concatted = np.concatenate(list(map(convert_categorical, to_concat)), axis=0) return Categorical(concatted, rawcats)
def encode_categorical(table, **kwargs): """ Encode categorical columns with `M` categories into `M-1` columns according to the one-hot scheme. Parameters ---------- table : pandas.DataFrame Table with categorical columns to encode. allow_drop : boolean, optional, default=True Whether to allow dropping categorical columns that only consist of a single category. Returns ------- encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged. """ if isinstance(table, pandas.Series): if not is_categorical_dtype( table.dtype) and not table.dtype.char == "O": raise TypeError( "series must be of categorical dtype, but was {}".format( table.dtype)) return _encode_categorical_series(table, **kwargs) else: new_table = pandas.DataFrame(index=table.index) for j in range(table.shape[1]): series = table.iloc[:, j] # for columns containing categories if is_categorical_dtype(series.dtype) or series.dtype.char == "O": series = _encode_categorical_series(series, **kwargs) if series is None: continue # concat columns of tables new_table = pandas.concat((new_table, series), axis=1, copy=False) return new_table
def pandas_col_to_ibis_type(col): import pandas.core.common as pdcom import ibis.expr.datatypes as dt import numpy as np dty = col.dtype # datetime types if pdcom.is_datetime64_dtype(dty): if pdcom.is_datetime64_ns_dtype(dty): return 'timestamp' else: raise com.IbisTypeError("Column {0} has dtype {1}, which is " "datetime64-like but does " "not use nanosecond units" .format(col.name, dty)) if pdcom.is_timedelta64_dtype(dty): print("Warning: encoding a timedelta64 as an int64") return 'int64' if pdcom.is_categorical_dtype(dty): return dt.Category(len(col.cat.categories)) if pdcom.is_bool_dtype(dty): return 'boolean' # simple numerical types if issubclass(dty.type, np.int8): return 'int8' if issubclass(dty.type, np.int16): return 'int16' if issubclass(dty.type, np.int32): return 'int32' if issubclass(dty.type, np.int64): return 'int64' if issubclass(dty.type, np.float32): return 'float' if issubclass(dty.type, np.float64): return 'double' if issubclass(dty.type, np.uint8): return 'int16' if issubclass(dty.type, np.uint16): return 'int32' if issubclass(dty.type, np.uint32): return 'int64' if issubclass(dty.type, np.uint64): raise com.IbisTypeError("Column {0} is an unsigned int64" .format(col.name)) if pdcom.is_object_dtype(dty): # TODO: overly broad? return 'string' raise com.IbisTypeError("Column {0} is dtype {1}" .format(col.name, dty))
def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq'] result += [top, freq] return pd.Series(result, index=names, name=data.name)
def unconvert(values, dtype, compress=None): as_is_ext = isinstance(values, ExtType) and values.code == 0 if as_is_ext: values = values.data if is_categorical_dtype(dtype): return values elif is_object_dtype(dtype): return np.array(values, dtype=object) dtype = pandas_dtype(dtype).base if not as_is_ext: values = values.encode('latin1') if compress: if compress == u'zlib': _check_zlib() decompress = zlib.decompress elif compress == u'blosc': _check_blosc() decompress = blosc.decompress else: raise ValueError("compress must be one of 'zlib' or 'blosc'") try: return np.frombuffer( _move_into_mutable_buffer(decompress(values)), dtype=dtype, ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. # We don't just store this in the locals because we want to # minimize the risk of giving users access to a `bytes` object # whose data is also given to a mutable buffer. values = e.args[0] if len(values) > 1: # The empty string and single characters are memoized in many # string creating functions in the capi. This case should not # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( 'copying data after decompressing; this may mean that' ' decompress is caching its result', PerformanceWarning, ) # fall through to copying `np.fromstring` # Copy the string into a numpy array. return np.fromstring(values, dtype=dtype)
def pandas_col_to_ibis_type(col): import pandas.core.common as pdcom import ibis.expr.datatypes as dt import numpy as np dty = col.dtype # datetime types if pdcom.is_datetime64_dtype(dty): if pdcom.is_datetime64_ns_dtype(dty): return 'timestamp' else: raise com.IbisTypeError("Column {0} has dtype {1}, which is " "datetime64-like but does " "not use nanosecond units".format( col.name, dty)) if pdcom.is_timedelta64_dtype(dty): print("Warning: encoding a timedelta64 as an int64") return 'int64' if pdcom.is_categorical_dtype(dty): return dt.Category(len(col.cat.categories)) if pdcom.is_bool_dtype(dty): return 'boolean' # simple numerical types if issubclass(dty.type, np.int8): return 'int8' if issubclass(dty.type, np.int16): return 'int16' if issubclass(dty.type, np.int32): return 'int32' if issubclass(dty.type, np.int64): return 'int64' if issubclass(dty.type, np.float32): return 'float' if issubclass(dty.type, np.float64): return 'double' if issubclass(dty.type, np.uint8): return 'int16' if issubclass(dty.type, np.uint16): return 'int32' if issubclass(dty.type, np.uint32): return 'int64' if issubclass(dty.type, np.uint64): raise com.IbisTypeError("Column {0} is an unsigned int64".format( col.name)) if pdcom.is_object_dtype(dty): # TODO: overly broad? return 'string' raise com.IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))
def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq', 'type'] result += [top, freq, 'CAT'] return pd.Series(result, index=names, name=data.name)
def encode_categorical(table, **kwargs): """ Encode categorical columns with `M` categories into `M-1` columns according to the one-hot scheme. Parameters ---------- table : pandas.DataFrame Table with categorical columns to encode. allow_drop : boolean, optional, default=True Whether to allow dropping categorical columns that only consist of a single category. Returns ------- encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged. """ if isinstance(table, pandas.Series): if not is_categorical_dtype(table.dtype) and not table.dtype.char == "O": raise TypeError("series must be of categorical dtype, but was {}".format(table.dtype)) return _encode_categorical_series(table, **kwargs) else: new_table = pandas.DataFrame(index=table.index) for j in range(table.shape[1]): series = table.iloc[:, j] # for columns containing categories if is_categorical_dtype(series.dtype) or series.dtype.char == "O": series = _encode_categorical_series(series, **kwargs) if series is None: continue # concat columns of tables new_table = pandas.concat((new_table, series), axis=1, copy=False) return new_table
def test_categorical_ordering(self): parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) parsed_115_unordered = read_stata(self.dta19_115, order_categoricals=False) parsed_117_unordered = read_stata(self.dta19_117, order_categoricals=False) for col in parsed_115: if not is_categorical_dtype(parsed_115[col]): continue tm.assert_equal(True, parsed_115[col].cat.ordered) tm.assert_equal(True, parsed_117[col].cat.ordered) tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) index = data.index name = data.name orig = data if is_categorical_dtype(data) else None if orig is not None: data = orig.values.categories if is_datetime64_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data)))
def _nonempty_series(s, idx): dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): entry = s.cat.categories[0] data = pd.Categorical([entry, entry], categories=s.cat.categories, ordered=s.cat.ordered) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) return pd.Series(data, name=s.name, index=idx)
def is_categorical_astype(self, dtype): """ validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ if com.is_categorical_dtype(dtype): if dtype == com.CategoricalDtype(): return True # this is a pd.Categorical, but is not # a valid type for astypeing raise TypeError("invalid type {0} for astype".format(dtype)) return False
def _nonempty_series(s, idx): dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): entry = s.cat.categories[0] data = pd.Categorical([entry, entry], categories=s.cat.categories, ordered=s.cat.ordered) else: entry = _scalar_from_dtype(dtype) data = [entry, entry] return pd.Series(data, name=s.name, index=idx)
def wrapper(self, other): if isinstance(other, pd.Series): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (pa.Array, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not com.is_categorical_dtype(self): msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ "If you want to compare values, use 'series <op> np.asarray(other)'." raise TypeError(msg.format(op=op, typ=self.dtype)) else: mask = isnull(self) values = self.get_values() other = _index.convert_scalar(values, _values_from_object(other)) if issubclass(values.dtype.type, np.datetime64): values = values.view('i8') # scalars res = na_op(values, other) if np.isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') # mask out the invalids if mask.any(): res[mask] = masker return res
def _nonempty_series(s, idx): dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) elif is_categorical_dtype(dtype): entry = pd.Categorical([s.cat.categories[0]], categories=s.cat.categories, ordered=s.cat.ordered) elif dtype.kind in ['i', 'f', 'u']: entry = dtype.type(1) elif dtype.kind in _simple_fake_mapping: entry = _simple_fake_mapping[dtype.kind] else: raise TypeError("Can't handle dtype: {0}".format(dtype)) return pd.Series([entry, entry], name=s.name, index=idx)
def transform(column): if is_categorical_dtype(column.dtype): return column.cat.codes if column.dtype.char == "O": try: nc = column.astype(int) except ValueError: classes = column.dropna().unique() classes.sort(kind="mergesort") nc = column.replace(classes, numpy.arange(classes.shape[0])) return nc elif column.dtype == bool: return column.astype(int) return column
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def wrapper(self, other): if isinstance(other, pd.Series): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (pa.Array, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not com.is_categorical_dtype(self): msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ "If you want to compare values, use 'series <op> np.asarray(other)'." raise TypeError(msg.format(op=op,typ=self.dtype)) else: mask = isnull(self) values = self.get_values() other = _index.convert_scalar(values,_values_from_object(other)) if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): values = values.view('i8') # scalars res = na_op(values, other) if np.isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') # mask out the invalids if mask.any(): res[mask] = masker return res
def convert(values): """ convert the numpy values to a list """ dtype = values.dtype if is_categorical_dtype(values): return values elif is_object_dtype(dtype): return values.ravel().tolist() if needs_i8_conversion(dtype): values = values.view('i8') v = values.ravel() if compressor == 'zlib': _check_zlib() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, zlib.compress(v)) elif compressor == 'blosc': _check_blosc() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, blosc.compress(v, typesize=dtype.itemsize)) # ndarray (on original dtype) return ExtType(0, v.tostring())
def describe_categorical_1d(data): # Only run if at least 1 non-missing value objcounts = data.value_counts() top, freq = objcounts.index[0], objcounts.iloc[0] names = [] result = [] if data.dtype == object or com.is_categorical_dtype(data.dtype): names += ['top', 'freq', 'type'] result += [top, freq, 'CAT'] if y is not None: try: mdld = mdl_1d_cat(data, y) result += list(mdld) except: Tracer()() else: result += ['No Dep Var', ''] names += ['AUC', 'cmatrix'] return pd.Series(result, index=names, name=data.name)
def make_field(arr, dtype=None): dtype = dtype or arr.dtype if arr.name is None: name = 'values' else: name = arr.name field = {'name': name, 'type': as_json_table_type(dtype)} if is_categorical_dtype(arr): if hasattr(arr, 'categories'): cats = arr.categories ordered = arr.ordered else: cats = arr.cat.categories ordered = arr.cat.ordered field['constraints'] = {"enum": list(cats)} field['ordered'] = ordered elif is_datetime64tz_dtype(arr): if hasattr(arr, 'dt'): field['tz'] = arr.dt.tz.zone else: field['tz'] = arr.tz.zone return field
def encode_categorical(table, **kwargs): """ Encode categorical columns with `M` categories into `M-1` columns according to the one-hot scheme. Parameters ---------- table : pandas.DataFrame Table with categorical columns to encode. allow_drop : boolean, optional, default=True Whether to allow dropping categorical columns that only consistent of a single category. Returns ------- encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged. """ if isinstance(table, pandas.Series): return _encode_categorical_series(table, **kwargs) else: new_table = pandas.DataFrame(index=table.index) for j in range(table.shape[1]): series = table.iloc[:, j] # for columns containing categories if is_categorical_dtype(series.dtype) or series.dtype.char == "O": series = _encode_categorical_series(series, **kwargs) if series is None: continue # join tables on index new_table = new_table.join(series) return new_table
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype( dtype) or is_period or is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result
def convert_categorical(x): # coerce to object dtype if com.is_categorical_dtype(x.dtype): return x.get_values() return x.ravel()