def test_empty_print(self): factor = Categorical([], ["a", "b", "c"], name="cat") expected = ("Categorical([], Name: cat, Levels (3): " "Index([a, b, c], dtype=object)") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" sub = "Index([a, b, c]" actual = re.sub(pat, sub, actual) self.assertEqual(actual, expected) factor = Categorical([], ["a", "b", "c"]) expected = ("Categorical([], Levels (3): " "Index([a, b, c], dtype=object)") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" sub = "Index([a, b, c]" actual = re.sub(pat, sub, actual) self.assertEqual(actual, expected) factor = Categorical([], []) expected = ("Categorical([], Levels (0): " "Index([], dtype=object)") self.assertEqual(repr(factor), expected)
def _create_categorical(self, data, categories=None, ordered=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing Returns ------- Categorical """ if not isinstance(data, ABCCategorical): ordered = False if ordered is None else ordered from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered) else: if categories is not None: data = data.set_categories(categories) if ordered is not None: data = data.set_ordered(ordered) return data
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise Exception('Bin edges must be unique: %s' % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if not retbins: return fac return fac, bins
def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') cat1 = Categorical.from_array(idx1) exp_arr = np.array([0, 0, 1, 1, 2, 2]) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat1.labels, exp_arr) self.assertTrue(cat1.levels.equals(exp_idx)) idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') cat2 = Categorical.from_array(idx2) exp_arr = np.array([2, 2, 1, 0, 2, 0]) self.assert_numpy_array_equal(cat2.labels, exp_arr) self.assertTrue(cat2.levels.equals(exp_idx)) idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') cat3 = Categorical.from_array(idx3) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0]) exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') self.assert_numpy_array_equal(cat3.labels, exp_arr) self.assertTrue(cat3.levels.equals(exp_idx))
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") unique_bins = algos.unique(bins) if len(unique_bins) < len(bins): if duplicates == 'raise': raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(repr(bins))) else: bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) return fac, bins
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): x_is_series = isinstance(x, Series) series_index = None if x_is_series: series_index = x.index if name is None: name = x.name x = np.asarray(x) side = "left" if right else "right" ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise ValueError("Bin edges must be unique: %s" % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError("Bin labels must be one fewer than " "the number of bin edges") levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if x_is_series: fac = Series(fac, index=series_index, name=name) if not retbins: return fac return fac, bins
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: fmt = lambda v: _format_label(v, precision=precision) if right: levels = [ '(%s, %s]' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:]) ] if include_lowest: levels[0] = '[' + levels[0][1:] else: levels = [ '[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:]) ] else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if not retbins: return fac return fac, bins
def _concat_categorical(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : int Axis to provide concatenation in the current implementation this is always 0, e.g. we only have 1D categoricals Returns ------- Categorical A single array, preserving the combined dtypes """ from pandas.core.categorical import Categorical def convert_categorical(x): # coerce to object dtype if com.is_categorical_dtype(x.dtype): return x.get_values() return x.ravel() if get_dtype_kinds(to_concat) - set(['object', 'category']): # convert to object type and perform a regular concat return _concat_compat( [np.array(x, copy=False, dtype=object) for x in to_concat], axis=0) # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] # validate the categories categories = categoricals[0] rawcats = categories.categories for x in categoricals[1:]: if not categories.is_dtype_equal(x): raise ValueError("incompatible categories in categorical concat") # we've already checked that all categoricals are the same, so if their # length is equal to the input then we have all the same categories if len(categoricals) == len(to_concat): # concating numeric types is much faster than concating object types # and fastpath takes a shorter path through the constructor return Categorical(np.concatenate([x.codes for x in to_concat], axis=0), rawcats, ordered=categoricals[0].ordered, fastpath=True) else: concatted = np.concatenate(list(map(convert_categorical, to_concat)), axis=0) return Categorical(concatted, rawcats)
def panel_index(time, panels, names=['time', 'panel']): """ Returns a multi-index suitable for a panel-like DataFrame Parameters ---------- time : array-like Time index, does not have to repeat panels : array-like Panel index, does not have to repeat names : list, optional List containing the names of the indices Returns ------- multi_index : MultiIndex Time index is the first level, the panels are the second level. Examples -------- >>> years = range(1960,1963) >>> panels = ['A', 'B', 'C'] >>> panel_idx = panel_index(years, panels) >>> panel_idx MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'), (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'), (1962, 'C')], dtype=object) or >>> import numpy as np >>> years = np.repeat(range(1960,1963), 3) >>> panels = np.tile(['A', 'B', 'C'], 3) >>> panel_idx = panel_index(years, panels) >>> panel_idx MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'), (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'), (1962, 'C')], dtype=object) """ time, panels = _ensure_like_indices(time, panels) time_factor = Categorical.from_array(time) panel_factor = Categorical.from_array(panels) labels = [time_factor.labels, panel_factor.labels] levels = [time_factor.levels, panel_factor.levels] return MultiIndex(levels, labels, sortorder=None, names=names, verify_integrity=False)
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if not retbins: return fac return fac, bins
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None): side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise ValueError('Bin edges must be unique: %s' % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) return fac, bins
def make_axis_dummies(frame, axis="minor", transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels Parameters ---------- axis : {'major', 'minor'}, default 'minor' transform : function, default None Function to apply to axis labels first. For example, to get "day of week" dummies in a time series regression you might call: make_axis_dummies(panel, axis='major', transform=lambda d: d.weekday()) Returns ------- dummies : DataFrame Column names taken from chosen axis """ numbers = {"major": 0, "minor": 1} num = numbers.get(axis, axis) items = frame.index.levels[num] labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) cat = Categorical.from_array(mapped_items.take(labels)) labels = cat.labels items = cat.levels values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index)
def get_dummies(data, prefix=None, prefix_sep='_'): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use Returns ------- dummies : DataFrame """ cat = Categorical.from_array(np.asarray(data)) dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, str(v)) for v in cat.levels] else: dummy_cols = cat.levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = algos.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [ Categorical(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = algos.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [Categorical.from_array(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns)
def test_constructor_unsortable(self): raise nose.SkipTest('skipping for now') arr = np.array([1, 2, 3, datetime.now()], dtype='O') # it works! factor = Categorical.from_array(arr)
def _create_from_codes(self, codes, categories=None, ordered=None, name=None): """ *this is an internal non-public method* create the correct categorical from codes Parameters ---------- codes : new codes categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing name : optional name attribute, defaults to existing Returns ------- CategoricalIndex """ from pandas.core.categorical import Categorical if categories is None: categories = self.categories if ordered is None: ordered = self.ordered if name is None: name = self.name cat = Categorical.from_codes(codes, categories=categories, ordered=self.ordered) return CategoricalIndex(cat, name=name)
def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") return newdf.set_index(i + [j])
def make_axis_dummies(frame, axis='minor', transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels Parameters ---------- axis : {'major', 'minor'}, default 'minor' transform : function, default None Function to apply to axis labels first. For example, to get "day of week" dummies in a time series regression you might call: make_axis_dummies(panel, axis='major', transform=lambda d: d.weekday()) Returns ------- dummies : DataFrame Column names taken from chosen axis """ numbers = {'major': 0, 'minor': 1} num = numbers.get(axis, axis) items = frame.index.levels[num] labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) cat = Categorical.from_array(mapped_items.take(labels)) labels = cat.labels items = cat.levels values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index)
def get_dummies(data, prefix=None, prefix_sep='_'): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use Returns ------- dummies : DataFrame """ cat = Categorical.from_array(np.asarray(data)) dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0) if prefix is not None: dummy_cols = [ '%s%s%s' % (prefix, prefix_sep, str(v)) for v in cat.levels ] else: dummy_cols = cat.levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [ [] for _ in range(len(dummy_cols)) ] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: fmt = lambda v: _format_label(v, precision=precision) if right: levels = ['(%s, %s]' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:])] if include_lowest: levels[0] = '[' + levels[0][1:] else: levels = ['[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:])] else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if not retbins: return fac return fac, bins
def sparse_dummies(df, column): """Returns sparse OHE matrix for the column of the dataframe""" categories = Categorical(df[column]) column_names = np.array( [f"{column}_{str(i)}" for i in range(len(categories.categories))]) N = len(categories) row_numbers = np.arange(N, dtype=np.int) ones = np.ones((N, )) return csr_matrix((ones, (row_numbers, categories.codes))), column_names
def _create_categorical(self, data, categories=None, ordered=None, dtype=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing dtype : CategoricalDtype, defaults to existing Returns ------- Categorical """ if (isinstance(data, (ABCSeries, type(self))) and is_categorical_dtype(data)): data = data.values if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: from pandas.core.dtypes.dtypes import CategoricalDtype if categories is not None: data = data.set_categories(categories, ordered=ordered) elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) if isinstance(dtype, CategoricalDtype): # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data
def where(self, cond, other=None): if other is None: other = self._na_value values = np.where(cond, self.values, other) from pandas.core.categorical import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) return self._shallow_copy(cat, **self._get_attributes_dict())
def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") # GH17627 Cast numerics suffixes to int/float newdf[j] = to_numeric(newdf[j], errors='ignore') return newdf.set_index(i + [j])
def _indicator_post_merge(self, result): result['_left_indicator'] = result['_left_indicator'].fillna(0) result['_right_indicator'] = result['_right_indicator'].fillna(0) result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3]) result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both']) result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1) return result
def test_na_flags_int_levels(self): # #1457 levels = range(10) labels = np.random.randint(0, 10, 20) labels[::5] = -1 cat = Categorical(labels, levels) repr(cat) self.assert_(np.array_equal(com.isnull(cat), labels == -1))
def sparse_dummies(df, column): '''Returns sparse OHE matrix for the column of the dataframe''' categories = Categorical(df[column]) column_names = np.array([ "{}_{}".format(column, str(i)) for i in range(len(categories.categories)) ]) N = len(categories) row_numbers = np.arange(N, dtype=np.int) ones = np.ones((N, )) return csr_matrix((ones, (row_numbers, categories.codes))), column_names
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError If any of the categoricals are ordered or all do not have the same dtype ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") if not all( com.is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: indexer = categories.get_indexer(c.categories) new_codes.append(indexer.take(c.codes)) codes = np.concatenate(new_codes) return Categorical(codes, categories=categories, ordered=False, fastpath=True)
def test_big_print(self): factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat') expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", "...", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", "Levels (3): Index([a, b, c], dtype=object)", "Name: cat, Length: 600" ] expected = "\n".join(expected) # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" sub = "Index([a, b, c]" actual = re.sub(pat, sub, actual) self.assertEqual(actual, expected)
def test_describe(self): # string type desc = self.factor.describe() expected = DataFrame.from_dict( dict(counts=[3, 2, 3], freqs=[3 / 8., 2 / 8., 3 / 8.], levels=['a', 'b', 'c'])).set_index('levels') tm.assert_frame_equal(desc, expected) # check an integer one desc = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]).describe() expected = DataFrame.from_dict( dict(counts=[5, 3, 3], freqs=[5 / 11., 3 / 11., 3 / 11.], levels=[1, 2, 3])).set_index('levels') tm.assert_frame_equal(desc, expected)
def get_result(self): values, _ = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [ Categorical(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1]) ] return self.constructor(values, index=index, columns=columns)
def sparse_dummies(df, column): '''Returns sparse OHE matrix for the column of the dataframe''' print(column) categories = Categorical(df[column]) print(categories) # return a CategoricalDtype object column_names = np.array([ "{}_{}".format(column, str(i)) for i in range(len(categories.categories)) ]) print(column_names) # f-string, format strings N = len(categories) row_numbers = np.arange(N, dtype=np.int) ones = np.ones((N, )) # categories.codes encode the strinig with number # create a matrix with 1's only at (i, i's category) return csr_matrix((ones, (row_numbers, categories.codes))), column_names
def lexsort_indexer(keys, orders=None, na_position='last'): from pandas.core.categorical import Categorical labels = [] shape = [] if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) for key, order in zip(keys, orders): # we are already a Categorical if is_categorical_dtype(key): c = key # create the Categorical else: c = Categorical(key, ordered=True) if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) n = len(c.categories) codes = c.codes.copy() mask = (c.codes == -1) if order: # ascending if na_position == 'last': codes = np.where(mask, n, codes) elif na_position == 'first': codes += 1 else: # not order means descending if na_position == 'last': codes = np.where(mask, n, n - codes - 1) elif na_position == 'first': codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 shape.append(n) labels.append(codes) return indexer_from_factorized(labels, shape)
def where(self, cond, other=None): """ .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other. Parameters ---------- cond : boolean same length as self other : scalar, or array-like """ if other is None: other = self._na_value values = np.where(cond, self.values, other) from pandas.core.categorical import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) return self._shallow_copy(cat, **self._get_attributes_dict())
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) number_of_cols = len(levels) if dummy_na: number_of_cols += 1 dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) if dummy_na: levels = np.append(cat.categories, np.nan) else: # reset NaN GH4446 dummy_mat[cat.codes == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = (np.empty((len(index), values.shape[1])), values) values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) # may need to coerce categoricals here if self.is_categorical is not None: values = [ Categorical.from_array(values[:,i], categories=self.is_categorical.categories) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def test_levels_none(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) self.assert_(factor.equals(self.factor))
def setUp(self): self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
def setUp(self): self.factor = Categorical.from_array(["a", "b", "b", "a", "a", "c", "c", "c"])
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if ((levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1)): zipped = lzip(*keys) if names is None: names = [None] * len(zipped) if levels is None: levels = [Categorical.from_array( zp, ordered=True).categories for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [_ensure_index(keys)] else: levels = [_ensure_index(x) for x in levels] if not _all_indexes_same(indexes): label_list = [] # things are potentially different sizes, so compute the exact labels # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError('Key %s not in level %s' % (str(key), str(level))) to_concat.append(np.repeat(i, len(index))) label_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: factor = Categorical.from_array(concat_index, ordered=True) levels.append(factor.categories) label_list.append(factor.codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len(set([idx.nlevels for idx in indexes])) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, labels=label_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct labels new_labels = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = _ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError('Values not found in passed level: %s' % str(hlevel[mask])) new_labels.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) else: new_levels.append(new_index) new_labels.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False)
def data(self, convert_dates=True, convert_categoricals=True, index=None): """ Reads observations from Stata file, converting them into a dataframe Parameters ---------- convert_dates : boolean, defaults to True Convert date variables to DataFrame time values convert_categoricals : boolean, defaults to True Read value labels and convert columns to Categorical/Factor variables index : identifier of index column identifier of column that should be used as index of the DataFrame Returns ------- y : DataFrame instance """ if self._data_read: raise Exception("Data has already been read.") self._data_read = True if self.format_version >= 117: self._read_strls() stata_dta = self._dataset() data = [] for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. for i, val in enumerate(line): # NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = np.nan data.append(tuple(line)) if convert_categoricals: self._read_value_labels() data = DataFrame(data, columns=self.varlist, index=index) cols_ = np.where(self.dtyplist)[0] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] if data[col].dtype is not np.dtype(object): data[col] = Series(data[col], data[col].index, self.dtyplist[i]) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] for i in cols: col = data.columns[i] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],)) if convert_categoricals: cols = np.where(lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0] for i in cols: col = data.columns[i] labeled_data = np.copy(data[col]) labeled_data = labeled_data.astype(object) for k, v in compat.iteritems(self.value_label_dict[self.lbllist[i]]): labeled_data[(data[col] == k).values] = v data[col] = Categorical.from_array(labeled_data) return data
def test_levels_none(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) self.assert_(factor.equals(self.factor))
def _get_dummies_1d(data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_cols = ["%s%s%s" % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. Returns ------- dummies : DataFrame Examples -------- >>> s = pd.Series(list('abca')) >>> get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 See also ``Series.str.get_dummies``. """ # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) levels = cat.levels # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) number_of_cols = len(levels) if dummy_na: number_of_cols += 1 dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0) if dummy_na: levels = np.append(cat.levels, np.nan) else: # reset NaN GH4446 dummy_mat[cat.labels == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)