def _aggregate_multiple_funcs(self, arg, _level): from pandas.tools.concat import concat if self.axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: try: colg = self._gotitem(obj.name, ndim=1, subset=obj) results.append(colg.aggregate(a)) # make sure we find a good name name = com._get_callable_name(a) or a keys.append(name) except (TypeError, DataError): pass except SpecificationError: raise # multiples else: for col in obj: try: colg = self._gotitem(col, ndim=1, subset=obj[col]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): pass except SpecificationError: raise return concat(results, keys=keys, axis=1)
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternativly, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. .. versionadded:: 0.16.1 drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 Returns ------- dummies : DataFrame or SparseDataFrame Examples -------- >>> import pandas as pd >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 See Also -------- Series.str.get_dummies """ from pandas.tools.concat import concat from itertools import cycle if isinstance(data, DataFrame): # determine columns being encoded if columns is None: columns_to_encode = data.select_dtypes( include=['object', 'category']).columns else: columns_to_encode = columns # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError( length_msg.format(name, len(item), len(columns_to_encode))) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in columns_to_encode] if prefix is None: prefix = columns_to_encode # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in columns_to_encode] if set(columns_to_encode) == set(data.columns): with_dummies = [] else: with_dummies = [data.drop(columns_to_encode, axis=1)] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first) return result
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return getattr(self, arg)(*args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) arg = new_arg from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(compat.iterkeys(arg))): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame result = DataFrame(result) return result, True elif hasattr(arg, '__iter__'): return self._aggregate_multiple_funcs(arg, _level=_level), None else: result = None cy_func = self._is_cython_func(arg) if cy_func and not args and not kwargs: return getattr(self, cy_func)(), None # caller can react return result, True
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return getattr(self, arg)(*args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) arg = new_arg from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg(arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(compat.iterkeys(arg))): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame result = DataFrame(result) return result, True elif hasattr(arg, '__iter__'): return self._aggregate_multiple_funcs(arg, _level=_level), None else: result = None cy_func = self._is_cython_func(arg) if cy_func and not args and not kwargs: return getattr(self, cy_func)(), None # caller can react return result, True
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternativly, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. .. versionadded:: 0.16.1 drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 Returns ------- dummies : DataFrame or SparseDataFrame Examples -------- >>> import pandas as pd >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 See Also -------- Series.str.get_dummies """ from pandas.tools.concat import concat from itertools import cycle if isinstance(data, DataFrame): # determine columns being encoded if columns is None: columns_to_encode = data.select_dtypes( include=['object', 'category']).columns else: columns_to_encode = columns # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), len(columns_to_encode))) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in columns_to_encode] if prefix is None: prefix = columns_to_encode # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in columns_to_encode] if set(columns_to_encode) == set(data.columns): with_dummies = [] else: with_dummies = [data.drop(columns_to_encode, axis=1)] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first) return result
def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.tools.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: try: colg = self._gotitem(obj.name, ndim=1, subset=obj) results.append(colg.aggregate(a)) # make sure we find a good name name = com._get_callable_name(a) or a keys.append(name) except (TypeError, DataError): pass except SpecificationError: raise # multiples else: for col in obj: try: colg = self._gotitem(col, ndim=1, subset=obj[col]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): pass except ValueError: # cannot aggregate continue except SpecificationError: raise # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1) except TypeError: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas.types.cast import is_nested_object from pandas import Series result = Series(results, index=keys, name=self.name) if is_nested_object(result): raise ValueError("cannot combine transform and " "aggregation operations") return result
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 warnings.warn(("using a dict with renaming " "is deprecated and will be removed in a future " "version"), FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) if (isinstance(obj, ABCDataFrame) and len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any([ isinstance(r, ABCSeries) for r in compat.itervalues(result) ]) def is_any_frame(): # return a boolean if we have *any* nested series return any([ isinstance(r, ABCDataFrame) for r in compat.itervalues(result) ]) if isinstance(result, list): return concat(result, keys=keys, axis=1), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._is_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True