def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin, margins_name='All'): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() try: piece[all_key] = margin[key] except TypeError: # we cannot reshape, so coerce the axis piece.set_axis(piece._get_axis( cat_axis)._to_safe_for_reshape(), axis=cat_axis, inplace=True) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: margin = grand_margin cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + lrange(len(cols)) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def read(self): """Read the whole JSON input into a pandas object""" if self.lines and self.chunksize: obj = concat(self) elif self.lines: obj = self._get_object_parser( self._combine_lines(self.data.split('\n')) ) else: obj = self._get_object_parser(self.data) self.close() return obj
def _unstack_extension_series(series, level, fill_value): """ Unstack an ExtensionArray-backed Series. The ExtensionDtype is preserved. Parameters ---------- series : Series A Series with an ExtensionArray for values level : Any The level name or number. fill_value : Any The user-level (not physical storage) fill value to use for missing values introduced by the reshape. Passed to ``series.values.take``. Returns ------- DataFrame Each column of the DataFrame will have the same dtype as the input Series. """ # Implementation note: the basic idea is to # 1. Do a regular unstack on a dummy array of integers # 2. Followup with a columnwise take. # We use the dummy take to discover newly-created missing values # introduced by the reshape. from pandas.core.reshape.concat import concat dummy_arr = np.arange(len(series)) # fill_value=-1, since we will do a series.values.take later result = _Unstacker(dummy_arr, series.index, level=level, fill_value=-1).get_result() out = [] values = extract_array(series, extract_numpy=False) for col, indices in result.iteritems(): out.append(Series(values.take(indices.values, allow_fill=True, fill_value=fill_value), name=col, index=result.index)) return concat(out, axis='columns', copy=False, keys=result.columns)
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 warnings.warn( ("using a dict with renaming " "is deprecated and will be removed in a future " "version"), FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: msg = ('cannot perform renaming for {key} with a ' 'nested dictionary').format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() elif isinstance(obj, ABCDataFrame) and \ k not in obj.columns: raise KeyError( "Column '{col}' does not exist!".format(col=k)) arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) if (isinstance(obj, ABCDataFrame) and len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg(arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any(isinstance(r, ABCSeries) for r in compat.itervalues(result)) def is_any_frame(): # return a boolean if we have *any* nested series return any(isinstance(r, ABCDataFrame) for r in compat.itervalues(result)) if isinstance(result, list): return concat(result, keys=keys, axis=1), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._is_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. .. versionadded:: 0.22.0 Returns ------- dummies : DataFrame or SparseDataFrame Examples -------- >>> import pandas as pd >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 See Also -------- Series.str.get_dummies """ from pandas.core.reshape.concat import concat from itertools import cycle if isinstance(data, DataFrame): # determine columns being encoded if columns is None: columns_to_encode = data.select_dtypes( include=['object', 'category']).columns else: columns_to_encode = columns # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): len_msg = ("Length of '{name}' ({len_item}) did not match the " "length of the columns being encoded ({len_enc}).") if is_list_like(item): if not len(item) == len(columns_to_encode): len_msg = len_msg.format(name=name, len_item=len(item), len_enc=len(columns_to_encode)) raise ValueError(len_msg) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in columns_to_encode] if prefix is None: prefix = columns_to_encode # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in columns_to_encode] if set(columns_to_encode) == set(data.columns): with_dummies = [] else: with_dummies = [data.drop(columns_to_encode, axis=1)] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) return result
def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): # TODO: what about the existing index? if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list)): raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') else: id_vars = list(id_vars) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list)): raise ValueError('value_vars must be a list of tuples when' ' columns are a MultiIndex') else: value_vars = list(value_vars) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ 'variable_{i}'.format(i=i) for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else 'variable' ] if isinstance(var_name, compat.string_types): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_type(id_data): id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns)
def get_codon_table(transl_table=11): if transl_table == 11: codons = DataFrame( dict(AAs=list( 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG' ), Starts=list( '---M------**--*----M------------MMMM---------------M------------' ), Base1=list( 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG' ), Base2=list( 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG' ), Base3=list( 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG' ))) elif transl_table == 1: codons = DataFrame( dict(AAs=list( 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG' ), Starts=list( '---M------**--*----M---------------M----------------------------' ), Base1=list( 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG' ), Base2=list( 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG' ), Base3=list( 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG' ))) else: raise ValueError('Table {} not coded'.format(transl_table)) codons['Codon'] = codons[['Base1', 'Base2', 'Base3']].sum(1) codons = codons.set_index('Codon') starts = codons[codons.Starts == 'M'].dropna().index aas = codons['AAs'] def determine_synonymity(codon, col='AAs'): ret = defaultdict(dict) for i, base in enumerate(codon.name): s = 0 for opt in ['T', 'C', 'A', 'G']: if opt == base: continue newcodon = list(codon.name) newcodon[i] = opt newcodon = ''.join(newcodon) if codons.loc[newcodon][col] == codon[col]: s += 1 ret[codon.name][('S', i)] = s ret[codon.name][('NS', i)] = 3 - s return DataFrame(ret).T synonymity = concat( [determine_synonymity(codon) for nm, codon in codons.iterrows()]) startsyn = concat([determine_synonymity(codon, 'Starts') for nm, codon in codons.iterrows() \ if codon.name in starts]) return aas, startsyn, synonymity
def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, ABCMultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list)): raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(np.ravel(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list)): raise ValueError('value_vars must be a list of tuples when' ' columns are a MultiIndex') else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(np.ravel(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in" " the DataFrame: {missing}" "".format(missing=list(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = ['variable_{i}'.format(i=i) for i in range(len(frame.columns.names))] else: var_name = [frame.columns.name if frame.columns.name is not None else 'variable'] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_type(id_data): id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray(frame.columns ._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns)
def heatmap_data(self): import matplotlib.pyplot as plt _type = request.args.get('type') platform = request.args.get('platform') genes = split_params(request.args.get('gene')) if _type == 'mouse': datasets = split_params(request.args.get('tissue')) # if len(datasets) < 2: # raise InvalidUsage('Too few data sets', 400) table_model = self._get_table_name(_type) query = db.session.query(table_model).filter( table_model.gene.in_(genes)) df = pd.read_sql(query.statement, db.engine, index_col='gene') df = df.rename(columns=table_model.column_mappings()) df = df[datasets] else: types = platform + '_' + 'tissue' datasets1 = split_params(request.args.get('tissue')) table_model = self._get_table_name(types) query = db.session.query(table_model).filter( table_model.gene.in_(genes)).order_by(asc(table_model.gene)) df1 = pd.read_sql(query.statement, db.engine, index_col='gene') df1 = df1.rename(columns=table_model.column_mappings()) df1 = df1[datasets1] types = platform + '_' + 'disease' datasets2 = split_params(request.args.get('disease')) table_model = self._get_table_name(types) query = db.session.query(table_model).filter( table_model.gene.in_(genes)).order_by(asc(table_model.gene)) df2 = pd.read_sql(query.statement, db.engine, index_col='gene') df2 = df2.rename(columns=table_model.column_mappings()) df2 = df2[datasets2] # if len(datasets1 + datasets2) < 2: # raise InvalidUsage('Too few data sets', 400) df = concat([df1, df2], join="inner", axis=1) # print(df) df.index.name = None if len(df) == 0: empty_df = df.reset_index() return make_dict_response( fig=None, data=empty_df.to_dict('records'), columns=empty_df.columns.tolist(), ) # cellSizePixels = 75 # dpi = matplotlib.rcParams['figure.dpi'] # marginWidth = matplotlib.rcParams['figure.subplot.right']-matplotlib.rcParams['figure.subplot.left'] # marginHeight = matplotlib.rcParams['figure.subplot.top']-matplotlib.rcParams['figure.subplot.bottom'] # Ny,Nx = df.shape # figWidth = (Nx*cellSizePixels/dpi)/0.8/marginWidth # figHeigh = (Ny*cellSizePixels/dpi)/0.8/marginHeight row_cluster = df.shape[0] > 1 col_cluster = df.shape[1] > 1 df = df.fillna(0) sns.set(font_scale=1.2) cluster = sns.clustermap(df, linewidths=1, cmap="mako", vmin=0, vmax=10, row_cluster=row_cluster, col_cluster=col_cluster) # axWidth = (Nx*cellSizePixels)/(figWidth*dpi) # axHeight = (Ny*cellSizePixels)/(figHeigh*dpi) # # # resize heatmap # ax_heatmap_orig_pos = cluster.ax_heatmap.get_position() # cluster.ax_heatmap.set_position([ax_heatmap_orig_pos.x0, ax_heatmap_orig_pos.y0, # axWidth, axHeight]) # # # resize dendrograms to match # ax_row_orig_pos = cluster.ax_row_dendrogram.get_position() # cluster.ax_row_dendrogram.set_position([ax_row_orig_pos.x0, ax_row_orig_pos.y0, # ax_row_orig_pos.width, axHeight]) # ax_col_orig_pos = cluster.ax_col_dendrogram.get_position() # cluster.ax_col_dendrogram.set_position([ax_col_orig_pos.x0, ax_heatmap_orig_pos.y0+axHeight, # axWidth, ax_col_orig_pos.height]) plt.setp(cluster.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, horizontalalignment='right') img = io.BytesIO() cluster.savefig(img, format='png') img.seek(0) fig = base64.b64encode(img.getvalue()).decode('utf-8') # Safari fix: https://stackoverflow.com/questions/27396376/base64-image-tag-in-safari-did-not-showed-up # pad_num = 4 - (len(fig) % 4) # if pad_num < 4: # fig = fig + ('=' * pad_num) cluster_data = cluster.data2d.round(4).reset_index() return make_dict_response( fig='data:image/png;base64,' + fig, data=cluster_data.to_dict('records'), columns=cluster_data.columns.tolist(), )
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) _axis = kwargs.pop("_axis", None) if _axis is None: _axis = getattr(self, "axis", 0) if isinstance(arg, str): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") obj = self._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): new_arg = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): raise SpecificationError( "nested renamer is not supported") elif isinstance(obj, ABCSeries): raise SpecificationError( "nested renamer is not supported") elif isinstance(obj, ABCDataFrame) and k not in obj.columns: raise KeyError(f"Column '{k}' does not exist!") arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) if isinstance(obj, ABCDataFrame) and len( obj.columns.intersection(keys)) != len(keys): cols = sorted( set(keys) - set(obj.columns.intersection(keys))) raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError( "nested dictionary is ambiguous in aggregation") return colg.aggregate(how) def _agg_2dim(how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how) def _agg(arg, func): """ run the aggregations over the arg with func return a dict """ result = {} for fname, agg_how in arg.items(): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(arg.keys()) result = {} if self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series() -> bool: # return a boolean if we have *any* nested series return any(isinstance(r, ABCSeries) for r in result.values()) def is_any_frame() -> bool: # return a boolean if we have *any* nested series return any( isinstance(r, ABCDataFrame) for r in result.values()) if isinstance(result, list): return concat(result, keys=keys, axis=1, sort=True), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError as err: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") from err return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, "name", None)) return result, True elif is_list_like(arg): # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _axis=_axis), None else: result = None f = self._get_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = '{prefix}{prefix_sep}{level}' return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] if isinstance(data, Series): index = data.index else: index = None if sparse: if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def pivot_table( data, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=False, ) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces: List[DataFrame] = [] keys = [] for func in aggfunc: table = pivot_table( data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, ) pieces.append(table) keys.append(getattr(func, "__name__", func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if (v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged # GH17038, this check should only happen if index is defined (not None) if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if isinstance(table.index, MultiIndex): m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if isinstance(table.columns, MultiIndex): m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: _table = table.fillna(fill_value, downcast="infer") assert _table is not None # needed for mypy table = _table if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def melt( frame: "DataFrame", id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ) -> "DataFrame": # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, MultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " "name of the resultiing Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, stacklevel=3, ) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present " f"in the DataFrame: {list(missing)}") else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in " f"the DataFrame: {list(missing)}") if col_level is not None: idx = frame.columns.get_level_values(col_level).get_indexer( id_vars + value_vars) else: idx = frame.columns.get_indexer(id_vars + value_vars) frame = frame.iloc[:, idx] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ f"variable_{i}" for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): id_data = cast("Series", concat([id_data] * K, ignore_index=True)) else: id_data = np.tile(id_data._values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame._values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns)
def agg_list_like(self) -> DataFrame | Series: """ Compute aggregation in the case of a list-like argument. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat obj = self.obj arg = cast(List[AggFuncTypeBase], self.f) if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj elif obj._selected_obj.ndim == 1: # For SeriesGroupBy this matches _obj_with_exclusions selected_obj = obj._selected_obj else: selected_obj = obj._obj_with_exclusions results = [] keys = [] # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) except TypeError: pass else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named pass elif "no results" in str(err): # reached in test_frame_apply.test_nuiscance_columns # where the colg.aggregate(arg) ends up going through # the selected_obj.ndim == 1 branch above with arg == ["sum"] # on a datetime64[ns] column pass else: raise else: results.append(new_res) indices.append(index) keys = selected_obj.columns.take(indices) # if we are empty if not len(results): raise ValueError("no results") try: concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=obj.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations" ) from err return result else: # Concat uses the first index to determine the final indexing order. # The union of a shorter first index with the other indices causes # the index sorting to be different from the order of the aggregating # functions. Reindex if this is the case. index_size = concatenated.index.size full_ordered_index = next( result.index for result in results if result.index.size == index_size ) return concatenated.reindex(full_ordered_index, copy=False)
def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a dict-like argument. Parameters ---------- _axis : int, 0 or 1 Axis to compute aggregation on. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat obj = self.obj arg = cast(AggFuncTypeDict, self.f) if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj arg = self.normalize_dictlike_arg("agg", selected_obj, arg) if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(obj._selection, ndim=1) results = {key: colg.agg(how) for key, how in arg.items()} else: # key used for column selection and output results = { key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() } # set the final keys keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] # combine results if all(is_ndframe): keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys axis = 0 if isinstance(obj, ABCSeries) else 1 result = concat({k: results[k] for k in keys_to_use}, axis=axis) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") else: from pandas import Series # we have a dict of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) name = obj.name else: name = None result = Series(results, name=name) return result
def _normalize(table, normalize, margins, margins_name="All"): if not isinstance(normalize, (bool, str)): axis_subs = {0: "index", 1: "columns"} try: normalize = axis_subs[normalize] except KeyError: raise ValueError("Not a valid normalize argument") if margins is False: # Actual Normalizations normalizers = { "all": lambda x: x / x.sum(axis=1).sum(axis=0), "columns": lambda x: x / x.sum(), "index": lambda x: x.div(x.sum(axis=1), axis=0), } normalizers[True] = normalizers["all"] try: f = normalizers[normalize] except KeyError: raise ValueError("Not a valid normalize argument") table = f(table) table = table.fillna(0) elif margins is True: column_margin = table.loc[:, margins_name].drop(margins_name) index_margin = table.loc[margins_name, :].drop(margins_name) table = table.drop(margins_name, axis=1).drop(margins_name) # to keep index and columns names table_index_names = table.index.names table_columns_names = table.columns.names # Normalize core table = _normalize(table, normalize=normalize, margins=False) # Fix Margins if normalize == "columns": column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() index_margin = index_margin / index_margin.sum() index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) table = table.append(index_margin) table = table.fillna(0) else: raise ValueError("Not a valid normalize argument") table.index.names = table_index_names table.columns.names = table_columns_names else: raise ValueError("Not a valid margins argument") return table
def agg_list_like(self) -> DataFrame | Series: """ Compute aggregation in the case of a list-like argument. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat obj = self.obj arg = cast(List[AggFuncTypeBase], self.f) if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj elif obj._selected_obj.ndim == 1: # For SeriesGroupBy this matches _obj_with_exclusions selected_obj = obj._selected_obj else: selected_obj = obj._obj_with_exclusions results = [] keys = [] failed_names = [] depr_nuisance_columns_msg = ( "{} did not aggregate successfully. If any error is " "raised this will raise in a future version of pandas. " "Drop these columns/ops to avoid this warning.") # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) except TypeError: failed_names.append(com.get_callable_name(a) or a) else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: # Capture and suppress any warnings emitted by us in the call # to agg below, but pass through any warnings that were # generated otherwise. # This is necessary because of https://bugs.python.org/issue29672 # See GH #43741 for more details with warnings.catch_warnings(record=True) as record: new_res = colg.aggregate(arg) if len(record) > 0: match = re.compile( depr_nuisance_columns_msg.format(".*")) for warning in record: if re.match(match, str(warning.message)): failed_names.append(col) else: warnings.warn_explicit( message=warning.message, category=warning.category, filename=warning.filename, lineno=warning.lineno, ) except (TypeError, DataError): failed_names.append(col) except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named failed_names.append(col) elif "no results" in str(err): # reached in test_frame_apply.test_nuiscance_columns # where the colg.aggregate(arg) ends up going through # the selected_obj.ndim == 1 branch above with arg == ["sum"] # on a datetime64[ns] column failed_names.append(col) else: raise else: results.append(new_res) indices.append(index) keys = selected_obj.columns.take(indices) # if we are empty if not len(results): raise ValueError("no results") if len(failed_names) > 0: warnings.warn( depr_nuisance_columns_msg.format(failed_names), FutureWarning, stacklevel=find_stack_level(), ) try: concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=obj.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations" ) from err return result else: # Concat uses the first index to determine the final indexing order. # The union of a shorter first index with the other indices causes # the index sorting to be different from the order of the aggregating # functions. Reindex if this is the case. index_size = concatenated.index.size full_ordered_index = next(result.index for result in results if result.index.size == index_size) return concatenated.reindex(full_ordered_index, copy=False)
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(getattr(func, '__name__', func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) # group by the cartesian product of the grouper # if we have a categorical grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in [v for v in values if v in data and v in agged]: if (is_integer_dtype(data[v]) and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: from pandas import MultiIndex if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how='all', axis=1) return table
def melt( frame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ): # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, ABCMultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(np.ravel(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(np.ravel(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in" " the DataFrame: {missing}" "".format(missing=list(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_type(id_data): id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns)
def agg_dict_like(self) -> DataFrame | Series: """ Compute aggregation in the case of a dict-like argument. Returns ------- Result of aggregation. """ from pandas import Index from pandas.core.reshape.concat import concat obj = self.obj arg = cast(AggFuncTypeDict, self.f) if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj selection = None else: selected_obj = obj._selected_obj selection = obj._selection arg = self.normalize_dictlike_arg("agg", selected_obj, arg) if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(selection, ndim=1) results = {key: colg.agg(how) for key, how in arg.items()} else: # key used for column selection and output results = { key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() } # set the final keys keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] # combine results if all(is_ndframe): keys_to_use: Iterable[Hashable] keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) ktu._set_names(selected_obj.columns.names) keys_to_use = ktu axis = 0 if isinstance(obj, ABCSeries) else 1 result = concat( {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use ) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError( "cannot perform both aggregation " "and transformation operations " "simultaneously" ) else: from pandas import Series # we have a dict of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) name = obj.name else: name = None result = Series(results, name=name) return result
def transform( obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs, ) -> FrameOrSeries: """ Transform a DataFrame or Series Parameters ---------- obj : DataFrame or Series Object to compute the transform on. func : string, function, list, or dictionary Function(s) to compute the transform with. axis : {0 or 'index', 1 or 'columns'} Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ from pandas.core.reshape.concat import concat is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return transform(obj.T, func, 0, *args, **kwargs).T if isinstance(func, list): if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if isinstance(func, dict): if not is_series: cols = sorted(set(func.keys()) - set(obj.columns)) if len(cols) > 0: raise SpecificationError(f"Column(s) {cols} do not exist") if any(isinstance(v, dict) for v in func.values()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") results = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as e: if str(e) == "Function did not transform": raise e # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1) # func is either str or callable try: if isinstance(func, str): result = obj._try_aggregate_string_function(func, *args, **kwargs) else: f = obj._get_cython_func(func) if f and not args and not kwargs: result = getattr(obj, f)() else: try: result = obj.apply(func, args=args, **kwargs) except Exception: result = func(obj, *args, **kwargs) except Exception: raise ValueError("Transform function failed") # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index ): raise ValueError("Function did not transform") return result
def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = DataFrame( dict( household_id=[1, 2, 3], male=[0, 1, 0], wealth=[196087.3, 316478.7, 294750], ), columns=["household_id", "male", "wealth"], ).set_index("household_id") portfolio = DataFrame( dict( household_id=[1, 2, 2, 3, 3, 3, 4], asset_id=[ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan, ], name=[ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", np.nan, ], share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], ), columns=["household_id", "asset_id", "name", "share"], ).set_index(["household_id", "asset_id"]) result = household.join(portfolio, how="inner") expected = (DataFrame( dict( male=[0, 1, 1, 0, 0, 0], wealth=[ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ], name=[ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", ], share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], household_id=[1, 2, 2, 3, 3, 3], asset_id=[ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", ], )).set_index([ "household_id", "asset_id" ]).reindex(columns=["male", "wealth", "name", "share"])) tm.assert_frame_equal(result, expected) # equivalency result = merge( household.reset_index(), portfolio.reset_index(), on=["household_id"], how="inner", ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) result = household.join(portfolio, how="outer") expected = concat( [ expected, (DataFrame( dict(share=[1.00]), index=MultiIndex.from_tuples( [(4, np.nan)], names=["household_id", "asset_id"]), )), ], axis=0, sort=True, ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) # invalid cases household.index.name = "foo" with pytest.raises(ValueError): household.join(portfolio, how="inner") portfolio2 = portfolio.copy() portfolio2.index.set_names(["household_id", "foo"]) with pytest.raises(ValueError): portfolio2.join(portfolio, how="inner")
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. .. versionadded:: 0.23.0 Returns ------- dummies : DataFrame or SparseDataFrame Examples -------- >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 See Also -------- Series.str.get_dummies """ from pandas.core.reshape.concat import concat from itertools import cycle dtypes_to_encode = ['object', 'category'] if isinstance(data, DataFrame): # determine columns being encoded if columns is None: data_to_encode = data.select_dtypes( include=dtypes_to_encode) else: data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): len_msg = ("Length of '{name}' ({len_item}) did not match the " "length of the columns being encoded ({len_enc}).") if is_list_like(item): if not len(item) == data_to_encode.shape[1]: len_msg = \ len_msg.format(name=name, len_item=len(item), len_enc=data_to_encode.shape[1]) raise ValueError(len_msg) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in data_to_encode.columns] if prefix is None: prefix = data_to_encode.columns # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] if data_to_encode.shape == data.shape: # Encoding the entire df, do not prepend any dropped columns with_dummies = [] elif columns is not None: # Encoding only cols specified in columns. Get all cols not in # columns to prepend to result. with_dummies = [data.drop(columns, axis=1)] else: # Encoding only object and category dtype columns. Get remaining # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): # col is (column_name, column), use just column data here dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) return result
def _normalize(table, normalize, margins: bool, margins_name="All"): if not isinstance(normalize, (bool, str)): axis_subs = {0: "index", 1: "columns"} try: normalize = axis_subs[normalize] except KeyError as err: raise ValueError("Not a valid normalize argument") from err if margins is False: # Actual Normalizations normalizers: Dict[Union[bool, str], Callable] = { "all": lambda x: x / x.sum(axis=1).sum(axis=0), "columns": lambda x: x / x.sum(), "index": lambda x: x.div(x.sum(axis=1), axis=0), } normalizers[True] = normalizers["all"] try: f = normalizers[normalize] except KeyError as err: raise ValueError("Not a valid normalize argument") from err table = f(table) table = table.fillna(0) elif margins is True: # keep index and column of pivoted table table_index = table.index table_columns = table.columns # check if margin name is in (for MI cases) or equal to last # index/column and save the column and index margin if (margins_name not in table.iloc[-1, :].name) | ( margins_name != table.iloc[:, -1].name): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] # keep the core table table = table.iloc[:-1, :-1] # Normalize core table = _normalize(table, normalize=normalize, margins=False) # Fix Margins if normalize == "columns": column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) table.columns = table_columns elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) table.index = table_index elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() index_margin = index_margin / index_margin.sum() index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) table = table.append(index_margin) table = table.fillna(0) table.index = table_index table.columns = table_columns else: raise ValueError("Not a valid normalize argument") else: raise ValueError("Not a valid margins argument") return table
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternativly, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. .. versionadded:: 0.16.1 drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 Returns ------- dummies : DataFrame or SparseDataFrame Examples -------- >>> import pandas as pd >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 See Also -------- Series.str.get_dummies """ from pandas.core.reshape.concat import concat from itertools import cycle if isinstance(data, DataFrame): # determine columns being encoded if columns is None: columns_to_encode = data.select_dtypes( include=['object', 'category']).columns else: columns_to_encode = columns # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError( length_msg.format(name, len(item), len(columns_to_encode))) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in columns_to_encode] if prefix is None: prefix = columns_to_encode # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in columns_to_encode] if set(columns_to_encode) == set(data.columns): with_dummies = [] else: with_dummies = [data.drop(columns_to_encode, axis=1)] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first) return result
def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: try: colg = self._gotitem(obj.name, ndim=1, subset=obj) results.append(colg.aggregate(a)) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) except (TypeError, DataError): pass except SpecificationError: raise # multiples else: for index, col in enumerate(obj): try: colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): pass except ValueError: # cannot aggregate continue except SpecificationError: raise # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1, sort=False) except TypeError: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas.core.dtypes.cast import is_nested_object from pandas import Series result = Series(results, index=keys, name=self.name) if is_nested_object(result): raise ValueError("cannot combine transform and " "aggregation operations") return result
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError): pass values = list(values) grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: from pandas import MultiIndex try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name, fill_value=fill_value) # discard the top level if values_passed and not values_multi and not table.empty and \ (table.columns.nlevels > 1): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how='all', axis=1) return table
def _aggregate_multiple_funcs(self, arg, _axis): from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: colg = self._gotitem(obj.name, ndim=1, subset=obj) try: new_res = colg.aggregate(a) except TypeError: pass else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: for index, col in enumerate(obj): colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named pass elif "no results" in str(err): # raised directly in _aggregate_multiple_funcs pass else: raise else: results.append(new_res) keys.append(col) # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=self.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations" ) from err return result
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function or list of functions, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. Examples -------- >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", ... "bar", "bar", "bar", "bar"], ... "B": ["one", "one", "one", "two", "two", ... "one", "one", "two", "two"], ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table ... # doctest: +NORMALIZE_WHITESPACE C large small A B bar one 4.0 5.0 two 7.0 6.0 foo one 4.0 1.0 two NaN 6.0 Returns ------- table : DataFrame See also -------- DataFrame.pivot : pivot without aggregation that can handle non-numeric data """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError): pass values = list(values) grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [ agged.index.names[i] or i for i in range(len(index), len(keys)) ] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name) # discard the top level if values_passed and not values_multi and not table.empty and \ (table.columns.nlevels > 1): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Makse sure empty columns are removed if dropna=True if isinstance(table, DataFrame) and dropna: table = table.dropna(how='all', axis=1) return table
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 warnings.warn(("using a dict with renaming " "is deprecated and will be removed in a future " "version"), FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: msg = ('cannot perform renaming for {key} with a ' 'nested dictionary').format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() elif isinstance(obj, ABCDataFrame) and \ k not in obj.columns: raise KeyError( "Column '{col}' does not exist!".format(col=k)) arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) if (isinstance(obj, ABCDataFrame) and len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any( isinstance(r, ABCSeries) for r in compat.itervalues(result)) def is_any_frame(): # return a boolean if we have *any* nested series return any( isinstance(r, ABCDataFrame) for r in compat.itervalues(result)) if isinstance(result, list): return concat(result, keys=keys, axis=1, sort=True), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._is_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin, margins_name='All'): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ('', ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() try: piece[all_key] = margin[key] except TypeError: # we cannot reshape, so coerce the axis piece.set_axis( piece._get_axis(cat_axis)._to_safe_for_reshape(), axis=cat_axis, inplace=True) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: margin = grand_margin cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + lrange(len(cols)) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def _normalize(table, normalize, margins, margins_name='All'): if not isinstance(normalize, bool) and not isinstance( normalize, compat.string_types): axis_subs = {0: 'index', 1: 'columns'} try: normalize = axis_subs[normalize] except KeyError: raise ValueError("Not a valid normalize argument") if margins is False: # Actual Normalizations normalizers = { 'all': lambda x: x / x.sum(axis=1).sum(axis=0), 'columns': lambda x: x / x.sum(), 'index': lambda x: x.div(x.sum(axis=1), axis=0) } normalizers[True] = normalizers['all'] try: f = normalizers[normalize] except KeyError: raise ValueError("Not a valid normalize argument") table = f(table) table = table.fillna(0) elif margins is True: column_margin = table.loc[:, margins_name].drop(margins_name) index_margin = table.loc[margins_name, :].drop(margins_name) table = table.drop(margins_name, axis=1).drop(margins_name) # to keep index and columns names table_index_names = table.index.names table_columns_names = table.columns.names # Normalize core table = _normalize(table, normalize=normalize, margins=False) # Fix Margins if normalize == 'columns': column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) elif normalize == 'index': index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() index_margin = index_margin / index_margin.sum() index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) table = table.append(index_margin) table = table.fillna(0) else: raise ValueError("Not a valid normalize argument") table.index.names = table_index_names table.columns.names = table_columns_names else: raise ValueError("Not a valid margins argument") return table
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function or list of functions, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. Examples -------- >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", ... "bar", "bar", "bar", "bar"], ... "B": ["one", "one", "one", "two", "two", ... "one", "one", "two", "two"], ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table ... # doctest: +NORMALIZE_WHITESPACE C large small A B bar one 4.0 5.0 two 7.0 6.0 foo one 4.0 1.0 two NaN 6.0 Returns ------- table : DataFrame See also -------- DataFrame.pivot : pivot without aggregation that can handle non-numeric data """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError): pass values = list(values) grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name, fill_value=fill_value) # discard the top level if values_passed and not values_multi and not table.empty and \ (table.columns.nlevels > 1): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Makse sure empty columns are removed if dropna=True if isinstance(table, DataFrame) and dropna: table = table.dropna(how='all', axis=1) return table
def agg_list_like( obj: AggObjType, arg: List[AggFuncTypeBase], _axis: int, ) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a list-like argument. Parameters ---------- obj : Pandas object to compute aggregation on. arg : list Aggregations to compute. _axis : int, 0 or 1 Axis to compute aggregation on. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if obj._selected_obj.ndim == 1: selected_obj = obj._selected_obj else: selected_obj = obj._obj_with_exclusions results = [] keys = [] # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) except TypeError: pass else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named pass elif "no results" in str(err): # raised directly in _aggregate_multiple_funcs pass else: raise else: results.append(new_res) keys.append(col) # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=obj.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations") from err return result
def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = ( DataFrame( dict(household_id=[1, 2, 3], male=[0, 1, 0], wealth=[196087.3, 316478.7, 294750]), columns=['household_id', 'male', 'wealth']) .set_index('household_id')) portfolio = ( DataFrame( dict(household_id=[1, 2, 2, 3, 3, 3, 4], asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan], name=["ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", np.nan], share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), columns=['household_id', 'asset_id', 'name', 'share']) .set_index(['household_id', 'asset_id'])) result = household.join(portfolio, how='inner') expected = ( DataFrame( dict(male=[0, 1, 1, 0, 0, 0], wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0], name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', 'Royal Dutch Shell', 'AAB Eastern Europe Equity Fund', 'Postbank BioTech Fonds'], share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], household_id=[1, 2, 2, 3, 3, 3], asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', 'gb00b03mlx29', 'lu0197800237', 'nl0000289965'])) .set_index(['household_id', 'asset_id']) .reindex(columns=['male', 'wealth', 'name', 'share'])) tm.assert_frame_equal(result, expected) # equivalency result = (merge(household.reset_index(), portfolio.reset_index(), on=['household_id'], how='inner') .set_index(['household_id', 'asset_id'])) tm.assert_frame_equal(result, expected) result = household.join(portfolio, how='outer') expected = (concat([ expected, (DataFrame( dict(share=[1.00]), index=MultiIndex.from_tuples( [(4, np.nan)], names=['household_id', 'asset_id']))) ], axis=0, sort=True).reindex(columns=expected.columns)) tm.assert_frame_equal(result, expected) # invalid cases household.index.name = 'foo' with pytest.raises(ValueError): household.join(portfolio, how='inner') portfolio2 = portfolio.copy() portfolio2.index.set_names(['household_id', 'foo']) with pytest.raises(ValueError): portfolio2.join(portfolio, how='inner')
def agg_dict_like( obj: AggObjType, arg: AggFuncTypeDict, _axis: int, ) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a dict-like argument. Parameters ---------- obj : Pandas object to compute aggregation on. arg : dict label-aggregation pairs to compute. _axis : int, 0 or 1 Axis to compute aggregation on. Returns ------- Result of aggregation. """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): new_arg: AggFuncTypeDict = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): raise SpecificationError("nested renamer is not supported") elif isinstance(selected_obj, ABCSeries): raise SpecificationError("nested renamer is not supported") elif (isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns): raise KeyError(f"Column '{k}' does not exist!") arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) if isinstance(selected_obj, ABCDataFrame) and len( selected_obj.columns.intersection(keys)) != len(keys): cols = sorted( set(keys) - set(selected_obj.columns.intersection(keys))) raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(obj._selection, ndim=1) results = {key: colg.agg(how) for key, how in arg.items()} else: # key used for column selection and output results = { key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() } # set the final keys keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] # combine results if all(is_ndframe): keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys axis = 0 if isinstance(obj, ABCSeries) else 1 result = concat({k: results[k] for k in keys_to_use}, axis=axis) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") else: from pandas import Series # we have a dict of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) name = obj.name else: name = None result = Series(results, name=name) return result
def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: try: colg = self._gotitem(obj.name, ndim=1, subset=obj) results.append(colg.aggregate(a)) # make sure we find a good name name = com._get_callable_name(a) or a keys.append(name) except (TypeError, DataError): pass except SpecificationError: raise # multiples else: for col in obj: try: colg = self._gotitem(col, ndim=1, subset=obj[col]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): pass except ValueError: # cannot aggregate continue except SpecificationError: raise # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1) except TypeError: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas.core.dtypes.cast import is_nested_object from pandas import Series result = Series(results, index=keys, name=self.name) if is_nested_object(result): raise ValueError("cannot combine transform and " "aggregation operations") return result
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level) -> str: fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [ _make_col_name(prefix, prefix_sep, level) for level in levels ] index: Optional[Index] if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: Union[bool, float, int] if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: List[List] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _normalize(table, normalize, margins, margins_name='All'): if not isinstance(normalize, bool) and not isinstance(normalize, compat.string_types): axis_subs = {0: 'index', 1: 'columns'} try: normalize = axis_subs[normalize] except KeyError: raise ValueError("Not a valid normalize argument") if margins is False: # Actual Normalizations normalizers = { 'all': lambda x: x / x.sum(axis=1).sum(axis=0), 'columns': lambda x: x / x.sum(), 'index': lambda x: x.div(x.sum(axis=1), axis=0) } normalizers[True] = normalizers['all'] try: f = normalizers[normalize] except KeyError: raise ValueError("Not a valid normalize argument") table = f(table) table = table.fillna(0) elif margins is True: column_margin = table.loc[:, margins_name].drop(margins_name) index_margin = table.loc[margins_name, :].drop(margins_name) table = table.drop(margins_name, axis=1).drop(margins_name) # to keep index and columns names table_index_names = table.index.names table_columns_names = table.columns.names # Normalize core table = _normalize(table, normalize=normalize, margins=False) # Fix Margins if normalize == 'columns': column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) elif normalize == 'index': index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() index_margin = index_margin / index_margin.sum() index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) table = table.append(index_margin) table = table.fillna(0) else: raise ValueError("Not a valid normalize argument") table.index.names = table_index_names table.columns.names = table_columns_names else: raise ValueError("Not a valid margins argument") return table
def get_dummies( data, prefix=None, prefix_sep="_", dummy_na: bool = False, columns=None, sparse: bool = False, drop_first: bool = False, dtype: Optional[Dtype] = None, ) -> DataFrame: """ Convert categorical variable into dummy/indicator variables. Parameters ---------- data : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, list of str, or dict of str, default None String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : str, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix`. dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy-encoded columns should be backed by a :class:`SparseArray` (True) or a regular NumPy array (False). drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. Returns ------- DataFrame Dummy-coded data. See Also -------- Series.str.get_dummies : Convert Series to dummy codes. Examples -------- >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ from pandas.core.reshape.concat import concat dtypes_to_encode = ["object", "category"] if isinstance(data, DataFrame): # determine columns being encoded if columns is None: data_to_encode = data.select_dtypes(include=dtypes_to_encode) elif not is_list_like(columns): raise TypeError( "Input must be a list-like for parameter `columns`") else: data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): if is_list_like(item): if not len(item) == data_to_encode.shape[1]: len_msg = ( f"Length of '{name}' ({len(item)}) did not match the " "length of the columns being encoded " f"({data_to_encode.shape[1]}).") raise ValueError(len_msg) check_len(prefix, "prefix") check_len(prefix_sep, "prefix_sep") if isinstance(prefix, str): prefix = itertools.cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in data_to_encode.columns] if prefix is None: prefix = data_to_encode.columns # validate separators if isinstance(prefix_sep, str): prefix_sep = itertools.cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] with_dummies: List[DataFrame] if data_to_encode.shape == data.shape: # Encoding the entire df, do not prepend any dropped columns with_dummies = [] elif columns is not None: # Encoding only cols specified in columns. Get all cols not in # columns to prepend to result. with_dummies = [data.drop(columns, axis=1)] else: # Encoding only object and category dtype columns. Get remaining # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep): # col is (column_name, column), use just column data here dummy = _get_dummies_1d( col[1], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype, ) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d( data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype, ) return result
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None): """ Convert categorical variable into dummy/indicator variables. Parameters ---------- data : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, list of str, or dict of str, default None String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : str, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix`. dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy-encoded columns should be backed by a :class:`SparseArray` (True) or a regular NumPy array (False). drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. .. versionadded:: 0.23.0 Returns ------- DataFrame Dummy-coded data. See Also -------- Series.str.get_dummies : Convert Series to dummy codes. Examples -------- >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ from pandas.core.reshape.concat import concat from itertools import cycle dtypes_to_encode = ['object', 'category'] if isinstance(data, DataFrame): # determine columns being encoded if columns is None: data_to_encode = data.select_dtypes( include=dtypes_to_encode) else: data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): len_msg = ("Length of '{name}' ({len_item}) did not match the " "length of the columns being encoded ({len_enc}).") if is_list_like(item): if not len(item) == data_to_encode.shape[1]: len_msg = len_msg.format(name=name, len_item=len(item), len_enc=data_to_encode.shape[1]) raise ValueError(len_msg) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') if isinstance(prefix, str): prefix = cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in data_to_encode.columns] if prefix is None: prefix = data_to_encode.columns # validate separators if isinstance(prefix_sep, str): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] if data_to_encode.shape == data.shape: # Encoding the entire df, do not prepend any dropped columns with_dummies = [] elif columns is not None: # Encoding only cols specified in columns. Get all cols not in # columns to prepend to result. with_dummies = [data.drop(columns, axis=1)] else: # Encoding only object and category dtype columns. Get remaining # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): # col is (column_name, column), use just column data here dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) return result
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na: bool = False, sparse: bool = False, drop_first: bool = False, dtype: Optional[Dtype] = None, ) -> DataFrame: from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] index: Optional[Index] if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: Union[bool, float, int] if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: List[List] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) # TODO: overload concat with Literal for axis out = cast(DataFrame, out) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): # TODO: what about the existing index? if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list)): raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') else: id_vars = list(id_vars) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif (isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list)): raise ValueError('value_vars must be a list of tuples when' ' columns are a MultiIndex') else: value_vars = list(value_vars) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = ['variable_{i}'.format(i=i) for i in range(len(frame.columns.names))] else: var_name = [frame.columns.name if frame.columns.name is not None else 'variable'] if isinstance(var_name, compat.string_types): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_type(id_data): id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray(frame.columns ._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns)
def _generate_marginal_results( table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", ): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ("", ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby( rows, observed=observed).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() try: piece[all_key] = margin[key] except TypeError: # we cannot reshape, so coerce the axis piece.set_axis( piece._get_axis(cat_axis)._to_safe_for_reshape(), axis=cat_axis, inplace=True, ) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: from pandas import DataFrame cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): if len(cols) > 1: all_key = _all_key(key) else: all_key = margins_name table_pieces.append(piece) # GH31016 this is to calculate margin for each group, and assign # corresponded key as index transformed_piece = DataFrame(piece.apply(aggfunc)).T transformed_piece.index = Index([all_key], name=piece.index.name) # append piece for margin into table_piece table_pieces.append(transformed_piece) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby( cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + list(range(len(cols))) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin