def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ assert obj is not None if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") # Keep self.grouper value before overriding if self._grouper is None: self._grouper = self.grouper # the key must be a valid info item if self.key is not None: key = self.key # The 'on' is already defined if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): # pandas\core\groupby\grouper.py:348: error: Item "None" of # "Optional[Any]" has no attribute "take" [union-attr] ax = self._grouper.take(obj.index) # type: ignore[union-attr] else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) if self.level is not None: level = self.level # if a level is given it must be a mi level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) ax = Index(ax._get_level_values(level), name=ax.names[level]) else: if level not in (0, ax.name): raise ValueError(f"The level {level} is not valid") # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis) self.obj = obj self.grouper = ax return self.grouper
def get_weight_shares( weights: FrameOrSeries, axis: Axis = 1, ) -> FrameOrSeries: """If not weight shares already, calculates weight shares.""" axis = _handle_axis(axis) # TODO: test precision if not weights.sum(axis).round(5).eq(1).all(): return weights.div(weights.sum(axis, min_count=1), axis=flip(axis)) else: # It is already weight shares so return input return weights
def table(ax, data: FrameOrSeries, rowLabels=None, colLabels=None, **kwargs) -> "Table": if isinstance(data, ABCSeries): data = data.to_frame() elif isinstance(data, ABCDataFrame): pass else: raise ValueError("Input data must be DataFrame or Series") if rowLabels is None: rowLabels = data.index if colLabels is None: colLabels = data.columns cellText = data.values table = matplotlib.table.table(ax, cellText=cellText, rowLabels=rowLabels, colLabels=colLabels, **kwargs) return table
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs) -> FrameOrSeries: """ Transform a DataFrame or Series Parameters ---------- obj : DataFrame or Series Object to compute the transform on. func : string, function, list, or dictionary Function(s) to compute the transform with. axis : {0 or 'index', 1 or 'columns'} Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return transform(obj.T, func, 0, *args, **kwargs).T if isinstance(func, list): if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if isinstance(func, dict): return transform_dict_like(obj, func, *args, **kwargs) # func is either str or callable try: result = transform_str_or_callable(obj, func, *args, **kwargs) except Exception: raise ValueError("Transform function failed") # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index): raise ValueError("Function did not transform") return result
def transform_str_or_callable(obj: FrameOrSeries, func: AggFuncTypeBase, *args, **kwargs) -> FrameOrSeriesUnion: """ Compute transform in the case of a string or callable func """ if isinstance(func, str): return obj._try_aggregate_string_function(func, *args, **kwargs) if not args and not kwargs: f = obj._get_cython_func(func) if f: return getattr(obj, f)() # Two possible ways to use a UDF - apply or call directly try: return obj.apply(func, args=args, **kwargs) except Exception: return func(obj, *args, **kwargs)
def transform_dict_like( obj: FrameOrSeries, func: AggFuncTypeDict, *args, **kwargs, ): """ Compute transform in the case of a dict-like func """ from pandas.core.reshape.concat import concat if len(func) == 0: raise ValueError("No transform functions were provided") if obj.ndim != 1: # Check for missing columns on a frame cols = sorted(set(func.keys()) - set(obj.columns)) if len(cols) > 0: raise SpecificationError(f"Column(s) {cols} do not exist") # Can't use func.values(); wouldn't work for a Series if any(is_dict_like(v) for _, v in func.items()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes # Cannot use func.values() because arg may be a Series if any(is_aggregator(x) for _, x in func.items()): new_func: AggFuncTypeDict = {} for k, v in func.items(): if not is_aggregator(v): # mypy can't realize v is not a list here new_func[k] = [v] # type:ignore[list-item] else: new_func[k] = v func = new_func results: Dict[Label, FrameOrSeriesUnion] = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as err: if ( str(err) == "Function did not transform" or str(err) == "No transform functions were provided" ): raise err # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1)
def filter_pipe( data: FrameOrSeries, like: List[str] = None, regex: List[str] = None, axis: int = None, ) -> FrameOrSeries: """Subset the DataFrame or Series labels with more than one filter at once. Parameters ---------- data: DataFrame or Series DataFrame or Series to filter labels on. like : list of str Keep labels from axis for which "like in label == True". regex : list of str Keep labels from axis for which re.search(regex, label) == True. axis : {0 or ‘index’, 1 or ‘columns’, None}, default None The axis to filter on, expressed either as an index (int) or axis name (str). By default this is the info axis, 'index' for Series, 'columns' for DataFrame. Returns ------- Dataframe or Series Subset of `data`. """ if like and regex: raise ValueError("Cannot pass both `like` and `regex`") elif like: if isinstance(like, str): like = [like] for exp in like: data = data.filter(like=exp, axis=axis) elif regex: if isinstance(regex, str): regex = [regex] for exp in like: data = data.filter(regex=exp, axis=axis) else: raise ValueError("Must pass either `like` or `regex` but not both") return data
def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: """ Split pandas object into its components as numpy arrays for numba functions. Parameters ---------- arg : Series or DataFrame Returns ------- (ndarray, ndarray) values, index """ return arg.to_numpy(), arg.index.to_numpy()
def get_index_and_growth_stats( index: FrameOrSeries, reference_period: str, double_link: bool = False, prefix: str = '', ) -> Dict[str, FrameOrSeries]: """Returns the monthly chained index referenced to the given period and the index growth as a dictionary. Parameters ---------- index : Series The unchained index. reference_period : str The reference period as a string. Must work with pandas datetime indexing. double_link : bool Boolean switch for annual chainlinking. prefix : str A prefix to add to the keys of the output dict. Returns ------- dict of {str : Series} The monthly index and growth stats for publication. """ stats = dict() stats['idx'] = index.copy() # Chain the index and reference to given reference period. chained_index = chain(index, double_link=double_link) stats[f'idx_r{reference_period}'] = set_reference_period( chained_index, reference_period, ) # Get the annual MoM growth and drop the first year of NaNs stats['idx_growth'] = chained_index.pct_change(12) * 100 stats['idx_growth'].dropna(inplace=True) # Adds prefix to the keys of the output dict stats = {prefix + k: v for k, v in stats.items()} return stats
def transform_dict_like( obj: FrameOrSeries, func: AggFuncTypeDict, *args, **kwargs, ): """ Compute transform in the case of a dict-like func """ from pandas.core.reshape.concat import concat if len(func) == 0: raise ValueError("No transform functions were provided") if obj.ndim != 1: # Check for missing columns on a frame cols = set(func.keys()) - set(obj.columns) if len(cols) > 0: cols_sorted = list(safe_sort(list(cols))) raise SpecificationError(f"Column(s) {cols_sorted} do not exist") # Can't use func.values(); wouldn't work for a Series if any(is_dict_like(v) for _, v in func.items()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") results: Dict[Hashable, FrameOrSeriesUnion] = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as err: if ( str(err) == "Function did not transform" or str(err) == "No transform functions were provided" ): raise err # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1)
def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator Returns ------- Generator yielding sequence of (name, subsetted object) for each group """ slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis) length = len(data.axes[axis]) start = 0 for edge, label in zip(self.bins, self.binlabels): if label is not NaT: yield label, slicer(start, edge) start = edge if start < length: yield self.binlabels[-1], slicer(start, None)
def get_grouper( obj: FrameOrSeries, key=None, axis: int = 0, level=None, sort: bool = True, observed: bool = False, mutated: bool = False, validate: bool = True, dropna: bool = True, ) -> tuple[ops.BaseGrouper, frozenset[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values. If validate, then check for key/level overlaps. """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError( "multiple levels only valid with MultiIndex") if isinstance(level, str): if obj._get_axis(axis).name != level: raise ValueError(f"level name {level} is not the name " f"of the {obj._get_axis_name(axis)}") elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, frozenset(), obj else: return grouper, frozenset({key.key}), obj # already have a BaseGrouper, just return it elif isinstance(key, ops.BaseGrouper): return key, frozenset(), obj if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) else: assert isinstance(obj, Series) all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings: list[Grouping] = [] exclusions: set[Hashable] = set() # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: if not _is_label_like(key): # items -> .columns for DataFrame, .index for Series items = obj.axes[-1] try: items.get_loc(key) except (KeyError, TypeError, InvalidIndexError): # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: return gpr is obj[gpr.name] except (KeyError, IndexError): # IndexError reached in e.g. test_skip_group_keys when we pass # lambda here return False for gpr, level in zip(keys, levels): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.add(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.add(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.add(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " "must be same length") # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, dropna=dropna, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna) return grouper, frozenset(exclusions), obj
def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ assert obj is not None if self.key is not None and self.level is not None: raise ValueError( "The Grouper cannot specify both a key and a level!") # Keep self.grouper value before overriding if self._grouper is None: self._grouper = self.grouper self._indexer = self.indexer # the key must be a valid info item if self.key is not None: key = self.key # The 'on' is already defined if getattr(self.grouper, "name", None) == key and isinstance( obj, Series): # Sometimes self._grouper will have been resorted while # obj has not. In this case there is a mismatch when we # call self._grouper.take(obj.index) so we need to undo the sorting # before we call _grouper.take. assert self._grouper is not None if self._indexer is not None: reverse_indexer = self._indexer.argsort() unsorted_ax = self._grouper.take(reverse_indexer) ax = unsorted_ax.take(obj.index) else: ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) if self.level is not None: level = self.level # if a level is given it must be a mi level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) ax = Index(ax._get_level_values(level), name=ax.names[level]) else: if level not in (0, ax.name): raise ValueError(f"The level {level} is not valid") # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth # TODO: why does putting na_position="first" fix datetimelike cases? indexer = self.indexer = ax.array.argsort(kind="mergesort", na_position="first") ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis) # error: Incompatible types in assignment (expression has type # "FrameOrSeries", variable has type "None") self.obj = obj # type: ignore[assignment] # error: Incompatible types in assignment (expression has type "Index", # variable has type "None") self.grouper = ax # type: ignore[assignment] return self.grouper
def describe_ndframe( *, obj: FrameOrSeries, include: Optional[Union[str, Sequence[str]]], exclude: Optional[Union[str, Sequence[str]]], datetime_is_numeric: bool, percentiles: Optional[Sequence[float]], ) -> FrameOrSeries: """Describe series or dataframe. Called from pandas.core.generic.NDFrame.describe() Parameters ---------- obj: DataFrame or Series Either dataframe or series to be described. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 75th percentiles. Returns ------- Dataframe or series description. """ if obj.ndim == 2 and obj.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") percentiles = _refine_percentiles(percentiles) if obj.ndim == 1: series = cast("Series", obj) # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] return describe_1d( series, percentiles, datetime_is_numeric, is_series=True, ) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] if datetime_is_numeric: default_include.append("datetime") data = obj.select_dtypes(include=default_include) if len(data.columns) == 0: data = obj elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = obj else: data = obj.select_dtypes(include=include, exclude=exclude) ldesc = [ describe_1d(s, percentiles, datetime_is_numeric, is_series=False) for _, s in data.items() ] # set a convenient order for rows names: List[Hashable] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d
def info( data: FrameOrSeries, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: """ Print a concise summary of a %(klass)s. This method prints information about a %(klass)s including the index dtype%(type_sub)s, non-null values and memory usage. Parameters ---------- data : %(klass)s %(klass)s to print information about. verbose : bool, optional Whether to print the full summary. By default, the setting in ``pandas.options.display.max_info_columns`` is followed. buf : writable buffer, defaults to sys.stdout Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process the output. %(max_cols_sub)s memory_usage : bool, str, optional Specifies whether total memory usage of the %(klass)s elements (including the index) should be displayed. By default, this follows the ``pandas.options.display.memory_usage`` setting. True always show memory usage. False never shows memory usage. A value of 'deep' is equivalent to "True with deep introspection". Memory usage is shown in human-readable units (base-2 representation). Without deep introspection a memory estimation is made based in column dtype and number of rows assuming values consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. null_counts : bool, optional Whether to show the non-null counts. By default, this is shown only if the %(klass)s is smaller than ``pandas.options.display.max_info_rows`` and ``pandas.options.display.max_info_columns``. A value of True always shows the counts, and False never shows the counts. Returns ------- None This method prints a summary of a %(klass)s and returns None. See Also -------- %(see_also_sub)s Examples -------- %(examples_sub)s """ if buf is None: # pragma: no cover buf = sys.stdout lines = [] lines.append(str(type(data))) lines.append(data.index._summary()) cols = data.columns col_count = len(cols) dtypes = data.dtypes if col_count == 0: lines.append(f"Empty {type(data).__name__}") fmt.buffer_put_lines(buf, lines) return # hack if max_cols is None: max_cols = get_option("display.max_info_columns", col_count + 1) max_rows = get_option("display.max_info_rows", len(data) + 1) if null_counts is None: show_counts = (col_count <= max_cols) and (len(data) < max_rows) else: show_counts = null_counts exceeds_info_cols = col_count > max_cols def _verbose_repr(): lines.append(f"Data columns (total {col_count} columns):") id_head = " # " column_head = "Column" col_space = 2 max_col = max(len(pprint_thing(k)) for k in cols) len_column = len(pprint_thing(column_head)) space = max(max_col, len_column) + col_space max_id = len(pprint_thing(col_count)) len_id = len(pprint_thing(id_head)) space_num = max(max_id, len_id) + col_space header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = data.count() if col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" ) count_header = "Non-Null Count" len_count = len(count_header) non_null = " non-null" max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) space_count = max(len_count, max_count) + col_space count_temp = "{count}" + non_null else: count_header = "" space_count = len(count_header) len_count = space_count count_temp = "{count}" dtype_header = "Dtype" len_dtype = len(dtype_header) max_dtypes = max(len(pprint_thing(k)) for k in dtypes) space_dtype = max(len_dtype, max_dtypes) header += _put_str(count_header, space_count) + _put_str( dtype_header, space_dtype) lines.append(header) lines.append( _put_str("-" * len_id, space_num) + _put_str("-" * len_column, space) + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype)) for i, col in enumerate(cols): dtype = dtypes[i] col = pprint_thing(col) line_no = _put_str(f" {i}", space_num) count = "" if show_counts: count = counts[i] lines.append(line_no + _put_str(col, space) + _put_str(count_temp.format( count=count), space_count) + _put_str(dtype, space_dtype)) def _non_verbose_repr(): lines.append(cols._summary(name="Columns")) def _sizeof_fmt(num, size_qualifier): # returns size in human readable format for x in ["bytes", "KB", "MB", "GB", "TB"]: if num < 1024.0: return f"{num:3.1f}{size_qualifier} {x}" num /= 1024.0 return f"{num:3.1f}{size_qualifier} PB" if verbose: _verbose_repr() elif verbose is False: # specifically set to False, not nesc None _non_verbose_repr() else: if exceeds_info_cols: _non_verbose_repr() else: _verbose_repr() # groupby dtype.name to collect e.g. Categorical columns counts = dtypes.value_counts().groupby(lambda x: x.name).sum() dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] lines.append(f"dtypes: {', '.join(dtypes)}") if memory_usage is None: memory_usage = get_option("display.memory_usage") if memory_usage: # append memory usage of df to display size_qualifier = "" if memory_usage == "deep": deep = True else: # size_qualifier is just a best effort; not guaranteed to catch # all cases (e.g., it misses categorical data even with object # categories) deep = False if "object" in counts or data.index._is_memory_usage_qualified(): size_qualifier = "+" mem_usage = data.memory_usage(index=True, deep=deep).sum() lines.append( f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") fmt.buffer_put_lines(buf, lines)
def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None if data.ndim == 2 and any( isinstance(x, ExtensionArray) for x in data._iter_column_arrays() ): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray # TODO: can we have a workaround for EAs backed by ndarray? pass elif isinstance(data._mgr, ArrayManager): # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0 # for now -> relies on BlockManager internals pass elif ( com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 # fast_apply/libreduction doesn't allow non-numpy backed indexes and not data.index._has_complex_internals ): try: sdata = splitter.sorted_data result_values, mutated = splitter.fast_apply(f, sdata, group_keys) except IndexError: # This is a rare case in which re-running in python-space may # make a difference, see test_apply_mutate.test_mutate_groups pass else: # If the fast apply path could be used we can return here. # Otherwise we need to fall back to the slow implementation. if len(result_values) == len(group_keys): return group_keys, result_values, mutated if result_values is None: # result_values is None if fast apply path wasn't taken # or fast apply aborted with an unexpected exception. # In either case, initialize the result list and perform # the slow iteration. result_values = [] skip_first = False else: # If result_values is not None we're in the case that the # fast apply loop was broken prematurely but we have # already the result for the first group which we can reuse. skip_first = True # This calls DataSplitter.__iter__ zipped = zip(group_keys, splitter) if skip_first: # pop the first item from the front of the iterator next(zipped) for key, group in zipped: object.__setattr__(group, "name", key) # group might be modified group_axes = group.axes res = f(group) if not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) return group_keys, result_values, mutated
def get_grouper( obj: FrameOrSeries, key=None, axis: int = 0, level=None, sort: bool = True, observed: bool = False, mutated: bool = False, validate: bool = True, ) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values. If validate, then check for key/level overlaps. """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError( "multiple levels only valid with MultiIndex") if isinstance(level, str): if obj.index.name != level: raise ValueError( "level name {level} is not the name of the index". format(level=level)) elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, [], obj else: return grouper, [key.key], obj # already have a BaseGrouper, just return it elif isinstance(key, BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with # unhashable elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 if isinstance(key, tuple): all_hashable = is_hashable(key) if (all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable: # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by=[...]' instead of 'by=(...)'. In " "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) else: assert isinstance(obj, Series) all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] # type: List[Grouping] exclusions = [] # type: List[Hashable] # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: if not _is_label_like(key): items = obj._data.items try: items.get_loc(key) except (KeyError, TypeError): # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: return gpr is obj[gpr.name] except (KeyError, IndexError): return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.append(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( ("Length of grouper ({len_gpr}) and axis ({len_axis})" " must be same length".format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs) -> FrameOrSeries: """ Transform a DataFrame or Series Parameters ---------- obj : DataFrame or Series Object to compute the transform on. func : string, function, list, or dictionary Function(s) to compute the transform with. axis : {0 or 'index', 1 or 'columns'} Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. Returns ------- DataFrame or Series Result of applying ``func`` along the given axis of the Series or DataFrame. Raises ------ ValueError If the transform function fails or does not transform. """ from pandas.core.reshape.concat import concat is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: assert not is_series return transform(obj.T, func, 0, *args, **kwargs).T if isinstance(func, list): if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: func = {col: func for col in obj} if isinstance(func, dict): if not is_series: cols = sorted(set(func.keys()) - set(obj.columns)) if len(cols) > 0: raise SpecificationError(f"Column(s) {cols} do not exist") if any(isinstance(v, dict) for v in func.values()): # GH 15931 - deprecation of renaming keys raise SpecificationError("nested renamer is not supported") results = {} for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = transform(colg, how, 0, *args, **kwargs) except Exception as e: if str(e) == "Function did not transform": raise e # combine results if len(results) == 0: raise ValueError("Transform function failed") return concat(results, axis=1) # func is either str or callable try: if isinstance(func, str): result = obj._try_aggregate_string_function(func, *args, **kwargs) else: f = obj._get_cython_func(func) if f and not args and not kwargs: result = getattr(obj, f)() else: try: result = obj.apply(func, args=args, **kwargs) except Exception: result = func(obj, *args, **kwargs) except Exception: raise ValueError("Transform function failed") # Functions that transform may return empty Series/DataFrame # when the dtype is not appropriate if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: raise ValueError("Transform function failed") if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( obj.index): raise ValueError("Function did not transform") return result
def build_table_schema( data: FrameOrSeries, index: bool = True, primary_key: bool | None = None, version: bool = True, ) -> dict[str, JSONSerializable]: """ Create a Table schema from ``data``. Parameters ---------- data : Series, DataFrame index : bool, default True Whether to include ``data.index`` in the schema. primary_key : bool or None, default True Column names to designate as the primary key. The default `None` will set `'primaryKey'` to the index level or levels if the index is unique. version : bool, default True Whether to include a field `pandas_version` with the version of pandas that generated the schema. Returns ------- schema : dict Notes ----- See `Table Schema <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for conversion types. Timedeltas as converted to ISO8601 duration format with 9 decimal places after the seconds field for nanosecond precision. Categoricals are converted to the `any` dtype, and use the `enum` field constraint to list the allowed values. The `ordered` attribute is included in an `ordered` field. Examples -------- >>> df = pd.DataFrame( ... {'A': [1, 2, 3], ... 'B': ['a', 'b', 'c'], ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), ... }, index=pd.Index(range(3), name='idx')) >>> build_table_schema(df) {'fields': \ [{'name': 'idx', 'type': 'integer'}, \ {'name': 'A', 'type': 'integer'}, \ {'name': 'B', 'type': 'string'}, \ {'name': 'C', 'type': 'datetime'}], \ 'primaryKey': ['idx'], \ 'pandas_version': '0.20.0'} """ if index is True: data = set_default_names(data) schema: dict[str, Any] = {} fields = [] if index: if data.index.nlevels > 1: data.index = cast("MultiIndex", data.index) for level, name in zip(data.index.levels, data.index.names): new_field = convert_pandas_type_to_json_field(level) new_field["name"] = name fields.append(new_field) else: fields.append(convert_pandas_type_to_json_field(data.index)) if data.ndim > 1: for column, s in data.items(): fields.append(convert_pandas_type_to_json_field(s)) else: fields.append(convert_pandas_type_to_json_field(data)) schema["fields"] = fields if index and data.index.is_unique and primary_key is None: if data.index.nlevels == 1: schema["primaryKey"] = [data.index.name] else: schema["primaryKey"] = data.index.names elif primary_key is not None: schema["primaryKey"] = primary_key if version: schema["pandas_version"] = "0.20.0" return schema
def describe_ndframe( *, obj: FrameOrSeries, include: Optional[Union[str, Sequence[str]]], exclude: Optional[Union[str, Sequence[str]]], datetime_is_numeric: bool, percentiles: Optional[Sequence[float]], ) -> FrameOrSeries: """Describe series or dataframe. Called from pandas.core.generic.NDFrame.describe() Parameters ---------- obj: DataFrame or Series Either dataframe or series to be described. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 75th percentiles. Returns ------- Dataframe or series description. """ if obj.ndim == 2 and obj.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # explicit conversion of `percentiles` to list percentiles = list(percentiles) # get them all to be in [0, 1] validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) # sort and check for duplicates unique_pcts = np.unique(percentiles) assert percentiles is not None if len(unique_pcts) < len(percentiles): raise ValueError("percentiles cannot contain duplicates") percentiles = unique_pcts formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series) -> "Series": from pandas import Series stat_index = ["count", "mean", "std", "min" ] + formatted_percentiles + ["max"] d = ([series.count(), series.mean(), series.std(), series.min()] + series.quantile(percentiles).tolist() + [series.max()]) return Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data) -> "Series": names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] if is_datetime64_any_dtype(data.dtype): if obj.ndim == 1: stacklevel = 5 else: stacklevel = 6 warnings.warn( "Treating datetime data as categorical rather than numeric in " "`.describe` is deprecated and will be removed in a future " "version of pandas. Specify `datetime_is_numeric=True` to " "silence this warning and adopt the future behavior now.", FutureWarning, stacklevel=stacklevel, ) tz = data.dt.tz asint = data.dropna().values.view("i8") top = Timestamp(top) if top.tzinfo is not None and tz is not None: # Don't tz_localize(None) if key is already tz-aware top = top.tz_convert(tz) else: top = top.tz_localize(tz) names += ["top", "freq", "first", "last"] result += [ top, freq, Timestamp(asint.min(), tz=tz), Timestamp(asint.max(), tz=tz), ] else: names += ["top", "freq"] result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency else: names += ["top", "freq"] result += [np.nan, np.nan] dtype = "object" from pandas import Series return Series(result, index=names, name=data.name, dtype=dtype) def describe_timestamp_1d(data) -> "Series": # GH-30164 from pandas import Series stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] d = ([data.count(), data.mean(), data.min()] + data.quantile(percentiles).tolist() + [data.max()]) return Series(d, index=stat_index, name=data.name) def describe_1d(data) -> "Series": if is_bool_dtype(data.dtype): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: return describe_timestamp_1d(data) elif is_timedelta64_dtype(data.dtype): return describe_numeric_1d(data) else: return describe_categorical_1d(data) if obj.ndim == 1: # Incompatible return value type # (got "Series", expected "FrameOrSeries") [return-value] return describe_1d(obj) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] if datetime_is_numeric: default_include.append("datetime") data = obj.select_dtypes(include=default_include) if len(data.columns) == 0: data = obj elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) data = obj else: data = obj.select_dtypes(include=include, exclude=exclude) ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows names: List[Hashable] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: if name not in names: names.append(name) d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d