Beispiel #1
0
    def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
        """
        given an object and the specifications, setup the internal grouper
        for this particular specification

        Parameters
        ----------
        obj : Series or DataFrame
        sort : bool, default False
            whether the resulting grouper should be sorted
        """
        assert obj is not None

        if self.key is not None and self.level is not None:
            raise ValueError("The Grouper cannot specify both a key and a level!")

        # Keep self.grouper value before overriding
        if self._grouper is None:
            self._grouper = self.grouper

        # the key must be a valid info item
        if self.key is not None:
            key = self.key
            # The 'on' is already defined
            if getattr(self.grouper, "name", None) == key and isinstance(
                obj, ABCSeries
            ):
                # pandas\core\groupby\grouper.py:348: error: Item "None" of
                # "Optional[Any]" has no attribute "take"  [union-attr]
                ax = self._grouper.take(obj.index)  # type: ignore[union-attr]
            else:
                if key not in obj._info_axis:
                    raise KeyError(f"The grouper name {key} is not found")
                ax = Index(obj[key], name=key)

        else:
            ax = obj._get_axis(self.axis)
            if self.level is not None:
                level = self.level

                # if a level is given it must be a mi level or
                # equivalent to the axis name
                if isinstance(ax, MultiIndex):
                    level = ax._get_level_number(level)
                    ax = Index(ax._get_level_values(level), name=ax.names[level])

                else:
                    if level not in (0, ax.name):
                        raise ValueError(f"The level {level} is not valid")

        # possibly sort
        if (self.sort or sort) and not ax.is_monotonic:
            # use stable sort to support first, last, nth
            indexer = self.indexer = ax.argsort(kind="mergesort")
            ax = ax.take(indexer)
            obj = obj.take(indexer, axis=self.axis)

        self.obj = obj
        self.grouper = ax
        return self.grouper
Beispiel #2
0
def get_weight_shares(
        weights: FrameOrSeries,
        axis: Axis = 1,
        ) -> FrameOrSeries:
    """If not weight shares already, calculates weight shares."""
    axis = _handle_axis(axis)
    # TODO: test precision
    if not weights.sum(axis).round(5).eq(1).all():
        return weights.div(weights.sum(axis, min_count=1), axis=flip(axis))

    else:   # It is already weight shares so return input
        return weights
Beispiel #3
0
def table(ax,
          data: FrameOrSeries,
          rowLabels=None,
          colLabels=None,
          **kwargs) -> "Table":
    if isinstance(data, ABCSeries):
        data = data.to_frame()
    elif isinstance(data, ABCDataFrame):
        pass
    else:
        raise ValueError("Input data must be DataFrame or Series")

    if rowLabels is None:
        rowLabels = data.index

    if colLabels is None:
        colLabels = data.columns

    cellText = data.values

    table = matplotlib.table.table(ax,
                                   cellText=cellText,
                                   rowLabels=rowLabels,
                                   colLabels=colLabels,
                                   **kwargs)
    return table
Beispiel #4
0
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args,
              **kwargs) -> FrameOrSeries:
    """
    Transform a DataFrame or Series

    Parameters
    ----------
    obj : DataFrame or Series
        Object to compute the transform on.
    func : string, function, list, or dictionary
        Function(s) to compute the transform with.
    axis : {0 or 'index', 1 or 'columns'}
        Axis along which the function is applied:

        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.

    Returns
    -------
    DataFrame or Series
        Result of applying ``func`` along the given axis of the
        Series or DataFrame.

    Raises
    ------
    ValueError
        If the transform function fails or does not transform.
    """
    is_series = obj.ndim == 1

    if obj._get_axis_number(axis) == 1:
        assert not is_series
        return transform(obj.T, func, 0, *args, **kwargs).T

    if isinstance(func, list):
        if is_series:
            func = {com.get_callable_name(v) or v: v for v in func}
        else:
            func = {col: func for col in obj}

    if isinstance(func, dict):
        return transform_dict_like(obj, func, *args, **kwargs)

    # func is either str or callable
    try:
        result = transform_str_or_callable(obj, func, *args, **kwargs)
    except Exception:
        raise ValueError("Transform function failed")

    # Functions that transform may return empty Series/DataFrame
    # when the dtype is not appropriate
    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
        raise ValueError("Transform function failed")
    if not isinstance(result,
                      (ABCSeries, ABCDataFrame)) or not result.index.equals(
                          obj.index):
        raise ValueError("Function did not transform")

    return result
Beispiel #5
0
def transform_str_or_callable(obj: FrameOrSeries, func: AggFuncTypeBase, *args,
                              **kwargs) -> FrameOrSeriesUnion:
    """
    Compute transform in the case of a string or callable func
    """
    if isinstance(func, str):
        return obj._try_aggregate_string_function(func, *args, **kwargs)

    if not args and not kwargs:
        f = obj._get_cython_func(func)
        if f:
            return getattr(obj, f)()

    # Two possible ways to use a UDF - apply or call directly
    try:
        return obj.apply(func, args=args, **kwargs)
    except Exception:
        return func(obj, *args, **kwargs)
Beispiel #6
0
def transform_dict_like(
    obj: FrameOrSeries,
    func: AggFuncTypeDict,
    *args,
    **kwargs,
):
    """
    Compute transform in the case of a dict-like func
    """
    from pandas.core.reshape.concat import concat

    if len(func) == 0:
        raise ValueError("No transform functions were provided")

    if obj.ndim != 1:
        # Check for missing columns on a frame
        cols = sorted(set(func.keys()) - set(obj.columns))
        if len(cols) > 0:
            raise SpecificationError(f"Column(s) {cols} do not exist")

    # Can't use func.values(); wouldn't work for a Series
    if any(is_dict_like(v) for _, v in func.items()):
        # GH 15931 - deprecation of renaming keys
        raise SpecificationError("nested renamer is not supported")

    is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

    # if we have a dict of any non-scalars
    # eg. {'A' : ['mean']}, normalize all to
    # be list-likes
    # Cannot use func.values() because arg may be a Series
    if any(is_aggregator(x) for _, x in func.items()):
        new_func: AggFuncTypeDict = {}
        for k, v in func.items():
            if not is_aggregator(v):
                # mypy can't realize v is not a list here
                new_func[k] = [v]  # type:ignore[list-item]
            else:
                new_func[k] = v
        func = new_func

    results: Dict[Label, FrameOrSeriesUnion] = {}
    for name, how in func.items():
        colg = obj._gotitem(name, ndim=1)
        try:
            results[name] = transform(colg, how, 0, *args, **kwargs)
        except Exception as err:
            if (
                str(err) == "Function did not transform"
                or str(err) == "No transform functions were provided"
            ):
                raise err

    # combine results
    if len(results) == 0:
        raise ValueError("Transform function failed")
    return concat(results, axis=1)
def filter_pipe(
    data: FrameOrSeries,
    like: List[str] = None,
    regex: List[str] = None,
    axis: int = None,
) -> FrameOrSeries:
    """Subset the DataFrame or Series labels with more than one filter at once.

    Parameters
    ----------
    data: DataFrame or Series
        DataFrame or Series to filter labels on.
    like : list of str
        Keep labels from axis for which "like in label == True".
    regex : list of str
        Keep labels from axis for which re.search(regex, label) == True.
    axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
        The axis to filter on, expressed either as an index (int)
        or axis name (str). By default this is the info axis,
        'index' for Series, 'columns' for DataFrame.

    Returns
    -------
    Dataframe or Series
        Subset of `data`.
    """
    if like and regex:
        raise ValueError("Cannot pass both `like` and `regex`")
    elif like:
        if isinstance(like, str):
            like = [like]
        for exp in like:
            data = data.filter(like=exp, axis=axis)
    elif regex:
        if isinstance(regex, str):
            regex = [regex]
        for exp in like:
            data = data.filter(regex=exp, axis=axis)
    else:
        raise ValueError("Must pass either `like` or `regex` but not both")
    return data
Beispiel #8
0
def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]:
    """
    Split pandas object into its components as numpy arrays for numba functions.

    Parameters
    ----------
    arg : Series or DataFrame

    Returns
    -------
    (ndarray, ndarray)
        values, index
    """
    return arg.to_numpy(), arg.index.to_numpy()
Beispiel #9
0
def get_index_and_growth_stats(
        index: FrameOrSeries,
        reference_period: str,
        double_link: bool = False,
        prefix: str = '',
        ) -> Dict[str, FrameOrSeries]:
    """Returns the monthly chained index referenced to the given period
    and the index growth as a dictionary.

    Parameters
    ----------
    index : Series
        The unchained index.
    reference_period : str
        The reference period as a string. Must work with pandas
        datetime indexing.
    double_link : bool
        Boolean switch for annual chainlinking.
    prefix : str
        A prefix to add to the keys of the output dict.

    Returns
    -------
    dict of {str : Series}
        The monthly index and growth stats for publication.
    """
    stats = dict()

    stats['idx'] = index.copy()

    # Chain the index and reference to given reference period.
    chained_index = chain(index, double_link=double_link)
    stats[f'idx_r{reference_period}'] = set_reference_period(
        chained_index,
        reference_period,
    )

    # Get the annual MoM growth and drop the first year of NaNs
    stats['idx_growth'] = chained_index.pct_change(12) * 100
    stats['idx_growth'].dropna(inplace=True)

    # Adds prefix to the keys of the output dict
    stats = {prefix + k: v for k, v in stats.items()}

    return stats
Beispiel #10
0
def transform_dict_like(
    obj: FrameOrSeries,
    func: AggFuncTypeDict,
    *args,
    **kwargs,
):
    """
    Compute transform in the case of a dict-like func
    """
    from pandas.core.reshape.concat import concat

    if len(func) == 0:
        raise ValueError("No transform functions were provided")

    if obj.ndim != 1:
        # Check for missing columns on a frame
        cols = set(func.keys()) - set(obj.columns)
        if len(cols) > 0:
            cols_sorted = list(safe_sort(list(cols)))
            raise SpecificationError(f"Column(s) {cols_sorted} do not exist")

    # Can't use func.values(); wouldn't work for a Series
    if any(is_dict_like(v) for _, v in func.items()):
        # GH 15931 - deprecation of renaming keys
        raise SpecificationError("nested renamer is not supported")

    results: Dict[Hashable, FrameOrSeriesUnion] = {}
    for name, how in func.items():
        colg = obj._gotitem(name, ndim=1)
        try:
            results[name] = transform(colg, how, 0, *args, **kwargs)
        except Exception as err:
            if (
                str(err) == "Function did not transform"
                or str(err) == "No transform functions were provided"
            ):
                raise err

    # combine results
    if len(results) == 0:
        raise ValueError("Transform function failed")
    return concat(results, axis=1)
Beispiel #11
0
    def get_iterator(self, data: FrameOrSeries, axis: int = 0):
        """
        Groupby iterator

        Returns
        -------
        Generator yielding sequence of (name, subsetted object)
        for each group
        """
        slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis)
        length = len(data.axes[axis])

        start = 0
        for edge, label in zip(self.bins, self.binlabels):
            if label is not NaT:
                yield label, slicer(start, edge)
            start = edge

        if start < length:
            yield self.binlabels[-1], slicer(start, None)
Beispiel #12
0
def get_grouper(
    obj: FrameOrSeries,
    key=None,
    axis: int = 0,
    level=None,
    sort: bool = True,
    observed: bool = False,
    mutated: bool = False,
    validate: bool = True,
    dropna: bool = True,
) -> tuple[ops.BaseGrouper, frozenset[Hashable], FrameOrSeries]:
    """
    Create and return a BaseGrouper, which is an internal
    mapping of how to create the grouper indexers.
    This may be composed of multiple Grouping objects, indicating
    multiple groupers

    Groupers are ultimately index mappings. They can originate as:
    index mappings, keys to columns, functions, or Groupers

    Groupers enable local references to axis,level,sort, while
    the passed in axis, level, and sort are 'global'.

    This routine tries to figure out what the passing in references
    are and then creates a Grouping for each one, combined into
    a BaseGrouper.

    If observed & we have a categorical grouper, only show the observed
    values.

    If validate, then check for key/level overlaps.

    """
    group_axis = obj._get_axis(axis)

    # validate that the passed single level is compatible with the passed
    # axis of the object
    if level is not None:
        # TODO: These if-block and else-block are almost same.
        # MultiIndex instance check is removable, but it seems that there are
        # some processes only for non-MultiIndex in else-block,
        # eg. `obj.index.name != level`. We have to consider carefully whether
        # these are applicable for MultiIndex. Even if these are applicable,
        # we need to check if it makes no side effect to subsequent processes
        # on the outside of this condition.
        # (GH 17621)
        if isinstance(group_axis, MultiIndex):
            if is_list_like(level) and len(level) == 1:
                level = level[0]

            if key is None and is_scalar(level):
                # Get the level values from group_axis
                key = group_axis.get_level_values(level)
                level = None

        else:
            # allow level to be a length-one list-like object
            # (e.g., level=[0])
            # GH 13901
            if is_list_like(level):
                nlevels = len(level)
                if nlevels == 1:
                    level = level[0]
                elif nlevels == 0:
                    raise ValueError("No group keys passed!")
                else:
                    raise ValueError(
                        "multiple levels only valid with MultiIndex")

            if isinstance(level, str):
                if obj._get_axis(axis).name != level:
                    raise ValueError(f"level name {level} is not the name "
                                     f"of the {obj._get_axis_name(axis)}")
            elif level > 0 or level < -1:
                raise ValueError(
                    "level > 0 or level < -1 only valid with MultiIndex")

            # NOTE: `group_axis` and `group_axis.get_level_values(level)`
            # are same in this section.
            level = None
            key = group_axis

    # a passed-in Grouper, directly convert
    if isinstance(key, Grouper):
        binner, grouper, obj = key._get_grouper(obj, validate=False)
        if key.key is None:
            return grouper, frozenset(), obj
        else:
            return grouper, frozenset({key.key}), obj

    # already have a BaseGrouper, just return it
    elif isinstance(key, ops.BaseGrouper):
        return key, frozenset(), obj

    if not isinstance(key, list):
        keys = [key]
        match_axis_length = False
    else:
        keys = key
        match_axis_length = len(keys) == len(group_axis)

    # what are we after, exactly?
    any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
    any_groupers = any(isinstance(g, Grouper) for g in keys)
    any_arraylike = any(
        isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys)

    # is this an index replacement?
    if (not any_callable and not any_arraylike and not any_groupers
            and match_axis_length and level is None):
        if isinstance(obj, DataFrame):
            all_in_columns_index = all(g in obj.columns or g in obj.index.names
                                       for g in keys)
        else:
            assert isinstance(obj, Series)
            all_in_columns_index = all(g in obj.index.names for g in keys)

        if not all_in_columns_index:
            keys = [com.asarray_tuplesafe(keys)]

    if isinstance(level, (tuple, list)):
        if key is None:
            keys = [None] * len(level)
        levels = level
    else:
        levels = [level] * len(keys)

    groupings: list[Grouping] = []
    exclusions: set[Hashable] = set()

    # if the actual grouper should be obj[key]
    def is_in_axis(key) -> bool:
        if not _is_label_like(key):
            # items -> .columns for DataFrame, .index for Series
            items = obj.axes[-1]
            try:
                items.get_loc(key)
            except (KeyError, TypeError, InvalidIndexError):
                # TypeError shows up here if we pass e.g. Int64Index
                return False

        return True

    # if the grouper is obj[name]
    def is_in_obj(gpr) -> bool:
        if not hasattr(gpr, "name"):
            return False
        try:
            return gpr is obj[gpr.name]
        except (KeyError, IndexError):
            # IndexError reached in e.g. test_skip_group_keys when we pass
            #  lambda here
            return False

    for gpr, level in zip(keys, levels):

        if is_in_obj(gpr):  # df.groupby(df['name'])
            in_axis, name = True, gpr.name
            exclusions.add(name)

        elif is_in_axis(gpr):  # df.groupby('name')
            if gpr in obj:
                if validate:
                    obj._check_label_or_level_ambiguity(gpr, axis=axis)
                in_axis, name, gpr = True, gpr, obj[gpr]
                exclusions.add(name)
            elif obj._is_level_reference(gpr, axis=axis):
                in_axis, name, level, gpr = False, None, gpr, None
            else:
                raise KeyError(gpr)
        elif isinstance(gpr, Grouper) and gpr.key is not None:
            # Add key to exclusions
            exclusions.add(gpr.key)
            in_axis, name = False, None
        else:
            in_axis, name = False, None

        if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:
            raise ValueError(
                f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) "
                "must be same length")

        # create the Grouping
        # allow us to passing the actual Grouping as the gpr
        ping = (Grouping(
            group_axis,
            gpr,
            obj=obj,
            name=name,
            level=level,
            sort=sort,
            observed=observed,
            in_axis=in_axis,
            dropna=dropna,
        ) if not isinstance(gpr, Grouping) else gpr)

        groupings.append(ping)

    if len(groupings) == 0 and len(obj):
        raise ValueError("No group keys passed!")
    elif len(groupings) == 0:
        groupings.append(
            Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))

    # create the internals grouper
    grouper = ops.BaseGrouper(group_axis,
                              groupings,
                              sort=sort,
                              mutated=mutated,
                              dropna=dropna)
    return grouper, frozenset(exclusions), obj
Beispiel #13
0
    def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
        """
        given an object and the specifications, setup the internal grouper
        for this particular specification

        Parameters
        ----------
        obj : Series or DataFrame
        sort : bool, default False
            whether the resulting grouper should be sorted
        """
        assert obj is not None

        if self.key is not None and self.level is not None:
            raise ValueError(
                "The Grouper cannot specify both a key and a level!")

        # Keep self.grouper value before overriding
        if self._grouper is None:
            self._grouper = self.grouper
            self._indexer = self.indexer

        # the key must be a valid info item
        if self.key is not None:
            key = self.key
            # The 'on' is already defined
            if getattr(self.grouper, "name", None) == key and isinstance(
                    obj, Series):
                # Sometimes self._grouper will have been resorted while
                # obj has not. In this case there is a mismatch when we
                # call self._grouper.take(obj.index) so we need to undo the sorting
                # before we call _grouper.take.
                assert self._grouper is not None
                if self._indexer is not None:
                    reverse_indexer = self._indexer.argsort()
                    unsorted_ax = self._grouper.take(reverse_indexer)
                    ax = unsorted_ax.take(obj.index)
                else:
                    ax = self._grouper.take(obj.index)
            else:
                if key not in obj._info_axis:
                    raise KeyError(f"The grouper name {key} is not found")
                ax = Index(obj[key], name=key)

        else:
            ax = obj._get_axis(self.axis)
            if self.level is not None:
                level = self.level

                # if a level is given it must be a mi level or
                # equivalent to the axis name
                if isinstance(ax, MultiIndex):
                    level = ax._get_level_number(level)
                    ax = Index(ax._get_level_values(level),
                               name=ax.names[level])

                else:
                    if level not in (0, ax.name):
                        raise ValueError(f"The level {level} is not valid")

        # possibly sort
        if (self.sort or sort) and not ax.is_monotonic:
            # use stable sort to support first, last, nth
            # TODO: why does putting na_position="first" fix datetimelike cases?
            indexer = self.indexer = ax.array.argsort(kind="mergesort",
                                                      na_position="first")
            ax = ax.take(indexer)
            obj = obj.take(indexer, axis=self.axis)

        # error: Incompatible types in assignment (expression has type
        # "FrameOrSeries", variable has type "None")
        self.obj = obj  # type: ignore[assignment]
        # error: Incompatible types in assignment (expression has type "Index",
        # variable has type "None")
        self.grouper = ax  # type: ignore[assignment]
        return self.grouper
Beispiel #14
0
def describe_ndframe(
    *,
    obj: FrameOrSeries,
    include: Optional[Union[str, Sequence[str]]],
    exclude: Optional[Union[str, Sequence[str]]],
    datetime_is_numeric: bool,
    percentiles: Optional[Sequence[float]],
) -> FrameOrSeries:
    """Describe series or dataframe.

    Called from pandas.core.generic.NDFrame.describe()

    Parameters
    ----------
    obj: DataFrame or Series
        Either dataframe or series to be described.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored for ``Series``.
    exclude : list-like of dtypes or None (default), optional,
        A black list of data types to omit from the result. Ignored for ``Series``.
    datetime_is_numeric : bool, default False
        Whether to treat datetime dtypes as numeric.
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should fall between 0 and 1.
        The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.

    Returns
    -------
    Dataframe or series description.
    """
    if obj.ndim == 2 and obj.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    percentiles = _refine_percentiles(percentiles)

    if obj.ndim == 1:
        series = cast("Series", obj)
        # Incompatible return value type
        #  (got "Series", expected "FrameOrSeries")  [return-value]
        return describe_1d(
            series,
            percentiles,
            datetime_is_numeric,
            is_series=True,
        )  # type:ignore[return-value]
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        default_include = [np.number]
        if datetime_is_numeric:
            default_include.append("datetime")
        data = obj.select_dtypes(include=default_include)
        if len(data.columns) == 0:
            data = obj
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = obj
    else:
        data = obj.select_dtypes(include=include, exclude=exclude)

    ldesc = [
        describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
        for _, s in data.items()
    ]
    # set a convenient order for rows
    names: List[Hashable] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = concat([x.reindex(names, copy=False) for x in ldesc],
               axis=1,
               sort=False)
    d.columns = data.columns.copy()
    return d
Beispiel #15
0
def info(
    data: FrameOrSeries,
    verbose: Optional[bool] = None,
    buf: Optional[IO[str]] = None,
    max_cols: Optional[int] = None,
    memory_usage: Optional[Union[bool, str]] = None,
    null_counts: Optional[bool] = None,
) -> None:
    """
    Print a concise summary of a %(klass)s.

    This method prints information about a %(klass)s including
    the index dtype%(type_sub)s, non-null values and memory usage.

    Parameters
    ----------
    data : %(klass)s
        %(klass)s to print information about.
    verbose : bool, optional
        Whether to print the full summary. By default, the setting in
        ``pandas.options.display.max_info_columns`` is followed.
    buf : writable buffer, defaults to sys.stdout
        Where to send the output. By default, the output is printed to
        sys.stdout. Pass a writable buffer if you need to further process
        the output.
    %(max_cols_sub)s
    memory_usage : bool, str, optional
        Specifies whether total memory usage of the %(klass)s
        elements (including the index) should be displayed. By default,
        this follows the ``pandas.options.display.memory_usage`` setting.

        True always show memory usage. False never shows memory usage.
        A value of 'deep' is equivalent to "True with deep introspection".
        Memory usage is shown in human-readable units (base-2
        representation). Without deep introspection a memory estimation is
        made based in column dtype and number of rows assuming values
        consume the same memory amount for corresponding dtypes. With deep
        memory introspection, a real memory usage calculation is performed
        at the cost of computational resources.
    null_counts : bool, optional
        Whether to show the non-null counts. By default, this is shown
        only if the %(klass)s is smaller than
        ``pandas.options.display.max_info_rows`` and
        ``pandas.options.display.max_info_columns``. A value of True always
        shows the counts, and False never shows the counts.

    Returns
    -------
    None
        This method prints a summary of a %(klass)s and returns None.

    See Also
    --------
    %(see_also_sub)s

    Examples
    --------
    %(examples_sub)s
    """
    if buf is None:  # pragma: no cover
        buf = sys.stdout

    lines = []

    lines.append(str(type(data)))
    lines.append(data.index._summary())

    cols = data.columns
    col_count = len(cols)
    dtypes = data.dtypes

    if col_count == 0:
        lines.append(f"Empty {type(data).__name__}")
        fmt.buffer_put_lines(buf, lines)
        return

    # hack
    if max_cols is None:
        max_cols = get_option("display.max_info_columns", col_count + 1)

    max_rows = get_option("display.max_info_rows", len(data) + 1)

    if null_counts is None:
        show_counts = (col_count <= max_cols) and (len(data) < max_rows)
    else:
        show_counts = null_counts
    exceeds_info_cols = col_count > max_cols

    def _verbose_repr():
        lines.append(f"Data columns (total {col_count} columns):")

        id_head = " # "
        column_head = "Column"
        col_space = 2

        max_col = max(len(pprint_thing(k)) for k in cols)
        len_column = len(pprint_thing(column_head))
        space = max(max_col, len_column) + col_space

        max_id = len(pprint_thing(col_count))
        len_id = len(pprint_thing(id_head))
        space_num = max(max_id, len_id) + col_space

        header = _put_str(id_head, space_num) + _put_str(column_head, space)
        if show_counts:
            counts = data.count()
            if col_count != len(counts):  # pragma: no cover
                raise AssertionError(
                    f"Columns must equal counts ({col_count} != {len(counts)})"
                )
            count_header = "Non-Null Count"
            len_count = len(count_header)
            non_null = " non-null"
            max_count = max(len(pprint_thing(k))
                            for k in counts) + len(non_null)
            space_count = max(len_count, max_count) + col_space
            count_temp = "{count}" + non_null
        else:
            count_header = ""
            space_count = len(count_header)
            len_count = space_count
            count_temp = "{count}"

        dtype_header = "Dtype"
        len_dtype = len(dtype_header)
        max_dtypes = max(len(pprint_thing(k)) for k in dtypes)
        space_dtype = max(len_dtype, max_dtypes)
        header += _put_str(count_header, space_count) + _put_str(
            dtype_header, space_dtype)

        lines.append(header)
        lines.append(
            _put_str("-" * len_id, space_num) +
            _put_str("-" * len_column, space) +
            _put_str("-" * len_count, space_count) +
            _put_str("-" * len_dtype, space_dtype))

        for i, col in enumerate(cols):
            dtype = dtypes[i]
            col = pprint_thing(col)

            line_no = _put_str(f" {i}", space_num)
            count = ""
            if show_counts:
                count = counts[i]

            lines.append(line_no + _put_str(col, space) +
                         _put_str(count_temp.format(
                             count=count), space_count) +
                         _put_str(dtype, space_dtype))

    def _non_verbose_repr():
        lines.append(cols._summary(name="Columns"))

    def _sizeof_fmt(num, size_qualifier):
        # returns size in human readable format
        for x in ["bytes", "KB", "MB", "GB", "TB"]:
            if num < 1024.0:
                return f"{num:3.1f}{size_qualifier} {x}"
            num /= 1024.0
        return f"{num:3.1f}{size_qualifier} PB"

    if verbose:
        _verbose_repr()
    elif verbose is False:  # specifically set to False, not nesc None
        _non_verbose_repr()
    else:
        if exceeds_info_cols:
            _non_verbose_repr()
        else:
            _verbose_repr()

    # groupby dtype.name to collect e.g. Categorical columns
    counts = dtypes.value_counts().groupby(lambda x: x.name).sum()
    dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())]
    lines.append(f"dtypes: {', '.join(dtypes)}")

    if memory_usage is None:
        memory_usage = get_option("display.memory_usage")
    if memory_usage:
        # append memory usage of df to display
        size_qualifier = ""
        if memory_usage == "deep":
            deep = True
        else:
            # size_qualifier is just a best effort; not guaranteed to catch
            # all cases (e.g., it misses categorical data even with object
            # categories)
            deep = False
            if "object" in counts or data.index._is_memory_usage_qualified():
                size_qualifier = "+"
        mem_usage = data.memory_usage(index=True, deep=deep).sum()
        lines.append(
            f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n")
    fmt.buffer_put_lines(buf, lines)
Beispiel #16
0
    def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
        mutated = self.mutated
        splitter = self._get_splitter(data, axis=axis)
        group_keys = self._get_group_keys()
        result_values = None

        if data.ndim == 2 and any(
            isinstance(x, ExtensionArray) for x in data._iter_column_arrays()
        ):
            # calling splitter.fast_apply will raise TypeError via apply_frame_axis0
            #  if we pass EA instead of ndarray
            #  TODO: can we have a workaround for EAs backed by ndarray?
            pass

        elif isinstance(data._mgr, ArrayManager):
            # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0
            # for now -> relies on BlockManager internals
            pass
        elif (
            com.get_callable_name(f) not in base.plotting_methods
            and isinstance(splitter, FrameSplitter)
            and axis == 0
            # fast_apply/libreduction doesn't allow non-numpy backed indexes
            and not data.index._has_complex_internals
        ):
            try:
                sdata = splitter.sorted_data
                result_values, mutated = splitter.fast_apply(f, sdata, group_keys)

            except IndexError:
                # This is a rare case in which re-running in python-space may
                #  make a difference, see  test_apply_mutate.test_mutate_groups
                pass

            else:
                # If the fast apply path could be used we can return here.
                # Otherwise we need to fall back to the slow implementation.
                if len(result_values) == len(group_keys):
                    return group_keys, result_values, mutated

        if result_values is None:
            # result_values is None if fast apply path wasn't taken
            # or fast apply aborted with an unexpected exception.
            # In either case, initialize the result list and perform
            # the slow iteration.
            result_values = []
            skip_first = False
        else:
            # If result_values is not None we're in the case that the
            # fast apply loop was broken prematurely but we have
            # already the result for the first group which we can reuse.
            skip_first = True

        # This calls DataSplitter.__iter__
        zipped = zip(group_keys, splitter)
        if skip_first:
            # pop the first item from the front of the iterator
            next(zipped)

        for key, group in zipped:
            object.__setattr__(group, "name", key)

            # group might be modified
            group_axes = group.axes
            res = f(group)
            if not _is_indexed_like(res, group_axes, axis):
                mutated = True
            result_values.append(res)

        return group_keys, result_values, mutated
Beispiel #17
0
def get_grouper(
    obj: FrameOrSeries,
    key=None,
    axis: int = 0,
    level=None,
    sort: bool = True,
    observed: bool = False,
    mutated: bool = False,
    validate: bool = True,
) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]:
    """
    Create and return a BaseGrouper, which is an internal
    mapping of how to create the grouper indexers.
    This may be composed of multiple Grouping objects, indicating
    multiple groupers

    Groupers are ultimately index mappings. They can originate as:
    index mappings, keys to columns, functions, or Groupers

    Groupers enable local references to axis,level,sort, while
    the passed in axis, level, and sort are 'global'.

    This routine tries to figure out what the passing in references
    are and then creates a Grouping for each one, combined into
    a BaseGrouper.

    If observed & we have a categorical grouper, only show the observed
    values.

    If validate, then check for key/level overlaps.

    """
    group_axis = obj._get_axis(axis)

    # validate that the passed single level is compatible with the passed
    # axis of the object
    if level is not None:
        # TODO: These if-block and else-block are almost same.
        # MultiIndex instance check is removable, but it seems that there are
        # some processes only for non-MultiIndex in else-block,
        # eg. `obj.index.name != level`. We have to consider carefully whether
        # these are applicable for MultiIndex. Even if these are applicable,
        # we need to check if it makes no side effect to subsequent processes
        # on the outside of this condition.
        # (GH 17621)
        if isinstance(group_axis, MultiIndex):
            if is_list_like(level) and len(level) == 1:
                level = level[0]

            if key is None and is_scalar(level):
                # Get the level values from group_axis
                key = group_axis.get_level_values(level)
                level = None

        else:
            # allow level to be a length-one list-like object
            # (e.g., level=[0])
            # GH 13901
            if is_list_like(level):
                nlevels = len(level)
                if nlevels == 1:
                    level = level[0]
                elif nlevels == 0:
                    raise ValueError("No group keys passed!")
                else:
                    raise ValueError(
                        "multiple levels only valid with MultiIndex")

            if isinstance(level, str):
                if obj.index.name != level:
                    raise ValueError(
                        "level name {level} is not the name of the index".
                        format(level=level))
            elif level > 0 or level < -1:
                raise ValueError(
                    "level > 0 or level < -1 only valid with MultiIndex")

            # NOTE: `group_axis` and `group_axis.get_level_values(level)`
            # are same in this section.
            level = None
            key = group_axis

    # a passed-in Grouper, directly convert
    if isinstance(key, Grouper):
        binner, grouper, obj = key._get_grouper(obj, validate=False)
        if key.key is None:
            return grouper, [], obj
        else:
            return grouper, [key.key], obj

    # already have a BaseGrouper, just return it
    elif isinstance(key, BaseGrouper):
        return key, [], obj

    # In the future, a tuple key will always mean an actual key,
    # not an iterable of keys. In the meantime, we attempt to provide
    # a warning. We can assume that the user wanted a list of keys when
    # the key is not in the index. We just have to be careful with
    # unhashable elements of `key`. Any unhashable elements implies that
    # they wanted a list of keys.
    # https://github.com/pandas-dev/pandas/issues/18314
    if isinstance(key, tuple):
        all_hashable = is_hashable(key)
        if (all_hashable and key not in obj
                and set(key).issubset(obj)) or not all_hashable:
            # column names ('a', 'b') -> ['a', 'b']
            # arrays like (a, b) -> [a, b]
            msg = ("Interpreting tuple 'by' as a list of keys, rather than "
                   "a single key. Use 'by=[...]' instead of 'by=(...)'. In "
                   "the future, a tuple will always mean a single key.")
            warnings.warn(msg, FutureWarning, stacklevel=5)
            key = list(key)

    if not isinstance(key, list):
        keys = [key]
        match_axis_length = False
    else:
        keys = key
        match_axis_length = len(keys) == len(group_axis)

    # what are we after, exactly?
    any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
    any_groupers = any(isinstance(g, Grouper) for g in keys)
    any_arraylike = any(
        isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys)

    # is this an index replacement?
    if (not any_callable and not any_arraylike and not any_groupers
            and match_axis_length and level is None):
        if isinstance(obj, DataFrame):
            all_in_columns_index = all(g in obj.columns or g in obj.index.names
                                       for g in keys)
        else:
            assert isinstance(obj, Series)
            all_in_columns_index = all(g in obj.index.names for g in keys)

        if not all_in_columns_index:
            keys = [com.asarray_tuplesafe(keys)]

    if isinstance(level, (tuple, list)):
        if key is None:
            keys = [None] * len(level)
        levels = level
    else:
        levels = [level] * len(keys)

    groupings = []  # type: List[Grouping]
    exclusions = []  # type: List[Hashable]

    # if the actual grouper should be obj[key]
    def is_in_axis(key) -> bool:
        if not _is_label_like(key):
            items = obj._data.items
            try:
                items.get_loc(key)
            except (KeyError, TypeError):
                # TypeError shows up here if we pass e.g. Int64Index
                return False

        return True

    # if the grouper is obj[name]
    def is_in_obj(gpr) -> bool:
        if not hasattr(gpr, "name"):
            return False
        try:
            return gpr is obj[gpr.name]
        except (KeyError, IndexError):
            return False

    for i, (gpr, level) in enumerate(zip(keys, levels)):

        if is_in_obj(gpr):  # df.groupby(df['name'])
            in_axis, name = True, gpr.name
            exclusions.append(name)

        elif is_in_axis(gpr):  # df.groupby('name')
            if gpr in obj:
                if validate:
                    obj._check_label_or_level_ambiguity(gpr, axis=axis)
                in_axis, name, gpr = True, gpr, obj[gpr]
                exclusions.append(name)
            elif obj._is_level_reference(gpr, axis=axis):
                in_axis, name, level, gpr = False, None, gpr, None
            else:
                raise KeyError(gpr)
        elif isinstance(gpr, Grouper) and gpr.key is not None:
            # Add key to exclusions
            exclusions.append(gpr.key)
            in_axis, name = False, None
        else:
            in_axis, name = False, None

        if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:
            raise ValueError(
                ("Length of grouper ({len_gpr}) and axis ({len_axis})"
                 " must be same length".format(len_gpr=len(gpr),
                                               len_axis=obj.shape[axis])))

        # create the Grouping
        # allow us to passing the actual Grouping as the gpr
        ping = (Grouping(
            group_axis,
            gpr,
            obj=obj,
            name=name,
            level=level,
            sort=sort,
            observed=observed,
            in_axis=in_axis,
        ) if not isinstance(gpr, Grouping) else gpr)

        groupings.append(ping)

    if len(groupings) == 0 and len(obj):
        raise ValueError("No group keys passed!")
    elif len(groupings) == 0:
        groupings.append(
            Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))

    # create the internals grouper
    grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)
    return grouper, exclusions, obj
Beispiel #18
0
def transform(obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args,
              **kwargs) -> FrameOrSeries:
    """
    Transform a DataFrame or Series

    Parameters
    ----------
    obj : DataFrame or Series
        Object to compute the transform on.
    func : string, function, list, or dictionary
        Function(s) to compute the transform with.
    axis : {0 or 'index', 1 or 'columns'}
        Axis along which the function is applied:

        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.

    Returns
    -------
    DataFrame or Series
        Result of applying ``func`` along the given axis of the
        Series or DataFrame.

    Raises
    ------
    ValueError
        If the transform function fails or does not transform.
    """
    from pandas.core.reshape.concat import concat

    is_series = obj.ndim == 1

    if obj._get_axis_number(axis) == 1:
        assert not is_series
        return transform(obj.T, func, 0, *args, **kwargs).T

    if isinstance(func, list):
        if is_series:
            func = {com.get_callable_name(v) or v: v for v in func}
        else:
            func = {col: func for col in obj}

    if isinstance(func, dict):
        if not is_series:
            cols = sorted(set(func.keys()) - set(obj.columns))
            if len(cols) > 0:
                raise SpecificationError(f"Column(s) {cols} do not exist")

        if any(isinstance(v, dict) for v in func.values()):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        results = {}
        for name, how in func.items():
            colg = obj._gotitem(name, ndim=1)
            try:
                results[name] = transform(colg, how, 0, *args, **kwargs)
            except Exception as e:
                if str(e) == "Function did not transform":
                    raise e

        # combine results
        if len(results) == 0:
            raise ValueError("Transform function failed")
        return concat(results, axis=1)

    # func is either str or callable
    try:
        if isinstance(func, str):
            result = obj._try_aggregate_string_function(func, *args, **kwargs)
        else:
            f = obj._get_cython_func(func)
            if f and not args and not kwargs:
                result = getattr(obj, f)()
            else:
                try:
                    result = obj.apply(func, args=args, **kwargs)
                except Exception:
                    result = func(obj, *args, **kwargs)
    except Exception:
        raise ValueError("Transform function failed")

    # Functions that transform may return empty Series/DataFrame
    # when the dtype is not appropriate
    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
        raise ValueError("Transform function failed")
    if not isinstance(result,
                      (ABCSeries, ABCDataFrame)) or not result.index.equals(
                          obj.index):
        raise ValueError("Function did not transform")

    return result
Beispiel #19
0
def build_table_schema(
    data: FrameOrSeries,
    index: bool = True,
    primary_key: bool | None = None,
    version: bool = True,
) -> dict[str, JSONSerializable]:
    """
    Create a Table schema from ``data``.

    Parameters
    ----------
    data : Series, DataFrame
    index : bool, default True
        Whether to include ``data.index`` in the schema.
    primary_key : bool or None, default True
        Column names to designate as the primary key.
        The default `None` will set `'primaryKey'` to the index
        level or levels if the index is unique.
    version : bool, default True
        Whether to include a field `pandas_version` with the version
        of pandas that generated the schema.

    Returns
    -------
    schema : dict

    Notes
    -----
    See `Table Schema
    <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
    conversion types.
    Timedeltas as converted to ISO8601 duration format with
    9 decimal places after the seconds field for nanosecond precision.

    Categoricals are converted to the `any` dtype, and use the `enum` field
    constraint to list the allowed values. The `ordered` attribute is included
    in an `ordered` field.

    Examples
    --------
    >>> df = pd.DataFrame(
    ...     {'A': [1, 2, 3],
    ...      'B': ['a', 'b', 'c'],
    ...      'C': pd.date_range('2016-01-01', freq='d', periods=3),
    ...     }, index=pd.Index(range(3), name='idx'))
    >>> build_table_schema(df)
    {'fields': \
[{'name': 'idx', 'type': 'integer'}, \
{'name': 'A', 'type': 'integer'}, \
{'name': 'B', 'type': 'string'}, \
{'name': 'C', 'type': 'datetime'}], \
'primaryKey': ['idx'], \
'pandas_version': '0.20.0'}
    """
    if index is True:
        data = set_default_names(data)

    schema: dict[str, Any] = {}
    fields = []

    if index:
        if data.index.nlevels > 1:
            data.index = cast("MultiIndex", data.index)
            for level, name in zip(data.index.levels, data.index.names):
                new_field = convert_pandas_type_to_json_field(level)
                new_field["name"] = name
                fields.append(new_field)
        else:
            fields.append(convert_pandas_type_to_json_field(data.index))

    if data.ndim > 1:
        for column, s in data.items():
            fields.append(convert_pandas_type_to_json_field(s))
    else:
        fields.append(convert_pandas_type_to_json_field(data))

    schema["fields"] = fields
    if index and data.index.is_unique and primary_key is None:
        if data.index.nlevels == 1:
            schema["primaryKey"] = [data.index.name]
        else:
            schema["primaryKey"] = data.index.names
    elif primary_key is not None:
        schema["primaryKey"] = primary_key

    if version:
        schema["pandas_version"] = "0.20.0"
    return schema
Beispiel #20
0
def describe_ndframe(
    *,
    obj: FrameOrSeries,
    include: Optional[Union[str, Sequence[str]]],
    exclude: Optional[Union[str, Sequence[str]]],
    datetime_is_numeric: bool,
    percentiles: Optional[Sequence[float]],
) -> FrameOrSeries:
    """Describe series or dataframe.

    Called from pandas.core.generic.NDFrame.describe()

    Parameters
    ----------
    obj: DataFrame or Series
        Either dataframe or series to be described.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored for ``Series``.
    exclude : list-like of dtypes or None (default), optional,
        A black list of data types to omit from the result. Ignored for ``Series``.
    datetime_is_numeric : bool, default False
        Whether to treat datetime dtypes as numeric.
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should fall between 0 and 1.
        The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.

    Returns
    -------
    Dataframe or series description.
    """
    if obj.ndim == 2 and obj.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        validate_percentile(percentiles)

        # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    assert percentiles is not None
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    formatted_percentiles = format_percentiles(percentiles)

    def describe_numeric_1d(series) -> "Series":
        from pandas import Series

        stat_index = ["count", "mean", "std", "min"
                      ] + formatted_percentiles + ["max"]
        d = ([series.count(),
              series.mean(),
              series.std(),
              series.min()] + series.quantile(percentiles).tolist() +
             [series.max()])
        return Series(d, index=stat_index, name=series.name)

    def describe_categorical_1d(data) -> "Series":
        names = ["count", "unique"]
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        result = [data.count(), count_unique]
        dtype = None
        if result[1] > 0:
            top, freq = objcounts.index[0], objcounts.iloc[0]
            if is_datetime64_any_dtype(data.dtype):
                if obj.ndim == 1:
                    stacklevel = 5
                else:
                    stacklevel = 6
                warnings.warn(
                    "Treating datetime data as categorical rather than numeric in "
                    "`.describe` is deprecated and will be removed in a future "
                    "version of pandas. Specify `datetime_is_numeric=True` to "
                    "silence this warning and adopt the future behavior now.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )
                tz = data.dt.tz
                asint = data.dropna().values.view("i8")
                top = Timestamp(top)
                if top.tzinfo is not None and tz is not None:
                    # Don't tz_localize(None) if key is already tz-aware
                    top = top.tz_convert(tz)
                else:
                    top = top.tz_localize(tz)
                names += ["top", "freq", "first", "last"]
                result += [
                    top,
                    freq,
                    Timestamp(asint.min(), tz=tz),
                    Timestamp(asint.max(), tz=tz),
                ]
            else:
                names += ["top", "freq"]
                result += [top, freq]

        # If the DataFrame is empty, set 'top' and 'freq' to None
        # to maintain output shape consistency
        else:
            names += ["top", "freq"]
            result += [np.nan, np.nan]
            dtype = "object"

        from pandas import Series

        return Series(result, index=names, name=data.name, dtype=dtype)

    def describe_timestamp_1d(data) -> "Series":
        # GH-30164
        from pandas import Series

        stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
        d = ([data.count(), data.mean(), data.min()] +
             data.quantile(percentiles).tolist() + [data.max()])
        return Series(d, index=stat_index, name=data.name)

    def describe_1d(data) -> "Series":
        if is_bool_dtype(data.dtype):
            return describe_categorical_1d(data)
        elif is_numeric_dtype(data):
            return describe_numeric_1d(data)
        elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
            return describe_timestamp_1d(data)
        elif is_timedelta64_dtype(data.dtype):
            return describe_numeric_1d(data)
        else:
            return describe_categorical_1d(data)

    if obj.ndim == 1:
        # Incompatible return value type
        #  (got "Series", expected "FrameOrSeries")  [return-value]
        return describe_1d(obj)  # type:ignore[return-value]
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        default_include = [np.number]
        if datetime_is_numeric:
            default_include.append("datetime")
        data = obj.select_dtypes(include=default_include)
        if len(data.columns) == 0:
            data = obj
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = obj
    else:
        data = obj.select_dtypes(include=include, exclude=exclude)

    ldesc = [describe_1d(s) for _, s in data.items()]
    # set a convenient order for rows
    names: List[Hashable] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = concat([x.reindex(names, copy=False) for x in ldesc],
               axis=1,
               sort=False)
    d.columns = data.columns.copy()
    return d