Ejemplo n.º 1
0
    def _reduce_for_stat_function(self, sfun, only_numeric):
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        sdf = self._kdf._sdf

        data_columns = []
        if len(self._agg_columns) > 0:
            stat_exprs = []
            for ks in self._agg_columns:
                spark_type = ks.spark_type
                # TODO: we should have a function that takes dataframes and converts the numeric
                # types. Converting the NaNs is used in a few places, it should be in utils.
                # Special handle floating point types because Spark's count treats nan as a valid
                # value, whereas Pandas count doesn't include nan.
                if isinstance(spark_type, DoubleType) or isinstance(
                        spark_type, FloatType):
                    stat_exprs.append(
                        sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name))
                    data_columns.append(ks.name)
                elif isinstance(spark_type, NumericType) or not only_numeric:
                    stat_exprs.append(sfun(ks._scol).alias(ks.name))
                    data_columns.append(ks.name)
            sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs)
        else:
            sdf = sdf.select(*groupkey_cols).distinct()
        sdf = sdf.sort(*groupkey_cols)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=data_columns,
                                  index_map=[('__index_level_{}__'.format(i),
                                              s.name)
                                             for i, s in enumerate(groupkeys)])
        kdf = DataFrame(internal)
        if not self._as_index:
            kdf = kdf.reset_index()
        return kdf
Ejemplo n.º 2
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Aggregate using one or more operations over the specified axis.

        Parameters
        ----------
        func : dict
             a dict mapping from column name (string) to
             aggregate functions (string or list of strings).

        Returns
        -------
        Series or DataFrame

            The return can be:

            * Series : when DataFrame.agg is called with a single function
            * DataFrame : when DataFrame.agg is called with several functions

            Return Series or DataFrame.

        Notes
        -----
        `agg` is an alias for `aggregate`. Use the alias.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 1, 2, 2],
        ...                    'B': [1, 2, 3, 4],
        ...                    'C': [0.362, 0.227, 1.267, -0.562]},
        ...                   columns=['A', 'B', 'C'])

        >>> df
           A  B      C
        0  1  1  0.362
        1  1  2  0.227
        2  2  3  1.267
        3  2  4 -0.562

        Different aggregations per column

        >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'})
        >>> aggregated[['B', 'C']]  # doctest: +NORMALIZE_WHITESPACE
           B      C
        A
        1  1  0.589
        2  3  0.705

        >>> aggregated = df.groupby('A').agg({'B': ['min', 'max']})
        >>> aggregated  # doctest: +NORMALIZE_WHITESPACE
             B
           min  max
        A
        1    1    2
        2    3    4

        """
        if not isinstance(func_or_funcs, dict) or \
                not all(isinstance(key, str) and
                        (isinstance(value, str) or
                         isinstance(value, list) and all(isinstance(v, str) for v in value))
                        for key, value in func_or_funcs.items()):
            raise ValueError(
                "aggs must be a dict mapping from column name (string) to aggregate "
                "functions (string or list of strings).")

        sdf = self._kdf._sdf
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        multi_aggs = any(isinstance(v, list) for v in func_or_funcs.values())
        reordered = []
        data_columns = []
        column_index = []
        for key, value in func_or_funcs.items():
            for aggfunc in [value] if isinstance(value, str) else value:
                data_col = "('{0}', '{1}')".format(
                    key, aggfunc) if multi_aggs else key
                data_columns.append(data_col)
                column_index.append((key, aggfunc))
                if aggfunc == "nunique":
                    reordered.append(
                        F.expr('count(DISTINCT `{0}`) as `{1}`'.format(
                            key, data_col)))
                else:
                    reordered.append(
                        F.expr('{1}(`{0}`) as `{2}`'.format(
                            key, aggfunc, data_col)))
        sdf = sdf.groupby(*groupkey_cols).agg(*reordered)
        internal = _InternalFrame(
            sdf=sdf,
            data_columns=data_columns,
            column_index=column_index if multi_aggs else None,
            index_map=[('__index_level_{}__'.format(i), s.name)
                       for i, s in enumerate(groupkeys)])
        kdf = DataFrame(internal)
        if not self._as_index:
            kdf = kdf.reset_index()
        return kdf