def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf data_columns = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance( spark_type, FloatType): stat_exprs.append( sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) data_columns.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) data_columns.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) internal = _InternalFrame(sdf=sdf, data_columns=data_columns, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) kdf = DataFrame(internal) if not self._as_index: kdf = kdf.reset_index() return kdf
def aggregate(self, func_or_funcs, *args, **kwargs): """Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict a dict mapping from column name (string) to aggregate functions (string or list of strings). Returns ------- Series or DataFrame The return can be: * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions Return Series or DataFrame. Notes ----- `agg` is an alias for `aggregate`. Use the alias. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 1, 2, 2], ... 'B': [1, 2, 3, 4], ... 'C': [0.362, 0.227, 1.267, -0.562]}, ... columns=['A', 'B', 'C']) >>> df A B C 0 1 1 0.362 1 1 2 0.227 2 2 3 1.267 3 2 4 -0.562 Different aggregations per column >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'}) >>> aggregated[['B', 'C']] # doctest: +NORMALIZE_WHITESPACE B C A 1 1 0.589 2 3 0.705 >>> aggregated = df.groupby('A').agg({'B': ['min', 'max']}) >>> aggregated # doctest: +NORMALIZE_WHITESPACE B min max A 1 1 2 2 3 4 """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, str) and (isinstance(value, str) or isinstance(value, list) and all(isinstance(v, str) for v in value)) for key, value in func_or_funcs.items()): raise ValueError( "aggs must be a dict mapping from column name (string) to aggregate " "functions (string or list of strings).") sdf = self._kdf._sdf groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] multi_aggs = any(isinstance(v, list) for v in func_or_funcs.values()) reordered = [] data_columns = [] column_index = [] for key, value in func_or_funcs.items(): for aggfunc in [value] if isinstance(value, str) else value: data_col = "('{0}', '{1}')".format( key, aggfunc) if multi_aggs else key data_columns.append(data_col) column_index.append((key, aggfunc)) if aggfunc == "nunique": reordered.append( F.expr('count(DISTINCT `{0}`) as `{1}`'.format( key, data_col))) else: reordered.append( F.expr('{1}(`{0}`) as `{2}`'.format( key, aggfunc, data_col))) sdf = sdf.groupby(*groupkey_cols).agg(*reordered) internal = _InternalFrame( sdf=sdf, data_columns=data_columns, column_index=column_index if multi_aggs else None, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) kdf = DataFrame(internal) if not self._as_index: kdf = kdf.reset_index() return kdf