Ejemplo n.º 1
0
    def _apply_basic_agg(self, agg_type, sort_results=False):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        """
        result = DataFrame()
        add_col_values = True

        ctx = ffi.new('gdf_context*')
        ctx.flag_sorted = 0
        ctx.flag_method = self._method
        ctx.flag_distinct = 0

        val_columns = self._val_columns
        val_columns_out = self._val_columns

        result = self._apply_agg(agg_type,
                                 result,
                                 add_col_values,
                                 ctx,
                                 val_columns,
                                 val_columns_out,
                                 sort_result=sort_results)

        # If a Groupby has one index column and one value column
        # and as_index is set, return a Series instead of a df
        if isinstance(val_columns, (str, Number)) and self._as_index:
            result_series = result[val_columns]
            idx = index.as_index(result[self._by[0]])
            if self.level == 0:
                idx.name = self._original_index_name
            else:
                idx.name = self._by[0]
            result_series = result_series.set_index(idx)
            return result_series

        # TODO: Do MultiIndex here
        if (self._as_index):
            idx = index.as_index(result[self._by[0]])
            idx.name = self._by[0]
            result.drop_column(idx.name)
            if self.level == 0:
                idx.name = self._original_index_name
            else:
                idx.name = self._by[0]
            result = result.set_index(idx)

        nvtx_range_pop()

        return result
Ejemplo n.º 2
0
    def agg(self, args):
        """ Invoke aggregation functions on the groups.

        Parameters
        ----------
        args : dict, list, str, callable
            - str
                The aggregate function name.
            - list
                List of *str* of the aggregate function.
            - dict
                key-value pairs of source column name and list of
                aggregate functions as *str*.

        Returns
        -------
        result : DataFrame

        Notes
        -----
        Since multi-indexes aren't supported aggregation results are returned
        in columns using the naming scheme of `aggregation_columnname`.
        """
        result = DataFrame()
        add_col_values = True

        ctx = ffi.new('gdf_context*')
        ctx.flag_sorted = 0
        ctx.flag_method = self._method
        ctx.flag_distinct = 0

        sort_result = True

        # TODO: Use MultiColumn here instead of use_prefix
        # use_prefix enables old functionality - prefixing column
        # groupby names since we don't support MultiColumn quite yet
        use_prefix = 1 < len(self._val_columns) or 1 < len(args)
        if not isinstance(args, str) and isinstance(args,
                                                    collections.abc.Sequence):
            for agg_type in args:
                val_columns_out = [
                    agg_type + '_' + val for val in self._val_columns
                ]
                if not use_prefix:
                    val_columns_out = self._val_columns
                result = self._apply_agg(agg_type,
                                         result,
                                         add_col_values,
                                         ctx,
                                         self._val_columns,
                                         val_columns_out,
                                         sort_result=sort_result)
                add_col_values = False  # we only want to add them once
            # TODO: Do multindex here
            if (self._as_index) and 1 == len(self._by):
                idx = index.as_index(result[self._by[0]])
                idx.name = self._by[0]
                result = result.set_index(idx)
                result.drop_column(idx.name)
        elif isinstance(args, collections.abc.Mapping):
            if (len(args.keys()) == 1):
                if (len(list(args.values())[0]) == 1):
                    sort_result = False
            for val, agg_type in args.items():

                if not isinstance(agg_type, str) and \
                       isinstance(agg_type, collections.abc.Sequence):
                    for sub_agg_type in agg_type:
                        val_columns_out = [sub_agg_type + '_' + val]
                        if not use_prefix:
                            val_columns_out = self._val_columns
                        result = self._apply_agg(sub_agg_type,
                                                 result,
                                                 add_col_values,
                                                 ctx, [val],
                                                 val_columns_out,
                                                 sort_result=sort_result)
                elif isinstance(agg_type, str):
                    val_columns_out = [agg_type + '_' + val]
                    if not use_prefix:
                        val_columns_out = self._val_columns
                    result = self._apply_agg(agg_type,
                                             result,
                                             add_col_values,
                                             ctx, [val],
                                             val_columns_out,
                                             sort_result=sort_result)
                add_col_values = False  # we only want to add them once
            # TODO: Do multindex here
            if (self._as_index) and 1 == len(self._by):
                idx = index.as_index(result[self._by[0]])
                idx.name = self._by[0]
                result = result.set_index(idx)
                result.drop_column(idx.name)
        else:
            result = self.agg([args])

        nvtx_range_pop()
        return result