コード例 #1
0
    def _apply_agg(self, agg_type, result, add_col_values,
                   ctx, val_columns, val_columns_out, sort_result=True):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        result : DataFrame
            The DataFrame to store the result of the aggregation into.
        add_col_values : bool
            Boolean to indicate whether this is the first aggregation being
            run and should add the additional columns' values.
        ctx : gdf_context cffi object
            Context object to pass information such as if the dataframe
            is sorted and/or which method to use for grouping.
        val_columns : list of *str*
            The list of column names that the aggregation should be performed
            on.
        val_columns_out : list of *str*
            The list of columns names that the aggregation results should be
            output into.
        """
        if sort_result:
            ctx.flag_sort_result = 1

        ncols = len(self._by)
        cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by]

        first_run = add_col_values
        need_to_index = False

        col_count = 0
        for val_col in val_columns:
            col_agg = self._df[val_col]._column.cffi_view

            # assuming here that if there are multiple aggregations that the
            # aggregated results will be in the same order for GDF_SORT method
            if need_to_index:
                out_col_indices_series = Series(
                    Buffer(rmm.device_array(col_agg.size, dtype=np.int32)))
                out_col_indices = out_col_indices_series._column.cffi_view
            else:
                out_col_indices = ffi.NULL

            out_col_values_series = [Series(Buffer(rmm.device_array(
                col_agg.size,
                dtype=self._df[self._by[i]]._column.data.dtype)))
                for i in range(0, ncols)]
            out_col_values = [
                out_col_values_series[i]._column.cffi_view
                for i in range(0, ncols)]

            if agg_type == "count":
                out_col_agg_series = Series(
                    Buffer(rmm.device_array(col_agg.size, dtype=np.int64)))
            else:
                out_col_agg_series = Series(Buffer(rmm.device_array(
                    col_agg.size, dtype=self._df[val_col]._column.data.dtype)))

            out_col_agg = out_col_agg_series._column.cffi_view

            agg_func = self._NAMED_FUNCTIONS.get(agg_type, None)
            if agg_func is None:
                raise RuntimeError(
                    "ERROR: this aggregator has not been implemented yet")
            err = agg_func(
                ncols,
                cols,
                col_agg,
                out_col_indices,
                out_col_values,
                out_col_agg,
                ctx)

            if (err is not None):
                print(err)
                raise RuntimeError(err)

            num_row_results = out_col_agg.size

            if first_run:
                for i, thisBy in enumerate(self._by):
                    result[thisBy] = out_col_values_series[i][
                        :num_row_results]

                    if is_categorical_dtype(self._df[thisBy].dtype):
                        result[thisBy] = CategoricalColumn(
                            data=result[thisBy].data,
                            categories=self._df[thisBy].cat.categories,
                            ordered=self._df[thisBy].cat.ordered)

            out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index()

            result[val_columns_out[col_count]
                   ] = out_col_agg_series[:num_row_results]

            out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index()

            first_run = False
            col_count = col_count + 1

        return result
コード例 #2
0
 def _get_row_major(self, df, row_tuple):
     slice_access = False
     if isinstance(row_tuple[0], numbers.Number):
         valid_indices = row_tuple[0]
     elif isinstance(row_tuple[0], slice):
         # 1. empty slice compute
         if row_tuple[0].stop == 0:
             valid_indices = []
         else:
             slice_access = True
             start = row_tuple[0].start or 0
             stop = row_tuple[0].stop or len(df)
             step = row_tuple[0].step or 1
             valid_indices = cudautils.arange(start, stop, step)
     else:
         valid_indices = self._compute_validity_mask(df, row_tuple)
     from cudf import Series
     result = df.take(Series(valid_indices))
     # Build new index - INDEX based MultiIndex
     # ---------------
     from cudf import DataFrame
     out_index = DataFrame()
     # Select the last n-k columns where n is the number of source
     # levels and k is the length of the indexing tuple
     size = 0
     if not isinstance(row_tuple[0], (numbers.Number, slice)):
         size = len(row_tuple)
     for k in range(size, len(df.index.levels)):
         out_index.add_column(df.index.names[k],
                              df.index.codes[df.index.codes.columns[k]])
     # If there's only one column remaining in the output index, convert
     # it into an Index and name the final index values according
     # to the proper codes.
     if len(out_index.columns) == 1:
         out_index = []
         for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]:  # noqa: E501
             out_index.append(result.index.levels[
                     len(result.index.codes.columns)-1][val])
         out_index = as_index(out_index)
         out_index.name = result.index.names[len(result.index.names)-1]
         result.index = out_index
     else:
         if len(result) == 1 and size == 0 and slice_access is False:
             # If the final result is one row and it was not mapped into
             # directly
             result = result.T
             result = result[result.columns[0]]
             # convert to Series
             series_name = []
             for idx, code in enumerate(result.columns.codes):
                 series_name.append(result.columns.levels[idx][
                         result.columns.codes[code][0]])
             result = Series(list(result._cols.values())[0],
                             name=series_name)
             result.name = tuple(series_name)
         elif(len(out_index.columns)) > 0:
             # Otherwise pop the leftmost levels, names, and codes from the
             # source index until it has the correct number of columns (n-k)
             result.reset_index(drop=True)
             result.index = result.index._popn(size)
     return result
コード例 #3
0
    def _apply_agg(self, agg_type, result, add_col_values,
                   ctx, val_columns, val_columns_out, sort_result=True):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        result : DataFrame
            The DataFrame to store the result of the aggregation into.
        add_col_values : bool
            Boolean to indicate whether this is the first aggregation being
            run and should add the additional columns' values.
        ctx : gdf_context cffi object
            Context object to pass information such as if the dataframe
            is sorted and/or which method to use for grouping.
        val_columns : list of *str*
            The list of column names that the aggregation should be performed
            on.
        val_columns_out : list of *str*
            The list of columns names that the aggregation results should be
            output into.
        """

        if sort_result:
            ctx.flag_sort_result = 1

        ncols = len(self._by)
        cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by]

        first_run = add_col_values
        need_to_index = self._as_index

        col_count = 0
        if isinstance(val_columns, (str, Number)):
            val_columns = [val_columns]
        for val_col in val_columns:
            col_agg = self._df[val_col]._column.cffi_view

            # assuming here that if there are multiple aggregations that the
            # aggregated results will be in the same order for GDF_SORT method
            if need_to_index:
                out_col_indices_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int32
                        )
                    )
                )
                out_col_indices = out_col_indices_series._column.cffi_view
            else:
                out_col_indices = ffi.NULL

            out_col_values_series = []
            for i in range(0, ncols):
                if self._df[self._by[i]].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    col = Series([''], dtype='str')[gather_map]\
                        .reset_index(drop=True)
                else:
                    col = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[self._by[i]]._column.data.dtype
                            )
                        )
                    )
                out_col_values_series.append(col)
            out_col_values = [
                out_col_values_series[i]._column.cffi_view
                for i in range(0, ncols)]

            if agg_type == "count":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int64
                        )
                    )
                )
            elif agg_type == "mean":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.float64
                        )
                    )
                )
            else:
                if self._df[val_col].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    out_col_agg_series = Series(
                        [''],
                        dtype='str'
                    )[gather_map].reset_index(drop=True)
                else:
                    out_col_agg_series = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[val_col]._column.data.dtype
                            )
                        )
                    )

            out_col_agg = out_col_agg_series._column.cffi_view

            agg_func = self._NAMED_FUNCTIONS.get(agg_type, None)
            if agg_func is None:
                raise RuntimeError(
                    "ERROR: this aggregator has not been implemented yet")

            err = agg_func(
                ncols,
                cols,
                col_agg,
                out_col_indices,
                out_col_values,
                out_col_agg,
                ctx)

            if (err is not None):
                raise RuntimeError(err)

            num_row_results = out_col_agg.size

            # NVStrings columns are not the same going in as coming out but we
            # can't create entire CFFI views otherwise multiple objects will
            # try to free the memory
            for i, col in enumerate(out_col_values_series):
                if col.dtype == np.dtype("object") and len(col) > 0:
                    import nvcategory
                    nvcat_ptr = int(
                        ffi.cast(
                            "uintptr_t",
                            out_col_values[i].dtype_info.category
                        )
                    )
                    nvcat_obj = None
                    if nvcat_ptr:
                        nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                        nvstr_obj = nvcat_obj.to_strings()
                    else:
                        import nvstrings
                        nvstr_obj = nvstrings.to_device([])
                    out_col_values_series[i]._column._data = nvstr_obj
                    out_col_values_series[i]._column._nvcategory = nvcat_obj
            if out_col_agg_series.dtype == np.dtype("object") and \
                    len(out_col_agg_series) > 0:
                import nvcategory
                nvcat_ptr = int(
                    ffi.cast(
                        "uintptr_t",
                        out_col_agg.dtype_info.category
                    )
                )
                nvcat_obj = None
                if nvcat_ptr:
                    nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                    nvstr_obj = nvcat_obj.to_strings()
                else:
                    import nvstrings
                    nvstr_obj = nvstrings.to_device([])
                out_col_agg_series._column._data = nvstr_obj
                out_col_agg_series._column._nvcategory = nvcat_obj

            if first_run:
                for i, thisBy in enumerate(self._by):
                    result[thisBy] = out_col_values_series[i][
                        :num_row_results]

                    if is_categorical_dtype(self._df[thisBy].dtype):
                        result[thisBy] = CategoricalColumn(
                            data=result[thisBy].data,
                            categories=self._df[thisBy].cat.categories,
                            ordered=self._df[thisBy].cat.ordered
                        )

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            if isinstance(val_columns_out, (str, Number)):
                result[val_columns_out] = out_col_agg_series[:num_row_results]
            else:
                result[val_columns_out[col_count]
                       ] = out_col_agg_series[:num_row_results]

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            first_run = False
            col_count = col_count + 1

        return result
コード例 #4
0
ファイル: multiindex.py プロジェクト: zeichuan/cudf
    def _index_and_downcast(self, result, index, index_key):
        from cudf import DataFrame
        from cudf import Series

        if isinstance(index_key, (numbers.Number, slice)):
            index_key = [index_key]
        if (
            len(index_key) > 0 and not isinstance(index_key, tuple)
        ) or isinstance(index_key[0], slice):
            index_key = index_key[0]

        slice_access = False
        if isinstance(index_key, slice):
            slice_access = True
        out_index = DataFrame()
        # Select the last n-k columns where n is the number of _source_data
        # columns and k is the length of the indexing tuple
        size = 0
        if not isinstance(index_key, (numbers.Number, slice)):
            size = len(index_key)
        for k in range(size, len(index._source_data.columns)):
            out_index.add_column(
                index.names[k],
                index._source_data[index._source_data.columns[k]],
            )

        if len(result) == 1 and size == 0 and slice_access is False:
            # If the final result is one row and it was not mapped into
            # directly, return a Series with a tuple as name.
            result = result.T
            result = result[result.columns[0]]
            # convert to Series
            series_name = []
            for idx, code in enumerate(index._source_data.columns):
                series_name.append(result.columns._source_data[code][0])
            result = Series(list(result._cols.values())[0], index=result.index)
            result.name = tuple(series_name)
        elif len(result) == 0 and slice_access is False:
            # Pandas returns an empty Series with a tuple as name
            # the one expected result column
            series_name = []
            for idx, code in enumerate(index._source_data.columns):
                series_name.append(index._source_data[code][0])
            result = Series([])
            result.name = tuple(series_name)
        elif len(out_index.columns) == 1:
            # If there's only one column remaining in the output index, convert
            # it into an Index and name the final index values according
            # to the _source_data column names
            last_column = index._source_data.columns[-1]
            out_index = index._source_data[last_column]
            out_index = as_index(out_index)
            out_index.name = index.names[len(index.names) - 1]
            index = out_index
        elif len(out_index.columns) > 1:
            # Otherwise pop the leftmost levels, names, and codes from the
            # source index until it has the correct number of columns (n-k)
            result.reset_index(drop=True)
            index = index._popn(size)
        if isinstance(index_key, tuple):
            result = result.set_index(index)
        return result