Beispiel #1
0
    def apply_multicolumn(self, result, aggs):
        # multicolumn only applies with multiple aggs and multiple groupby keys
        if len(aggs) == 1 or len(self._by) == 1:
            # unprefix columns
            new_cols = []
            for c in result.columns:
                if len(self._by) == 1 and len(result) != 0:
                    new_col = c.split('_')[0]  # sum_z-> (sum, z)
                else:
                    new_col = c.split('_')[1]  # sum_z-> (sum, z)
                new_cols.append(new_col)
            result.columns = new_cols
        else:
            # reorder our columns to match pandas
            if len(self._val_columns) > 1:
                col_dfs = [DataFrame() for col in self._val_columns]
                for agg in aggs:
                    for idx, col in enumerate(self._val_columns):
                        col_dfs[idx][agg + '_' + col] = result[agg + '_' + col]
                idx = result.index
                result = DataFrame(index=idx)
                for idx, col in enumerate(self._val_columns):
                    for agg in aggs:
                        result[agg + '_' + col] = col_dfs[idx][agg + '_' + col]

            levels = []
            codes = []
            levels.append(self._val_columns)
            levels.append(aggs)

            # if the values columns have length == 1, codes is a nested list of
            # zeros equal to the size of aggs (sum, min, mean, etc.)
            # if the values columns are length>1, codes will monotonically
            # increase by 1 for every n values where n is the number of aggs
            # [['x,', 'z'], ['sum', 'min']]
            # codes == [[0, 1], [0, 1]]
            first_codes = [len(aggs) * [d*1] for d in range(len(
                    self._val_columns))]
            codes.append([item for sublist in first_codes for item in sublist])
            codes.append(len(self._val_columns) * [0, 1])
            result.columns = MultiIndex(levels, codes)
        return result
Beispiel #2
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.dataframe.dataframe import DataFrame
        from cudf.dataframe.dataframe import Series
        from cudf.dataframe.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (
                len(columns_df) == 0
                and len(columns_df.columns) == 0
                and not isinstance(arg[0], slice)
            ):
                result = Series([], name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for col in columns:
                    columns_df.add_column(name=col, data=self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (
                isinstance(arg[0], slice) or isinstance(arg[1], slice)
            ):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for key, col in columns_df._cols.items():
                df[key] = col.iloc[arg[0]]
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (
                    isinstance(arg[0], slice) or isinstance(arg[1], slice)
                ):
                    return list(df._cols.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    result_series = list(df._cols.values())[0]
                    result_series.index = df.columns
                    result_series.name = arg[0]
                    return result_series
                else:
                    return list(df._cols.values())[0]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.dataframe.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df