def apply_multicolumn(self, result, aggs): # multicolumn only applies with multiple aggs and multiple groupby keys if len(aggs) == 1 or len(self._by) == 1: # unprefix columns new_cols = [] for c in result.columns: if len(self._by) == 1 and len(result) != 0: new_col = c.split('_')[0] # sum_z-> (sum, z) else: new_col = c.split('_')[1] # sum_z-> (sum, z) new_cols.append(new_col) result.columns = new_cols else: # reorder our columns to match pandas if len(self._val_columns) > 1: col_dfs = [DataFrame() for col in self._val_columns] for agg in aggs: for idx, col in enumerate(self._val_columns): col_dfs[idx][agg + '_' + col] = result[agg + '_' + col] idx = result.index result = DataFrame(index=idx) for idx, col in enumerate(self._val_columns): for agg in aggs: result[agg + '_' + col] = col_dfs[idx][agg + '_' + col] levels = [] codes = [] levels.append(self._val_columns) levels.append(aggs) # if the values columns have length == 1, codes is a nested list of # zeros equal to the size of aggs (sum, min, mean, etc.) # if the values columns are length>1, codes will monotonically # increase by 1 for every n values where n is the number of aggs # [['x,', 'z'], ['sum', 'min']] # codes == [[0, 1], [0, 1]] first_codes = [len(aggs) * [d*1] for d in range(len( self._val_columns))] codes.append([item for sublist in first_codes for item in sublist]) codes.append(len(self._val_columns) * [0, 1]) result.columns = MultiIndex(levels, codes) return result
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.dataframe import Series from cudf.dataframe.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns = self._get_column_selection(arg[1]) if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) if ( len(columns_df) == 0 and len(columns_df.columns) == 0 and not isinstance(arg[0], slice) ): result = Series([], name=arg[0]) result._index = columns_df.columns.copy(deep=False) return result else: if isinstance(arg[0], slice): columns_df = DataFrame() for col in columns: columns_df.add_column(name=col, data=self._df[col]) columns_df._index = self._df._index else: columns_df = self._df._columns_view(columns) # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): # Pandas returns a numpy scalar in this case return df[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: df = DataFrame() for key, col in columns_df._cols.items(): df[key] = col.iloc[arg[0]] df.columns = columns_df.columns # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index if isinstance(arg[0], slice): start = arg[0].start if start is None: start = 0 df.index = as_index(self._df.index[start]) else: df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): if isinstance(df.columns, MultiIndex): if len(df) > 0 and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): return list(df._cols.values())[0][0] elif df.shape[1] > 1: result = self._downcast_to_series(df, arg) result.index = df.columns return result elif not isinstance(arg[0], slice): result_series = list(df._cols.values())[0] result_series.index = df.columns result_series.name = arg[0] return result_series else: return list(df._cols.values())[0] return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0: from cudf.dataframe.index import RangeIndex slice_len = arg[0].stop or len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df