def copy(self, deep=True): mi = MultiIndex(source_data=self._source_data.copy(deep)) if self._levels is not None: mi._levels = [s.copy(deep) for s in self._levels] if self._codes is not None: mi._codes = self._codes.copy(deep) if self.names is not None: mi.names = self.names.copy() return mi
def _popn(self, n): """ Returns a copy of this index without the left-most n values. Removes n names, labels, and codes in order to build a new index for results. """ result = MultiIndex(source_data=self._source_data.iloc[:, n:]) if self.names is not None: result.names = self.names[n:] return result
def _popn(self, n): """ Returns a copy of this index without the left-most n values. Removes n names, labels, and codes in order to build a new index for results. """ from cudf import DataFrame codes = DataFrame() for idx in self.codes.columns[n:]: codes.add_column(idx, self.codes[idx]) result = MultiIndex(self.levels[n:], codes) result.names = self.names[n:] return result
def agg(self, agg_types): df = DataFrame() by = [] if self.level is not None: if isinstance(self.source_series.index, MultiIndex): # Add index columns specified by multiindex into _df # Record the index column names for the groupby for col in self.source_series.index.codes: df[self.group_name + col] = self.source_series.index.codes[ col] by.append(self.group_name + col) else: if isinstance(self.group_keys, Series): df[self.group_name] = self.group_keys by = self.group_name else: df = self.group_keys by = self._by df[self.source_name] = self.source_series groupby = df.groupby(by).agg(agg_types) idx = groupby.index if len(groupby.columns) == 1: result = groupby[self.source_name] result.name = self.source_series.name idx.name = None result = result.set_index(idx) else: idx.name = self.group_name result = groupby.set_index(idx) if len(result) == 0 and self._by is not None: empties = [[] for x in range(len(self._by))] mi = MultiIndex(empties, empties, names=self._by) result = result.set_index(mi) return result
def apply_multicolumn(self, result, aggs): levels = [] codes = [] levels.append(self._val_columns) levels.append(aggs) # if the values columns have length == 1, codes is a nested list of # zeros equal to the size of aggs (sum, min, mean, etc.) # if the values columns are length>1, codes will monotonically # increase by 1 for every n values where n is the number of aggs # [['x,', 'z'], ['sum', 'min']] # codes == [[0, 1], [0, 1]] code_size = max(len(aggs), len(self._val_columns)) codes.append(list(np.zeros(code_size, dtype='int64'))) codes.append(list(range(code_size))) if len(aggs) == 1: # unprefix columns new_cols = [] for c in result.columns: new_col = c.split('_')[1] # sum_z-> (sum, z) new_cols.append(new_col) result.columns = new_cols else: result.columns = MultiIndex(levels, codes) return result
def compute_result_column_index(self): """ Computes the column index of the result """ value_names = self.value_names aggs_as_list = self.get_aggs_as_list() if isinstance(self.obj, cudf.Series): if len(aggs_as_list) == 1: if self.obj.name is None: return self.obj.name else: return [self.obj.name] else: return aggs_as_list else: return_multi_index = True if isinstance(self.original_aggs, str): return_multi_index = False if isinstance(self.original_aggs, collections.abc.Mapping): return_multi_index = False for key in self.original_aggs: if not isinstance(self.original_aggs[key], str): return_multi_index = True break if return_multi_index: return MultiIndex.from_tuples(zip(value_names, aggs_as_list)) else: return value_names
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def test_multiindex_tuples(testarr): tuples = list(zip(*testarr[0])) index = MultiIndex.from_tuples(tuples, names=testarr[1]) index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1]) assert index.is_unique == index_pd.is_unique assert index.is_monotonic == index_pd.is_monotonic assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): indices = indices.to_gpu_array() elif isinstance(indices, slice): start, stop, step, sln = utils.standard_python_slice(len(self), indices) indices = cudautils.arange(start, stop, step) if hasattr(self, '_source_data'): result = MultiIndex(source_data=self._source_data.take(indices)) else: codes = self.codes.take(indices) result = MultiIndex(self.levels, codes) result.names = self.names return result
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): indices = indices.to_gpu_array() elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = cudautils.arange(start, stop, step) result = MultiIndex(source_data=self._source_data.take(indices)) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: result._levels = self._levels result.names = self.names return result
def deserialize(cls, header, frames): """Convert from pickle format into Index """ names = pickle.loads(header["names"]) source_data_typ = pickle.loads(header["source_data"]["type"]) source_data = source_data_typ.deserialize(header["source_data"], frames) names = pickle.loads(header["names"]) return MultiIndex(names=names, source_data=source_data)
def _concat(cls, objs): from cudf import DataFrame, MultiIndex source_data = [o._source_data for o in objs] source_data = DataFrame._concat(source_data) names = [None for x in source_data.columns] objs = list(filter(lambda o: o.names is not None, objs)) for o in range(len(objs)): for i, name in enumerate(objs[o].names): names[i] = names[i] or name return MultiIndex(names=names, source_data=source_data)
def _concat(cls, objs): from cudf import DataFrame from cudf import MultiIndex _need_codes = not all([hasattr(o, '_source_data') for o in objs]) if _need_codes: raise NotImplementedError( 'MultiIndex._concat is only supported ' 'for groupby generated MultiIndexes at this time.') else: _source_data = DataFrame._concat([o._source_data for o in objs]) index = MultiIndex(source_data=_source_data) return index
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): if indices.null_count != 0: raise ValueError("Column must have no nulls.") indices = indices.data.mem elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = cudautils.arange(start, stop, step) result = MultiIndex(source_data=self._source_data.take(indices)) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: result._levels = self._levels result.names = self.names return result
def apply_multicolumn_mapped(self, result, aggs): if len(set(aggs.keys())) == len(aggs.keys()) and\ isinstance(aggs[list(aggs.keys())[0]], (str, Number)): result.columns = aggs.keys() else: tuples = [] for k in aggs.keys(): for v in aggs[k]: tuples.append((k, v)) multiindex = MultiIndex.from_tuples(tuples) result.columns = multiindex return result
def copy(self, deep=True): if hasattr(self, '_source_data'): mi = MultiIndex(source_data=self._source_data) if self._levels is not None: mi._levels = self._levels.copy() if self._codes is not None: mi._codes = self._codes.copy(deep) else: mi = MultiIndex(self.levels.copy(), self.codes.copy(deep)) if self.names is not None: mi.names = self.names.copy() return mi
def get_result(): result_series = result_df[self.source_name] result_series.name = self.source_name if self.source_name !=\ _LEVEL_0_DATA_NAME else None if len(result_df) == 0 and self._by is not None: empties = [[] for x in range(len(self._by))] mi = MultiIndex(empties, empties, names=self._by) result_series = result_series.set_index(mi) else: idx = result_df.index if self.group_name == _LEVEL_0_INDEX_NAME: idx.name = None result_series = result_series.set_index(idx) return result_series
def compute_result_index(self, key_columns, value_columns): """ Computes the index of the result """ key_names = self.key_names if (len(key_columns)) == 1: return cudf.core.index.as_index(key_columns[0], name=key_names[0]) else: empty_keys = all([len(x) == 0 for x in key_columns]) if len(value_columns) == 0 and empty_keys: return cudf.core.index.GenericIndex( cudf.Series([], dtype="object")) return MultiIndex( source_data=dataframe_from_columns(key_columns, columns=key_names), names=key_names, )
def apply_multicolumn(self, result, aggs): # multicolumn only applies with multiple aggs and multiple groupby keys if len(aggs) == 1 or len(self._by) == 1: # unprefix columns new_cols = [] for c in result.columns: if len(self._by) == 1 and len(result) != 0: new_col = c.split('_')[0] # sum_z-> (sum, z) else: new_col = c.split('_')[1] # sum_z-> (sum, z) new_cols.append(new_col) result.columns = new_cols else: # reorder our columns to match pandas if len(self._val_columns) > 1: col_dfs = [DataFrame() for col in self._val_columns] for agg in aggs: for idx, col in enumerate(self._val_columns): col_dfs[idx][agg + '_' + col] = result[agg + '_' + col] idx = result.index result = DataFrame(index=idx) for idx, col in enumerate(self._val_columns): for agg in aggs: result[agg + '_' + col] = col_dfs[idx][agg + '_' + col] levels = [] codes = [] levels.append(self._val_columns) levels.append(aggs) # if the values columns have length == 1, codes is a nested list of # zeros equal to the size of aggs (sum, min, mean, etc.) # if the values columns are length>1, codes will monotonically # increase by 1 for every n values where n is the number of aggs # [['x,', 'z'], ['sum', 'min']] # codes == [[0, 1], [0, 1]] first_codes = [len(aggs) * [d*1] for d in range(len( self._val_columns))] codes.append([item for sublist in first_codes for item in sublist]) codes.append(len(self._val_columns) * [0, 1]) result.columns = MultiIndex(levels, codes) return result
def compute_result_column_index(self): """ Computes the column index of the result """ value_names = self.value_names aggs_as_list = self.get_aggs_as_list() if isinstance(self.obj, cudf.Series): if len(aggs_as_list) == 1: if self.obj.name is None: return self.obj.name else: return [self.obj.name] else: return aggs_as_list else: if len(aggs_as_list) == len(self.aggs): return value_names else: return MultiIndex.from_tuples(zip(value_names, aggs_as_list))
def apply_multicolumn_mapped(self, result, aggs): # if all of the aggregations in the mapping set are only # length 1, we can assign the columns directly as keys. can_map_directly = True for values in aggs.values(): value = [values] if isinstance(values, str) else list(values) if len(value) != 1: can_map_directly = False break if can_map_directly: result.columns = aggs.keys() else: tuples = [] for k in aggs.keys(): value = [aggs[k]] if isinstance(aggs[k], str) else list( aggs[k]) for v in value: tuples.append((k, v)) multiindex = MultiIndex.from_tuples(tuples) result.columns = multiindex return result
def _getitem_tuple_arg(self, arg): from uuid import uuid4 from cudf import MultiIndex from cudf.core.column import column from cudf.core.dataframe import DataFrame from cudf.core.index import as_index # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index else: columns_df = self._df # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): if isinstance(arg, (MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = MultiIndex.from_pandas(arg) indices = indices_from_labels(columns_df, arg) return columns_df.take(indices) else: if isinstance(arg, tuple): return columns_df.index._get_row_major(columns_df, arg[0]) else: return columns_df.index._get_row_major(columns_df, arg) else: if isinstance(arg[0], slice): out = get_label_range_or_mask( columns_df.index, arg[0].start, arg[0].stop, arg[0].step ) if isinstance(out, slice): df = columns_df._slice(out) else: df = columns_df._apply_boolean_mask(out) else: tmp_arg = arg if is_scalar(arg[0]): # If a scalar, there is possibility of having duplicates. # Join would get all the duplicates. So, coverting it to # an array kind. tmp_arg = ([tmp_arg[0]], tmp_arg[1]) if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) if pd.api.types.is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) else: tmp_col_name = str(uuid4()) other_df = DataFrame( {tmp_col_name: column.arange(len(tmp_arg[0]))}, index=as_index(tmp_arg[0]), ) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here df.index.name = columns_df.index.name df = df.sort_values(tmp_col_name) df.drop(columns=[tmp_col_name], inplace=True) # There were no indices found if len(df) == 0: raise KeyError(arg) # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def repeat(self, repeats, axis=None): assert axis in (None, 0) return MultiIndex.from_frame(self._source_data.repeat(repeats), names=self.names)
def unique(self): return MultiIndex.from_frame(self._source_data.drop_duplicates())
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: if len(self._by) == 1: dtype = self._df[self._by[0]] else: dtype = 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) name = self._by[0] if isinstance(name, str): name = self._by[0].split('+') if name[0] == 'cudfvalcol': idx.name = name[1] else: idx.name = name[0] result = result.drop(self._by[0]) for col in result.columns: if isinstance(col, str): colnames = col.split('+') if colnames[0] == 'cudfvalcol': result[colnames[1]] = result[col] result = result.drop(col) if idx.name == _LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: for col in result.columns: if isinstance(col, str): colnames = col.split('+') if colnames[0] == 'cudfvalcol': result[colnames[1]] = result[col] result = result.drop(col) new_by = [] for by in self._by: if isinstance(col, str): splitby = by.split('+') if splitby[0] == 'cudfvalcol': new_by.append(splitby[1]) else: new_by.append(splitby[0]) else: new_by.append(by) self._by = new_by multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) > 0: return final_result.set_index(multi_index) else: return result.set_index(multi_index)
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: levels = [] codes = [] names = [] for by in self._by: levels.append([]) codes.append([]) names.append(by) mi = MultiIndex(levels, codes) mi.names = names final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: levels = [] codes = DataFrame() names = [] # Note: This is an O(N^2) solution using gpu masking # to compute new codes for the MultiIndex. There may be # a faster solution that could be executed on gpu at the same # time the groupby is calculated. for by in self._by: level = result[by].unique() replaced = result[by].replace(level, range(len(level))) levels.append(level) codes[by] = Series(replaced, dtype="int32") names.append(by) multi_index = MultiIndex(levels=levels, codes=codes, names=names) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)