def copy(self, deep=True): mi = MultiIndex(source_data=self._source_data.copy(deep)) if self._levels is not None: mi._levels = [s.copy(deep) for s in self._levels] if self._codes is not None: mi._codes = self._codes.copy(deep) if self.names is not None: mi.names = self.names.copy() return mi
def _popn(self, n): """ Returns a copy of this index without the left-most n values. Removes n names, labels, and codes in order to build a new index for results. """ result = MultiIndex(source_data=self._source_data.iloc[:, n:]) if self.names is not None: result.names = self.names[n:] return result
def copy(self, deep=True): if hasattr(self, '_source_data'): mi = MultiIndex(source_data=self._source_data) if self._levels is not None: mi._levels = self._levels.copy() if self._codes is not None: mi._codes = self._codes.copy(deep) else: mi = MultiIndex(self.levels.copy(), self.codes.copy(deep)) if self.names is not None: mi.names = self.names.copy() return mi
def _popn(self, n): """ Returns a copy of this index without the left-most n values. Removes n names, labels, and codes in order to build a new index for results. """ from cudf import DataFrame codes = DataFrame() for idx in self.codes.columns[n:]: codes.add_column(idx, self.codes[idx]) result = MultiIndex(self.levels[n:], codes) result.names = self.names[n:] return result
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): indices = indices.to_gpu_array() elif isinstance(indices, slice): start, stop, step, sln = utils.standard_python_slice(len(self), indices) indices = cudautils.arange(start, stop, step) if hasattr(self, '_source_data'): result = MultiIndex(source_data=self._source_data.take(indices)) else: codes = self.codes.take(indices) result = MultiIndex(self.levels, codes) result.names = self.names return result
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): indices = indices.to_gpu_array() elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = cudautils.arange(start, stop, step) result = MultiIndex(source_data=self._source_data.take(indices)) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: result._levels = self._levels result.names = self.names return result
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): if indices.null_count != 0: raise ValueError("Column must have no nulls.") indices = indices.data.mem elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = cudautils.arange(start, stop, step) result = MultiIndex(source_data=self._source_data.take(indices)) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: result._levels = self._levels result.names = self.names return result
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: if len(self._by) == 1: dtype = self._df[self._by[0]] else: dtype = 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) name = self._by[0] if isinstance(name, str): name = self._by[0].split('+') if name[0] == 'cudfvalcol': idx.name = name[1] else: idx.name = name[0] result = result.drop(self._by[0]) for col in result.columns: if isinstance(col, str): colnames = col.split('+') if colnames[0] == 'cudfvalcol': result[colnames[1]] = result[col] result = result.drop(col) if idx.name == _LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: for col in result.columns: if isinstance(col, str): colnames = col.split('+') if colnames[0] == 'cudfvalcol': result[colnames[1]] = result[col] result = result.drop(col) new_by = [] for by in self._by: if isinstance(col, str): splitby = by.split('+') if splitby[0] == 'cudfvalcol': new_by.append(splitby[1]) else: new_by.append(splitby[0]) else: new_by.append(by) self._by = new_by multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) > 0: return final_result.set_index(multi_index) else: return result.set_index(multi_index)
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: levels = [] codes = [] names = [] for by in self._by: levels.append([]) codes.append([]) names.append(by) mi = MultiIndex(levels, codes) mi.names = names final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: levels = [] codes = DataFrame() names = [] # Note: This is an O(N^2) solution using gpu masking # to compute new codes for the MultiIndex. There may be # a faster solution that could be executed on gpu at the same # time the groupby is calculated. for by in self._by: level = result[by].unique() replaced = result[by].replace(level, range(len(level))) levels.append(level) codes[by] = Series(replaced, dtype="int32") names.append(by) multi_index = MultiIndex(levels=levels, codes=codes, names=names) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)