def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: mi = MultiIndex(source_data=result[self._by]) mi.names = self._by final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: multi_index = MultiIndex(source_data=result[self._by]) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def apply_multiindex_or_single_index(self, result): if len(result) == 0: final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(self._by) == 1 or len(final_result.columns) == 0: dtype = 'float64' if len(self._by) == 1 else 'object' name = self._by[0] if len(self._by) == 1 else None from cudf.dataframe.index import GenericIndex index = GenericIndex(Series([], dtype=dtype)) index.name = name final_result.index = index else: levels = [] codes = [] names = [] for by in self._by: levels.append([]) codes.append([]) names.append(by) mi = MultiIndex(levels, codes) mi.names = names final_result.index = mi if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series return final_result if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result = result.drop(idx.name) if idx.name == self._LEVEL_0_INDEX_NAME: idx.name = self._original_index_name result = result.set_index(idx) return result else: levels = [] codes = DataFrame() names = [] # Note: This is an O(N^2) solution using gpu masking # to compute new codes for the MultiIndex. There may be # a faster solution that could be executed on gpu at the same # time the groupby is calculated. for by in self._by: level = result[by].unique() replaced = result[by].replace(level, range(len(level))) levels.append(level) codes[by] = Series(replaced, dtype="int32") names.append(by) multi_index = MultiIndex(levels=levels, codes=codes, names=names) final_result = DataFrame() for col in result.columns: if col not in self._by: final_result[col] = result[col] if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series(final_result[final_result.columns[0]]) final_series.name = final_result.columns[0] final_series.index = multi_index return final_series return final_result.set_index(multi_index)
def _get_row_major(self, df, row_tuple): slice_access = False if isinstance(row_tuple[0], numbers.Number): valid_indices = row_tuple[0] elif isinstance(row_tuple[0], slice): # 1. empty slice compute if row_tuple[0].stop == 0: valid_indices = [] else: slice_access = True start = row_tuple[0].start or 0 stop = row_tuple[0].stop or len(df) step = row_tuple[0].step or 1 valid_indices = cudautils.arange(start, stop, step) else: valid_indices = self._compute_validity_mask(df, row_tuple) from cudf import Series result = df.take(Series(valid_indices)) # Build new index - INDEX based MultiIndex # --------------- from cudf import DataFrame out_index = DataFrame() # Select the last n-k columns where n is the number of source # levels and k is the length of the indexing tuple size = 0 if not isinstance(row_tuple[0], (numbers.Number, slice)): size = len(row_tuple) for k in range(size, len(df.index.levels)): out_index.add_column(df.index.names[k], df.index.codes[df.index.codes.columns[k]]) # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to the proper codes. if len(out_index.columns) == 1: out_index = [] for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]: # noqa: E501 out_index.append(result.index.levels[ len(result.index.codes.columns)-1][val]) out_index = as_index(out_index) out_index.name = result.index.names[len(result.index.names)-1] result.index = out_index else: if len(result) == 1 and size == 0 and slice_access is False: # If the final result is one row and it was not mapped into # directly result = result.T result = result[result.columns[0]] # convert to Series series_name = [] for idx, code in enumerate(result.columns.codes): series_name.append(result.columns.levels[idx][ result.columns.codes[code][0]]) result = Series(list(result._cols.values())[0], name=series_name) result.name = tuple(series_name) elif(len(out_index.columns)) > 0: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) result.index = result.index._popn(size) return result
def _index_and_downcast(self, result, index, index_key): from cudf import DataFrame from cudf import Series if isinstance(index_key, (numbers.Number, slice)): index_key = [index_key] if ( len(index_key) > 0 and not isinstance(index_key, tuple) ) or isinstance(index_key[0], slice): index_key = index_key[0] slice_access = False if isinstance(index_key, slice): slice_access = True out_index = DataFrame() # Select the last n-k columns where n is the number of _source_data # columns and k is the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) for k in range(size, len(index._source_data.columns)): out_index.add_column( index.names[k], index._source_data[index._source_data.columns[k]], ) if len(result) == 1 and size == 0 and slice_access is False: # If the final result is one row and it was not mapped into # directly, return a Series with a tuple as name. result = result.T result = result[result.columns[0]] # convert to Series series_name = [] for idx, code in enumerate(index._source_data.columns): series_name.append(result.columns._source_data[code][0]) result = Series(list(result._cols.values())[0], index=result.index) result.name = tuple(series_name) elif len(result) == 0 and slice_access is False: # Pandas returns an empty Series with a tuple as name # the one expected result column series_name = [] for idx, code in enumerate(index._source_data.columns): series_name.append(index._source_data[code][0]) result = Series([]) result.name = tuple(series_name) elif len(out_index.columns) == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to the _source_data column names last_column = index._source_data.columns[-1] out_index = index._source_data[last_column] out_index = as_index(out_index) out_index.name = index.names[len(index.names) - 1] index = out_index elif len(out_index.columns) > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) index = index._popn(size) if isinstance(index_key, tuple): result = result.set_index(index) return result