コード例 #1
0
ファイル: groupby.py プロジェクト: jimmytuc/cudf
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             mi = MultiIndex(source_data=result[self._by])
             mi.names = self._by
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         multi_index = MultiIndex(source_data=result[self._by])
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)
コード例 #2
0
ファイル: indexing.py プロジェクト: zeichuan/cudf
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.dataframe.dataframe import DataFrame
        from cudf.dataframe.dataframe import Series
        from cudf.dataframe.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (
                len(columns_df) == 0
                and len(columns_df.columns) == 0
                and not isinstance(arg[0], slice)
            ):
                result = Series([], name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for col in columns:
                    columns_df.add_column(name=col, data=self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (
                isinstance(arg[0], slice) or isinstance(arg[1], slice)
            ):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for key, col in columns_df._cols.items():
                df[key] = col.iloc[arg[0]]
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (
                    isinstance(arg[0], slice) or isinstance(arg[1], slice)
                ):
                    return list(df._cols.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    result_series = list(df._cols.values())[0]
                    result_series.index = df.columns
                    result_series.name = arg[0]
                    return result_series
                else:
                    return list(df._cols.values())[0]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.dataframe.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
コード例 #3
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             levels = []
             codes = []
             names = []
             for by in self._by:
                 levels.append([])
                 codes.append([])
                 names.append(by)
             mi = MultiIndex(levels, codes)
             mi.names = names
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         levels = []
         codes = DataFrame()
         names = []
         # Note: This is an O(N^2) solution using gpu masking
         # to compute new codes for the MultiIndex. There may be
         # a faster solution that could be executed on gpu at the same
         # time the groupby is calculated.
         for by in self._by:
             level = result[by].unique()
             replaced = result[by].replace(level, range(len(level)))
             levels.append(level)
             codes[by] = Series(replaced, dtype="int32")
             names.append(by)
         multi_index = MultiIndex(levels=levels, codes=codes, names=names)
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)
コード例 #4
0
 def _get_row_major(self, df, row_tuple):
     slice_access = False
     if isinstance(row_tuple[0], numbers.Number):
         valid_indices = row_tuple[0]
     elif isinstance(row_tuple[0], slice):
         # 1. empty slice compute
         if row_tuple[0].stop == 0:
             valid_indices = []
         else:
             slice_access = True
             start = row_tuple[0].start or 0
             stop = row_tuple[0].stop or len(df)
             step = row_tuple[0].step or 1
             valid_indices = cudautils.arange(start, stop, step)
     else:
         valid_indices = self._compute_validity_mask(df, row_tuple)
     from cudf import Series
     result = df.take(Series(valid_indices))
     # Build new index - INDEX based MultiIndex
     # ---------------
     from cudf import DataFrame
     out_index = DataFrame()
     # Select the last n-k columns where n is the number of source
     # levels and k is the length of the indexing tuple
     size = 0
     if not isinstance(row_tuple[0], (numbers.Number, slice)):
         size = len(row_tuple)
     for k in range(size, len(df.index.levels)):
         out_index.add_column(df.index.names[k],
                              df.index.codes[df.index.codes.columns[k]])
     # If there's only one column remaining in the output index, convert
     # it into an Index and name the final index values according
     # to the proper codes.
     if len(out_index.columns) == 1:
         out_index = []
         for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]:  # noqa: E501
             out_index.append(result.index.levels[
                     len(result.index.codes.columns)-1][val])
         out_index = as_index(out_index)
         out_index.name = result.index.names[len(result.index.names)-1]
         result.index = out_index
     else:
         if len(result) == 1 and size == 0 and slice_access is False:
             # If the final result is one row and it was not mapped into
             # directly
             result = result.T
             result = result[result.columns[0]]
             # convert to Series
             series_name = []
             for idx, code in enumerate(result.columns.codes):
                 series_name.append(result.columns.levels[idx][
                         result.columns.codes[code][0]])
             result = Series(list(result._cols.values())[0],
                             name=series_name)
             result.name = tuple(series_name)
         elif(len(out_index.columns)) > 0:
             # Otherwise pop the leftmost levels, names, and codes from the
             # source index until it has the correct number of columns (n-k)
             result.reset_index(drop=True)
             result.index = result.index._popn(size)
     return result