Example #1
0
 def _getitem_tuple_arg(self, arg):
     from cudf.dataframe.dataframe import DataFrame
     from cudf.dataframe.index import as_index
     columns = self._get_column_selection(arg[1])
     df = DataFrame()
     for col in columns:
         df.add_column(name=col, data=self._df[col].loc[arg[0]])
     if df.shape[0] == 1:  # we have a single row
         if isinstance(arg[0], slice):
             df.index = as_index(arg[0].start)
         else:
             df.index = as_index(arg[0])
     return df
Example #2
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             mi = MultiIndex(source_data=result[self._by])
             mi.names = self._by
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         multi_index = MultiIndex(source_data=result[self._by])
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)
Example #3
0
    def _getitem_tuple_arg(self, arg):
        from cudf.dataframe.dataframe import DataFrame
        from cudf.dataframe.index import as_index
        from cudf.utils.cudautils import arange
        from cudf import MultiIndex

        # Step 1: Gather columns
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
        else:
            columns = self._get_column_selection(arg[1])
            columns_df = DataFrame()
            for col in columns:
                columns_df.add_column(name=col, data=self._df[col])
        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            if isinstance(self._df.columns, MultiIndex):
                if isinstance(arg[0], slice):
                    start, stop, step = arg[0].indices(len(columns_df))
                    indices = arange(start, stop, step)
                    df = columns_df.take(indices)
                else:
                    df = columns_df.take(arg[0])
            else:
                df = DataFrame()
                for col in columns_df.columns:
                    df[col] = columns_df[col].loc[arg[0]]
        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                df.index = as_index(arg[0])
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Example #4
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.dataframe.dataframe import DataFrame
        from cudf.dataframe.dataframe import Series
        from cudf.dataframe.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (
                len(columns_df) == 0
                and len(columns_df.columns) == 0
                and not isinstance(arg[0], slice)
            ):
                result = Series([], name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for col in columns:
                    columns_df.add_column(name=col, data=self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (
                isinstance(arg[0], slice) or isinstance(arg[1], slice)
            ):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for key, col in columns_df._cols.items():
                df[key] = col.iloc[arg[0]]
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (
                    isinstance(arg[0], slice) or isinstance(arg[1], slice)
                ):
                    return list(df._cols.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    result_series = list(df._cols.values())[0]
                    result_series.index = df.columns
                    result_series.name = arg[0]
                    return result_series
                else:
                    return list(df._cols.values())[0]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.dataframe.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Example #5
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             if len(self._by) == 1:
                 dtype = self._df[self._by[0]]
             else:
                 dtype = 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             mi = MultiIndex(source_data=result[self._by])
             mi.names = self._by
             final_result.index = mi
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         name = self._by[0]
         if isinstance(name, str):
             name = self._by[0].split('+')
             if name[0] == 'cudfvalcol':
                 idx.name = name[1]
             else:
                 idx.name = name[0]
             result = result.drop(self._by[0])
         for col in result.columns:
             if isinstance(col, str):
                 colnames = col.split('+')
                 if colnames[0] == 'cudfvalcol':
                     result[colnames[1]] = result[col]
                     result = result.drop(col)
         if idx.name == _LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         for col in result.columns:
             if isinstance(col, str):
                 colnames = col.split('+')
                 if colnames[0] == 'cudfvalcol':
                     result[colnames[1]] = result[col]
                     result = result.drop(col)
         new_by = []
         for by in self._by:
             if isinstance(col, str):
                 splitby = by.split('+')
                 if splitby[0] == 'cudfvalcol':
                     new_by.append(splitby[1])
                 else:
                     new_by.append(splitby[0])
             else:
                 new_by.append(by)
         self._by = new_by
         multi_index = MultiIndex(source_data=result[self._by])
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) > 0:
             return final_result.set_index(multi_index)
         else:
             return result.set_index(multi_index)
Example #6
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             levels = []
             codes = []
             names = []
             for by in self._by:
                 levels.append([])
                 codes.append([])
                 names.append(by)
             mi = MultiIndex(levels, codes)
             mi.names = names
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         levels = []
         codes = DataFrame()
         names = []
         # Note: This is an O(N^2) solution using gpu masking
         # to compute new codes for the MultiIndex. There may be
         # a faster solution that could be executed on gpu at the same
         # time the groupby is calculated.
         for by in self._by:
             level = result[by].unique()
             replaced = result[by].replace(level, range(len(level)))
             levels.append(level)
             codes[by] = Series(replaced, dtype="int32")
             names.append(by)
         multi_index = MultiIndex(levels=levels, codes=codes, names=names)
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)