Esempio n. 1
0
 def copy(self, deep=True):
     mi = MultiIndex(source_data=self._source_data.copy(deep))
     if self._levels is not None:
         mi._levels = [s.copy(deep) for s in self._levels]
     if self._codes is not None:
         mi._codes = self._codes.copy(deep)
     if self.names is not None:
         mi.names = self.names.copy()
     return mi
Esempio n. 2
0
    def _popn(self, n):
        """ Returns a copy of this index without the left-most n values.

        Removes n names, labels, and codes in order to build a new index
        for results.
        """
        result = MultiIndex(source_data=self._source_data.iloc[:, n:])
        if self.names is not None:
            result.names = self.names[n:]
        return result
Esempio n. 3
0
    def _popn(self, n):
        """ Returns a copy of this index without the left-most n values.

        Removes n names, labels, and codes in order to build a new index
        for results.
        """
        from cudf import DataFrame
        codes = DataFrame()
        for idx in self.codes.columns[n:]:
            codes.add_column(idx, self.codes[idx])
        result = MultiIndex(self.levels[n:], codes)
        result.names = self.names[n:]
        return result
Esempio n. 4
0
 def agg(self, agg_types):
     df = DataFrame()
     by = []
     if self.level is not None:
         if isinstance(self.source_series.index, MultiIndex):
             # Add index columns specified by multiindex into _df
             # Record the index column names for the groupby
             for col in self.source_series.index.codes:
                 df[self.group_name + col] = self.source_series.index.codes[
                         col]
                 by.append(self.group_name + col)
     else:
         if isinstance(self.group_keys, Series):
             df[self.group_name] = self.group_keys
             by = self.group_name
         else:
             df = self.group_keys
             by = self._by
     df[self.source_name] = self.source_series
     groupby = df.groupby(by).agg(agg_types)
     idx = groupby.index
     if len(groupby.columns) == 1:
         result = groupby[self.source_name]
         result.name = self.source_series.name
         idx.name = None
         result = result.set_index(idx)
     else:
         idx.name = self.group_name
         result = groupby.set_index(idx)
     if len(result) == 0 and self._by is not None:
         empties = [[] for x in range(len(self._by))]
         mi = MultiIndex(empties, empties, names=self._by)
         result = result.set_index(mi)
     return result
Esempio n. 5
0
    def apply_multicolumn(self, result, aggs):
        levels = []
        codes = []
        levels.append(self._val_columns)
        levels.append(aggs)

        # if the values columns have length == 1, codes is a nested list of
        # zeros equal to the size of aggs (sum, min, mean, etc.)
        # if the values columns are length>1, codes will monotonically
        # increase by 1 for every n values where n is the number of aggs
        # [['x,', 'z'], ['sum', 'min']]
        # codes == [[0, 1], [0, 1]]
        code_size = max(len(aggs), len(self._val_columns))
        codes.append(list(np.zeros(code_size, dtype='int64')))
        codes.append(list(range(code_size)))

        if len(aggs) == 1:
            # unprefix columns
            new_cols = []
            for c in result.columns:
                new_col = c.split('_')[1]  # sum_z-> (sum, z)
                new_cols.append(new_col)
            result.columns = new_cols
        else:
            result.columns = MultiIndex(levels, codes)
        return result
Esempio n. 6
0
    def compute_result_column_index(self):
        """
        Computes the column index of the result
        """
        value_names = self.value_names
        aggs_as_list = self.get_aggs_as_list()

        if isinstance(self.obj, cudf.Series):
            if len(aggs_as_list) == 1:
                if self.obj.name is None:
                    return self.obj.name
                else:
                    return [self.obj.name]
            else:
                return aggs_as_list
        else:
            return_multi_index = True
            if isinstance(self.original_aggs, str):
                return_multi_index = False
            if isinstance(self.original_aggs, collections.abc.Mapping):
                return_multi_index = False
                for key in self.original_aggs:
                    if not isinstance(self.original_aggs[key], str):
                        return_multi_index = True
                        break
            if return_multi_index:
                return MultiIndex.from_tuples(zip(value_names, aggs_as_list))
            else:
                return value_names
Esempio n. 7
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             mi = MultiIndex(source_data=result[self._by])
             mi.names = self._by
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         multi_index = MultiIndex(source_data=result[self._by])
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)
Esempio n. 8
0
def test_multiindex_tuples(testarr):
    tuples = list(zip(*testarr[0]))

    index = MultiIndex.from_tuples(tuples, names=testarr[1])
    index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1])

    assert index.is_unique == index_pd.is_unique
    assert index.is_monotonic == index_pd.is_monotonic
    assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing
    assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
Esempio n. 9
0
 def take(self, indices):
     from collections.abc import Sequence
     from cudf import Series
     from numbers import Integral
     if isinstance(indices, (Integral, Sequence)):
         indices = np.array(indices)
     elif isinstance(indices, Series):
         indices = indices.to_gpu_array()
     elif isinstance(indices, slice):
         start, stop, step, sln = utils.standard_python_slice(len(self),
                                                              indices)
         indices = cudautils.arange(start, stop, step)
     if hasattr(self, '_source_data'):
         result = MultiIndex(source_data=self._source_data.take(indices))
     else:
         codes = self.codes.take(indices)
         result = MultiIndex(self.levels, codes)
     result.names = self.names
     return result
Esempio n. 10
0
    def take(self, indices):
        from collections.abc import Sequence
        from cudf import Series
        from numbers import Integral

        if isinstance(indices, (Integral, Sequence)):
            indices = np.array(indices)
        elif isinstance(indices, Series):
            indices = indices.to_gpu_array()
        elif isinstance(indices, slice):
            start, stop, step = indices.indices(len(self))
            indices = cudautils.arange(start, stop, step)
        result = MultiIndex(source_data=self._source_data.take(indices))
        if self._codes is not None:
            result._codes = self._codes.take(indices)
        if self._levels is not None:
            result._levels = self._levels
        result.names = self.names
        return result
Esempio n. 11
0
    def deserialize(cls, header, frames):
        """Convert from pickle format into Index
        """
        names = pickle.loads(header["names"])

        source_data_typ = pickle.loads(header["source_data"]["type"])
        source_data = source_data_typ.deserialize(header["source_data"],
                                                  frames)

        names = pickle.loads(header["names"])
        return MultiIndex(names=names, source_data=source_data)
Esempio n. 12
0
    def _concat(cls, objs):
        from cudf import DataFrame, MultiIndex

        source_data = [o._source_data for o in objs]
        source_data = DataFrame._concat(source_data)
        names = [None for x in source_data.columns]
        objs = list(filter(lambda o: o.names is not None, objs))
        for o in range(len(objs)):
            for i, name in enumerate(objs[o].names):
                names[i] = names[i] or name
        return MultiIndex(names=names, source_data=source_data)
Esempio n. 13
0
 def _concat(cls, objs):
     from cudf import DataFrame
     from cudf import MultiIndex
     _need_codes = not all([hasattr(o, '_source_data') for o in objs])
     if _need_codes:
         raise NotImplementedError(
                 'MultiIndex._concat is only supported '
                 'for groupby generated MultiIndexes at this time.')
     else:
         _source_data = DataFrame._concat([o._source_data for o in objs])
         index = MultiIndex(source_data=_source_data)
     return index
Esempio n. 14
0
    def take(self, indices):
        from collections.abc import Sequence
        from cudf import Series
        from numbers import Integral

        if isinstance(indices, (Integral, Sequence)):
            indices = np.array(indices)
        elif isinstance(indices, Series):
            if indices.null_count != 0:
                raise ValueError("Column must have no nulls.")
            indices = indices.data.mem
        elif isinstance(indices, slice):
            start, stop, step = indices.indices(len(self))
            indices = cudautils.arange(start, stop, step)
        result = MultiIndex(source_data=self._source_data.take(indices))
        if self._codes is not None:
            result._codes = self._codes.take(indices)
        if self._levels is not None:
            result._levels = self._levels
        result.names = self.names
        return result
Esempio n. 15
0
 def apply_multicolumn_mapped(self, result, aggs):
     if len(set(aggs.keys())) == len(aggs.keys()) and\
             isinstance(aggs[list(aggs.keys())[0]], (str, Number)):
         result.columns = aggs.keys()
     else:
         tuples = []
         for k in aggs.keys():
             for v in aggs[k]:
                 tuples.append((k, v))
         multiindex = MultiIndex.from_tuples(tuples)
         result.columns = multiindex
     return result
Esempio n. 16
0
 def copy(self, deep=True):
     if hasattr(self, '_source_data'):
         mi = MultiIndex(source_data=self._source_data)
         if self._levels is not None:
             mi._levels = self._levels.copy()
         if self._codes is not None:
             mi._codes = self._codes.copy(deep)
     else:
         mi = MultiIndex(self.levels.copy(), self.codes.copy(deep))
     if self.names is not None:
         mi.names = self.names.copy()
     return mi
Esempio n. 17
0
 def get_result():
     result_series = result_df[self.source_name]
     result_series.name = self.source_name if self.source_name !=\
         _LEVEL_0_DATA_NAME else None
     if len(result_df) == 0 and self._by is not None:
         empties = [[] for x in range(len(self._by))]
         mi = MultiIndex(empties, empties, names=self._by)
         result_series = result_series.set_index(mi)
     else:
         idx = result_df.index
         if self.group_name == _LEVEL_0_INDEX_NAME:
             idx.name = None
         result_series = result_series.set_index(idx)
     return result_series
Esempio n. 18
0
 def compute_result_index(self, key_columns, value_columns):
     """
     Computes the index of the result
     """
     key_names = self.key_names
     if (len(key_columns)) == 1:
         return cudf.core.index.as_index(key_columns[0], name=key_names[0])
     else:
         empty_keys = all([len(x) == 0 for x in key_columns])
         if len(value_columns) == 0 and empty_keys:
             return cudf.core.index.GenericIndex(
                 cudf.Series([], dtype="object"))
         return MultiIndex(
             source_data=dataframe_from_columns(key_columns,
                                                columns=key_names),
             names=key_names,
         )
Esempio n. 19
0
    def apply_multicolumn(self, result, aggs):
        # multicolumn only applies with multiple aggs and multiple groupby keys
        if len(aggs) == 1 or len(self._by) == 1:
            # unprefix columns
            new_cols = []
            for c in result.columns:
                if len(self._by) == 1 and len(result) != 0:
                    new_col = c.split('_')[0]  # sum_z-> (sum, z)
                else:
                    new_col = c.split('_')[1]  # sum_z-> (sum, z)
                new_cols.append(new_col)
            result.columns = new_cols
        else:
            # reorder our columns to match pandas
            if len(self._val_columns) > 1:
                col_dfs = [DataFrame() for col in self._val_columns]
                for agg in aggs:
                    for idx, col in enumerate(self._val_columns):
                        col_dfs[idx][agg + '_' + col] = result[agg + '_' + col]
                idx = result.index
                result = DataFrame(index=idx)
                for idx, col in enumerate(self._val_columns):
                    for agg in aggs:
                        result[agg + '_' + col] = col_dfs[idx][agg + '_' + col]

            levels = []
            codes = []
            levels.append(self._val_columns)
            levels.append(aggs)

            # if the values columns have length == 1, codes is a nested list of
            # zeros equal to the size of aggs (sum, min, mean, etc.)
            # if the values columns are length>1, codes will monotonically
            # increase by 1 for every n values where n is the number of aggs
            # [['x,', 'z'], ['sum', 'min']]
            # codes == [[0, 1], [0, 1]]
            first_codes = [len(aggs) * [d*1] for d in range(len(
                    self._val_columns))]
            codes.append([item for sublist in first_codes for item in sublist])
            codes.append(len(self._val_columns) * [0, 1])
            result.columns = MultiIndex(levels, codes)
        return result
Esempio n. 20
0
    def compute_result_column_index(self):
        """
        Computes the column index of the result
        """
        value_names = self.value_names
        aggs_as_list = self.get_aggs_as_list()

        if isinstance(self.obj, cudf.Series):
            if len(aggs_as_list) == 1:
                if self.obj.name is None:
                    return self.obj.name
                else:
                    return [self.obj.name]
            else:
                return aggs_as_list
        else:
            if len(aggs_as_list) == len(self.aggs):
                return value_names
            else:
                return MultiIndex.from_tuples(zip(value_names, aggs_as_list))
Esempio n. 21
0
 def apply_multicolumn_mapped(self, result, aggs):
     # if all of the aggregations in the mapping set are only
     # length 1, we can assign the columns directly as keys.
     can_map_directly = True
     for values in aggs.values():
         value = [values] if isinstance(values, str) else list(values)
         if len(value) != 1:
             can_map_directly = False
             break
     if can_map_directly:
         result.columns = aggs.keys()
     else:
         tuples = []
         for k in aggs.keys():
             value = [aggs[k]] if isinstance(aggs[k], str) else list(
                     aggs[k])
             for v in value:
                 tuples.append((k, v))
         multiindex = MultiIndex.from_tuples(tuples)
         result.columns = multiindex
     return result
Esempio n. 22
0
    def _getitem_tuple_arg(self, arg):
        from uuid import uuid4

        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.dataframe import DataFrame
        from cudf.core.index import as_index

        # Step 1: Gather columns
        if isinstance(arg, tuple):
            columns_df = self._get_column_selection(arg[1])
            columns_df._index = self._df._index
        else:
            columns_df = self._df

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
                if isinstance(arg, pd.MultiIndex):
                    arg = MultiIndex.from_pandas(arg)

                indices = indices_from_labels(columns_df, arg)
                return columns_df.take(indices)

            else:
                if isinstance(arg, tuple):
                    return columns_df.index._get_row_major(columns_df, arg[0])
                else:
                    return columns_df.index._get_row_major(columns_df, arg)
        else:
            if isinstance(arg[0], slice):
                out = get_label_range_or_mask(
                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
                )
                if isinstance(out, slice):
                    df = columns_df._slice(out)
                else:
                    df = columns_df._apply_boolean_mask(out)
            else:
                tmp_arg = arg
                if is_scalar(arg[0]):
                    # If a scalar, there is possibility of having duplicates.
                    # Join would get all the duplicates. So, coverting it to
                    # an array kind.
                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                if len(tmp_arg[0]) == 0:
                    return columns_df._empty_like(keep_index=True)
                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])

                if pd.api.types.is_bool_dtype(tmp_arg[0]):
                    df = columns_df._apply_boolean_mask(tmp_arg[0])
                else:
                    tmp_col_name = str(uuid4())
                    other_df = DataFrame(
                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
                        index=as_index(tmp_arg[0]),
                    )
                    df = other_df.join(columns_df, how="inner")
                    # as join is not assigning any names to index,
                    # update it over here
                    df.index.name = columns_df.index.name
                    df = df.sort_values(tmp_col_name)
                    df.drop(columns=[tmp_col_name], inplace=True)
                    # There were no indices found
                    if len(df) == 0:
                        raise KeyError(arg)

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Esempio n. 23
0
 def repeat(self, repeats, axis=None):
     assert axis in (None, 0)
     return MultiIndex.from_frame(self._source_data.repeat(repeats),
                                  names=self.names)
Esempio n. 24
0
 def unique(self):
     return MultiIndex.from_frame(self._source_data.drop_duplicates())
Esempio n. 25
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             if len(self._by) == 1:
                 dtype = self._df[self._by[0]]
             else:
                 dtype = 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             mi = MultiIndex(source_data=result[self._by])
             mi.names = self._by
             final_result.index = mi
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         name = self._by[0]
         if isinstance(name, str):
             name = self._by[0].split('+')
             if name[0] == 'cudfvalcol':
                 idx.name = name[1]
             else:
                 idx.name = name[0]
             result = result.drop(self._by[0])
         for col in result.columns:
             if isinstance(col, str):
                 colnames = col.split('+')
                 if colnames[0] == 'cudfvalcol':
                     result[colnames[1]] = result[col]
                     result = result.drop(col)
         if idx.name == _LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         for col in result.columns:
             if isinstance(col, str):
                 colnames = col.split('+')
                 if colnames[0] == 'cudfvalcol':
                     result[colnames[1]] = result[col]
                     result = result.drop(col)
         new_by = []
         for by in self._by:
             if isinstance(col, str):
                 splitby = by.split('+')
                 if splitby[0] == 'cudfvalcol':
                     new_by.append(splitby[1])
                 else:
                     new_by.append(splitby[0])
             else:
                 new_by.append(by)
         self._by = new_by
         multi_index = MultiIndex(source_data=result[self._by])
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) > 0:
             return final_result.set_index(multi_index)
         else:
             return result.set_index(multi_index)
Esempio n. 26
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             levels = []
             codes = []
             names = []
             for by in self._by:
                 levels.append([])
                 codes.append([])
                 names.append(by)
             mi = MultiIndex(levels, codes)
             mi.names = names
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         levels = []
         codes = DataFrame()
         names = []
         # Note: This is an O(N^2) solution using gpu masking
         # to compute new codes for the MultiIndex. There may be
         # a faster solution that could be executed on gpu at the same
         # time the groupby is calculated.
         for by in self._by:
             level = result[by].unique()
             replaced = result[by].replace(level, range(len(level)))
             levels.append(level)
             codes[by] = Series(replaced, dtype="int32")
             names.append(by)
         multi_index = MultiIndex(levels=levels, codes=codes, names=names)
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)