Example #1
0
    def _loc_to_iloc(self, arg):
        from cudf.dataframe.series import Series
        from cudf.dataframe.index import Index

        if isinstance(
            arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)
        ):
            if len(arg) == 0:
                arg = Series(np.array([], dtype="int32"))
            else:
                arg = Series(arg)
        if isinstance(arg, Series):
            if arg.dtype in [np.bool, np.bool_]:
                return arg
            else:
                return indices_from_labels(self._sr, arg)
        elif is_scalar(arg):
            found_index = self._sr.index.find_label_range(arg, None)[0]
            return found_index
        elif isinstance(arg, slice):
            start_index, stop_index = self._sr.index.find_label_range(
                arg.start, arg.stop
            )
            return slice(start_index, stop_index, arg.step)
        else:
            raise NotImplementedError(
                ".loc not implemented for label type {}".format(
                    type(arg).__name__
                )
            )
Example #2
0
 def __getitem__(self, arg):
     from cudf.dataframe.series import Series
     from cudf.dataframe.index import Index
     if isinstance(
             arg,
         (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)):
         if len(arg) == 0:
             arg = Series(np.array([], dtype='int32'))
         else:
             arg = Series(arg)
     if isinstance(arg, Series):
         if arg.dtype in [np.bool, np.bool_]:
             return self._sr.iloc[arg]
         # To do this efficiently we need a solution to
         # https://github.com/rapidsai/cudf/issues/1087
         out = Series([],
                      dtype=self._sr.dtype,
                      index=self._sr.index.__class__([]))
         for s in arg:
             out = out.append(self._sr.loc[s:s], ignore_index=False)
         return out
     elif is_single_value(arg):
         found_index = self._sr.index.find_label_range(arg, None)[0]
         return self._sr.iloc[found_index]
     elif isinstance(arg, slice):
         start_index, stop_index = self._sr.index.find_label_range(
             arg.start, arg.stop)
         return self._sr.iloc[start_index:stop_index:arg.step]
     else:
         raise NotImplementedError(
             ".loc not implemented for label type {}".format(
                 type(arg).__name__))
Example #3
0
    def _compute_validity_mask(self, index, row_tuple, max_length):
        """ Computes the valid set of indices of values in the lookup
        """
        from cudf import DataFrame
        from cudf import Series
        from cudf import concat
        from cudf.utils.cudautils import arange

        lookup = DataFrame()
        for idx, row in enumerate(row_tuple):
            if row == slice(None):
                continue
            lookup[index._source_data.columns[idx]] = Series(row)
        data_table = concat(
            [
                index._source_data,
                DataFrame({"idx": Series(arange(len(index._source_data)))}),
            ],
            axis=1,
        )
        result = lookup.merge(data_table)["idx"]
        # Avoid computing levels unless the result of the merge is empty,
        # which suggests that a KeyError should be raised.
        if len(result) == 0:
            for idx, row in enumerate(row_tuple):
                if row == slice(None):
                    continue
                if row not in index.levels[idx]:
                    raise KeyError(row)
        return result
Example #4
0
 def normalize_chunks(self, size, chunks):
     if isinstance(chunks, six.integer_types):
         # *chunks* is the chunksize
         return cudautils.arange(0, size, chunks)
     else:
         # *chunks* is an array of chunk leading offset
         chunks = Series(chunks)
         return chunks.to_gpu_array()
Example #5
0
 def codes(self):
     from cudf.dataframe.series import Series
     data = self._parent.data
     if self._parent.has_null_mask:
         mask = self._parent.mask
         null_count = self._parent.null_count
         return Series.from_masked_array(data=data.mem, mask=mask.mem,
                                         null_count=null_count)
     else:
         return Series(data)
Example #6
0
    def _get_column_major(self, df, row_tuple):
        from cudf import Series
        from cudf import DataFrame

        valid_indices = self._get_valid_indices_by_tuple(
            df.columns, row_tuple, len(df._cols)
        )
        result = df._take_columns(valid_indices)

        if isinstance(row_tuple, (numbers.Number, slice)):
            row_tuple = [row_tuple]
        if len(result) == 0 and len(result.columns) == 0:
            result_columns = df.columns.copy(deep=False)
            clear_codes = DataFrame()
            for name in df.columns.names:
                clear_codes[name] = Series([])
            result_columns._codes = clear_codes
            result_columns._source_data = clear_codes
            result.columns = result_columns
        elif len(row_tuple) < len(self.levels) and (
            not slice(None) in row_tuple
            and not isinstance(row_tuple[0], slice)
        ):
            columns = self._popn(len(row_tuple))
            result.columns = columns.take(valid_indices)
        else:
            result.columns = self.take(valid_indices)
        if len(result.columns.levels) == 1:
            columns = []
            for code in result.columns.codes[result.columns.codes.columns[0]]:
                columns.append(result.columns.levels[0][code])
            name = result.columns.names[0]
            result.columns = as_index(columns, name=name)
        return result
Example #7
0
def concat(objs, ignore_index=False):
    """Concatenate DataFrames, Series, or Indices row-wise.

    Parameters
    ----------
    objs : list of DataFrame, Series, or Index
    ignore_index : bool
        Set True to ignore the index of the *objs* and provide a
        default range index instead.

    Returns
    -------
    A new object of like type with rows from each object in ``objs``.
    """
    if not objs:
        raise ValueError("Need at least one object to concatenate")

    # no-op for single object
    if len(objs) == 1:
        return objs[0]

    typs = set(type(o) for o in objs)
    if len(typs) > 1:
        raise ValueError("`concat` expects all objects to be of the same "
                         "type. Got mix of %r." % [t.__name__ for t in typs])
    typ = list(typs)[0]

    if typ is DataFrame:
        return DataFrame._concat(objs, ignore_index=ignore_index)
    elif typ is Series:
        return Series._concat(objs)
    elif issubclass(typ, Index):
        return Index._concat(objs)
    else:
        raise ValueError("Unknown type %r" % typ)
Example #8
0
 def _get_row_major(self, df, row_tuple):
     valid_indices = self._compute_validity_mask(df, row_tuple)
     from cudf import Series
     result = df.take(Series(valid_indices))
     # Build new index - INDEX based MultiIndex
     # ---------------
     from cudf import DataFrame
     out_index = DataFrame()
     # Select the last n-k columns where n is the number of source
     # levels and k is the length of the indexing tuple
     for k in range(len(row_tuple), len(df.index.levels)):
         out_index.add_column(df.index.names[k],
                              df.index.codes[df.index.codes.columns[k]])
     # If there's only one column remaining in the output index, convert
     # it into a StringIndex and name the final index values according
     # to the proper codes.
     if len(out_index.columns) == 1:
         out_index = []
         for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]:  # noqa: E501
             out_index.append(result.index.levels[
                     len(result.index.codes.columns)-1][val])
         # TODO: Warning! The final index column could be arbitrarily
         # ordered integers, not Strings, so we need to check for that
         # dtype and produce a GenericIndex instead of a StringIndex
         out_index = StringIndex(out_index)
         out_index.name = result.index.names[len(result.index.names)-1]
         result.index = out_index
     else:
         # Otherwise pop the leftmost levels, names, and codes from the
         # source index until it has the correct number of columns (n-k)
         if(len(out_index.columns)) > 0:
             result.reset_index(drop=True)
             result.index = result.index._popn(len(row_tuple))
     return result
Example #9
0
    def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
        from cudf.utils.cudautils import arange
        from cudf import Series

        # Instructions for Slicing
        # if tuple, get first and last elements of tuple
        # if open beginning tuple, get 0 to highest valid_index
        # if open ending tuple, get highest valid_index to len()
        # if not open end or beginning, get range lowest beginning index
        # to highest ending index
        if isinstance(row_tuple, slice):
            if (
                isinstance(row_tuple.start, numbers.Number)
                or isinstance(row_tuple.stop, numbers.Number)
                or row_tuple == slice(None)
            ):
                stop = row_tuple.stop or max_length
                start, stop, step = row_tuple.indices(stop)
                return arange(start, stop, step)
            start_values = self._compute_validity_mask(
                index, row_tuple.start, max_length
            )
            stop_values = self._compute_validity_mask(
                index, row_tuple.stop, max_length
            )
            return Series(arange(start_values.min(), stop_values.max() + 1))
        elif isinstance(row_tuple, numbers.Number):
            return row_tuple
        return self._compute_validity_mask(index, row_tuple, max_length)
Example #10
0
 def _apply_op(self, fn, other=None):
     from cudf.dataframe.series import Series
     idx_series = Series(self)
     op = getattr(idx_series, fn)
     if other is not None:
         return as_index(op(other))
     else:
         return as_index(op())
Example #11
0
    def _sortjoin(self, other, how='left', return_indexers=False):
        """Join with another column.

        When the column is a index, set *return_indexers* to obtain
        the indices for shuffling the remaining columns.
        """
        from cudf.dataframe.series import Series

        if not self.is_type_equivalent(other):
            raise TypeError('*other* is not compatible')

        lkey, largsort = self.sort_by_values(True)
        rkey, rargsort = other.sort_by_values(True)
        with _gdf.apply_join(
                [lkey], [rkey], how=how, method='sort') as (lidx, ridx):
            if lidx.size > 0:
                raw_index = cudautils.gather_joined_index(
                        lkey.to_gpu_array(),
                        rkey.to_gpu_array(),
                        lidx,
                        ridx,
                        )
                buf_index = Buffer(raw_index)
            else:
                buf_index = Buffer.null(dtype=self.dtype)

            joined_index = lkey.replace(data=buf_index)

            if return_indexers:
                def gather(idxrange, idx):
                    mask = (Series(idx) != -1).as_mask()
                    return idxrange.take(idx).set_mask(mask).fillna(-1)

                if len(joined_index) > 0:
                    indexers = (
                            gather(Series(largsort), lidx),
                            gather(Series(rargsort), ridx),
                            )
                else:
                    indexers = (
                            Series(Buffer.null(dtype=np.intp)),
                            Series(Buffer.null(dtype=np.intp))
                            )
                return joined_index, indexers
            else:
                return joined_index
Example #12
0
 def wrapper(*args, **kwargs):
     ret = getattr(self._parent._data, attr)(*args, **kwargs)
     if isinstance(ret, nvstrings.nvstrings):
         ret = Series(
             columnops.as_column(ret),
             index=self._index,
             name=self._parent.name,
         )
     return ret
Example #13
0
def concat(objs, axis=0, ignore_index=False, sort=None):
    """Concatenate DataFrames, Series, or Indices row-wise.

    Parameters
    ----------
    objs : list of DataFrame, Series, or Index
    axis : concatenation axis, 0 - index, 1 - columns
    ignore_index : bool
        Set True to ignore the index of the *objs* and provide a
        default range index instead.

    Returns
    -------
    A new object of like type with rows from each object in ``objs``.
    """
    if sort not in (None, False):
        raise NotImplementedError("sort parameter is not yet supported")

    if not objs:
        raise ValueError("Need at least one object to concatenate")

    # no-op for single object
    if len(objs) == 1:
        return objs[0]

    typs = set(type(o) for o in objs)
    allowed_typs = {Series, DataFrame}
    # when axis is 1 (column) we can concat with Series and Dataframes
    if axis == 1:
        assert typs.issubset(allowed_typs)
        df = DataFrame()
        for idx, o in enumerate(objs):
            if isinstance(o, Series):
                name = o.name
                if o.name is None:
                    # pandas uses 0-offset
                    name = idx - 1
                df[name] = o
            else:
                for col in o.columns:
                    df[col] = o[col]
        return df

    if len(typs) > 1:
        raise ValueError("`concat` expects all objects to be of the same "
                         "type. Got mix of %r." % [t.__name__ for t in typs])
    typ = list(typs)[0]

    if typ is DataFrame:
        return DataFrame._concat(objs, axis=axis, ignore_index=ignore_index)
    elif typ is Series:
        return Series._concat(objs, axis=axis)
    elif issubclass(typ, Index):
        return Index._concat(objs)
    else:
        raise ValueError("Unknown type %r" % typ)
Example #14
0
    def _get_row_major(self, df, row_tuple):
        from cudf import Series

        valid_indices = self._get_valid_indices_by_tuple(
            df.index, row_tuple, len(df.index)
        )
        indices = Series(valid_indices)
        result = df.take(indices)
        final = self._index_and_downcast(result, result.index, row_tuple)
        return final
Example #15
0
    def _group_dataframe(self, df, levels):
        """Group dataframe.

        The output dataframe has the same number of rows as the input
        dataframe.  The rows are shuffled so that the groups are moved
        together in ascending order based on the multi-level index.

        Parameters
        ----------
        df : DataFrame
        levels : list[str]
            Column names for the multi-level index.

        Returns
        -------
        (df, segs) : namedtuple
            * df : DataFrame
                The grouped dataframe.
            * segs : Series.
                 Group starting index.
        """
        if len(df) == 0:
            # Groupby on empty dataframe
            return _dfsegs_pack(df=df, segs=Buffer(np.asarray([])))
        # Prepare dataframe
        orig_df = df.copy()
        df = df.loc[:, levels].reset_index(drop=True)
        df = df.to_frame() if isinstance(df, Series) else df
        rowid_column = '__cudf.groupby.rowid'
        df[rowid_column] = df.index.as_column()

        col_order = list(levels)

        # Perform grouping
        df, segs, markers = self._group_first_level(col_order[0],
                                                    rowid_column, df)
        rowidcol = df[rowid_column]
        sorted_keys = [Series(df.index.as_column())]
        del df

        more_keys, reordering_indices, segs = self._group_inner_levels(
                                            col_order[1:], rowidcol, segs,
                                            markers=markers)
        sorted_keys.extend(more_keys)
        valcols = [k for k in orig_df.columns if k not in levels]
        # Prepare output
        # All key columns are already sorted
        out_df = DataFrame()
        for k, sr in zip(levels, sorted_keys):
            out_df[k] = sr
        # Shuffle the value columns
        self._group_shuffle(orig_df.loc[:, valcols],
                            reordering_indices, out_df)
        return _dfsegs_pack(df=out_df, segs=segs)
Example #16
0
    def __init__(self, levels=None, codes=None, labels=None, names=None,
                 **kwargs):
        self.names = names
        column_names = []
        if labels:
            warnings.warn("the 'labels' keyword is deprecated, use 'codes' "
                          "instead", FutureWarning)
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if 'source_data' in kwargs:
            self._source_data = kwargs['source_data']
            self._codes = codes
            self._levels = levels
            self.names = self._source_data.columns
            return

        # name setup
        if isinstance(names, (Sequence,
                              pd.core.indexes.frozen.FrozenNDArray,
                              pd.core.indexes.frozen.FrozenList)):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError('Must pass non-zero number of levels/codes')

        import cudf
        if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\
                not isinstance(codes[0], (Sequence,
                               pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError('Codes is not a Sequence of sequences')
        if not isinstance(codes, cudf.dataframe.dataframe.DataFrame):
            self._codes = cudf.dataframe.dataframe.DataFrame()
            for idx, code in enumerate(codes):
                code = np.array(code)
                self._codes.add_column(column_names[idx],
                                       columnops.as_column(code))
        else:
            self._codes = codes

        # converting levels to numpy array will produce a Float64Index
        # (on empty levels)for levels mimicking the behavior of Pandas
        self._levels = np.array([Series(level).to_array() for level in levels])
        self._validate_levels_and_codes(self._levels, self._codes)
        self.name = None
        self.names = names
Example #17
0
    def lower(self):
        """
        Convert strings in the Series/Index to lowercase.

        Returns
        -------
        Series/Index of str dtype
            A copy of the object with all strings converted to lowercase.
        """
        from cudf.dataframe import Series
        return Series(self._parent.data.lower(), index=self._index)
Example #18
0
    def searchsorted(self, value, side="left"):
        """Find indices where elements should be inserted to maintain order

        Parameters
        ----------
        value : Column
            Column of values to search for
        side : str {‘left’, ‘right’} optional
            If ‘left’, the index of the first suitable location found is given.
            If ‘right’, return the last such index

        Returns
        -------
        An index series of insertion points with the same shape as value
        """
        from cudf.dataframe.series import Series

        idx_series = Series(self, name=self.name)
        result = idx_series.searchsorted(value, side)
        return as_index(result)
Example #19
0
    def _hashjoin(self, other, how='left', return_indexers=False):

        from cudf.dataframe.series import Series

        if not self.is_type_equivalent(other):
            raise TypeError('*other* is not compatible')

        with _gdf.apply_join(
                [self], [other], how=how, method='hash') as (lidx, ridx):
            if lidx.size > 0:
                raw_index = cudautils.gather_joined_index(
                        self.to_gpu_array(),
                        other.to_gpu_array(),
                        lidx,
                        ridx,
                        )
                buf_index = Buffer(raw_index)
            else:
                buf_index = Buffer.null(dtype=self.dtype)

            joined_index = self.replace(data=buf_index)

            if return_indexers:
                def gather(idxrange, idx):
                    mask = (Series(idx) != -1).as_mask()
                    return idxrange.take(idx).set_mask(mask).fillna(-1)

                if len(joined_index) > 0:
                    indexers = (
                            gather(Series(range(0, len(self))), lidx),
                            gather(Series(range(0, len(other))), ridx),
                            )
                else:
                    indexers = (
                            Series(Buffer.null(dtype=np.intp)),
                            Series(Buffer.null(dtype=np.intp))
                            )
                return joined_index, indexers
            else:
                return joined_index
Example #20
0
    def _group_inner_levels(self, columns, rowidcol, segs, markers):
        """Group the second and onwards level.

        Parameters
        ----------
        columns : sequence[str]
            Group keys.  The order is important.
        rowid_column : str
            The name of the special column with the original rowid.
            It's internally used to determine the shuffling order.
        df : DataFrame
            The dataframe being grouped.
        segs : Series
            First level group begin offsets.

        Returns
        -------
        (sorted_keys, reordering_indices, segments)
            - sorted_keys : list[Series]
                List of sorted key columns.
                Column order is same as arg *columns*.
            - reordering_indices : device array
                The indices to gather on to shuffle the dataframe
                into the grouped seqence.
            - segments : Series
                Group begin offsets.
        """
        dsegs = segs.astype(dtype=np.int32).data.mem
        sorted_keys = []
        plan_cache = {}
        for col in columns:
            # Shuffle the key column according to the previous groups
            srkeys = self._df[col].take(rowidcol.to_gpu_array(),
                                        ignore_index=True)
            # Segmented sort on the key
            shuf = Column(Buffer(cudautils.arange(len(srkeys))))

            cache_key = (len(srkeys), srkeys.dtype, shuf.dtype)
            plan = plan_cache.get(cache_key)
            plan = apply_segsort(srkeys._column, shuf, dsegs, plan=plan)
            plan_cache[cache_key] = plan

            sorted_keys.append(srkeys)  # keep sorted key cols
            # Determine segments
            dsegs, markers = cudautils.find_segments(srkeys.to_gpu_array(),
                                                     dsegs,
                                                     markers=markers)
            # Shuffle
            rowidcol = rowidcol.take(shuf.to_gpu_array(), ignore_index=True)

        reordering_indices = rowidcol.to_gpu_array()
        return sorted_keys, reordering_indices, Series(dsegs)
Example #21
0
 def _to_frame(self):
     from cudf import DataFrame
     # for each column of codes
     # replace column with mapping from integers to levels
     df = self.codes.copy(deep=False)
     for idx, column in enumerate(df.columns):
         # use merge as a replace fn
         level = DataFrame({'idx': Series(cudautils.arange(len(
                                                     self.levels[idx]),
                                          dtype=df[column].dtype)),
                            'level': self.levels[idx]})
         code = DataFrame({'idx': df[column]})
         df[column] = code.merge(level).level
     return df
Example #22
0
 def apply(self, method):
     gpu_out = numba.cuda.device_array_like(self.gpu_in)
     kernel = get_ewm_kernel(method)
     kernel[(self.number_of_blocks,),
            (self.number_of_threads,),
            0,
            self.shared_buffer_size * 8](self.gpu_in,
                                         gpu_out,
                                         self.window,
                                         self.span,
                                         self.array_len,
                                         self.thread_tile,
                                         self.min_periods)
     return Series(gpu_out)
Example #23
0
    def _compute_levels_and_codes(self):
        levels = []
        from cudf import DataFrame
        codes = DataFrame()
        names = []
        # Note: This is an O(N^2) solution using gpu masking
        # to compute new codes for the MultiIndex. There may be
        # a faster solution that could be executed on gpu at the same
        # time the groupby is calculated.
        for by in self._source_data.columns:
            if len(self._source_data[by]) > 0:
                level = self._source_data[by].unique()
                replaced = self._source_data[by].replace(
                        level, Series(range(len(level))))
            else:
                level = np.array([])
                replaced = np.array([])
            levels.append(level)
            codes[by] = Series(replaced, dtype="int32")
            names.append(by)

        self._levels = levels
        self._codes = codes
        self.names = names
Example #24
0
    def cat(self, others=None, sep=None, na_rep=None):
        """
        Concatenate strings in the Series/Index with given separator.

        If *others* is specified, this function concatenates the Series/Index
        and elements of others element-wise. If others is not passed, then all
        values in the Series/Index are concatenated into a single string with
        a given sep.

        Parameters
        ----------
            others : Series or List of str
                Strings to be appended.
                The number of strings must match size() of this instance.
                This must be either a Series of string dtype or a Python
                list of strings.

            sep : str
                If specified, this separator will be appended to each string
                before appending the others.

            na_rep : str
                This character will take the place of any null strings
                (not empty strings) in either list.

                - If `na_rep` is None, and `others` is None, missing values in
                the Series/Index are omitted from the result.
                - If `na_rep` is None, and `others` is not None, a row
                containing a missing value in any of the columns (before
                concatenation) will have a missing value in the result.

        Returns
        -------
        concat : str or Series/Index of str dtype
            If `others` is None, `str` is returned, otherwise a `Series/Index`
            (same type as caller) of str dtype is returned.
        """
        from cudf.dataframe import Series, Index
        if isinstance(others, (Series, Index)):
            assert others.dtype == np.dtype('object')
            others = others.data
        out = Series(self._parent.data.cat(others=others,
                                           sep=sep,
                                           na_rep=na_rep),
                     index=self._index)
        if len(out) == 1 and others is None:
            out = out[0]
        return out
Example #25
0
    def __getattr__(self, attr, *args, **kwargs):
        from cudf.dataframe.series import Series
        if hasattr(self._parent._data, attr):
            passed_attr = getattr(self._parent._data, attr)
            if callable(passed_attr):

                def wrapper(*args, **kwargs):
                    return getattr(self._parent._data, attr)(*args, **kwargs)

                if isinstance(wrapper, nvstrings.nvstrings):
                    wrapper = Series(columnops.as_column(wrapper),
                                     index=self._index)
                return wrapper
            else:
                return passed_attr
        else:
            raise AttributeError(attr)
Example #26
0
 def apply_multiindex_or_single_index(self, result):
     if len(result) == 0:
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(self._by) == 1 or len(final_result.columns) == 0:
             dtype = 'float64' if len(self._by) == 1 else 'object'
             name = self._by[0] if len(self._by) == 1 else None
             from cudf.dataframe.index import GenericIndex
             index = GenericIndex(Series([], dtype=dtype))
             index.name = name
             final_result.index = index
         else:
             mi = MultiIndex(source_data=result[self._by])
             mi.names = self._by
             final_result.index = mi
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series([], name=final_result.columns[0])
             final_series.index = final_result.index
             return final_series
         return final_result
     if len(self._by) == 1:
         from cudf.dataframe import index
         idx = index.as_index(result[self._by[0]])
         idx.name = self._by[0]
         result = result.drop(idx.name)
         if idx.name == self._LEVEL_0_INDEX_NAME:
             idx.name = self._original_index_name
         result = result.set_index(idx)
         return result
     else:
         multi_index = MultiIndex(source_data=result[self._by])
         final_result = DataFrame()
         for col in result.columns:
             if col not in self._by:
                 final_result[col] = result[col]
         if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
             final_series = Series(final_result[final_result.columns[0]])
             final_series.name = final_result.columns[0]
             final_series.index = multi_index
             return final_series
         return final_result.set_index(multi_index)
Example #27
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size")
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            columnops.build_column(Buffer(res[idx]),
                                   dtype=res[idx].dtype,
                                   mask=mask))
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Example #28
0
File: string.py Project: ziiin/cudf
    def extract(self, pat, flags=0, expand=True):
        """
        Extract capture groups in the regex `pat` as columns in a DataFrame.

        For each subject string in the Series, extract groups from the first
        match of regular expression `pat`.

        Parameters
        ----------
        pat : str
            Regular expression pattern with capturing groups.
        expand : bool, default True
            If True, return DataFrame with on column per capture group.
            If False, return a Series/Index if there is one capture group or
            DataFrame if there are multiple capture groups.

        Returns
        -------
        DataFrame or Series/Index
            A DataFrame with one row for each subject string, and one column
            for each group. If `expand=False` and `pat` has only one capture
            group, then return a Series/Index.

        Notes
        -----
        The `flags` parameter is not yet supported and will raise a
        NotImplementedError if anything other than the default value is passed.
        """
        if flags != 0:
            raise NotImplementedError("`flags` parameter is not yet supported")

        from cudf.dataframe import DataFrame, Series
        out = self._parent.data.extract(pat)
        if len(out) == 1 and expand is False:
            return Series(
                out[0],
                index=self._index
            )
        else:
            out_df = DataFrame(index=self._index)
            for idx, val in enumerate(out):
                out_df[idx] = val
            return out_df
Example #29
0
    def len(self):
        """
        Computes the length of each element in the Series/Index.

        Returns
        -------
          Series or Index of int: A Series or Index of integer values
            indicating the length of each element in the Series or Index.
        """
        from cudf.dataframe.series import Series
        out_dev_arr = rmm.device_array(len(self._parent), dtype='int32')
        ptr = get_ctype_ptr(out_dev_arr)
        self._parent.data.len(ptr)

        mask = None
        if self._parent.null_count > 0:
            mask = self._parent.mask

        column = columnops.build_column(Buffer(out_dev_arr),
                                        np.dtype('int32'),
                                        mask=mask)
        return Series(column, index=self._index)
Example #30
0
    def _group_first_level(self, col, rowid_column, df):
        """Group first level *col* of *df*

        Parameters
        ----------
        col : str
            Name of the first group key column.
        df : DataFrame
            The dataframe being grouped.

        Returns
        -------
        (df, segs)
            - df : DataFrame
                Sorted by *col- * index
            - segs : Series
                Group begin offsets
        """
        df = df.loc[:, [col, rowid_column]]
        df = df.set_index(col).sort_index()
        segs, markers = df.index._find_segments()
        return df, Series(segs), markers