Exemple #1
0
    def _compute_validity_mask(self, index, row_tuple, max_length):
        """ Computes the valid set of indices of values in the lookup
        """
        from cudf import DataFrame
        from cudf import Series
        from cudf import concat
        from cudf.utils.cudautils import arange

        lookup = DataFrame()
        for idx, row in enumerate(row_tuple):
            if row == slice(None):
                continue
            lookup[index._source_data.columns[idx]] = Series(row)
        data_table = concat(
            [
                index._source_data,
                DataFrame({"idx": Series(arange(len(index._source_data)))}),
            ],
            axis=1,
        )
        result = lookup.merge(data_table)["idx"]
        # Avoid computing levels unless the result of the merge is empty,
        # which suggests that a KeyError should be raised.
        if len(result) == 0:
            for idx, row in enumerate(row_tuple):
                if row == slice(None):
                    continue
                if row not in index.levels[idx]._column:
                    raise KeyError(row)
        return result
Exemple #2
0
    def _loc_to_iloc(self, arg):
        from cudf.core.series import Series
        from cudf.core.index import Index

        if isinstance(
                arg,
            (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)):
            if len(arg) == 0:
                arg = Series(np.array([], dtype="int32"))
            else:
                arg = Series(arg)
        if isinstance(arg, Series):
            if arg.dtype in [np.bool, np.bool_]:
                return arg
            else:
                return indices_from_labels(self._sr, arg)
        elif is_scalar(arg):
            found_index = self._sr.index.find_label_range(arg, None)[0]
            return found_index
        elif isinstance(arg, slice):
            start_index, stop_index = self._sr.index.find_label_range(
                arg.start, arg.stop)
            return slice(start_index, stop_index, arg.step)
        else:
            raise NotImplementedError(
                ".loc not implemented for label type {}".format(
                    type(arg).__name__))
Exemple #3
0
def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
    """Encode the input values as integer labels

    Parameters
    ----------
    values: Series, Index, or CuPy array
        The data to be factorized.
    na_sentinel : number, default -1
        Value to indicate missing category.

    Returns
    --------
    (labels, cats) : (Series, Series)
        - *labels* contains the encoded values
        - *cats* contains the categories in order that the N-th
            item corresponds to the (N-1) code.

    Examples
    --------
    >>> import cudf
    >>> data = cudf.Series(['a', 'c', 'c'])
    >>> codes, uniques = cudf.factorize(data)
    >>> codes
    0    0
    1    1
    2    1
    dtype: int8
    >>> uniques
    0    a
    1    c
    dtype: object

    See Also
    --------
    cudf.core.series.Series.factorize : Encode the input values of Series.

    """
    if sort:
        raise NotImplementedError(
            "Sorting not yet supported during factorization."
        )
    if na_sentinel is None:
        raise NotImplementedError("na_sentinel can not be None.")

    if size_hint:
        warn("size_hint is not applicable for cudf.factorize")

    return_cupy_array = isinstance(values, cp.core.core.ndarray)

    values = Series(values)

    cats = values._column.dropna().unique().astype(values.dtype)

    name = values.name  # label_encoding mutates self.name
    labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values
    values.name = name

    return labels, cats.values if return_cupy_array else Index(cats)
Exemple #4
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """

    res = libdlpack.from_dlpack(pycapsule_obj)

    if res._num_columns == 1:
        return Series(res._data[0])
    else:
        return DataFrame(data=res._data)
Exemple #5
0
    def to_series(self, index=None, name=None):
        """
        Create a Series with both index and values equal to the index keys.
        Useful with map for returning an indexer based on an index.

        Parameters
        ----------
        index : Index, optional
            Index of resulting Series. If None, defaults to original index.
        name : str, optional
            Dame of resulting Series. If None, defaults to name of original
            index.

        Returns
        -------
        Series
            The dtype will be based on the type of the Index values.
        """

        from cudf.core.series import Series

        return Series(
            self._values,
            index=self.copy(deep=False) if index is None else index,
            name=self.name if name is None else name,
        )
Exemple #6
0
    def run(self, df, **launch_params):
        # Get input columns
        if isinstance(self.incols, dict):
            inputs = {
                v: df[k]._column.data_array_view
                for (k, v) in self.incols.items()
            }
        else:
            inputs = {k: df[k]._column.data_array_view for k in self.incols}
        # Allocate output columns
        outputs = {}
        for k, dt in self.outcols.items():
            outputs[k] = column.column_empty(len(df), dt,
                                             False).data_array_view
        # Bind argument
        args = {}
        for dct in [inputs, outputs, self.kwargs]:
            args.update(dct)
        bound = self.sig.bind(**args)
        # Launch kernel
        self.launch_kernel(df, bound.args, **launch_params)
        # Prepare pessimistic nullmask
        if self.pessimistic_nulls:
            out_mask = make_aggregate_nullmask(df, columns=self.incols)
        else:
            out_mask = None
        # Prepare output frame
        outdf = df.copy()
        for k in sorted(self.outcols):
            outdf[k] = Series(outputs[k], index=outdf.index, nan_as_null=False)
            if out_mask is not None:
                outdf[k] = outdf[k].set_mask(out_mask.data_array_view)

        return outdf
Exemple #7
0
    def _get_column_major(self, df, row_tuple):
        from cudf import Series
        from cudf import DataFrame

        valid_indices = self._get_valid_indices_by_tuple(
            df.columns, row_tuple, len(df._cols))
        result = df._take_columns(valid_indices)
        if isinstance(row_tuple, (numbers.Number, slice)):
            row_tuple = [row_tuple]
        if len(result) == 0 and len(result.columns) == 0:
            result_columns = df.columns.copy(deep=False)
            clear_codes = DataFrame()
            for name in df.columns.names:
                clear_codes[name] = Series([])
            result_columns._codes = clear_codes
            result_columns._source_data = clear_codes
            result.columns = result_columns
        elif len(row_tuple) < len(
                self.levels) and (not slice(None) in row_tuple
                                  and not isinstance(row_tuple[0], slice)):
            columns = self._popn(len(row_tuple))
            result.columns = columns.take(valid_indices)
        else:
            result.columns = self.take(valid_indices)
        if len(result.columns.levels) == 1:
            columns = []
            for code in result.columns.codes[result.columns.codes.columns[0]]:
                columns.append(result.columns.levels[0][code])
            name = result.columns.names[0]
            result.columns = as_index(columns, name=name)
        if len(row_tuple) == len(self.levels) and len(result.columns) == 1:
            result = list(result._cols.values())[0]
        return result
Exemple #8
0
    def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
        from cudf.utils.cudautils import arange
        from cudf import Series

        # Instructions for Slicing
        # if tuple, get first and last elements of tuple
        # if open beginning tuple, get 0 to highest valid_index
        # if open ending tuple, get highest valid_index to len()
        # if not open end or beginning, get range lowest beginning index
        # to highest ending index
        if isinstance(row_tuple, slice):
            if (isinstance(row_tuple.start, numbers.Number)
                    or isinstance(row_tuple.stop, numbers.Number)
                    or row_tuple == slice(None)):
                stop = row_tuple.stop or max_length
                start, stop, step = row_tuple.indices(stop)
                return arange(start, stop, step)
            start_values = self._compute_validity_mask(index, row_tuple.start,
                                                       max_length)
            stop_values = self._compute_validity_mask(index, row_tuple.stop,
                                                      max_length)
            return Series(arange(start_values.min(), stop_values.max() + 1))
        elif isinstance(row_tuple, numbers.Number):
            return row_tuple
        return self._compute_validity_mask(index, row_tuple, max_length)
Exemple #9
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.

    Notes
    -----
    cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
    tensor is row-major, transpose it before passing it to this function.
    """

    data, _ = libdlpack.from_dlpack(pycapsule_obj)

    if len(data) == 1:
        return Series._from_data(data)
    else:
        return DataFrame._from_data(data)
Exemple #10
0
    def _loc_to_iloc(self, arg):
        from cudf.core.column import column
        from cudf.core.series import Series

        if is_scalar(arg):
            try:
                found_index = self._sr.index._values.find_first_value(
                    arg, closest=False
                )
                return found_index
            except (TypeError, KeyError, IndexError, ValueError):
                raise KeyError("label scalar is out of bound")

        elif isinstance(arg, slice):
            return get_label_range_or_mask(
                self._sr.index, arg.start, arg.stop, arg.step
            )
        elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)):
            if isinstance(arg, pd.MultiIndex):
                arg = cudf.MultiIndex.from_pandas(arg)

            return indices_from_labels(self._sr, arg)

        else:
            arg = Series(column.as_column(arg))
            if arg.dtype in (bool, np.bool_):
                return arg
            else:
                indices = indices_from_labels(self._sr, arg)
                if indices.null_count > 0:
                    raise KeyError("label scalar is out of bound")
                return indices
Exemple #11
0
    def _group_dataframe(self, df, levels):
        """Group dataframe.

        The output dataframe has the same number of rows as the input
        dataframe.  The rows are shuffled so that the groups are moved
        together in ascending order based on the multi-level index.

        Parameters
        ----------
        df : DataFrame
        levels : list[str]
            Column names for the multi-level index.

        Returns
        -------
        (df, segs) : namedtuple
            * df : DataFrame
                The grouped dataframe.
            * segs : Series.
                 Group starting index.
        """
        sorted_cols, offsets = libcudf.groupby.groupby_without_aggregations(
            df._columns, df[levels]._columns
        )
        outdf = cudf.DataFrame._from_columns(sorted_cols)
        segs = Series(offsets)
        outdf.columns = df.columns
        return _dfsegs_pack(df=outdf, segs=segs)
Exemple #12
0
    def _get_row_major(self, df, row_tuple):
        from cudf import Series

        valid_indices = self._get_valid_indices_by_tuple(
            df.index, row_tuple, len(df.index))
        indices = Series(valid_indices)
        result = df.take(indices)
        final = self._index_and_downcast(result, result.index, row_tuple)
        return final
Exemple #13
0
    def _apply_op(self, fn, other=None):
        from cudf.core.series import Series

        idx_series = Series(self, name=self.name)
        op = getattr(idx_series, fn)
        if other is not None:
            return as_index(op(other))
        else:
            return as_index(op())
Exemple #14
0
 def wrapper(*args, **kwargs):
     ret = passed_attr(*args, **kwargs)
     if isinstance(ret, nvstrings.nvstrings):
         ret = Series(
             column.as_column(ret),
             index=self._index,
             name=self._name,
         )
     return ret
Exemple #15
0
    def searchsorted(self, value, side="left"):
        """Find indices where elements should be inserted to maintain order

        Parameters
        ----------
        value : Column
            Column of values to search for
        side : str {‘left’, ‘right’} optional
            If ‘left’, the index of the first suitable location found is given.
            If ‘right’, return the last such index

        Returns
        -------
        An index series of insertion points with the same shape as value
        """
        from cudf.core.series import Series

        idx_series = Series(self, name=self.name)
        result = idx_series.searchsorted(value, side)
        return as_index(result)
Exemple #16
0
    def _get_row_major(self, df, row_tuple):
        from cudf import Series

        if pd.api.types.is_bool_dtype(row_tuple):
            return df[row_tuple]

        valid_indices = self._get_valid_indices_by_tuple(
            df.index, row_tuple, len(df.index))
        indices = Series(valid_indices)
        result = df.take(indices)
        final = self._index_and_downcast(result, result.index, row_tuple)
        return final
Exemple #17
0
    def _getitem_tuple_arg(self, arg):
        from cudf.core.dataframe import Series, DataFrame
        from cudf.core.column import column
        from cudf.core.index import as_index
        from cudf.utils.cudautils import arange
        from cudf import MultiIndex

        # Step 1: Gather columns
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if isinstance(columns_df, Series):
                return columns_df
        else:
            columns = self._get_column_selection(arg[1])
            columns_df = DataFrame(index=self._df.index)
            for i, col in enumerate(columns):
                columns_df.insert(i, col, self._df[col])

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            if isinstance(self._df.columns, MultiIndex):
                if isinstance(arg[0], slice):
                    start, stop, step = arg[0].indices(len(columns_df))
                    indices = arange(start, stop, step)
                    df = columns_df.take(indices)
                else:
                    df = columns_df.take(arg[0])
            else:
                df = DataFrame()
                for col in columns_df.columns:
                    # need Series() in case a scalar is returned
                    df[col] = Series(columns_df[col].loc[arg[0]])
                df.columns = columns_df.columns

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Exemple #18
0
    def _index_and_downcast(self, result, index, index_key):
        from cudf import DataFrame
        from cudf import Series

        if isinstance(index_key, (numbers.Number, slice)):
            index_key = [index_key]
        if (
            len(index_key) > 0 and not isinstance(index_key, tuple)
        ) or isinstance(index_key[0], slice):
            index_key = index_key[0]

        slice_access = False
        if isinstance(index_key, slice):
            slice_access = True
        out_index = DataFrame()
        # Select the last n-k columns where n is the number of _source_data
        # columns and k is the length of the indexing tuple
        size = 0
        if not isinstance(index_key, (numbers.Number, slice)):
            size = len(index_key)
        for k in range(size, len(index._source_data.columns)):
            if index.names is None:
                name = k
            else:
                name = index.names[k]
            out_index.add_column(
                name, index._source_data[index._source_data.columns[k]]
            )

        if len(result) == 1 and size == 0 and slice_access is False:
            # If the final result is one row and it was not mapped into
            # directly, return a Series with a tuple as name.
            result = result.T
            result = result[result.columns[0]]
        elif len(result) == 0 and slice_access is False:
            # Pandas returns an empty Series with a tuple as name
            # the one expected result column
            series_name = []
            for idx, code in enumerate(index._source_data.columns):
                series_name.append(index._source_data[code][0])
            result = Series([])
            result.name = tuple(series_name)
        elif len(out_index.columns) == 1:
            # If there's only one column remaining in the output index, convert
            # it into an Index and name the final index values according
            # to the _source_data column names
            last_column = index._source_data.columns[-1]
            out_index = index._source_data[last_column]
            out_index = as_index(out_index)
            out_index.name = index.names[len(index.names) - 1]
            index = out_index
        elif len(out_index.columns) > 1:
            # Otherwise pop the leftmost levels, names, and codes from the
            # source index until it has the correct number of columns (n-k)
            result.reset_index(drop=True)
            index = index._popn(size)
        if isinstance(index_key, tuple):
            result = result.set_index(index)
        return result
Exemple #19
0
    def lower(self):
        """
        Convert strings in the Series/Index to lowercase.

        Returns
        -------
        Series/Index of str dtype
            A copy of the object with all strings converted to lowercase.
        """
        from cudf.core import Series

        return Series(
            self._parent.nvstrings.lower(), index=self._index, name=self._name
        )
Exemple #20
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.dataframe import DataFrame, Series
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg[0], slice):
                df = columns_df[arg[0]]
            else:
                df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance(
                    arg[0], slice) or isinstance(arg[1], slice)):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for i, col in enumerate(columns_df._columns):
                # need Series() in case a scalar is returned
                df[i] = Series(col[arg[0]])

            df.index = as_index(columns_df.index[arg[0]])
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)

        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
            from cudf.core.index import RangeIndex

            slice_len = len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Exemple #21
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size"
            )
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            column.build_column(
                Buffer(res[idx]), dtype=res[idx].dtype, mask=mask
            )
        )
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Exemple #22
0
    def _to_frame(self):
        from cudf import DataFrame, Series

        # for each column of codes
        # replace column with mapping from integers to levels
        df = self.codes.copy(deep=False)
        for idx, col in enumerate(df.columns):
            # use merge as a replace fn
            level = DataFrame({
                "idx":
                Series(cupy.arange(len(self.levels[idx]),
                                   dtype=df[col].dtype)),
                "level":
                self.levels[idx],
            })
            code = DataFrame({"idx": df[col]})
            df[col] = code.merge(level).level
        return df
Exemple #23
0
    def extract(self, pat, flags=0, expand=True):
        """
        Extract capture groups in the regex `pat` as columns in a DataFrame.

        For each subject string in the Series, extract groups from the first
        match of regular expression `pat`.

        Parameters
        ----------
        pat : str
            Regular expression pattern with capturing groups.
        expand : bool, default True
            If True, return DataFrame with on column per capture group.
            If False, return a Series/Index if there is one capture group or
            DataFrame if there are multiple capture groups.

        Returns
        -------
        DataFrame or Series/Index
            A DataFrame with one row for each subject string, and one column
            for each group. If `expand=False` and `pat` has only one capture
            group, then return a Series/Index.

        Notes
        -----
        The `flags` parameter is not yet supported and will raise a
        NotImplementedError if anything other than the default value is passed.
        """
        if flags != 0:
            raise NotImplementedError("`flags` parameter is not yet supported")

        from cudf.core import DataFrame, Series

        out = self._parent.nvstrings.extract(pat)
        if len(out) == 1 and expand is False:
            return Series(out[0], index=self._index, name=self._name)
        else:
            out_df = DataFrame(index=self._index)
            for idx, val in enumerate(out):
                out_df[idx] = val
            return out_df
Exemple #24
0
    def len(self):
        """
        Computes the length of each element in the Series/Index.

        Returns
        -------
          Series or Index of int: A Series or Index of integer values
            indicating the length of each element in the Series or Index.
        """
        from cudf.core.series import Series

        out_dev_arr = rmm.device_array(len(self._parent), dtype="int32")
        ptr = libcudf.cudf.get_ctype_ptr(out_dev_arr)
        self._parent.nvstrings.len(ptr)

        mask = None
        if self._parent.has_nulls:
            mask = self._parent.mask

        col = column.build_column(
            Buffer(out_dev_arr), np.dtype("int32"), mask=mask
        )
        return Series(col, index=self._index, name=self._name)
Exemple #25
0
    def cat(self, others=None, sep=None, na_rep=None):
        """
        Concatenate strings in the Series/Index with given separator.

        If *others* is specified, this function concatenates the Series/Index
        and elements of others element-wise. If others is not passed, then all
        values in the Series/Index are concatenated into a single string with
        a given sep.

        Parameters
        ----------
            others : Series or List of str
                Strings to be appended.
                The number of strings must match size() of this instance.
                This must be either a Series of string dtype or a Python
                list of strings.

            sep : str
                If specified, this separator will be appended to each string
                before appending the others.

            na_rep : str
                This character will take the place of any null strings
                (not empty strings) in either list.

                - If `na_rep` is None, and `others` is None, missing values in
                the Series/Index are omitted from the result.
                - If `na_rep` is None, and `others` is not None, a row
                containing a missing value in any of the columns (before
                concatenation) will have a missing value in the result.

        Returns
        -------
        concat : str or Series/Index of str dtype
            If `others` is None, `str` is returned, otherwise a `Series/Index`
            (same type as caller) of str dtype is returned.
        """
        from cudf.core import Series, Index

        if isinstance(others, Series):
            assert others.dtype == np.dtype("object")
            others = others._column.nvstrings
        elif isinstance(others, Index):
            assert others.dtype == np.dtype("object")
            others = others.as_column().nvstrings
        elif isinstance(others, StringMethods):
            """
            If others is a StringMethods then
            raise an exception
            """
            msg = "series.str is an accessor, not an array-like of strings."
            raise ValueError(msg)
        elif is_list_like(others) and others:
            """
            If others is a list-like object (in our case lists & tuples)
            just another Series/Index, great go ahead with concatenation.
            """

            """
            Picking first element and checking if it really adheres to
            list like conditions, if not we switch to next case

            Note: We have made a call not to iterate over the entire list as
            it could be more expensive if it was of very large size.
            Thus only doing a sanity check on just the first element of list.
            """
            first = others[0]

            if is_list_like(first) or isinstance(
                first, (Series, Index, pd.Series, pd.Index)
            ):
                """
                Internal elements in others list should also be
                list-like and not a regular string/byte
                """
                first = None
                for frame in others:
                    if not isinstance(frame, Series):
                        """
                        Make sure all inputs to .cat function call
                        are of type nvstrings so creating a Series object.
                        """
                        frame = Series(frame, dtype="str")

                    if first is None:
                        """
                        extracting nvstrings pointer since
                        `frame` is of type Series/Index and
                        first isn't yet initialized.
                        """
                        first = frame._column.nvstrings
                    else:
                        assert frame.dtype == np.dtype("object")
                        frame = frame._column.nvstrings
                        first = first.cat(frame, sep=sep, na_rep=na_rep)

                others = first
            elif not is_list_like(first):
                """
                Picking first element and checking if it really adheres to
                non-list like conditions.

                Note: We have made a call not to iterate over the entire
                list as it could be more expensive if it was of very
                large size. Thus only doing a sanity check on just the
                first element of list.
                """
                others = Series(others)
                others = others._column.nvstrings
        elif isinstance(others, (pd.Series, pd.Index)):
            others = Series(others)
            others = others._column.nvstrings

        data = self._parent.nvstrings.cat(
            others=others, sep=sep, na_rep=na_rep
        )
        out = Series(data, index=self._index, name=self._name)
        if len(out) == 1 and others is None:
            out = out[0]
        return out
Exemple #26
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.dataframe import DataFrame, Series
        from cudf.core.column import column_empty
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (len(columns_df) == 0 and len(columns_df.columns) == 0
                    and not isinstance(arg[0], slice)):
                result = Series(column_empty(0, dtype="float64"), name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for i, col in enumerate(columns):
                    columns_df.insert(i, col, self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance(
                    arg[0], slice) or isinstance(arg[1], slice)):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for i, col in enumerate(columns_df._columns):
                # need Series() in case a scalar is returned
                df[i] = Series(col[arg[0]])

            df.index = as_index(columns_df.index[arg[0]])
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (isinstance(arg[0], slice)
                                        or isinstance(arg[1], slice)):
                    return list(df._data.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    if len(df._data) == 0:
                        return Series(
                            column_empty(0, dtype="float64"),
                            index=df.columns,
                            name=arg[0],
                        )
                    else:
                        result_series = df[df.columns[0]]
                        result_series.index = df.columns
                        result_series.name = arg[0]
                        return result_series
                else:
                    return df[df.columns[0]]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.core.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Exemple #27
0
    def to_series(self):
        from cudf.core.series import Series

        return Series(self._values)
Exemple #28
0
    def __init__(self,
                 levels=None,
                 codes=None,
                 labels=None,
                 names=None,
                 **kwargs):
        from cudf.core.series import Series
        from cudf import DataFrame

        super().__init__()

        self._name = None

        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' "
                "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            source_data = kwargs["source_data"].copy(deep=False)
            source_data.reset_index(drop=True, inplace=True)

            if isinstance(source_data, pd.DataFrame):
                nan_as_null = kwargs.get("nan_as_null", None)
                source_data = DataFrame.from_pandas(source_data,
                                                    nan_as_null=nan_as_null)
            names = names if names is not None else source_data._data.names
            # if names are unique
            # try using those as the source_data column names:
            if len(dict.fromkeys(names)) == len(names):
                source_data.columns = names
            self._data = source_data._data
            self.names = names
            self._codes = codes
            self._levels = levels
            return

        # name setup
        if isinstance(
                names,
            (
                Sequence,
                pd.core.indexes.frozen.FrozenNDArray,
                pd.core.indexes.frozen.FrozenList,
            ),
        ):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        if not isinstance(codes, DataFrame) and not isinstance(
                codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, DataFrame):
            self._codes = codes
        elif len(levels) == len(codes):
            self._codes = DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                self._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError("MultiIndex has unequal number of levels and "
                             "codes and is inconsistent!")

        self._levels = [Series(level) for level in levels]
        self._validate_levels_and_codes(self._levels, self._codes)

        source_data = DataFrame()
        for i, name in enumerate(self._codes.columns):
            codes = as_index(self._codes[name]._column)
            if -1 in self._codes[name].values:
                # Must account for null(s) in _source_data column
                level = DataFrame(
                    {name: [None] + list(self._levels[i])},
                    index=range(-1, len(self._levels[i])),
                )
            else:
                level = DataFrame({name: self._levels[i]})

            import cudf._lib as libcudf

            source_data[name] = libcudf.copying.gather(
                level, codes._data.columns[0])._data[name]

        self._data = source_data._data
        self.names = names
Exemple #29
0
    def __init__(self,
                 levels=None,
                 codes=None,
                 labels=None,
                 names=None,
                 **kwargs):
        from cudf.core.series import Series

        self.name = None
        self.names = names
        self._source_data = None
        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' "
                "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            self._source_data = kwargs["source_data"].reset_index(drop=True)
            self._codes = codes
            self._levels = levels
            return

        # name setup
        if isinstance(
                names,
            (
                Sequence,
                pd.core.indexes.frozen.FrozenNDArray,
                pd.core.indexes.frozen.FrozenList,
            ),
        ):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        from cudf import DataFrame

        if not isinstance(codes, DataFrame) and not isinstance(
                codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, DataFrame):
            self._codes = codes
        elif len(levels) == len(codes):
            self._codes = DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                self._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError("MultiIndex has unequal number of levels and "
                             "codes and is inconsistent!")

        self._levels = [Series(level) for level in levels]
        self._validate_levels_and_codes(self._levels, self._codes)

        self._source_data = DataFrame()
        for i, name in enumerate(self._codes.columns):
            codes = as_index(self._codes[name]._column)
            if -1 in self._codes[name].values:
                # Must account for null(s) in _source_data column
                level = DataFrame(
                    {name: [None] + list(self._levels[i])},
                    index=range(-1, len(self._levels[i])),
                )
            else:
                level = DataFrame({name: self._levels[i]})
            level = DataFrame(index=codes).join(level)
            self._source_data[name] = level[name].reset_index(drop=True)

        self.names = [None] * len(self._levels) if names is None else names
Exemple #30
0
    def _concat(cls, objs, dtype=None):
        from cudf.core.series import Series
        from cudf.core.column import (
            StringColumn,
            CategoricalColumn,
            NumericalColumn,
        )

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if is_categorical_dtype(dtype):
                dtype = CategoricalDtype()
            return column_empty(0, dtype=dtype, masked=True)

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (
            len(
                [
                    o
                    for o in not_null_cols
                    if not isinstance(o, NumericalColumn)
                    or np.issubdtype(o.dtype, np.datetime64)
                ]
            )
            == 0
        ):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.core.column import column_empty_like

                    objs[i] = column_empty_like(
                        head, dtype=head.dtype, masked=True, newsize=len(obj)
                    )

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (
                Series(ColumnBase._concat([o.categories for o in objs]))
                .drop_duplicates()
                ._column
            )
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.dtype == head.dtype):
                raise ValueError("All series must be of same type")

        newsize = sum(map(len, objs))
        if newsize > libcudfxx.MAX_COLUMN_SIZE:
            raise MemoryError(
                "Result of concat cannot have "
                "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR)
            )

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            result_nbytes = sum(o._nbytes for o in objs)
            if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES:
                raise MemoryError(
                    "Result of concat cannot have > {}  bytes".format(
                        libcudfxx.MAX_STRING_COLUMN_BYTES_STR
                    )
                )
            objs = [o.nvstrings for o in objs]
            return as_column(nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = any(col.nullable for col in objs)

        if is_categorical_dtype(head):
            data_dtype = head.codes.dtype
            data = None
            children = (column_empty(newsize, dtype=head.codes.dtype),)
        else:
            data_dtype = head.dtype
            data = Buffer.empty(size=newsize * data_dtype.itemsize)
            children = ()

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = build_column(
            data=data, dtype=head.dtype, mask=mask, children=children
        )

        # Performance the actual concatenation
        if newsize > 0:
            col = libcudf.concat._column_concat(objs, col)

        return col