Ejemplo n.º 1
0
def test_dataframe_setitem_scaler_bool_inconsistency():
    df = pd.DataFrame({"a": [1, 2, 3]})
    df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]})

    gdf = DataFrame({"a": [1, 2, 3]})
    gdf[[True, False, True]] = DataFrame({"a": [-1, -2]})
    assert_eq(df, gdf)
Ejemplo n.º 2
0
def test_dataframe_setitem_new_columns(df, arg, value):
    gdf = DataFrame.from_pandas(df)
    cudf_replace_value = value

    if isinstance(cudf_replace_value, pd.DataFrame):
        cudf_replace_value = DataFrame.from_pandas(value)

    df[arg] = value
    gdf[arg] = cudf_replace_value
    assert_eq(df, gdf, check_dtype=True)
Ejemplo n.º 3
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.

    Notes
    -----
    cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
    tensor is row-major, transpose it before passing it to this function.
    """

    data, _ = libdlpack.from_dlpack(pycapsule_obj)

    if len(data) == 1:
        return Series._from_data(data)
    else:
        return DataFrame._from_data(data)
Ejemplo n.º 4
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """

    res = libdlpack.from_dlpack(pycapsule_obj)

    if res._num_columns == 1:
        return Series(res._data[0])
    else:
        return DataFrame(data=res._data)
Ejemplo n.º 5
0
def read_feather(path, *args, **kwargs):
    """{docstring}"""

    warnings.warn("Using CPU via PyArrow to read feather dataset, this may "
                  "be GPU accelerated in the future")
    pa_table = feather.read_table(path, *args, **kwargs)
    return DataFrame.from_arrow(pa_table)
Ejemplo n.º 6
0
def test_kernel_shallow_copy():
    pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["a", "b", "c"])
    gdf = DataFrame.from_pandas(pdf)
    cdf = gdf.copy(deep=False)
    sr = gdf["a"]
    add_one[1, len(sr)](sr.to_gpu_array())
    assert_eq(gdf, cdf)
Ejemplo n.º 7
0
def test_cudf_dataframe_copy(copy_fn, ncols, data_type):
    pdf = pd.DataFrame()
    for i in range(ncols):
        pdf[chr(i + ord("a"))] = pd.Series(np.random.randint(
            0, 1000, 20)).astype(data_type)
    df = DataFrame.from_pandas(pdf)
    copy_df = copy_fn(df)
    assert_eq(df, copy_df)
Ejemplo n.º 8
0
def test_kernel_deep_copy():
    pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["a", "b", "c"])
    gdf = DataFrame.from_pandas(pdf)
    cdf = gdf.copy(deep=True)
    sr = gdf["b"]

    add_one[1, len(sr)](sr._column.data_array_view)
    assert not gdf.to_string().split() == cdf.to_string().split()
Ejemplo n.º 9
0
def test_series_setitem_index():
    df = pd.DataFrame(
        data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3]
    )

    df["b"] = pd.Series(data=[12, 11, 10], index=[3, 2, 1])
    gdf = DataFrame(data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3])
    gdf["b"] = Series(data=[12, 11, 10], index=[3, 2, 1])
    assert_eq(df, gdf, check_dtype=False)
Ejemplo n.º 10
0
def test_dataframe_copy_shallow():
    pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["a", "b", "c"])
    gdf = DataFrame.from_pandas(pdf)
    copy_pdf = pdf.copy(deep=False)
    copy_gdf = gdf.copy(deep=False)
    copy_pdf["b"] = [0, 0, 0]
    copy_gdf["b"] = [0, 0, 0]
    assert_eq(pdf["b"], copy_pdf["b"])
    assert_eq(gdf["b"], copy_gdf["b"])
Ejemplo n.º 11
0
def test_kernel_deep_copy():
    pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["a", "b", "c"])
    gdf = DataFrame.from_pandas(pdf)
    cdf = gdf.copy(deep=True)
    sr = gdf["b"]
    # column.to_gpu_array calls to_dense_buffer which returns a copy
    # need to access buffer directly and then call gpu_array
    add_one[1, len(sr)](sr.data.to_gpu_array())
    assert not gdf.to_string().split() == cdf.to_string().split()
Ejemplo n.º 12
0
def test_dataframe_deep_copy_and_insert(copy_parameters):
    pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["a", "b", "c"])
    gdf = DataFrame.from_pandas(pdf)
    copy_pdf = copy_parameters["fn"](pdf)
    copy_gdf = copy_parameters["fn"](gdf)
    copy_pdf["b"] = [0, 0, 0]
    copy_gdf["b"] = [0, 0, 0]
    pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values)
    gdf_is_equal = np.array_equal(gdf["b"].to_array(),
                                  copy_gdf["b"].to_array())
    assert pdf_is_equal == copy_parameters["expected_equality"]
    assert gdf_is_equal == copy_parameters["expected_equality"]
Ejemplo n.º 13
0
def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type):
    pdf = pd.DataFrame()
    for i in range(ncols):
        pdf[chr(i + ord("a"))] = pd.Series(np.random.randint(
            0, 1000, 20)).astype(data_type)
    df = DataFrame.from_pandas(pdf)
    copy_df = copy_fn(df)
    copy_pdf = copy_fn(pdf)
    copy_df["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype(data_type)
    copy_pdf["aa"] = pd.Series(np.random.randint(0, 1000,
                                                 20)).astype(data_type)
    assert not copy_pdf.to_string().split() == pdf.to_string().split()
    assert not copy_df.to_string().split() == df.to_string().split()
Ejemplo n.º 14
0
def test_setitem_dataframe_series_inplace(df):
    pdf = df
    gdf = DataFrame.from_pandas(pdf)

    pdf["a"].replace(1, 500, inplace=True)
    gdf["a"].replace(1, 500, inplace=True)

    assert_eq(pdf, gdf)

    psr_a = pdf["a"]
    gsr_a = gdf["a"]

    psr_a.replace(500, 501, inplace=True)
    gsr_a.replace(500, 501, inplace=True)

    assert_eq(pdf, gdf)
Ejemplo n.º 15
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size"
            )
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            column.build_column(
                Buffer(res[idx]), dtype=res[idx].dtype, mask=mask
            )
        )
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Ejemplo n.º 16
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.dataframe import DataFrame, Series
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg[0], slice):
                df = columns_df[arg[0]]
            else:
                df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance(
                    arg[0], slice) or isinstance(arg[1], slice)):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for i, col in enumerate(columns_df._columns):
                # need Series() in case a scalar is returned
                df[i] = Series(col[arg[0]])

            df.index = as_index(columns_df.index[arg[0]])
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)

        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
            from cudf.core.index import RangeIndex

            slice_len = len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Ejemplo n.º 17
0
def test_cummin(dtype, nelem):
    if dtype == np.int8:
        # to keep data in range
        data = gen_rand(dtype, nelem, low=-2, high=2)
    else:
        data = gen_rand(dtype, nelem)

    decimal = 4 if dtype == np.float32 else 6

    # series
    gs = Series(data)
    ps = pd.Series(data)
    np.testing.assert_array_almost_equal(gs.cummin().to_array(),
                                         ps.cummin(),
                                         decimal=decimal)

    # dataframe series (named series)
    gdf = DataFrame()
    gdf["a"] = Series(data)
    pdf = pd.DataFrame()
    pdf["a"] = pd.Series(data)
    np.testing.assert_array_almost_equal(gdf.a.cummin().to_array(),
                                         pdf.a.cummin(),
                                         decimal=decimal)
Ejemplo n.º 18
0
    def _getitem_tuple_arg(self, arg):
        from cudf.core.dataframe import Series, DataFrame
        from cudf.core.column import column
        from cudf.core.index import as_index
        from cudf import MultiIndex

        # Step 1: Gather columns
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            df = DataFrame()
            for col in columns_df.columns:
                # need Series() in case a scalar is returned
                df[col] = Series(columns_df[col].loc[arg[0]])
            df.columns = columns_df.columns

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Ejemplo n.º 19
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.dataframe import DataFrame, Series
        from cudf.core.column import column_empty
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (len(columns_df) == 0 and len(columns_df.columns) == 0
                    and not isinstance(arg[0], slice)):
                result = Series(column_empty(0, dtype="float64"), name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for i, col in enumerate(columns):
                    columns_df.insert(i, col, self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance(
                    arg[0], slice) or isinstance(arg[1], slice)):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for i, col in enumerate(columns_df._columns):
                # need Series() in case a scalar is returned
                df[i] = Series(col[arg[0]])

            df.index = as_index(columns_df.index[arg[0]])
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (isinstance(arg[0], slice)
                                        or isinstance(arg[1], slice)):
                    return list(df._data.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    if len(df._data) == 0:
                        return Series(
                            column_empty(0, dtype="float64"),
                            index=df.columns,
                            name=arg[0],
                        )
                    else:
                        result_series = df[df.columns[0]]
                        result_series.index = df.columns
                        result_series.name = arg[0]
                        return result_series
                else:
                    return df[df.columns[0]]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.core.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Ejemplo n.º 20
0
    def _getitem_tuple_arg(self, arg):
        from cudf.core.dataframe import DataFrame
        from cudf.core.column import column
        from cudf.core.index import as_index
        from cudf.utils.cudautils import arange
        from cudf import MultiIndex

        # Step 1: Gather columns
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
        else:
            columns = self._get_column_selection(arg[1])
            columns_df = DataFrame()
            for col in columns:
                columns_df.add_column(name=col, data=self._df[col])
        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            if isinstance(self._df.columns, MultiIndex):
                if isinstance(arg[0], slice):
                    start, stop, step = arg[0].indices(len(columns_df))
                    indices = arange(start, stop, step)
                    df = columns_df.take(indices)
                else:
                    df = columns_df.take(arg[0])
            else:
                df = DataFrame()
                for col in columns_df.columns:
                    df[col] = columns_df[col].loc[arg[0]]
        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Ejemplo n.º 21
0
def test_dataframe_setitem_scaler_keyerror():
    df = DataFrame({"a": [1, 2, 3]})
    with pytest.raises(KeyError):
        df[["x"]] = 0
Ejemplo n.º 22
0
def where(
    frame: Union[Series, Index, DataFrame],
    cond: Any,
    other: Any = None,
    inplace: bool = False,
) -> Optional[Union[Frame]]:
    """
    Replace values where the condition is False.

    Parameters
    ----------
    cond : bool Series/DataFrame, array-like
        Where cond is True, keep the original value.
        Where False, replace with corresponding value from other.
        Callables are not supported.
    other: scalar, list of scalars, Series/DataFrame
        Entries where cond is False are replaced with
        corresponding value from other. Callables are not
        supported. Default is None.

        DataFrame expects only Scalar or array like with scalars or
        dataframe with same dimension as frame.

        Series expects only scalar or series like with same length
    inplace : bool, default False
        Whether to perform the operation in place on the data.

    Returns
    -------
    Same type as caller

    Examples
    --------
    >>> import cudf
    >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]})
    >>> df.where(df % 2 == 0, [-1, -1])
       A  B
    0 -1 -1
    1  4 -1
    2 -1  8

    >>> ser = Series([4, 3, 2, 1, 0])
    >>> ser.where(ser > 2, 10)
    0     4
    1     3
    2    10
    3    10
    4    10
    dtype: int64
    >>> ser.where(ser > 2)
    0       4
    1       3
    2    <NA>
    3    <NA>
    4    <NA>
    dtype: int64
    """

    if isinstance(frame, DataFrame):
        if hasattr(cond, "__cuda_array_interface__"):
            cond = DataFrame(cond,
                             columns=frame._column_names,
                             index=frame.index)
        elif (hasattr(cond, "__array_interface__")
              and cond.__array_interface__["shape"] != frame.shape):
            raise ValueError("conditional must be same shape as self")
        elif not isinstance(cond, DataFrame):
            cond = frame.from_pandas(pd.DataFrame(cond))

        common_cols = set(frame._column_names).intersection(
            set(cond._column_names))
        if len(common_cols) > 0:
            # If `frame` and `cond` are having unequal index,
            # then re-index `cond`.
            if not frame.index.equals(cond.index):
                cond = cond.reindex(frame.index)
        else:
            if cond.shape != frame.shape:
                raise ValueError(
                    """Array conditional must be same shape as self""")
            # Setting `frame` column names to `cond`
            # as `cond` has no column names.
            cond.columns = frame.columns

        (
            source_df,
            others,
        ) = _normalize_columns_and_scalars_type(frame, other)
        if isinstance(other, Frame):
            others = others._data.columns

        out_df = DataFrame(index=frame.index)
        if len(frame._columns) != len(others):
            raise ValueError(
                """Replacement list length or number of dataframe columns
                should be equal to Number of columns of dataframe""")
        for i, column_name in enumerate(frame._column_names):
            input_col = source_df._data[column_name]
            other_column = others[i]
            if column_name in cond._data:
                if isinstance(input_col, cudf.core.column.CategoricalColumn):
                    if cudf.utils.dtypes.is_scalar(other_column):
                        try:
                            other_column = input_col._encode(other_column)
                        except ValueError:
                            # When other is not present in categories,
                            # fill with Null.
                            other_column = None
                        other_column = cudf.Scalar(other_column,
                                                   dtype=input_col.codes.dtype)
                    elif isinstance(other_column,
                                    cudf.core.column.CategoricalColumn):
                        other_column = other_column.codes
                    input_col = input_col.codes

                result = cudf._lib.copying.copy_if_else(
                    input_col, other_column, cond._data[column_name])

                if isinstance(
                        frame._data[column_name],
                        cudf.core.column.CategoricalColumn,
                ):
                    result = cudf.core.column.build_categorical_column(
                        categories=frame._data[column_name].categories,
                        codes=cudf.core.column.as_column(result.base_data,
                                                         dtype=result.dtype),
                        mask=result.base_mask,
                        size=result.size,
                        offset=result.offset,
                        ordered=frame._data[column_name].ordered,
                    )
            else:
                out_mask = cudf._lib.null_mask.create_null_mask(
                    len(input_col),
                    state=cudf._lib.null_mask.MaskState.ALL_NULL,
                )
                result = input_col.set_mask(out_mask)
            out_df[column_name] = frame[column_name].__class__(result)

        return frame._mimic_inplace(out_df, inplace=inplace)

    else:
        if isinstance(other, DataFrame):
            raise NotImplementedError(
                "cannot align with a higher dimensional Frame")
        input_col = frame._data[frame.name]
        cond = cudf.core.column.as_column(cond)
        if len(cond) != len(frame):
            raise ValueError(
                """Array conditional must be same shape as self""")

        (
            input_col,
            other,
        ) = _normalize_columns_and_scalars_type(frame, other, inplace)

        if isinstance(input_col, cudf.core.column.CategoricalColumn):
            if cudf.utils.dtypes.is_scalar(other):
                try:
                    other = input_col._encode(other)
                except ValueError:
                    # When other is not present in categories,
                    # fill with Null.
                    other = None
                other = cudf.Scalar(other, dtype=input_col.codes.dtype)
            elif isinstance(other, cudf.core.column.CategoricalColumn):
                other = other.codes

            input_col = input_col.codes

        result = cudf._lib.copying.copy_if_else(input_col, other, cond)

        if isinstance(frame._data[frame.name],
                      cudf.core.column.CategoricalColumn):
            result = cudf.core.column.build_categorical_column(
                categories=cast(
                    cudf.core.column.CategoricalColumn,
                    frame._data[frame.name],
                ).categories,
                codes=cudf.core.column.as_column(result.base_data,
                                                 dtype=result.dtype),
                mask=result.base_mask,
                size=result.size,
                offset=result.offset,
                ordered=cast(
                    cudf.core.column.CategoricalColumn,
                    frame._data[frame.name],
                ).ordered,
            )

        if isinstance(frame, Index):
            result = Index(result, name=frame.name)
        else:
            result = frame._copy_construct(data=result)

        return frame._mimic_inplace(result, inplace=inplace)
Ejemplo n.º 23
0
def _parse_tdf_gpu(tdf):
    """
    Parse the results of a select ipc_gpu into a GpuDataFrame

    Parameters
    ----------
    tdf : TDataFrame

    Returns
    -------
    gdf : GpuDataFrame
    """

    import pyarrow as pa
    from cudf.comm.gpuarrow import GpuArrowReader
    from cudf.core.dataframe import DataFrame
    from cudf._lib.arrow._cuda import Context, IpcMemHandle
    from numba import cuda

    ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle))
    ctx = Context()
    ipc_buf = ctx.open_ipc_buffer(ipc_handle)
    ipc_buf.context.synchronize()

    schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size)

    buffer = pa.BufferReader(schema_buffer)
    schema = pa.read_schema(buffer)

    # Dictionary Memo functionality used to
    # deserialize on the C++ side is not
    # exposed on the pyarrow side, so we need to
    # handle this on our own.
    dict_memo = {}

    try:
        dict_batch_reader = pa.RecordBatchStreamReader(buffer)
        updated_fields = []

        for f in schema:
            if pa.types.is_dictionary(f.type):
                msg = dict_batch_reader.read_next_batch()
                dict_memo[f.name] = msg.column(0)
                updated_fields.append(pa.field(f.name, f.type.index_type))
            else:
                updated_fields.append(pa.field(f.name, f.type))

        schema = pa.schema(updated_fields)
    except pa.ArrowInvalid:
        # This message does not have any dictionary encoded
        # columns
        pass

    dtype = np.dtype(np.byte)
    darr = cuda.devicearray.DeviceNDArray(
        shape=ipc_buf.size,
        strides=dtype.itemsize,
        dtype=dtype,
        gpu_data=ipc_buf.to_numba(),
    )

    reader = GpuArrowReader(schema, darr)
    df = DataFrame()
    df.set_tdf = MethodType(set_tdf, df)
    df.get_tdf = MethodType(get_tdf, df)

    for k, v in reader.to_dict().items():
        if k in dict_memo:
            df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k])
        else:
            df[k] = v

    df.set_tdf(tdf)

    # free shared memory from Python
    # https://github.com/omnisci/pymapd/issues/46
    # https://github.com/omnisci/pymapd/issues/31
    free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p))  # noqa

    return df
Ejemplo n.º 24
0
def test_dataframe_setitem_bool_mask_scaler(df, arg, value):
    gdf = DataFrame.from_pandas(df)

    df[arg] = value
    gdf[arg] = value
    assert_eq(df, gdf)
Ejemplo n.º 25
0
    def _getitem_tuple_arg(self, arg):
        from uuid import uuid4

        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.dataframe import DataFrame
        from cudf.core.index import as_index

        # Step 1: Gather columns
        if isinstance(arg, tuple):
            columns_df = self._get_column_selection(arg[1])
            columns_df._index = self._df._index
        else:
            columns_df = self._df

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
                if isinstance(arg, pd.MultiIndex):
                    arg = MultiIndex.from_pandas(arg)

                indices = indices_from_labels(columns_df, arg)
                return columns_df.take(indices)

            else:
                if isinstance(arg, tuple):
                    return columns_df.index._get_row_major(columns_df, arg[0])
                else:
                    return columns_df.index._get_row_major(columns_df, arg)
        else:
            if isinstance(arg[0], slice):
                out = get_label_range_or_mask(
                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
                )
                if isinstance(out, slice):
                    df = columns_df._slice(out)
                else:
                    df = columns_df._apply_boolean_mask(out)
            else:
                tmp_arg = arg
                if is_scalar(arg[0]):
                    # If a scalar, there is possibility of having duplicates.
                    # Join would get all the duplicates. So, coverting it to
                    # an array kind.
                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                if len(tmp_arg[0]) == 0:
                    return columns_df._empty_like(keep_index=True)
                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])

                if pd.api.types.is_bool_dtype(tmp_arg[0]):
                    df = columns_df._apply_boolean_mask(tmp_arg[0])
                else:
                    tmp_col_name = str(uuid4())
                    other_df = DataFrame(
                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
                        index=as_index(tmp_arg[0]),
                    )
                    df = other_df.join(columns_df, how="inner")
                    # as join is not assigning any names to index,
                    # update it over here
                    df.index.name = columns_df.index.name
                    df = df.sort_values(tmp_col_name)
                    df.drop(columns=[tmp_col_name], inplace=True)
                    # There were no indices found
                    if len(df) == 0:
                        raise KeyError(arg)

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df