Ejemplo n.º 1
0
def _match_categorical_dtypes_both(lcol: CategoricalColumn,
                                   rcol: CategoricalColumn,
                                   how: str) -> Tuple[ColumnBase, ColumnBase]:
    # The commontype depends on both `how` and the specifics of the
    # categorical variables to be merged.

    ltype, rtype = lcol.dtype, rcol.dtype

    # when both are ordered and both have the same categories,
    # no casting required:
    if ltype == rtype:
        return lcol, rcol

    # Merging categorical variables when only one side is ordered is
    # ambiguous and not allowed.
    if ltype.ordered != rtype.ordered:
        raise TypeError("Merging on categorical variables with mismatched"
                        " ordering is ambiguous")

    if ltype.ordered and rtype.ordered:
        # if we get to here, categories must be what causes the
        # dtype equality check to fail. And we can never merge
        # two ordered categoricals with different categories
        raise TypeError(f"{how} merge between categoricals with "
                        "different categories is only valid when "
                        "neither side is ordered")

    # the following should now always hold
    assert not ltype.ordered and not rtype.ordered

    if how == "inner":
        # cast to category types -- we must cast them back later
        return _match_join_keys(
            lcol.cat()._decategorize(),
            rcol.cat()._decategorize(),
            how,
        )
    elif how in {"left", "leftanti", "leftsemi"}:
        # always cast to left type
        return lcol, rcol.astype(ltype)
    else:
        # merge categories
        merged_categories = cudf.concat([ltype.categories,
                                         rtype.categories]).unique()
        common_type = cudf.CategoricalDtype(categories=merged_categories,
                                            ordered=False)
        return lcol.astype(common_type), rcol.astype(common_type)
Ejemplo n.º 2
0
    def read_partition(
        fs, piece, columns, index, categories=(), partitions=(), **kwargs
    ):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if isinstance(piece, str):
            # `piece` is a file-path string
            piece = pq.ParquetDatasetPiece(
                piece, open_file_func=partial(fs.open, mode="rb")
            )
        else:
            # `piece` contains (path, row_group, partition_keys)
            piece = pq.ParquetDatasetPiece(
                piece[0],
                row_group=piece[1],
                partition_keys=piece[2],
                open_file_func=partial(fs.open, mode="rb"),
            )

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if cudf.utils.ioutils._is_local_filesystem(fs):
            df = cudf.read_parquet(
                piece.path,
                engine="cudf",
                columns=columns,
                row_group=piece.row_group,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )
        else:
            with fs.open(piece.path, mode="rb") as f:
                df = cudf.read_parquet(
                    f,
                    engine="cudf",
                    columns=columns,
                    row_group=piece.row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )

        if index is not None and index[0] in df.columns:
            df = df.set_index(index[0])

        if len(piece.partition_keys) > 0:
            if partitions is None:
                raise ValueError("Must pass partition sets")
            for i, (name, index2) in enumerate(piece.partition_keys):
                categories = [
                    val.as_py() for val in partitions.levels[i].dictionary
                ]
                sr = cudf.Series(index2).astype(type(index2)).repeat(len(df))
                df[name] = CategoricalColumn(
                    data=sr._column.data, categories=categories, ordered=False
                )

        return df
Ejemplo n.º 3
0
 def __init__(self, values, **kwargs):
     kwargs = _setdefault_name(values, kwargs)
     if isinstance(values, CategoricalColumn):
         values = values
     elif isinstance(values, pd.Series) and (is_categorical_dtype(
             values.dtype)):
         values = CategoricalColumn(
             data=Buffer(values.cat.codes.values),
             categories=values.cat.categories,
             ordered=values.cat.ordered,
         )
     elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
         values = CategoricalColumn(
             data=Buffer(values.codes),
             categories=values.categories,
             ordered=values.ordered,
         )
     elif isinstance(values, (list, tuple)):
         values = column.as_column(pd.Categorical(values,
                                                  categories=values))
     super(CategoricalIndex, self).__init__(values, **kwargs)
Ejemplo n.º 4
0
def array_to_series(array):

    if isinstance(array, pa.ChunkedArray):
        return Series._concat(
            [array_to_series(chunk) for chunk in array.chunks]
        )
    if isinstance(array, pa.Column):
        return Series._concat(
            [array_to_series(chunk) for chunk in array.data.chunks]
        )

    array_len = len(array)
    null_count = array.null_count
    buffers = make_device_arrays(array)
    mask, data = buffers[0], buffers[1]
    dtype = arrow_to_pandas_dtype(array.type)

    if pa.types.is_dictionary(array.type):
        from cudf.core.column import CategoricalColumn

        codes = array_to_series(array.indices)
        categories = array_to_series(array.dictionary)
        data = CategoricalColumn(
            data=codes.data,
            mask=mask,
            null_count=null_count,
            categories=categories,
            ordered=array.type.ordered,
        )
    elif pa.types.is_string(array.type):
        import nvstrings

        offs, data = buffers[1], buffers[2]
        offs = offs[array.offset : array.offset + array_len + 1]
        data = None if data is None else data.device_ctypes_pointer.value
        mask = None if mask is None else mask.device_ctypes_pointer.value
        data = nvstrings.from_offsets(
            data,
            offs.device_ctypes_pointer.value,
            array_len,
            mask,
            null_count,
            True,
        )
    elif data is not None:
        data = data[array.offset : array.offset + len(array)]

    series = Series(data, dtype=dtype)

    if null_count > 0 and mask is not None and not series.has_null_mask:
        return series.set_mask(mask, null_count)

    return series
Ejemplo n.º 5
0
def build_column(data,
                 dtype,
                 mask=None,
                 offset=0,
                 children=(),
                 categories=None):
    """
    Build a Column of the appropriate type from the given parameters

    Parameters
    ----------
    data : Buffer
        The data buffer (can be None if constructin certain Column
        types like StringColumn or CategoricalColumn)
    dtype
        The dtype associated with the Column to construct
    mask : Buffer, optionapl
        The mask buffer
    offset : int, optional
    children : tuple, optional
    categories : Column, optional
        If constructing a CategoricalColumn, a Column containing
        the categories
    """
    from cudf.core.column.numerical import NumericalColumn
    from cudf.core.column.datetime import DatetimeColumn
    from cudf.core.column.categorical import CategoricalColumn
    from cudf.core.column.string import StringColumn

    dtype = pd.api.types.pandas_dtype(dtype)

    if is_categorical_dtype(dtype):
        if not len(children) == 1:
            raise ValueError(
                "Must specify exactly one child column for CategoricalColumn")
        if not isinstance(children[0], ColumnBase):
            raise TypeError("children must be a tuple of Columns")
        return CategoricalColumn(dtype=dtype,
                                 mask=mask,
                                 offset=offset,
                                 children=children)
    elif dtype.type is np.datetime64:
        return DatetimeColumn(data=data, dtype=dtype, mask=mask, offset=offset)
    elif dtype.type in (np.object_, np.str_):
        return StringColumn(mask=mask, offset=offset, children=children)
    else:
        return NumericalColumn(data=data,
                               dtype=dtype,
                               mask=mask,
                               offset=offset)
Ejemplo n.º 6
0
def melt(
    frame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
):
    """Unpivots a DataFrame from wide format to long format,
    optionally leaving identifier variables set.

    Parameters
    ----------
    frame : DataFrame
    id_vars : tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
        default: None
    value_vars : tuple, list, or ndarray, optional
        Column(s) to unpivot.
        default: all columns that are not set as `id_vars`.
    var_name : scalar
        Name to use for the `variable` column.
        default: frame.columns.name or 'variable'
    value_name : str
        Name to use for the `value` column.
        default: 'value'

    Returns
    -------
    out : DataFrame
        Melted result

    Difference from pandas:
     * Does not support 'col_level' because cuDF does not have multi-index

    Examples
    --------
    >>> import cudf
    >>> import numpy as np
    >>> df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5},
    ...                      'B': {0: 1, 1: 3, 2: 6},
    ...                      'C': {0: 1.0, 1: np.nan, 2: 4.0},
    ...                      'D': {0: 2.0, 1: 5.0, 2: 6.0}})
    >>> cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D'])
         A    B variable value
    0    1    1        C   1.0
    1    1    3        C
    2    5    6        C   4.0
    3    1    1        D   2.0
    4    1    3        D   5.0
    5    5    6        D   6.0
    """
    assert col_level in (None, )

    # Arg cleaning
    import collections

    # id_vars
    if id_vars is not None:
        if not isinstance(id_vars, collections.abc.Sequence):
            id_vars = [id_vars]
        id_vars = list(id_vars)
        missing = set(id_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError("The following 'id_vars' are not present"
                           " in the DataFrame: {missing}"
                           "".format(missing=list(missing)))
    else:
        id_vars = []

    # value_vars
    if value_vars is not None:
        if not isinstance(value_vars, collections.abc.Sequence):
            value_vars = [value_vars]
        value_vars = list(value_vars)
        missing = set(value_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError("The following 'value_vars' are not present"
                           " in the DataFrame: {missing}"
                           "".format(missing=list(missing)))
    else:
        # then all remaining columns in frame
        value_vars = frame.columns.drop(id_vars)
        value_vars = list(value_vars)

    # Error for unimplemented support for datatype
    dtypes = [frame[col].dtype for col in id_vars + value_vars]
    if any(is_categorical_dtype(t) for t in dtypes):
        raise NotImplementedError("Categorical columns are not yet "
                                  "supported for function")

    # Check dtype homogeneity in value_var
    # Because heterogeneous concat is unimplemented
    dtypes = [frame[col].dtype for col in value_vars]
    if len(dtypes) > 0:
        dtype = dtypes[0]
        if any(t != dtype for t in dtypes):
            raise ValueError("all cols in value_vars must have the same dtype")

    # overlap
    overlap = set(id_vars).intersection(set(value_vars))
    if not len(overlap) == 0:
        raise KeyError("'value_vars' and 'id_vars' cannot have overlap."
                       " The following 'value_vars' are ALSO present"
                       " in 'id_vars': {overlap}"
                       "".format(overlap=list(overlap)))

    N = len(frame)
    K = len(value_vars)

    def _tile(A, reps):
        series_list = [A] * reps
        if reps > 0:
            return Series._concat(objs=series_list, index=None)
        else:
            return Series(Buffer.null(dtype=A.dtype))

    # Step 1: tile id_vars
    mdata = collections.OrderedDict()
    for col in id_vars:
        mdata[col] = _tile(frame[col], K)

    # Step 2: add variable
    var_cols = []
    for i, var in enumerate(value_vars):
        var_cols.append(
            Series(Buffer(cudautils.full(size=N, value=i, dtype=np.int8))))
    temp = Series._concat(objs=var_cols, index=None)

    if not var_name:
        var_name = "variable"

    mdata[var_name] = Series(
        CategoricalColumn(categories=value_vars,
                          data=temp._column.data,
                          ordered=False))

    # Step 3: add values
    mdata[value_name] = Series._concat(objs=[frame[val] for val in value_vars],
                                       index=None)

    return DataFrame(mdata)