Ejemplo n.º 1
0
def get_dummies(data,
                prefix=None,
                prefix_sep='_',
                dummy_na=False,
                columns=None,
                sparse=False,
                drop_first=False,
                dtype=None):
    """
    Convert categorical variable into dummy/indicator variables, also
    known as one hot encoding.

    Parameters
    ----------
    data : array-like, Series, or DataFrame
    prefix : string, list of strings, or dict of strings, default None
        String to append DataFrame column names.
        Pass a list with length equal to the number of columns
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
        can be a dictionary mapping column names to prefixes.
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use. Or pass a
        list or dictionary as with `prefix.`
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.
    columns : list-like, default None
        Column names in the DataFrame to be encoded.
        If `columns` is None then all the columns with
        `object` or `category` dtype will be converted.
    sparse : bool, default False
        Whether the dummy-encoded columns should be be backed by
        a :class:`SparseArray` (True) or a regular NumPy array (False).
        In Koalas, this value must be "False".
    drop_first : bool, default False
        Whether to get k-1 dummies out of k categorical levels by removing the
        first level.
    dtype : dtype, default np.uint8
        Data type for new columns. Only a single dtype is allowed.

    Returns
    -------
    dummies : DataFrame

    See Also
    --------
    Series.str.get_dummies

    Examples
    --------
    >>> s = ks.Series(list('abca'))

    >>> ks.get_dummies(s)
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0

    >>> df = ks.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
    ...                    'C': [1, 2, 3]},
    ...                   columns=['A', 'B', 'C'])

    >>> ks.get_dummies(df, prefix=['col1', 'col2'])
       C  col1_a  col1_b  col2_a  col2_b  col2_c
    0  1       1       0       0       1       0
    1  2       0       1       1       0       0
    2  3       1       0       0       0       1

    >>> ks.get_dummies(ks.Series(list('abcaa')))
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0
    4  1  0  0

    >>> ks.get_dummies(ks.Series(list('abcaa')), drop_first=True)
       b  c
    0  0  0
    1  1  0
    2  0  1
    3  0  0
    4  0  0

    >>> ks.get_dummies(ks.Series(list('abc')), dtype=float)
         a    b    c
    0  1.0  0.0  0.0
    1  0.0  1.0  0.0
    2  0.0  0.0  1.0
    """
    if sparse is not False:
        raise NotImplementedError(
            "get_dummies currently does not support sparse")

    if isinstance(columns, str):
        columns = [columns]
    if dtype is None:
        dtype = 'byte'

    if isinstance(data, Series):
        if prefix is not None:
            prefix = [str(prefix)]
        columns = [data.name]
        kdf = data.to_dataframe()
        remaining_columns = []
    else:
        if isinstance(prefix, str):
            raise ValueError(
                "get_dummies currently does not support prefix as string types"
            )
        kdf = data.copy()
        if columns is None:
            columns = [
                column for column in kdf.columns
                if isinstance(data._sdf.schema[column].dataType,
                              _get_dummies_default_accept_types)
            ]
        if len(columns) == 0:
            return kdf

        if prefix is None:
            prefix = columns

        column_set = set(columns)
        remaining_columns = [
            kdf[column] for column in kdf.columns if column not in column_set
        ]

    if any(not isinstance(kdf._sdf.schema[column].dataType,
                          _get_dummies_acceptable_types)
           for column in columns):
        raise ValueError("get_dummies currently only accept {} values".format(
            ', '.join([t.typeName() for t in _get_dummies_acceptable_types])))

    if prefix is not None and len(columns) != len(prefix):
        raise ValueError(
            "Length of 'prefix' ({}) did not match the length of the columns being encoded ({})."
            .format(len(prefix), len(columns)))

    all_values = _reduce_spark_multi(
        kdf._sdf,
        [F.collect_set(F.col(column)).alias(column) for column in columns])
    for i, column in enumerate(columns):
        values = sorted(all_values[i])
        if drop_first:
            values = values[1:]

        def column_name(value):
            if prefix is None:
                return str(value)
            else:
                return '{}{}{}'.format(prefix[i], prefix_sep, value)

        for value in values:
            remaining_columns.append(
                (kdf[column].notnull() &
                 (kdf[column] == value)).astype(dtype).rename(
                     column_name(value)))
        if dummy_na:
            remaining_columns.append(kdf[column].isnull().astype(dtype).rename(
                column_name('nan')))

    return kdf[remaining_columns]
Ejemplo n.º 2
0
def get_dummies(data,
                prefix=None,
                prefix_sep='_',
                dummy_na=False,
                columns=None,
                sparse=False,
                drop_first=False,
                dtype=None):
    if sparse is not False:
        raise NotImplementedError(
            "get_dummies currently does not support sparse")

    if isinstance(columns, string_types):
        columns = [columns]
    if dtype is None:
        dtype = 'byte'

    if isinstance(data, Series):
        if prefix is not None:
            prefix = [str(prefix)]
        columns = [data.name]
        kdf = data.to_dataframe()
        remaining_columns = []
    else:
        if isinstance(prefix, string_types):
            raise ValueError(
                "get_dummies currently does not support prefix as string types"
            )
        kdf = data.copy()
        if columns is None:
            columns = [
                column for column in kdf.columns
                if isinstance(data._sdf.schema[column].dataType,
                              _get_dummies_default_accept_types)
            ]
        if len(columns) == 0:
            return kdf

        if prefix is None:
            prefix = columns

        column_set = set(columns)
        remaining_columns = [
            kdf[column] for column in kdf.columns if column not in column_set
        ]

    if any(not isinstance(kdf._sdf.schema[column].dataType,
                          _get_dummies_acceptable_types)
           for column in columns):
        raise ValueError("get_dummies currently only accept {} values".format(
            ', '.join([t.typeName() for t in _get_dummies_acceptable_types])))

    if prefix is not None and len(columns) != len(prefix):
        raise ValueError(
            "Length of 'prefix' ({}) did not match the length of the columns being encoded ({})."
            .format(len(prefix), len(columns)))

    all_values = _reduce_spark_multi(
        kdf._sdf,
        [F.collect_set(F.col(column)).alias(column) for column in columns])
    for i, column in enumerate(columns):
        values = sorted(all_values[i])
        if drop_first:
            values = values[1:]

        def column_name(value):
            if prefix is None:
                return str(value)
            else:
                return '{}{}{}'.format(prefix[i], prefix_sep, value)

        for value in values:
            remaining_columns.append(
                (kdf[column].notnull() &
                 (kdf[column] == value)).astype(dtype).rename(
                     column_name(value)))
        if dummy_na:
            remaining_columns.append(kdf[column].isnull().astype(dtype).rename(
                column_name('nan')))

    return kdf[remaining_columns]