Example #1
0
    def test_empty_print(self):
        factor = Categorical([], ["a", "b", "c"], name="cat")
        expected = ("Categorical([], Name: cat, Levels (3): "
                    "Index([a, b, c], dtype=object)")
        # hack because array_repr changed in numpy > 1.6.x
        actual = repr(factor)
        pat = "Index\(\['a', 'b', 'c']"
        sub = "Index([a, b, c]"
        actual = re.sub(pat, sub, actual)

        self.assertEqual(actual, expected)

        factor = Categorical([], ["a", "b", "c"])
        expected = ("Categorical([], Levels (3): "
                    "Index([a, b, c], dtype=object)")
        # hack because array_repr changed in numpy > 1.6.x
        actual = repr(factor)
        pat = "Index\(\['a', 'b', 'c']"
        sub = "Index([a, b, c]"
        actual = re.sub(pat, sub, actual)

        self.assertEqual(actual, expected)

        factor = Categorical([], [])
        expected = ("Categorical([], Levels (0): " "Index([], dtype=object)")
        self.assertEqual(repr(factor), expected)
Example #2
0
    def _create_categorical(self, data, categories=None, ordered=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing

        Returns
        -------
        Categorical
        """
        if not isinstance(data, ABCCategorical):
            ordered = False if ordered is None else ordered
            from pandas.core.categorical import Categorical
            data = Categorical(data, categories=categories, ordered=ordered)
        else:
            if categories is not None:
                data = data.set_categories(categories)
            if ordered is not None:
                data = data.set_ordered(ordered)
        return data
Example #3
0
def _bins_to_cuts(x,
                  bins,
                  right=True,
                  labels=None,
                  retbins=False,
                  precision=3,
                  name=None,
                  include_lowest=False):
    if name is None and isinstance(x, Series):
        name = x.name
    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise Exception('Bin edges must be unique: %s' % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins,
                                            precision,
                                            right=right,
                                            include_lowest=include_lowest)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, name=name)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if not retbins:
        return fac

    return fac, bins
Example #4
0
    def _create_categorical(self, data, categories=None, ordered=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing

        Returns
        -------
        Categorical
        """
        if not isinstance(data, ABCCategorical):
            ordered = False if ordered is None else ordered
            from pandas.core.categorical import Categorical
            data = Categorical(data, categories=categories, ordered=ordered)
        else:
            if categories is not None:
                data = data.set_categories(categories)
            if ordered is not None:
                data = data.set_ordered(ordered)
        return data
Example #5
0
    def test_periodindex(self):
        idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
                            '2014-03', '2014-03'], freq='M')
        cat1 = Categorical.from_array(idx1)

        exp_arr = np.array([0, 0, 1, 1, 2, 2])
        exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')

        self.assert_numpy_array_equal(cat1.labels, exp_arr)
        self.assertTrue(cat1.levels.equals(exp_idx))


        idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
                            '2014-03', '2014-01'], freq='M')
        cat2 = Categorical.from_array(idx2)

        exp_arr = np.array([2, 2, 1, 0, 2, 0])

        self.assert_numpy_array_equal(cat2.labels, exp_arr)
        self.assertTrue(cat2.levels.equals(exp_idx))

        idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
                            '2013-08', '2013-07', '2013-05'], freq='M')
        cat3 = Categorical.from_array(idx3)

        exp_arr = np.array([6, 5, 4, 3, 2, 1, 0])
        exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09',
                               '2013-10', '2013-11', '2013-12'], freq='M')

        self.assert_numpy_array_equal(cat3.labels, exp_arr)
        self.assertTrue(cat3.levels.equals(exp_idx))
Example #6
0
def _bins_to_cuts(x, bins, right=True, labels=None,
                  precision=3, include_lowest=False,
                  dtype=None, duplicates='raise'):

    if duplicates not in ['raise', 'drop']:
        raise ValueError("invalid value for 'duplicates' parameter, "
                         "valid options are: raise, drop")

    unique_bins = algos.unique(bins)
    if len(unique_bins) < len(bins):
        if duplicates == 'raise':
            raise ValueError("Bin edges must be unique: {}.\nYou "
                             "can drop duplicate edges by setting "
                             "the 'duplicates' kwarg".format(repr(bins)))
        else:
            bins = unique_bins

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right,
                                            include_lowest=include_lowest,
                                            dtype=dtype)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    return fac, bins
Example #7
0
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False):
    x_is_series = isinstance(x, Series)
    series_index = None

    if x_is_series:
        series_index = x.index
        if name is None:
            name = x.name

    x = np.asarray(x)

    side = "left" if right else "right"
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise ValueError("Bin edges must be unique: %s" % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError("Bin labels must be one fewer than " "the number of bin edges")
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if x_is_series:
        fac = Series(fac, index=series_index, name=name)

    if not retbins:
        return fac

    return fac, bins
Example #8
0
def _bins_to_cuts(x,
                  bins,
                  right=True,
                  labels=None,
                  retbins=False,
                  precision=3,
                  name=None,
                  include_lowest=False):
    if name is None and isinstance(x, Series):
        name = x.name
    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            fmt = lambda v: _format_label(v, precision=precision)
            if right:
                levels = [
                    '(%s, %s]' % (fmt(a), fmt(b))
                    for a, b in zip(bins, bins[1:])
                ]
                if include_lowest:
                    levels[0] = '[' + levels[0][1:]
            else:
                levels = [
                    '[%s, %s)' % (fmt(a), fmt(b))
                    for a, b in zip(bins, bins[1:])
                ]

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, name=name)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if not retbins:
        return fac

    return fac, bins
Example #9
0
def _concat_categorical(to_concat, axis=0):
    """Concatenate an object/categorical array of arrays, each of which is a
    single dtype

    Parameters
    ----------
    to_concat : array of arrays
    axis : int
        Axis to provide concatenation in the current implementation this is
        always 0, e.g. we only have 1D categoricals

    Returns
    -------
    Categorical
        A single array, preserving the combined dtypes
    """

    from pandas.core.categorical import Categorical

    def convert_categorical(x):
        # coerce to object dtype
        if com.is_categorical_dtype(x.dtype):
            return x.get_values()
        return x.ravel()

    if get_dtype_kinds(to_concat) - set(['object', 'category']):
        # convert to object type and perform a regular concat
        return _concat_compat(
            [np.array(x, copy=False, dtype=object) for x in to_concat], axis=0)

    # we could have object blocks and categoricals here
    # if we only have a single categoricals then combine everything
    # else its a non-compat categorical
    categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)]

    # validate the categories
    categories = categoricals[0]
    rawcats = categories.categories
    for x in categoricals[1:]:
        if not categories.is_dtype_equal(x):
            raise ValueError("incompatible categories in categorical concat")

    # we've already checked that all categoricals are the same, so if their
    # length is equal to the input then we have all the same categories
    if len(categoricals) == len(to_concat):
        # concating numeric types is much faster than concating object types
        # and fastpath takes a shorter path through the constructor
        return Categorical(np.concatenate([x.codes for x in to_concat],
                                          axis=0),
                           rawcats,
                           ordered=categoricals[0].ordered,
                           fastpath=True)
    else:
        concatted = np.concatenate(list(map(convert_categorical, to_concat)),
                                   axis=0)
        return Categorical(concatted, rawcats)
Example #10
0
def panel_index(time, panels, names=['time', 'panel']):
    """
    Returns a multi-index suitable for a panel-like DataFrame

    Parameters
    ----------
    time : array-like
        Time index, does not have to repeat
    panels : array-like
        Panel index, does not have to repeat
    names : list, optional
        List containing the names of the indices

    Returns
    -------
    multi_index : MultiIndex
        Time index is the first level, the panels are the second level.

    Examples
    --------
    >>> years = range(1960,1963)
    >>> panels = ['A', 'B', 'C']
    >>> panel_idx = panel_index(years, panels)
    >>> panel_idx
    MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'),
                (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'),
                (1962, 'C')], dtype=object)

    or

    >>> import numpy as np
    >>> years = np.repeat(range(1960,1963), 3)
    >>> panels = np.tile(['A', 'B', 'C'], 3)
    >>> panel_idx = panel_index(years, panels)
    >>> panel_idx
    MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'),
                (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'),
                (1962, 'C')], dtype=object)
    """
    time, panels = _ensure_like_indices(time, panels)
    time_factor = Categorical.from_array(time)
    panel_factor = Categorical.from_array(panels)

    labels = [time_factor.labels, panel_factor.labels]
    levels = [time_factor.levels, panel_factor.levels]
    return MultiIndex(levels,
                      labels,
                      sortorder=None,
                      names=names,
                      verify_integrity=False)
Example #11
0
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
                  precision=3, name=None, include_lowest=False):
    if name is None and isinstance(x, Series):
        name = x.name
    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right,
                                            include_lowest=include_lowest)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, name=name)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if not retbins:
        return fac

    return fac, bins
Example #12
0
def _bins_to_cuts(x, bins, right=True, labels=None,
                  precision=3, include_lowest=False,
                  dtype=None):

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise ValueError('Bin edges must be unique: %s' % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right,
                                            include_lowest=include_lowest,
                                            dtype=dtype)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    return fac, bins
Example #13
0
def make_axis_dummies(frame, axis="minor", transform=None):
    """
    Construct 1-0 dummy variables corresponding to designated axis
    labels

    Parameters
    ----------
    axis : {'major', 'minor'}, default 'minor'
    transform : function, default None
        Function to apply to axis labels first. For example, to
        get "day of week" dummies in a time series regression you might
        call:
            make_axis_dummies(panel, axis='major',
                              transform=lambda d: d.weekday())
    Returns
    -------
    dummies : DataFrame
        Column names taken from chosen axis
    """
    numbers = {"major": 0, "minor": 1}
    num = numbers.get(axis, axis)

    items = frame.index.levels[num]
    labels = frame.index.labels[num]
    if transform is not None:
        mapped_items = items.map(transform)
        cat = Categorical.from_array(mapped_items.take(labels))
        labels = cat.labels
        items = cat.levels

    values = np.eye(len(items), dtype=float)
    values = values.take(labels, axis=0)

    return DataFrame(values, columns=items, index=frame.index)
Example #14
0
def get_dummies(data, prefix=None, prefix_sep='_'):
    """
    Convert categorical variable into dummy/indicator variables

    Parameters
    ----------
    data : array-like or Series
    prefix : string, default None
        String to append DataFrame column names
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use

    Returns
    -------
    dummies : DataFrame
    """
    cat = Categorical.from_array(np.asarray(data))
    dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0)

    if prefix is not None:
        dummy_cols = ['%s%s%s' % (prefix, prefix_sep, str(v))
                      for v in cat.levels]
    else:
        dummy_cols = cat.levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Example #15
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = algos.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            categories = self.is_categorical.categories
            ordered = self.is_categorical.ordered
            values = [
                Categorical(values[:, i],
                            categories=categories,
                            ordered=ordered) for i in range(values.shape[-1])
            ]

        return DataFrame(values, index=index, columns=columns)
Example #16
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = algos.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            categories = self.is_categorical.categories
            ordered = self.is_categorical.ordered
            values = [Categorical.from_array(values[:, i],
                                             categories=categories,
                                             ordered=ordered)
                      for i in range(values.shape[-1])]

        return DataFrame(values, index=index, columns=columns)
Example #17
0
    def test_constructor_unsortable(self):
        raise nose.SkipTest('skipping for now')

        arr = np.array([1, 2, 3, datetime.now()], dtype='O')

        # it works!
        factor = Categorical.from_array(arr)
Example #18
0
    def _create_from_codes(self, codes, categories=None, ordered=None,
                           name=None):
        """
        *this is an internal non-public method*

        create the correct categorical from codes

        Parameters
        ----------
        codes : new codes
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        name : optional name attribute, defaults to existing

        Returns
        -------
        CategoricalIndex
        """

        from pandas.core.categorical import Categorical
        if categories is None:
            categories = self.categories
        if ordered is None:
            ordered = self.ordered
        if name is None:
            name = self.name
        cat = Categorical.from_codes(codes, categories=categories,
                                     ordered=self.ordered)
        return CategoricalIndex(cat, name=name)
Example #19
0
    def _create_from_codes(self,
                           codes,
                           categories=None,
                           ordered=None,
                           name=None):
        """
        *this is an internal non-public method*

        create the correct categorical from codes

        Parameters
        ----------
        codes : new codes
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        name : optional name attribute, defaults to existing

        Returns
        -------
        CategoricalIndex
        """

        from pandas.core.categorical import Categorical
        if categories is None:
            categories = self.categories
        if ordered is None:
            ordered = self.ordered
        if name is None:
            name = self.name
        cat = Categorical.from_codes(codes,
                                     categories=categories,
                                     ordered=self.ordered)
        return CategoricalIndex(cat, name=name)
Example #20
0
    def melt_stub(df, stub, i, j, value_vars, sep):
        newdf = melt(df, id_vars=i, value_vars=value_vars,
                     value_name=stub.rstrip(sep), var_name=j)
        newdf[j] = Categorical(newdf[j])
        newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")

        return newdf.set_index(i + [j])
Example #21
0
def make_axis_dummies(frame, axis='minor', transform=None):
    """
    Construct 1-0 dummy variables corresponding to designated axis
    labels

    Parameters
    ----------
    axis : {'major', 'minor'}, default 'minor'
    transform : function, default None
        Function to apply to axis labels first. For example, to
        get "day of week" dummies in a time series regression you might
        call:
            make_axis_dummies(panel, axis='major',
                              transform=lambda d: d.weekday())
    Returns
    -------
    dummies : DataFrame
        Column names taken from chosen axis
    """
    numbers = {'major': 0, 'minor': 1}
    num = numbers.get(axis, axis)

    items = frame.index.levels[num]
    labels = frame.index.labels[num]
    if transform is not None:
        mapped_items = items.map(transform)
        cat = Categorical.from_array(mapped_items.take(labels))
        labels = cat.labels
        items = cat.levels

    values = np.eye(len(items), dtype=float)
    values = values.take(labels, axis=0)

    return DataFrame(values, columns=items, index=frame.index)
Example #22
0
    def test_constructor_unsortable(self):
        raise nose.SkipTest('skipping for now')

        arr = np.array([1, 2, 3, datetime.now()], dtype='O')

        # it works!
        factor = Categorical.from_array(arr)
Example #23
0
def get_dummies(data, prefix=None, prefix_sep='_'):
    """
    Convert categorical variable into dummy/indicator variables

    Parameters
    ----------
    data : array-like or Series
    prefix : string, default None
        String to append DataFrame column names
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use

    Returns
    -------
    dummies : DataFrame
    """
    cat = Categorical.from_array(np.asarray(data))
    dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0)

    if prefix is not None:
        dummy_cols = [
            '%s%s%s' % (prefix, prefix_sep, str(v)) for v in cat.levels
        ]
    else:
        dummy_cols = cat.levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Example #24
0
def panel_index(time, panels, names=['time', 'panel']):
    """
    Returns a multi-index suitable for a panel-like DataFrame

    Parameters
    ----------
    time : array-like
        Time index, does not have to repeat
    panels : array-like
        Panel index, does not have to repeat
    names : list, optional
        List containing the names of the indices

    Returns
    -------
    multi_index : MultiIndex
        Time index is the first level, the panels are the second level.

    Examples
    --------
    >>> years = range(1960,1963)
    >>> panels = ['A', 'B', 'C']
    >>> panel_idx = panel_index(years, panels)
    >>> panel_idx
    MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'),
                (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'),
                (1962, 'C')], dtype=object)

    or

    >>> import numpy as np
    >>> years = np.repeat(range(1960,1963), 3)
    >>> panels = np.tile(['A', 'B', 'C'], 3)
    >>> panel_idx = panel_index(years, panels)
    >>> panel_idx
    MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'),
                (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'),
                (1962, 'C')], dtype=object)
    """
    time, panels = _ensure_like_indices(time, panels)
    time_factor = Categorical.from_array(time)
    panel_factor = Categorical.from_array(panels)

    labels = [time_factor.labels, panel_factor.labels]
    levels = [time_factor.levels, panel_factor.levels]
    return MultiIndex(levels, labels, sortorder=None, names=names,
                      verify_integrity=False)
Example #25
0
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False):
    # Series avoids inconsistent NaN handling
    cat = Categorical.from_array(Series(data), ordered=True)
    levels = cat.categories

    # if all NaN
    if not dummy_na and len(levels) == 0:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index)

    codes = cat.codes.copy()
    if dummy_na:
        codes[codes == -1] = len(cat.categories)
        levels = np.append(cat.categories, np.nan)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
                      for v in levels]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [ [] for _ in range(len(dummy_cols)) ]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs),
                               fill_value=0)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)

    else:
        dummy_mat = np.eye(number_of_cols).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Example #26
0
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
                    sparse=False):
    # Series avoids inconsistent NaN handling
    cat = Categorical.from_array(Series(data), ordered=True)
    levels = cat.categories

    # if all NaN
    if not dummy_na and len(levels) == 0:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index)

    codes = cat.codes.copy()
    if dummy_na:
        codes[codes == -1] = len(cat.categories)
        levels = np.append(cat.categories, np.nan)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs)),
                               sparse_index=IntIndex(N, ixs), fill_value=0)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)

    else:
        dummy_mat = np.eye(number_of_cols).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Example #27
0
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
                  precision=3, name=None, include_lowest=False):
    if name is None and isinstance(x, Series):
        name = x.name
    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            fmt = lambda v: _format_label(v, precision=precision)
            if right:
                levels = ['(%s, %s]' % (fmt(a), fmt(b))
                           for a, b in zip(bins, bins[1:])]
                if include_lowest:
                    levels[0] = '[' + levels[0][1:]
            else:
                levels = ['[%s, %s)' % (fmt(a), fmt(b))
                           for a, b in zip(bins, bins[1:])]

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, name=name)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if not retbins:
        return fac

    return fac, bins
def sparse_dummies(df, column):
    """Returns sparse OHE matrix for the column of the dataframe"""
    categories = Categorical(df[column])
    column_names = np.array(
        [f"{column}_{str(i)}" for i in range(len(categories.categories))])
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N, ))
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names
Example #29
0
    def _create_categorical(self,
                            data,
                            categories=None,
                            ordered=None,
                            dtype=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        dtype : CategoricalDtype, defaults to existing

        Returns
        -------
        Categorical
        """
        if (isinstance(data, (ABCSeries, type(self)))
                and is_categorical_dtype(data)):
            data = data.values

        if not isinstance(data, ABCCategorical):
            if ordered is None and dtype is None:
                ordered = False
            from pandas.core.categorical import Categorical
            data = Categorical(data,
                               categories=categories,
                               ordered=ordered,
                               dtype=dtype)
        else:
            from pandas.core.dtypes.dtypes import CategoricalDtype

            if categories is not None:
                data = data.set_categories(categories, ordered=ordered)
            elif ordered is not None and ordered != data.ordered:
                data = data.set_ordered(ordered)
            if isinstance(dtype, CategoricalDtype):
                # we want to silently ignore dtype='category'
                data = data._set_dtype(dtype)
        return data
Example #30
0
    def where(self, cond, other=None):
        if other is None:
            other = self._na_value
        values = np.where(cond, self.values, other)

        from pandas.core.categorical import Categorical
        cat = Categorical(values,
                          categories=self.categories,
                          ordered=self.ordered)
        return self._shallow_copy(cat, **self._get_attributes_dict())
Example #31
0
    def melt_stub(df, stub, i, j, value_vars, sep):
        newdf = melt(df, id_vars=i, value_vars=value_vars,
                     value_name=stub.rstrip(sep), var_name=j)
        newdf[j] = Categorical(newdf[j])
        newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")

        # GH17627 Cast numerics suffixes to int/float
        newdf[j] = to_numeric(newdf[j], errors='ignore')

        return newdf.set_index(i + [j])
Example #32
0
    def _indicator_post_merge(self, result):

        result['_left_indicator'] = result['_left_indicator'].fillna(0)
        result['_right_indicator'] = result['_right_indicator'].fillna(0)

        result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3])
        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both'])

        result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1)

        return result
Example #33
0
    def test_na_flags_int_levels(self):
        # #1457

        levels = range(10)
        labels = np.random.randint(0, 10, 20)
        labels[::5] = -1

        cat = Categorical(labels, levels)
        repr(cat)

        self.assert_(np.array_equal(com.isnull(cat), labels == -1))
Example #34
0
def sparse_dummies(df, column):
    '''Returns sparse OHE matrix for the column of the dataframe'''
    categories = Categorical(df[column])
    column_names = np.array([
        "{}_{}".format(column, str(i))
        for i in range(len(categories.categories))
    ])
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N, ))
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names
Example #35
0
def union_categoricals(to_union):
    """
    Combine list-like of Categoricals, unioning categories. All
    must have the same dtype, and none can be ordered.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categoricals

    Returns
    -------
    Categorical
       A single array, categories will be ordered as they
       appear in the list

    Raises
    ------
    TypeError
        If any of the categoricals are ordered or all do not
        have the same dtype
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    first = to_union[0]
    if any(c.ordered for c in to_union):
        raise TypeError("Can only combine unordered Categoricals")

    if not all(
            com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
            for c in to_union):
        raise TypeError("dtype of categories must be the same")

    cats = first.categories
    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
    categories = Index(unique_cats)

    new_codes = []
    for c in to_union:
        indexer = categories.get_indexer(c.categories)
        new_codes.append(indexer.take(c.codes))
    codes = np.concatenate(new_codes)
    return Categorical(codes,
                       categories=categories,
                       ordered=False,
                       fastpath=True)
Example #36
0
    def _create_categorical(self, data, categories=None, ordered=None,
                            dtype=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        dtype : CategoricalDtype, defaults to existing

        Returns
        -------
        Categorical
        """
        if (isinstance(data, (ABCSeries, type(self))) and
                is_categorical_dtype(data)):
            data = data.values

        if not isinstance(data, ABCCategorical):
            if ordered is None and dtype is None:
                ordered = False
            from pandas.core.categorical import Categorical
            data = Categorical(data, categories=categories, ordered=ordered,
                               dtype=dtype)
        else:
            from pandas.core.dtypes.dtypes import CategoricalDtype

            if categories is not None:
                data = data.set_categories(categories, ordered=ordered)
            elif ordered is not None and ordered != data.ordered:
                data = data.set_ordered(ordered)
            if isinstance(dtype, CategoricalDtype):
                # we want to silently ignore dtype='category'
                data = data._set_dtype(dtype)
        return data
Example #37
0
    def test_big_print(self):
        factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat')
        expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c",
                    " a", " b", " c", " a", "...", " c", " a", " b", " c",
                    " a", " b", " c", " a", " b", " c", " a", " b", " c",
                    "Levels (3): Index([a, b, c], dtype=object)",
                    "Name: cat, Length: 600" ]
        expected = "\n".join(expected)

        # hack because array_repr changed in numpy > 1.6.x
        actual = repr(factor)
        pat = "Index\(\['a', 'b', 'c']"
        sub = "Index([a, b, c]"
        actual = re.sub(pat, sub, actual)

        self.assertEqual(actual, expected)
Example #38
0
    def test_describe(self):
        # string type
        desc = self.factor.describe()
        expected = DataFrame.from_dict(
            dict(counts=[3, 2, 3],
                 freqs=[3 / 8., 2 / 8., 3 / 8.],
                 levels=['a', 'b', 'c'])).set_index('levels')
        tm.assert_frame_equal(desc, expected)

        # check an integer one
        desc = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]).describe()
        expected = DataFrame.from_dict(
            dict(counts=[5, 3, 3],
                 freqs=[5 / 11., 3 / 11., 3 / 11.],
                 levels=[1, 2, 3])).set_index('levels')
        tm.assert_frame_equal(desc, expected)
Example #39
0
    def get_result(self):
        values, _ = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            categories = self.is_categorical.categories
            ordered = self.is_categorical.ordered
            values = [
                Categorical(values[:, i],
                            categories=categories,
                            ordered=ordered) for i in range(values.shape[-1])
            ]

        return self.constructor(values, index=index, columns=columns)
def sparse_dummies(df, column):
    '''Returns sparse OHE matrix for the column of the dataframe'''
    print(column)
    categories = Categorical(df[column])
    print(categories)
    # return a CategoricalDtype object
    column_names = np.array([
        "{}_{}".format(column, str(i))
        for i in range(len(categories.categories))
    ])
    print(column_names)
    # f-string, format strings
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N, ))
    # categories.codes encode the strinig with number
    # create a matrix with 1's only at (i, i's category)
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names
Example #41
0
def lexsort_indexer(keys, orders=None, na_position='last'):
    from pandas.core.categorical import Categorical

    labels = []
    shape = []
    if isinstance(orders, bool):
        orders = [orders] * len(keys)
    elif orders is None:
        orders = [True] * len(keys)

    for key, order in zip(keys, orders):

        # we are already a Categorical
        if is_categorical_dtype(key):
            c = key

        # create the Categorical
        else:
            c = Categorical(key, ordered=True)

        if na_position not in ['last', 'first']:
            raise ValueError('invalid na_position: {!r}'.format(na_position))

        n = len(c.categories)
        codes = c.codes.copy()

        mask = (c.codes == -1)
        if order:  # ascending
            if na_position == 'last':
                codes = np.where(mask, n, codes)
            elif na_position == 'first':
                codes += 1
        else:  # not order means descending
            if na_position == 'last':
                codes = np.where(mask, n, n - codes - 1)
            elif na_position == 'first':
                codes = np.where(mask, 0, n - codes)
        if mask.any():
            n += 1

        shape.append(n)
        labels.append(codes)

    return indexer_from_factorized(labels, shape)
Example #42
0
    def where(self, cond, other=None):
        """
        .. versionadded:: 0.19.0

        Return an Index of same shape as self and whose corresponding
        entries are from self where cond is True and otherwise are from
        other.

        Parameters
        ----------
        cond : boolean same length as self
        other : scalar, or array-like
        """
        if other is None:
            other = self._na_value
        values = np.where(cond, self.values, other)

        from pandas.core.categorical import Categorical
        cat = Categorical(values,
                          categories=self.categories,
                          ordered=self.ordered)
        return self._shallow_copy(cat, **self._get_attributes_dict())
Example #43
0
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
    # Series avoids inconsistent NaN handling
    cat = Categorical.from_array(Series(data))
    levels = cat.categories

    # if all NaN
    if not dummy_na and len(levels) == 0:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    number_of_cols = len(levels)
    if dummy_na:
        number_of_cols += 1

    dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0)

    if dummy_na:
        levels = np.append(cat.categories, np.nan)
    else:
        # reset NaN GH4446
        dummy_mat[cat.codes == -1] = 0

    if prefix is not None:
        dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
                      for v in levels]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Example #44
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = com.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # we might have a missing index
        if len(index) != values.shape[0]:
            mask = isnull(index)
            if mask.any():
                l = np.arange(len(index))
                values, orig_values = (np.empty((len(index), values.shape[1])),
                                       values)
                values.fill(np.nan)
                values_indexer = com._ensure_int64(l[~mask])
                for i, j in enumerate(values_indexer):
                    values[j] = orig_values[i]
            else:
                index = index.take(self.unique_groups)

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            values = [ Categorical.from_array(values[:,i],
                                              categories=self.is_categorical.categories)
                       for i in range(values.shape[-1]) ]

        return DataFrame(values, index=index, columns=columns)
Example #45
0
 def test_levels_none(self):
     factor = Categorical(['a', 'b', 'b', 'a',
                           'a', 'c', 'c', 'c'])
     self.assert_(factor.equals(self.factor))
Example #46
0
 def setUp(self):
     self.factor = Categorical.from_array(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'])
Example #47
0
 def setUp(self):
     self.factor = Categorical.from_array(["a", "b", "b", "a", "a", "c", "c", "c"])
Example #48
0
def _make_concat_multiindex(indexes, keys, levels=None, names=None):
    if ((levels is None and isinstance(keys[0], tuple)) or
            (levels is not None and len(levels) > 1)):
        zipped = lzip(*keys)
        if names is None:
            names = [None] * len(zipped)

        if levels is None:
            levels = [Categorical.from_array(
                zp, ordered=True).categories for zp in zipped]
        else:
            levels = [_ensure_index(x) for x in levels]
    else:
        zipped = [keys]
        if names is None:
            names = [None]

        if levels is None:
            levels = [_ensure_index(keys)]
        else:
            levels = [_ensure_index(x) for x in levels]

    if not _all_indexes_same(indexes):
        label_list = []

        # things are potentially different sizes, so compute the exact labels
        # for each level and pass those to MultiIndex.from_arrays

        for hlevel, level in zip(zipped, levels):
            to_concat = []
            for key, index in zip(hlevel, indexes):
                try:
                    i = level.get_loc(key)
                except KeyError:
                    raise ValueError('Key %s not in level %s'
                                     % (str(key), str(level)))

                to_concat.append(np.repeat(i, len(index)))
            label_list.append(np.concatenate(to_concat))

        concat_index = _concat_indexes(indexes)

        # these go at the end
        if isinstance(concat_index, MultiIndex):
            levels.extend(concat_index.levels)
            label_list.extend(concat_index.labels)
        else:
            factor = Categorical.from_array(concat_index, ordered=True)
            levels.append(factor.categories)
            label_list.append(factor.codes)

        if len(names) == len(levels):
            names = list(names)
        else:
            # make sure that all of the passed indices have the same nlevels
            if not len(set([idx.nlevels for idx in indexes])) == 1:
                raise AssertionError("Cannot concat indices that do"
                                     " not have the same number of levels")

            # also copies
            names = names + _get_consensus_names(indexes)

        return MultiIndex(levels=levels, labels=label_list, names=names,
                          verify_integrity=False)

    new_index = indexes[0]
    n = len(new_index)
    kpieces = len(indexes)

    # also copies
    new_names = list(names)
    new_levels = list(levels)

    # construct labels
    new_labels = []

    # do something a bit more speedy

    for hlevel, level in zip(zipped, levels):
        hlevel = _ensure_index(hlevel)
        mapped = level.get_indexer(hlevel)

        mask = mapped == -1
        if mask.any():
            raise ValueError('Values not found in passed level: %s'
                             % str(hlevel[mask]))

        new_labels.append(np.repeat(mapped, n))

    if isinstance(new_index, MultiIndex):
        new_levels.extend(new_index.levels)
        new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels])
    else:
        new_levels.append(new_index)
        new_labels.append(np.tile(np.arange(n), kpieces))

    if len(new_names) < len(new_levels):
        new_names.extend(new_index.names)

    return MultiIndex(levels=new_levels, labels=new_labels, names=new_names,
                      verify_integrity=False)
Example #49
0
    def data(self, convert_dates=True, convert_categoricals=True, index=None):
        """
        Reads observations from Stata file, converting them into a dataframe

        Parameters
        ----------
        convert_dates : boolean, defaults to True
            Convert date variables to DataFrame time values
        convert_categoricals : boolean, defaults to True
            Read value labels and convert columns to Categorical/Factor variables
        index : identifier of index column
            identifier of column that should be used as index of the DataFrame

        Returns
        -------
        y : DataFrame instance
        """
        if self._data_read:
            raise Exception("Data has already been read.")
        self._data_read = True

        if self.format_version >= 117:
            self._read_strls()

        stata_dta = self._dataset()

        data = []
        for rownum, line in enumerate(stata_dta):
            # doesn't handle missing value objects, just casts
            # None will only work without missing value object.
            for i, val in enumerate(line):
                # NOTE: This will only be scalar types because missing strings
                # are empty not None in Stata
                if val is None:
                    line[i] = np.nan
            data.append(tuple(line))

        if convert_categoricals:
            self._read_value_labels()

        data = DataFrame(data, columns=self.varlist, index=index)

        cols_ = np.where(self.dtyplist)[0]
        for i in cols_:
            if self.dtyplist[i] is not None:
                col = data.columns[i]
                if data[col].dtype is not np.dtype(object):
                    data[col] = Series(data[col], data[col].index, self.dtyplist[i])

        if convert_dates:
            cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0]
            for i in cols:
                col = data.columns[i]
                data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],))

        if convert_categoricals:
            cols = np.where(lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0]
            for i in cols:
                col = data.columns[i]
                labeled_data = np.copy(data[col])
                labeled_data = labeled_data.astype(object)
                for k, v in compat.iteritems(self.value_label_dict[self.lbllist[i]]):
                    labeled_data[(data[col] == k).values] = v
                data[col] = Categorical.from_array(labeled_data)

        return data
Example #50
0
 def test_levels_none(self):
     factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
     self.assert_(factor.equals(self.factor))
Example #51
0
def _get_dummies_1d(data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False):
    # Series avoids inconsistent NaN handling
    cat = Categorical.from_array(Series(data), ordered=True)
    levels = cat.categories

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = cat.codes.copy()
    if dummy_na:
        codes[codes == -1] = len(cat.categories)
        levels = np.append(cat.categories, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_cols = ["%s%s%s" % (prefix, prefix_sep, v) for v in levels]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)

    else:
        dummy_mat = np.eye(number_of_cols).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Example #52
0
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
    """
    Convert categorical variable into dummy/indicator variables

    Parameters
    ----------
    data : array-like or Series
    prefix : string, default None
        String to append DataFrame column names
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.

    Returns
    -------
    dummies : DataFrame

    Examples
    --------
    >>> s = pd.Series(list('abca'))

    >>> get_dummies(s)
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0

    >>> s1 = ['a', 'b', np.nan]

    >>> get_dummies(s1)
       a  b
    0  1  0
    1  0  1
    2  0  0

    >>> get_dummies(s1, dummy_na=True)
       a  b  NaN
    0  1  0    0
    1  0  1    0
    2  0  0    1

    See also ``Series.str.get_dummies``.

    """
    # Series avoids inconsistent NaN handling
    cat = Categorical.from_array(Series(data))
    levels = cat.levels

    # if all NaN
    if not dummy_na and len(levels) == 0:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    number_of_cols = len(levels)
    if dummy_na:
        number_of_cols += 1

    dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0)

    if dummy_na:
        levels = np.append(cat.levels, np.nan)
    else:
        # reset NaN GH4446
        dummy_mat[cat.labels == -1] = 0

    if prefix is not None:
        dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
                      for v in levels]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    return DataFrame(dummy_mat, index=index, columns=dummy_cols)