Ejemplo n.º 1
0
    def test_empty_print(self):
        factor = Categorical([], ["a", "b", "c"], name="cat")
        expected = ("Categorical([], Name: cat, Levels (3): "
                    "Index([a, b, c], dtype=object)")
        # hack because array_repr changed in numpy > 1.6.x
        actual = repr(factor)
        pat = "Index\(\['a', 'b', 'c']"
        sub = "Index([a, b, c]"
        actual = re.sub(pat, sub, actual)

        self.assertEqual(actual, expected)

        factor = Categorical([], ["a", "b", "c"])
        expected = ("Categorical([], Levels (3): "
                    "Index([a, b, c], dtype=object)")
        # hack because array_repr changed in numpy > 1.6.x
        actual = repr(factor)
        pat = "Index\(\['a', 'b', 'c']"
        sub = "Index([a, b, c]"
        actual = re.sub(pat, sub, actual)

        self.assertEqual(actual, expected)

        factor = Categorical([], [])
        expected = ("Categorical([], Levels (0): " "Index([], dtype=object)")
        self.assertEqual(repr(factor), expected)
Ejemplo n.º 2
0
def _concat_categorical(to_concat, axis=0):
    """Concatenate an object/categorical array of arrays, each of which is a
    single dtype

    Parameters
    ----------
    to_concat : array of arrays
    axis : int
        Axis to provide concatenation in the current implementation this is
        always 0, e.g. we only have 1D categoricals

    Returns
    -------
    Categorical
        A single array, preserving the combined dtypes
    """

    from pandas.core.categorical import Categorical

    def convert_categorical(x):
        # coerce to object dtype
        if com.is_categorical_dtype(x.dtype):
            return x.get_values()
        return x.ravel()

    if get_dtype_kinds(to_concat) - set(['object', 'category']):
        # convert to object type and perform a regular concat
        return _concat_compat(
            [np.array(x, copy=False, dtype=object) for x in to_concat], axis=0)

    # we could have object blocks and categoricals here
    # if we only have a single categoricals then combine everything
    # else its a non-compat categorical
    categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)]

    # validate the categories
    categories = categoricals[0]
    rawcats = categories.categories
    for x in categoricals[1:]:
        if not categories.is_dtype_equal(x):
            raise ValueError("incompatible categories in categorical concat")

    # we've already checked that all categoricals are the same, so if their
    # length is equal to the input then we have all the same categories
    if len(categoricals) == len(to_concat):
        # concating numeric types is much faster than concating object types
        # and fastpath takes a shorter path through the constructor
        return Categorical(np.concatenate([x.codes for x in to_concat],
                                          axis=0),
                           rawcats,
                           ordered=categoricals[0].ordered,
                           fastpath=True)
    else:
        concatted = np.concatenate(list(map(convert_categorical, to_concat)),
                                   axis=0)
        return Categorical(concatted, rawcats)
Ejemplo n.º 3
0
    def melt_stub(df, stub, i, j, value_vars, sep):
        newdf = melt(df, id_vars=i, value_vars=value_vars,
                     value_name=stub.rstrip(sep), var_name=j)
        newdf[j] = Categorical(newdf[j])
        newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")

        return newdf.set_index(i + [j])
Ejemplo n.º 4
0
    def get_result(self):
        # TODO: find a better way than this masking business

        values, value_mask = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # filter out missing levels
        if values.shape[1] > 0:
            col_inds, obs_ids = compress_group_index(self.sorted_labels[-1])
            # rare case, level values not observed
            if len(obs_ids) < self.full_shape[1]:
                inds = (value_mask.sum(0) > 0).nonzero()[0]
                values = algos.take_nd(values, inds, axis=1)
                columns = columns[inds]

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            categories = self.is_categorical.categories
            ordered = self.is_categorical.ordered
            values = [
                Categorical(values[:, i],
                            categories=categories,
                            ordered=ordered) for i in range(values.shape[-1])
            ]

        return DataFrame(values, index=index, columns=columns)
Ejemplo n.º 5
0
    def _create_categorical(self, data, categories=None, ordered=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing

        Returns
        -------
        Categorical
        """
        if not isinstance(data, ABCCategorical):
            ordered = False if ordered is None else ordered
            from pandas.core.categorical import Categorical
            data = Categorical(data, categories=categories, ordered=ordered)
        else:
            if categories is not None:
                data = data.set_categories(categories)
            if ordered is not None:
                data = data.set_ordered(ordered)
        return data
Ejemplo n.º 6
0
def _bins_to_cuts(x,
                  bins,
                  right=True,
                  labels=None,
                  retbins=False,
                  precision=3,
                  name=None,
                  include_lowest=False):
    if name is None and isinstance(x, Series):
        name = x.name
    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise Exception('Bin edges must be unique: %s' % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins,
                                            precision,
                                            right=right,
                                            include_lowest=include_lowest)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, name=name)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if not retbins:
        return fac

    return fac, bins
Ejemplo n.º 7
0
def _bins_to_cuts(x, bins, right=True, labels=None,
                  precision=3, include_lowest=False,
                  dtype=None, duplicates='raise'):

    if duplicates not in ['raise', 'drop']:
        raise ValueError("invalid value for 'duplicates' parameter, "
                         "valid options are: raise, drop")

    unique_bins = algos.unique(bins)
    if len(unique_bins) < len(bins):
        if duplicates == 'raise':
            raise ValueError("Bin edges must be unique: {}.\nYou "
                             "can drop duplicate edges by setting "
                             "the 'duplicates' kwarg".format(repr(bins)))
        else:
            bins = unique_bins

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right,
                                            include_lowest=include_lowest,
                                            dtype=dtype)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    return fac, bins
def sparse_dummies(df, column):
    """Returns sparse OHE matrix for the column of the dataframe"""
    categories = Categorical(df[column])
    column_names = np.array(
        [f"{column}_{str(i)}" for i in range(len(categories.categories))])
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N, ))
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names
Ejemplo n.º 9
0
def _bins_to_cuts(x,
                  bins,
                  right=True,
                  labels=None,
                  retbins=False,
                  precision=3,
                  name=None,
                  include_lowest=False):
    if name is None and isinstance(x, Series):
        name = x.name
    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            fmt = lambda v: _format_label(v, precision=precision)
            if right:
                levels = [
                    '(%s, %s]' % (fmt(a), fmt(b))
                    for a, b in zip(bins, bins[1:])
                ]
                if include_lowest:
                    levels[0] = '[' + levels[0][1:]
            else:
                levels = [
                    '[%s, %s)' % (fmt(a), fmt(b))
                    for a, b in zip(bins, bins[1:])
                ]

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, name=name)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if not retbins:
        return fac

    return fac, bins
Ejemplo n.º 10
0
    def melt_stub(df, stub, i, j, value_vars, sep):
        newdf = melt(df, id_vars=i, value_vars=value_vars,
                     value_name=stub.rstrip(sep), var_name=j)
        newdf[j] = Categorical(newdf[j])
        newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")

        # GH17627 Cast numerics suffixes to int/float
        newdf[j] = to_numeric(newdf[j], errors='ignore')

        return newdf.set_index(i + [j])
Ejemplo n.º 11
0
    def where(self, cond, other=None):
        if other is None:
            other = self._na_value
        values = np.where(cond, self.values, other)

        from pandas.core.categorical import Categorical
        cat = Categorical(values,
                          categories=self.categories,
                          ordered=self.ordered)
        return self._shallow_copy(cat, **self._get_attributes_dict())
Ejemplo n.º 12
0
    def test_na_flags_int_levels(self):
        # #1457

        levels = range(10)
        labels = np.random.randint(0, 10, 20)
        labels[::5] = -1

        cat = Categorical(labels, levels)
        repr(cat)

        self.assert_(np.array_equal(com.isnull(cat), labels == -1))
Ejemplo n.º 13
0
def sparse_dummies(df, column):
    '''Returns sparse OHE matrix for the column of the dataframe'''
    categories = Categorical(df[column])
    column_names = np.array([
        "{}_{}".format(column, str(i))
        for i in range(len(categories.categories))
    ])
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N, ))
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names
Ejemplo n.º 14
0
    def _indicator_post_merge(self, result):

        result['_left_indicator'] = result['_left_indicator'].fillna(0)
        result['_right_indicator'] = result['_right_indicator'].fillna(0)

        result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3])
        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both'])

        result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1)

        return result
Ejemplo n.º 15
0
def union_categoricals(to_union):
    """
    Combine list-like of Categoricals, unioning categories. All
    must have the same dtype, and none can be ordered.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categoricals

    Returns
    -------
    Categorical
       A single array, categories will be ordered as they
       appear in the list

    Raises
    ------
    TypeError
        If any of the categoricals are ordered or all do not
        have the same dtype
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    first = to_union[0]
    if any(c.ordered for c in to_union):
        raise TypeError("Can only combine unordered Categoricals")

    if not all(
            com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
            for c in to_union):
        raise TypeError("dtype of categories must be the same")

    cats = first.categories
    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
    categories = Index(unique_cats)

    new_codes = []
    for c in to_union:
        indexer = categories.get_indexer(c.categories)
        new_codes.append(indexer.take(c.codes))
    codes = np.concatenate(new_codes)
    return Categorical(codes,
                       categories=categories,
                       ordered=False,
                       fastpath=True)
Ejemplo n.º 16
0
    def test_big_print(self):
        factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat')
        expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c",
                    " a", " b", " c", " a", "...", " c", " a", " b", " c",
                    " a", " b", " c", " a", " b", " c", " a", " b", " c",
                    "Levels (3): Index([a, b, c], dtype=object)",
                    "Name: cat, Length: 600" ]
        expected = "\n".join(expected)

        # hack because array_repr changed in numpy > 1.6.x
        actual = repr(factor)
        pat = "Index\(\['a', 'b', 'c']"
        sub = "Index([a, b, c]"
        actual = re.sub(pat, sub, actual)

        self.assertEqual(actual, expected)
Ejemplo n.º 17
0
    def test_describe(self):
        # string type
        desc = self.factor.describe()
        expected = DataFrame.from_dict(
            dict(counts=[3, 2, 3],
                 freqs=[3 / 8., 2 / 8., 3 / 8.],
                 levels=['a', 'b', 'c'])).set_index('levels')
        tm.assert_frame_equal(desc, expected)

        # check an integer one
        desc = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]).describe()
        expected = DataFrame.from_dict(
            dict(counts=[5, 3, 3],
                 freqs=[5 / 11., 3 / 11., 3 / 11.],
                 levels=[1, 2, 3])).set_index('levels')
        tm.assert_frame_equal(desc, expected)
Ejemplo n.º 18
0
    def get_result(self):
        values, _ = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            categories = self.is_categorical.categories
            ordered = self.is_categorical.ordered
            values = [
                Categorical(values[:, i],
                            categories=categories,
                            ordered=ordered) for i in range(values.shape[-1])
            ]

        return self.constructor(values, index=index, columns=columns)
Ejemplo n.º 19
0
def sparse_dummies(df, column):
    '''Returns sparse OHE matrix for the column of the dataframe'''
    print(column)
    categories = Categorical(df[column])
    print(categories)
    # return a CategoricalDtype object
    column_names = np.array([
        "{}_{}".format(column, str(i))
        for i in range(len(categories.categories))
    ])
    print(column_names)
    # f-string, format strings
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N, ))
    # categories.codes encode the strinig with number
    # create a matrix with 1's only at (i, i's category)
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names
Ejemplo n.º 20
0
def lexsort_indexer(keys, orders=None, na_position='last'):
    from pandas.core.categorical import Categorical

    labels = []
    shape = []
    if isinstance(orders, bool):
        orders = [orders] * len(keys)
    elif orders is None:
        orders = [True] * len(keys)

    for key, order in zip(keys, orders):

        # we are already a Categorical
        if is_categorical_dtype(key):
            c = key

        # create the Categorical
        else:
            c = Categorical(key, ordered=True)

        if na_position not in ['last', 'first']:
            raise ValueError('invalid na_position: {!r}'.format(na_position))

        n = len(c.categories)
        codes = c.codes.copy()

        mask = (c.codes == -1)
        if order:  # ascending
            if na_position == 'last':
                codes = np.where(mask, n, codes)
            elif na_position == 'first':
                codes += 1
        else:  # not order means descending
            if na_position == 'last':
                codes = np.where(mask, n, n - codes - 1)
            elif na_position == 'first':
                codes = np.where(mask, 0, n - codes)
        if mask.any():
            n += 1

        shape.append(n)
        labels.append(codes)

    return indexer_from_factorized(labels, shape)
Ejemplo n.º 21
0
    def _create_categorical(self,
                            data,
                            categories=None,
                            ordered=None,
                            dtype=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        dtype : CategoricalDtype, defaults to existing

        Returns
        -------
        Categorical
        """
        if (isinstance(data, (ABCSeries, type(self)))
                and is_categorical_dtype(data)):
            data = data.values

        if not isinstance(data, ABCCategorical):
            if ordered is None and dtype is None:
                ordered = False
            from pandas.core.categorical import Categorical
            data = Categorical(data,
                               categories=categories,
                               ordered=ordered,
                               dtype=dtype)
        else:
            from pandas.core.dtypes.dtypes import CategoricalDtype

            if categories is not None:
                data = data.set_categories(categories, ordered=ordered)
            elif ordered is not None and ordered != data.ordered:
                data = data.set_ordered(ordered)
            if isinstance(dtype, CategoricalDtype):
                # we want to silently ignore dtype='category'
                data = data._set_dtype(dtype)
        return data
Ejemplo n.º 22
0
    def where(self, cond, other=None):
        """
        .. versionadded:: 0.19.0

        Return an Index of same shape as self and whose corresponding
        entries are from self where cond is True and otherwise are from
        other.

        Parameters
        ----------
        cond : boolean same length as self
        other : scalar, or array-like
        """
        if other is None:
            other = self._na_value
        values = np.where(cond, self.values, other)

        from pandas.core.categorical import Categorical
        cat = Categorical(values,
                          categories=self.categories,
                          ordered=self.ordered)
        return self._shallow_copy(cat, **self._get_attributes_dict())
Ejemplo n.º 23
0
def union_categoricals(to_union):
    """
    Combine list-like of Categoricals, unioning categories. All
    must have the same dtype, and none can be ordered.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categoricals

    Returns
    -------
    Categorical
       A single array, categories will be ordered as they
       appear in the list

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    first = to_union[0]

    if not all(
            is_dtype_equal(c.categories.dtype, first.categories.dtype)
            for c in to_union):
        raise TypeError("dtype of categories must be the same")

    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        return Categorical(np.concatenate([c.codes for c in to_union]),
                           categories=first.categories,
                           ordered=first.ordered,
                           fastpath=True)
    elif all(not c.ordered for c in to_union):
        # not ordered
        pass
    else:
        # to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    cats = first.categories
    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
    categories = Index(unique_cats)

    new_codes = []
    for c in to_union:
        if len(c.categories) > 0:
            indexer = categories.get_indexer(c.categories)
            new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
        else:
            # must be all NaN
            new_codes.append(c.codes)

    new_codes = np.concatenate(new_codes)
    return Categorical(new_codes,
                       categories=categories,
                       ordered=False,
                       fastpath=True)
Ejemplo n.º 24
0
def union_categoricals(to_union, sort_categories=False):
    """
    Combine list-like of Categoricals, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categoricals
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.

    Returns
    -------
    result : Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    first = to_union[0]

    if not all(
            is_dtype_equal(other.categories.dtype, first.categories.dtype)
            for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([c.codes for c in to_union])

        if sort_categories and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)
                new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
            else:
                # must be all NaN
                new_codes.append(c.codes)
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    return Categorical(new_codes,
                       categories=categories,
                       ordered=ordered,
                       fastpath=True)
Ejemplo n.º 25
0
def empty(types, size, cats=None, cols=None, index_type=None, index_name=None):
    """
    Create empty DataFrame to assign into

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    df = DataFrame()
    views = {}

    cols = cols if cols is not None else range(cols)
    if isinstance(types, str):
        types = types.split(',')
    for t, col in zip(types, cols):
        if str(t) == 'category':
            if cats is None or col not in cats:
                df[str(col)] = Categorical(
                        [], categories=RangeIndex(0, 2**14),
                        fastpath=True)
            elif isinstance(cats[col], int):
                df[str(col)] = Categorical(
                        [], categories=RangeIndex(0, cats[col]),
                        fastpath=True)
            else:  # explicit labels list
                df[str(col)] = Categorical([], categories=cats[col],
                                           fastpath=True)
        else:
            df[str(col)] = np.empty(0, dtype=t)

    if index_type is not None and index_type is not False:
        if index_name is None:
            raise ValueError('If using an index, must give an index name')
        if str(index_type) == 'category':
            if cats is None or index_name not in cats:
                c = Categorical(
                        [], categories=RangeIndex(0, 2**14),
                        fastpath=True)
            elif isinstance(cats[index_name], int):
                c = Categorical(
                        [], categories=RangeIndex(0, cats[index_name]),
                        fastpath=True)
            else:  # explicit labels list
                c = Categorical([], categories=cats[index_name],
                                           fastpath=True)
            print(cats, index_name, c)
            vals = np.empty(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[index_name] = vals
        else:
            index = np.empty(size, dtype=index_type)
            views[index_name] = index

        axes = [df._data.axes[0], index]
    else:
        axes = [df._data.axes[0], RangeIndex(size)]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code, categories=categories,
                                 fastpath=True)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)

        new_block = block.make_block_same_class(values=values)
        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if str(dtype) == 'category':
                views[col] = block.values._codes
                views[col+'-catdef'] = block.values
            else:
                views[col] = block.values[i]

    if index_name is not None and index_name is not False:
        df.index.name = index_name
    if str(index_type) == 'category':
        views[index_name+'-catdef'] = df._data.axes[1].values
    return df, views
Ejemplo n.º 26
0
def _bins_to_cuts_new(
    x,
    bins,
    right=True,
    labels=None,
    retbins=False,
    precision=3,
    name=None,
    include_lowest=False,
):
    x_is_series = isinstance(x, Series)
    series_index = None

    # Added this line to the original code
    bins = np.array(sorted(list(set(bins))))

    if x_is_series:
        series_index = x.index
        if name is None:
            name = x.name

    x = np.asarray(x)

    side = "left" if right else "right"
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise ValueError("Bin edges must be unique: %s" % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins,
                                            precision,
                                            right=right,
                                            include_lowest=include_lowest)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError("Bin labels must be one fewer than "
                                 "the number of bin edges")
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if x_is_series:
        fac = Series(fac, index=series_index, name=name)

    if not retbins:
        return fac

    return fac, bins
Ejemplo n.º 27
0
 def factorize(index):
     if index.is_unique:
         return index, np.arange(len(index))
     cat = Categorical(index, ordered=True)
     return cat.categories, cat.codes
Ejemplo n.º 28
0
 def test_levels_none(self):
     factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
     self.assert_(factor.equals(self.factor))