Exemple #1
0
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if com.is_categorical_dtype(x):
            return op(x,y)
        elif com.is_categorical_dtype(y) and not lib.isscalar(y):
            return op(y,x)

        if x.dtype == np.object_:
            if isinstance(y, list):
                y = lib.list_to_object_array(y)

            if isinstance(y, (np.ndarray, pd.Series)):
                if y.dtype != np.object_:
                    result = lib.vec_compare(x, y.astype(np.object_), op)
                else:
                    result = lib.vec_compare(x, y, op)
            else:
                result = lib.scalar_compare(x, y, op)
        else:

            try:
                result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except (AttributeError):
                result = op(x, y)

        return result
Exemple #2
0
    def na_op(x, y):

        if com.is_categorical_dtype(x) != (not np.isscalar(y) and com.is_categorical_dtype(y)):
            msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \
                  "compare values, use 'series <op> np.asarray(cat)'."
            raise TypeError(msg.format(op=op,typ=type(y)))
        if x.dtype == np.object_:
            if isinstance(y, list):
                y = lib.list_to_object_array(y)

            if isinstance(y, (pa.Array, pd.Series)):
                if y.dtype != np.object_:
                    result = lib.vec_compare(x, y.astype(np.object_), op)
                else:
                    result = lib.vec_compare(x, y, op)
            else:
                result = lib.scalar_compare(x, y, op)
        else:

            try:
                result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except (AttributeError):
                result = op(x, y)

        return result
Exemple #3
0
    def na_op(x, y):
        if com.is_categorical_dtype(x) != com.is_categorical_dtype(y):
            msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \
                  "compare values, use 'series <op> np.asarray(cat)'."
            raise TypeError(msg.format(op=op, typ=type(y)))
        if x.dtype == np.object_:
            if isinstance(y, list):
                y = lib.list_to_object_array(y)

            if isinstance(y, (pa.Array, pd.Series)):
                if y.dtype != np.object_:
                    result = lib.vec_compare(x, y.astype(np.object_), op)
                else:
                    result = lib.vec_compare(x, y, op)
            else:
                result = lib.scalar_compare(x, y, op)
        else:

            try:
                result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except (AttributeError):
                result = op(x, y)

        return result
Exemple #4
0
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if com.is_categorical_dtype(x):
            return op(x, y)
        elif com.is_categorical_dtype(y) and not lib.isscalar(y):
            return op(y, x)

        if x.dtype == np.object_:
            if isinstance(y, list):
                y = lib.list_to_object_array(y)

            if isinstance(y, (np.ndarray, pd.Series)):
                if y.dtype != np.object_:
                    result = lib.vec_compare(x, y.astype(np.object_), op)
                else:
                    result = lib.vec_compare(x, y, op)
            else:
                result = lib.scalar_compare(x, y, op)
        else:

            try:
                result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except (AttributeError):
                result = op(x, y)

        return result
Exemple #5
0
    def reindex(self, target, method=None, level=None, limit=None,
                tolerance=None):
        """
        Create index with target's values (move/add/delete values as necessary)

        Returns
        -------
        new_index : pd.Index
            Resulting index
        indexer : np.ndarray or None
            Indices of output values in original index

        """

        if method is not None:
            raise NotImplementedError("argument method is not implemented for "
                                      "CategoricalIndex.reindex")
        if level is not None:
            raise NotImplementedError("argument level is not implemented for "
                                      "CategoricalIndex.reindex")
        if limit is not None:
            raise NotImplementedError("argument limit is not implemented for "
                                      "CategoricalIndex.reindex")

        target = ibase._ensure_index(target)

        if not com.is_categorical_dtype(target) and not target.is_unique:
            raise ValueError("cannot reindex with a non-unique indexer")

        indexer, missing = self.get_indexer_non_unique(np.array(target))
        new_target = self.take(indexer)

        # filling in missing if needed
        if len(missing):
            cats = self.categories.get_indexer(target)

            if (cats == -1).any():
                # coerce to a regular index here!
                result = Index(np.array(self), name=self.name)
                new_target, indexer, _ = result._reindex_non_unique(
                    np.array(target))

            else:

                codes = new_target.codes.copy()
                codes[indexer == -1] = cats[missing]
                new_target = self._create_from_codes(codes)

        # we always want to return an Index type here
        # to be consistent with .reindex for other index types (e.g. they don't
        # coerce based on the actual values, only on the dtype)
        # unless we had an inital Categorical to begin with
        # in which case we are going to conform to the passed Categorical
        new_target = np.asarray(new_target)
        if com.is_categorical_dtype(target):
            new_target = target._shallow_copy(new_target, name=self.name)
        else:
            new_target = Index(new_target, name=self.name)

        return new_target, indexer
Exemple #6
0
    def reindex(self, target, method=None, level=None, limit=None,
                tolerance=None):
        """
        Create index with target's values (move/add/delete values as necessary)

        Returns
        -------
        new_index : pd.Index
            Resulting index
        indexer : np.ndarray or None
            Indices of output values in original index

        """

        if method is not None:
            raise NotImplementedError("argument method is not implemented for "
                                      "CategoricalIndex.reindex")
        if level is not None:
            raise NotImplementedError("argument level is not implemented for "
                                      "CategoricalIndex.reindex")
        if limit is not None:
            raise NotImplementedError("argument limit is not implemented for "
                                      "CategoricalIndex.reindex")

        target = ibase._ensure_index(target)

        if not com.is_categorical_dtype(target) and not target.is_unique:
            raise ValueError("cannot reindex with a non-unique indexer")

        indexer, missing = self.get_indexer_non_unique(np.array(target))
        new_target = self.take(indexer)

        # filling in missing if needed
        if len(missing):
            cats = self.categories.get_indexer(target)

            if (cats == -1).any():
                # coerce to a regular index here!
                result = Index(np.array(self), name=self.name)
                new_target, indexer, _ = result._reindex_non_unique(
                    np.array(target))

            else:

                codes = new_target.codes.copy()
                codes[indexer == -1] = cats[missing]
                new_target = self._create_from_codes(codes)

        # we always want to return an Index type here
        # to be consistent with .reindex for other index types (e.g. they don't
        # coerce based on the actual values, only on the dtype)
        # unless we had an inital Categorical to begin with
        # in which case we are going to conform to the passed Categorical
        new_target = np.asarray(new_target)
        if com.is_categorical_dtype(target):
            new_target = target._shallow_copy(new_target, name=self.name)
        else:
            new_target = Index(new_target, name=self.name)

        return new_target, indexer
Exemple #7
0
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (np.ndarray, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not com.is_categorical_dtype(self):
                msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\
                      "If you want to compare values, use 'series <op> np.asarray(other)'."
                raise TypeError(msg.format(op=op,typ=self.dtype))


        mask = isnull(self)

        if com.is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray, which would then
            # not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and dispatch to it.
            res = op(self.values, other)
        else:
            values = self.get_values()
            other = _index.convert_scalar(values,_values_from_object(other))

            if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series'
                                % type(other))

            # always return a full value series here
            res = _values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name,
                        dtype='bool')

        # mask out the invalids
        if mask.any():
            res[mask] = masker

        return res
Exemple #8
0
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (np.ndarray, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not com.is_categorical_dtype(self):
                msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\
                      "If you want to compare values, use 'series <op> np.asarray(other)'."
                raise TypeError(msg.format(op=op,typ=self.dtype))


        mask = isnull(self)

        if com.is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray, which would then
            # not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and dispatch to it.
            res = op(self.values, other)
        else:
            values = self.get_values()
            other = _index.convert_scalar(values,_values_from_object(other))

            if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series'
                                % type(other))

            # always return a full value series here
            res = _values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name,
                        dtype='bool')

        # mask out the invalids
        if mask.any():
            res[mask] = masker

        return res
Exemple #9
0
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if is_categorical_dtype(x):
            return op(x, y)
        elif is_categorical_dtype(y) and not isscalar(y):
            return op(y, x)

        if is_object_dtype(x.dtype):
            result = _comp_method_OBJECT_ARRAY(op, x, y)
        else:

            # we want to compare like types
            # we only want to convert to integer like if
            # we are not NotImplemented, otherwise
            # we would allow datetime64 (but viewed as i8) against
            # integer comparisons
            if is_datetimelike_v_numeric(x, y):
                raise TypeError("invalid type comparison")

            # numpy does not like comparisons vs None
            if isscalar(y) and isnull(y):
                if name == '__ne__':
                    return np.ones(len(x), dtype=bool)
                else:
                    return np.zeros(len(x), dtype=bool)

            # we have a datetime/timedelta and may need to convert
            mask = None
            if (needs_i8_conversion(x) or
                    (not isscalar(y) and needs_i8_conversion(y))):

                if isscalar(y):
                    mask = isnull(x)
                    y = _index.convert_scalar(x, _values_from_object(y))
                else:
                    mask = isnull(x) | isnull(y)
                    y = y.view('i8')
                x = x.view('i8')

            try:
                result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except AttributeError:
                result = op(x, y)

            if mask is not None and mask.any():
                result[mask] = masker

        return result
Exemple #10
0
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, ABCSeries):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (np.ndarray, pd.Index)):
            # do not check length of zerodim array
            # as it will broadcast
            if (not lib.isscalar(lib.item_from_zerodim(other)) and
                    len(self) != len(other)):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not is_categorical_dtype(self):
                msg = ("Cannot compare a Categorical for op {op} with Series "
                       "of dtype {typ}.\nIf you want to compare values, use "
                       "'series <op> np.asarray(other)'.")
                raise TypeError(msg.format(op=op, typ=self.dtype))

        if is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray,
            # which would then not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and
            # dispatch to it.
            res = op(self.values, other)
        else:
            values = self.get_values()
            if isinstance(other, (list, np.ndarray)):
                other = np.asarray(other)

            res = na_op(values, other)
            if isscalar(res):
                raise TypeError('Could not compare %s type with Series' %
                                type(other))

            # always return a full value series here
            res = _values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
        return res
def mode(values):
    """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
    # must sort because hash order isn't necessarily defined.
    from pandas.core.series import Series

    if isinstance(values, Series):
        constructor = values._constructor
        values = values.values
    else:
        values = np.asanyarray(values)
        constructor = Series

    dtype = values.dtype
    if com.is_integer_dtype(values):
        values = com._ensure_int64(values)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    elif com.is_categorical_dtype(values):
        result = constructor(values.mode())
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        res = htable.mode_object(values, mask)
        try:
            res = sorted(res)
        except TypeError as e:
            warn("Unable to sort modes: %s" % e)
        result = constructor(res, dtype=dtype)

    return result
    def _is_dtype_compat(self, other):
        """
        *this is an internal non-public method*

        provide a comparison between the dtype of self and other (coercing if
        needed)

        Raises
        ------
        TypeError if the dtypes are not compatible
        """
        if com.is_categorical_dtype(other):
            if isinstance(other, CategoricalIndex):
                other = other._values
            if not other.is_dtype_equal(self):
                raise TypeError("categories must match existing categories "
                                "when appending")
        else:
            values = other
            if not com.is_list_like(values):
                values = [values]
            other = CategoricalIndex(
                self._create_categorical(self,
                                         other,
                                         categories=self.categories,
                                         ordered=self.ordered))
            if not other.isin(values).all():
                raise TypeError("cannot append a non-category item to a "
                                "CategoricalIndex")

        return other
Exemple #13
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = 'category'
        elif com.is_sparse(arr):
            typ = 'sparse'
        elif com.is_datetimetz(arr):
            typ = 'datetimetz'
        elif com.is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif com.is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif com.is_object_dtype(dtype):
            typ = 'object'
        elif com.is_bool_dtype(dtype):
            typ = 'bool'
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Exemple #14
0
 def _make_str_accessor(self):
     from pandas.core.series import Series
     from pandas.core.index import Index
     if isinstance(self, Series) and not(
                 (is_categorical_dtype(self.dtype) and
                  is_object_dtype(self.values.categories)) or
                 (is_object_dtype(self.dtype))):
         # it's neither a string series not a categorical series with strings
         # inside the categories.
         # this really should exclude all series with any non-string values (instead of test
         # for object dtype), but that isn't practical for performance reasons until we have a
         # str dtype (GH 9343)
         raise AttributeError("Can only use .str accessor with string "
                              "values, which use np.object_ dtype in "
                              "pandas")
     elif isinstance(self, Index):
         # see scc/inferrence.pyx which can contain string values
         allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
         if self.inferred_type not in allowed_types:
             message = ("Can only use .str accessor with string values "
                        "(i.e. inferred_type is 'string', 'unicode' or 'mixed')")
             raise AttributeError(message)
         if self.nlevels > 1:
             message = "Can only use .str accessor with Index, not MultiIndex"
             raise AttributeError(message)
     return StringMethods(self)
Exemple #15
0
    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
                             fastpath=True)
        res = Series(cat)
        self.assertTrue(res.values.equals(cat))

        # GH12574
        self.assertRaises(
            ValueError,
            lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64'))
        cat = Series(pd.Categorical([1, 2, 3]), dtype='category')
        self.assertTrue(com.is_categorical_dtype(cat))
        self.assertTrue(com.is_categorical_dtype(cat.dtype))
        s = Series([1, 2, 3], dtype='category')
        self.assertTrue(com.is_categorical_dtype(s))
        self.assertTrue(com.is_categorical_dtype(s.dtype))
Exemple #16
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = "category"
        elif com.is_sparse(arr):
            typ = "sparse"
        elif com.is_datetimetz(arr):
            typ = "datetimetz"
        elif com.is_datetime64_dtype(dtype):
            typ = "datetime"
        elif com.is_timedelta64_dtype(dtype):
            typ = "timedelta"
        elif com.is_object_dtype(dtype):
            typ = "object"
        elif com.is_bool_dtype(dtype):
            typ = "bool"
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Exemple #17
0
    def test_categorical_order(self):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [
            (True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
            (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]),
            (True, 'noorder', ['a', 'b', 'c', 'd',
                               'e'], np.array([2, 1, 4, 0, 3])),
            (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
            (True, 'float_missing', ['a', 'd',
                                     'e'], np.array([0, 1, 2, -1, -1])),
            (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
            (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))
        ]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_items(cols)

        # Read with and with out categoricals, ensure order is identical
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)
        tm.assert_frame_equal(expected, parsed_115, check_categorical=False)
        tm.assert_frame_equal(expected, parsed_117, check_categorical=False)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed_115[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed_115[col].cat.categories)
    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
                             fastpath=True)
        res = Series(cat)
        self.assertTrue(res.values.equals(cat))

        # GH12574
        self.assertRaises(
            ValueError, lambda: Series(pd.Categorical([1, 2, 3]),
                                       dtype='int64'))
        cat = Series(pd.Categorical([1, 2, 3]), dtype='category')
        self.assertTrue(com.is_categorical_dtype(cat))
        self.assertTrue(com.is_categorical_dtype(cat.dtype))
        s = Series([1, 2, 3], dtype='category')
        self.assertTrue(com.is_categorical_dtype(s))
        self.assertTrue(com.is_categorical_dtype(s.dtype))
Exemple #19
0
def mode(values):
    """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
    # must sort because hash order isn't necessarily defined.
    from pandas.core.series import Series

    if isinstance(values, Series):
        constructor = values._constructor
        values = values.values
    else:
        values = np.asanyarray(values)
        constructor = Series

    dtype = values.dtype
    if com.is_integer_dtype(values):
        values = com._ensure_int64(values)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

    elif com.is_categorical_dtype(values):
        result = constructor(values.mode())
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        res = htable.mode_object(values, mask)
        try:
            res = sorted(res)
        except TypeError as e:
            warn("Unable to sort modes: %s" % e)
        result = constructor(res, dtype=dtype)

    return result
Exemple #20
0
 def _make_str_accessor(self):
     from pandas.core.series import Series
     from pandas.core.index import Index
     if isinstance(self, Series) and not ((is_categorical_dtype(
             self.dtype) and is_object_dtype(self.values.categories)) or
                                          (is_object_dtype(self.dtype))):
         # it's neither a string series not a categorical series with strings
         # inside the categories.
         # this really should exclude all series with any non-string values (instead of test
         # for object dtype), but that isn't practical for performance reasons until we have a
         # str dtype (GH 9343)
         raise AttributeError("Can only use .str accessor with string "
                              "values, which use np.object_ dtype in "
                              "pandas")
     elif isinstance(self, Index):
         # see scc/inferrence.pyx which can contain string values
         allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
         if self.inferred_type not in allowed_types:
             message = (
                 "Can only use .str accessor with string values "
                 "(i.e. inferred_type is 'string', 'unicode' or 'mixed')")
             raise AttributeError(message)
         if self.nlevels > 1:
             message = "Can only use .str accessor with Index, not MultiIndex"
             raise AttributeError(message)
     return StringMethods(self)
    def test_categorical_order(self):
        # Directly construct using expected codes
        # Format is is_cat, col_name, labels (in order), underlying data
        expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
                    (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]),
                    (True, 'noorder', ['a', 'b', 'c', 'd', 'e'], np.array([2, 1, 4, 0, 3])),
                    (True, 'floating', ['a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
                    (True, 'float_missing', ['a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
                    (False, 'nolabel', [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
                    (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))]
        cols = []
        for is_cat, col, labels, codes in expected:
            if is_cat:
                cols.append((col, pd.Categorical.from_codes(codes, labels)))
            else:
                cols.append((col, pd.Series(labels, dtype=np.float32)))
        expected = DataFrame.from_items(cols)

        # Read with and with out categoricals, ensure order is identical
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)
        tm.assert_frame_equal(expected, parsed_115)
        tm.assert_frame_equal(expected, parsed_117)

        # Check identity of codes
        for col in expected:
            if is_categorical_dtype(expected[col]):
                tm.assert_series_equal(expected[col].cat.codes,
                                       parsed_115[col].cat.codes)
                tm.assert_index_equal(expected[col].cat.categories,
                                      parsed_115[col].cat.categories)
Exemple #22
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = 'category'
        elif com.is_sparse(arr):
            typ = 'sparse'
        elif com.is_datetimetz(arr):
            typ = 'datetimetz'
        elif com.is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif com.is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif com.is_object_dtype(dtype):
            typ = 'object'
        elif com.is_bool_dtype(dtype):
            typ = 'bool'
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Exemple #23
0
    def _is_dtype_compat(self, other):
        """
        *this is an internal non-public method*

        provide a comparison between the dtype of self and other (coercing if
        needed)

        Raises
        ------
        TypeError if the dtypes are not compatible
        """
        if com.is_categorical_dtype(other):
            if isinstance(other, CategoricalIndex):
                other = other._values
            if not other.is_dtype_equal(self):
                raise TypeError("categories must match existing categories "
                                "when appending")
        else:
            values = other
            if not com.is_list_like(values):
                values = [values]
            other = CategoricalIndex(self._create_categorical(
                self, other, categories=self.categories, ordered=self.ordered))
            if not other.isin(values).all():
                raise TypeError("cannot append a non-category item to a "
                                "CategoricalIndex")

        return other
Exemple #24
0
    def test_basic(self):

        self.assertTrue(is_categorical_dtype(self.dtype))

        factor = Categorical.from_array(["a", "b", "b", "a", "a", "c", "c", "c"])

        s = Series(factor, name="A")

        # dtypes
        self.assertTrue(is_categorical_dtype(s.dtype))
        self.assertTrue(is_categorical_dtype(s))
        self.assertFalse(is_categorical_dtype(np.dtype("float64")))

        self.assertTrue(is_categorical(s.dtype))
        self.assertTrue(is_categorical(s))
        self.assertFalse(is_categorical(np.dtype("float64")))
        self.assertFalse(is_categorical(1.0))
Exemple #25
0
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, ABCSeries):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (np.ndarray, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not is_categorical_dtype(self):
                msg = ("Cannot compare a Categorical for op {op} with Series "
                       "of dtype {typ}.\nIf you want to compare values, use "
                       "'series <op> np.asarray(other)'.")
                raise TypeError(msg.format(op=op, typ=self.dtype))

        if is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray,
            # which would then not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and
            # dispatch to it.
            res = op(self.values, other)
        else:
            values = self.get_values()
            if isinstance(other, (list, np.ndarray)):
                other = np.asarray(other)

            res = na_op(values, other)
            if isscalar(res):
                raise TypeError('Could not compare %s type with Series' %
                                type(other))

            # always return a full value series here
            res = _values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
        return res
Exemple #26
0
    def test_basic(self):

        self.assertTrue(is_categorical_dtype(self.dtype))

        factor = Categorical.from_array(['a', 'b', 'b', 'a',
                                         'a', 'c', 'c', 'c'])

        s = Series(factor,name='A')

        # dtypes
        self.assertTrue(is_categorical_dtype(s.dtype))
        self.assertTrue(is_categorical_dtype(s))
        self.assertFalse(is_categorical_dtype(np.dtype('float64')))

        self.assertTrue(is_categorical(s.dtype))
        self.assertTrue(is_categorical(s))
        self.assertFalse(is_categorical(np.dtype('float64')))
        self.assertFalse(is_categorical(1.0))
def encode_categorical(table, columns=None, **kwargs):
    """
    Encode categorical columns with `M` categories into `M-1` columns according
    to the one-hot scheme.

    Parameters
    ----------
    table : pandas.DataFrame
        Table with categorical columns to encode.

    columns : list-like, optional, default: None
        Column names in the DataFrame to be encoded.
        If `columns` is None then all the columns with
        `object` or `category` dtype will be converted.

    allow_drop : boolean, optional, default: True
        Whether to allow dropping categorical columns that only consist
        of a single category.

    Returns
    -------
    encoded : pandas.DataFrame
        Table with categorical columns encoded as numeric.
        Numeric columns in the input table remain unchanged.
    """
    if isinstance(table, pandas.Series):
        if not is_categorical_dtype(
                table.dtype) and not table.dtype.char == "O":
            raise TypeError(
                "series must be of categorical dtype, but was {}".format(
                    table.dtype))
        return _encode_categorical_series(table, **kwargs)

    def _is_categorical_or_object(series):
        return is_categorical_dtype(series.dtype) or series.dtype.char == "O"

    if columns is None:
        # for columns containing categories
        columns_to_encode = {
            nam
            for nam, s in table.iteritems() if _is_categorical_or_object(s)
        }
    else:
        columns_to_encode = set(columns)

    items = []
    for name, series in table.iteritems():
        if name in columns_to_encode:
            series = _encode_categorical_series(series, **kwargs)
            if series is None:
                continue
        items.append(series)

    # concat columns of tables
    new_table = pandas.concat(items, axis=1, copy=False)
    return new_table
Exemple #28
0
def _concat_categorical(to_concat, axis=0):
    """Concatenate an object/categorical array of arrays, each of which is a
    single dtype

    Parameters
    ----------
    to_concat : array of arrays
    axis : int
        Axis to provide concatenation in the current implementation this is
        always 0, e.g. we only have 1D categoricals

    Returns
    -------
    Categorical
        A single array, preserving the combined dtypes
    """

    from pandas.core.categorical import Categorical

    def convert_categorical(x):
        # coerce to object dtype
        if com.is_categorical_dtype(x.dtype):
            return x.get_values()
        return x.ravel()

    if get_dtype_kinds(to_concat) - set(['object', 'category']):
        # convert to object type and perform a regular concat
        return _concat_compat(
            [np.array(x, copy=False, dtype=object) for x in to_concat], axis=0)

    # we could have object blocks and categoricals here
    # if we only have a single categoricals then combine everything
    # else its a non-compat categorical
    categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)]

    # validate the categories
    categories = categoricals[0]
    rawcats = categories.categories
    for x in categoricals[1:]:
        if not categories.is_dtype_equal(x):
            raise ValueError("incompatible categories in categorical concat")

    # we've already checked that all categoricals are the same, so if their
    # length is equal to the input then we have all the same categories
    if len(categoricals) == len(to_concat):
        # concating numeric types is much faster than concating object types
        # and fastpath takes a shorter path through the constructor
        return Categorical(np.concatenate([x.codes for x in to_concat],
                                          axis=0),
                           rawcats,
                           ordered=categoricals[0].ordered,
                           fastpath=True)
    else:
        concatted = np.concatenate(list(map(convert_categorical, to_concat)),
                                   axis=0)
        return Categorical(concatted, rawcats)
Exemple #29
0
def _concat_categorical(to_concat, axis=0):
    """Concatenate an object/categorical array of arrays, each of which is a
    single dtype

    Parameters
    ----------
    to_concat : array of arrays
    axis : int
        Axis to provide concatenation in the current implementation this is
        always 0, e.g. we only have 1D categoricals

    Returns
    -------
    Categorical
        A single array, preserving the combined dtypes
    """

    from pandas.core.categorical import Categorical

    def convert_categorical(x):
        # coerce to object dtype
        if com.is_categorical_dtype(x.dtype):
            return x.get_values()
        return x.ravel()

    if get_dtype_kinds(to_concat) - set(['object', 'category']):
        # convert to object type and perform a regular concat
        return _concat_compat([np.array(x, copy=False, dtype=object)
                               for x in to_concat], axis=0)

    # we could have object blocks and categoricals here
    # if we only have a single categoricals then combine everything
    # else its a non-compat categorical
    categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)]

    # validate the categories
    categories = categoricals[0]
    rawcats = categories.categories
    for x in categoricals[1:]:
        if not categories.is_dtype_equal(x):
            raise ValueError("incompatible categories in categorical concat")

    # we've already checked that all categoricals are the same, so if their
    # length is equal to the input then we have all the same categories
    if len(categoricals) == len(to_concat):
        # concating numeric types is much faster than concating object types
        # and fastpath takes a shorter path through the constructor
        return Categorical(np.concatenate([x.codes for x in to_concat],
                                          axis=0),
                           rawcats, ordered=categoricals[0].ordered,
                           fastpath=True)
    else:
        concatted = np.concatenate(list(map(convert_categorical, to_concat)),
                                   axis=0)
        return Categorical(concatted, rawcats)
Exemple #30
0
def encode_categorical(table, **kwargs):
    """
    Encode categorical columns with `M` categories into `M-1` columns according
    to the one-hot scheme.

    Parameters
    ----------
    table : pandas.DataFrame
        Table with categorical columns to encode.

    allow_drop : boolean, optional, default=True
        Whether to allow dropping categorical columns that only consist
        of a single category.

    Returns
    -------
    encoded : pandas.DataFrame
        Table with categorical columns encoded as numeric.
        Numeric columns in the input table remain unchanged.
    """
    if isinstance(table, pandas.Series):
        if not is_categorical_dtype(
                table.dtype) and not table.dtype.char == "O":
            raise TypeError(
                "series must be of categorical dtype, but was {}".format(
                    table.dtype))
        return _encode_categorical_series(table, **kwargs)
    else:
        new_table = pandas.DataFrame(index=table.index)

        for j in range(table.shape[1]):
            series = table.iloc[:, j]

            # for columns containing categories
            if is_categorical_dtype(series.dtype) or series.dtype.char == "O":
                series = _encode_categorical_series(series, **kwargs)
                if series is None:
                    continue

            # concat columns of tables
            new_table = pandas.concat((new_table, series), axis=1, copy=False)
        return new_table
Exemple #31
0
def pandas_col_to_ibis_type(col):
    import pandas.core.common as pdcom
    import ibis.expr.datatypes as dt
    import numpy as np
    dty = col.dtype

    # datetime types
    if pdcom.is_datetime64_dtype(dty):
        if pdcom.is_datetime64_ns_dtype(dty):
            return 'timestamp'
        else:
            raise com.IbisTypeError("Column {0} has dtype {1}, which is "
                                    "datetime64-like but does "
                                    "not use nanosecond units"
                                    .format(col.name, dty))
    if pdcom.is_timedelta64_dtype(dty):
        print("Warning: encoding a timedelta64 as an int64")
        return 'int64'

    if pdcom.is_categorical_dtype(dty):
        return dt.Category(len(col.cat.categories))

    if pdcom.is_bool_dtype(dty):
        return 'boolean'

    # simple numerical types
    if issubclass(dty.type, np.int8):
        return 'int8'
    if issubclass(dty.type, np.int16):
        return 'int16'
    if issubclass(dty.type, np.int32):
        return 'int32'
    if issubclass(dty.type, np.int64):
        return 'int64'
    if issubclass(dty.type, np.float32):
        return 'float'
    if issubclass(dty.type, np.float64):
        return 'double'
    if issubclass(dty.type, np.uint8):
        return 'int16'
    if issubclass(dty.type, np.uint16):
        return 'int32'
    if issubclass(dty.type, np.uint32):
        return 'int64'
    if issubclass(dty.type, np.uint64):
        raise com.IbisTypeError("Column {0} is an unsigned int64"
                                .format(col.name))

    if pdcom.is_object_dtype(dty):
        # TODO: overly broad?
        return 'string'

    raise com.IbisTypeError("Column {0} is dtype {1}"
                            .format(col.name, dty))
    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq']
            result += [top, freq]

        return pd.Series(result, index=names, name=data.name)
def unconvert(values, dtype, compress=None):

    as_is_ext = isinstance(values, ExtType) and values.code == 0

    if as_is_ext:
        values = values.data

    if is_categorical_dtype(dtype):
        return values

    elif is_object_dtype(dtype):
        return np.array(values, dtype=object)

    dtype = pandas_dtype(dtype).base

    if not as_is_ext:
        values = values.encode('latin1')

    if compress:
        if compress == u'zlib':
            _check_zlib()
            decompress = zlib.decompress
        elif compress == u'blosc':
            _check_blosc()
            decompress = blosc.decompress
        else:
            raise ValueError("compress must be one of 'zlib' or 'blosc'")

        try:
            return np.frombuffer(
                _move_into_mutable_buffer(decompress(values)),
                dtype=dtype,
            )
        except _BadMove as e:
            # Pull the decompressed data off of the `_BadMove` exception.
            # We don't just store this in the locals because we want to
            # minimize the risk of giving users access to a `bytes` object
            # whose data is also given to a mutable buffer.
            values = e.args[0]
            if len(values) > 1:
                # The empty string and single characters are memoized in many
                # string creating functions in the capi. This case should not
                # warn even though we need to make a copy because we are only
                # copying at most 1 byte.
                warnings.warn(
                    'copying data after decompressing; this may mean that'
                    ' decompress is caching its result',
                    PerformanceWarning,
                )
                # fall through to copying `np.fromstring`

    # Copy the string into a numpy array.
    return np.fromstring(values, dtype=dtype)
Exemple #34
0
def pandas_col_to_ibis_type(col):
    import pandas.core.common as pdcom
    import ibis.expr.datatypes as dt
    import numpy as np
    dty = col.dtype

    # datetime types
    if pdcom.is_datetime64_dtype(dty):
        if pdcom.is_datetime64_ns_dtype(dty):
            return 'timestamp'
        else:
            raise com.IbisTypeError("Column {0} has dtype {1}, which is "
                                    "datetime64-like but does "
                                    "not use nanosecond units".format(
                                        col.name, dty))
    if pdcom.is_timedelta64_dtype(dty):
        print("Warning: encoding a timedelta64 as an int64")
        return 'int64'

    if pdcom.is_categorical_dtype(dty):
        return dt.Category(len(col.cat.categories))

    if pdcom.is_bool_dtype(dty):
        return 'boolean'

    # simple numerical types
    if issubclass(dty.type, np.int8):
        return 'int8'
    if issubclass(dty.type, np.int16):
        return 'int16'
    if issubclass(dty.type, np.int32):
        return 'int32'
    if issubclass(dty.type, np.int64):
        return 'int64'
    if issubclass(dty.type, np.float32):
        return 'float'
    if issubclass(dty.type, np.float64):
        return 'double'
    if issubclass(dty.type, np.uint8):
        return 'int16'
    if issubclass(dty.type, np.uint16):
        return 'int32'
    if issubclass(dty.type, np.uint32):
        return 'int64'
    if issubclass(dty.type, np.uint64):
        raise com.IbisTypeError("Column {0} is an unsigned int64".format(
            col.name))

    if pdcom.is_object_dtype(dty):
        # TODO: overly broad?
        return 'string'

    raise com.IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))
Exemple #35
0
    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq', 'type']
            result += [top, freq, 'CAT']

        return pd.Series(result, index=names, name=data.name)
def encode_categorical(table, **kwargs):
    """
    Encode categorical columns with `M` categories into `M-1` columns according
    to the one-hot scheme.

    Parameters
    ----------
    table : pandas.DataFrame
        Table with categorical columns to encode.

    allow_drop : boolean, optional, default=True
        Whether to allow dropping categorical columns that only consist
        of a single category.

    Returns
    -------
    encoded : pandas.DataFrame
        Table with categorical columns encoded as numeric.
        Numeric columns in the input table remain unchanged.
    """
    if isinstance(table, pandas.Series):
        if not is_categorical_dtype(table.dtype) and not table.dtype.char == "O":
            raise TypeError("series must be of categorical dtype, but was {}".format(table.dtype))
        return _encode_categorical_series(table, **kwargs)
    else:
        new_table = pandas.DataFrame(index=table.index)

        for j in range(table.shape[1]):
            series = table.iloc[:, j]

            # for columns containing categories
            if is_categorical_dtype(series.dtype) or series.dtype.char == "O":
                series = _encode_categorical_series(series, **kwargs)
                if series is None:
                    continue

            # concat columns of tables
            new_table = pandas.concat((new_table, series), axis=1, copy=False)
        return new_table
Exemple #37
0
    def test_categorical_ordering(self):
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)

        parsed_115_unordered = read_stata(self.dta19_115, order_categoricals=False)
        parsed_117_unordered = read_stata(self.dta19_117, order_categoricals=False)
        for col in parsed_115:
            if not is_categorical_dtype(parsed_115[col]):
                continue
            tm.assert_equal(True, parsed_115[col].cat.ordered)
            tm.assert_equal(True, parsed_117[col].cat.ordered)
            tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
            tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
Exemple #38
0
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike
      (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """
    from pandas import Series

    if not isinstance(data, Series):
        raise TypeError("cannot convert an object of type {0} to a "
                        "datetimelike index".format(type(data)))

    index = data.index
    name = data.name
    orig = data if is_categorical_dtype(data) else None
    if orig is not None:
        data = orig.values.categories

    if is_datetime64_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'),
                                  index, name=name, orig=orig)
    elif is_datetime64tz_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer',
                                                ambiguous='infer'),
                                  index, data.name, orig=orig)
    elif is_timedelta64_dtype(data.dtype):
        return TimedeltaProperties(TimedeltaIndex(data, copy=copy,
                                                  freq='infer'), index,
                                   name=name, orig=orig)
    else:
        if is_period_arraylike(data):
            return PeriodProperties(PeriodIndex(data, copy=copy), index,
                                    name=name, orig=orig)
        if is_datetime_arraylike(data):
            return DatetimeProperties(DatetimeIndex(data, copy=copy,
                                                    freq='infer'), index,
                                      name=name, orig=orig)

    raise TypeError("cannot convert an object of type {0} to a "
                    "datetimelike index".format(type(data)))
Exemple #39
0
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike
      (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """
    from pandas import Series

    if not isinstance(data, Series):
        raise TypeError("cannot convert an object of type {0} to a "
                        "datetimelike index".format(type(data)))

    index = data.index
    name = data.name
    orig = data if is_categorical_dtype(data) else None
    if orig is not None:
        data = orig.values.categories

    if is_datetime64_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'),
                                  index, name=name, orig=orig)
    elif is_datetime64tz_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer',
                                                ambiguous='infer'),
                                  index, data.name, orig=orig)
    elif is_timedelta64_dtype(data.dtype):
        return TimedeltaProperties(TimedeltaIndex(data, copy=copy,
                                                  freq='infer'), index,
                                   name=name, orig=orig)
    else:
        if is_period_arraylike(data):
            return PeriodProperties(PeriodIndex(data, copy=copy), index,
                                    name=name, orig=orig)
        if is_datetime_arraylike(data):
            return DatetimeProperties(DatetimeIndex(data, copy=copy,
                                                    freq='infer'), index,
                                      name=name, orig=orig)

    raise TypeError("cannot convert an object of type {0} to a "
                    "datetimelike index".format(type(data)))
Exemple #40
0
def _nonempty_series(s, idx):
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        entry = s.cat.categories[0]
        data = pd.Categorical([entry, entry],
                               categories=s.cat.categories,
                               ordered=s.cat.ordered)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)
    return pd.Series(data, name=s.name, index=idx)
Exemple #41
0
    def is_categorical_astype(self, dtype):
        """
        validate that we have a astypeable to categorical,
        returns a boolean if we are a categorical
        """
        if com.is_categorical_dtype(dtype):
            if dtype == com.CategoricalDtype():
                return True

            # this is a pd.Categorical, but is not
            # a valid type for astypeing
            raise TypeError("invalid type {0} for astype".format(dtype))

        return False
Exemple #42
0
def _nonempty_series(s, idx):
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        entry = s.cat.categories[0]
        data = pd.Categorical([entry, entry],
                               categories=s.cat.categories,
                               ordered=s.cat.ordered)
    else:
        entry = _scalar_from_dtype(dtype)
        data = [entry, entry]
    return pd.Series(data, name=s.name, index=idx)
Exemple #43
0
    def wrapper(self, other):
        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index,
                                     name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (pa.Array, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not com.is_categorical_dtype(self):
                msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\
                      "If you want to compare values, use 'series <op> np.asarray(other)'."
                raise TypeError(msg.format(op=op, typ=self.dtype))
        else:

            mask = isnull(self)

            values = self.get_values()
            other = _index.convert_scalar(values, _values_from_object(other))

            if issubclass(values.dtype.type, np.datetime64):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series' %
                                type(other))

            # always return a full value series here
            res = _values_from_object(res)

            res = pd.Series(res,
                            index=self.index,
                            name=self.name,
                            dtype='bool')

            # mask out the invalids
            if mask.any():
                res[mask] = masker

            return res
Exemple #44
0
def _nonempty_series(s, idx):
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
    elif is_categorical_dtype(dtype):
        entry = pd.Categorical([s.cat.categories[0]],
                               categories=s.cat.categories,
                               ordered=s.cat.ordered)
    elif dtype.kind in ['i', 'f', 'u']:
        entry = dtype.type(1)
    elif dtype.kind in _simple_fake_mapping:
        entry = _simple_fake_mapping[dtype.kind]
    else:
        raise TypeError("Can't handle dtype: {0}".format(dtype))
    return pd.Series([entry, entry], name=s.name, index=idx)
Exemple #45
0
    def test_categorical_ordering(self):
        parsed_115 = read_stata(self.dta19_115)
        parsed_117 = read_stata(self.dta19_117)

        parsed_115_unordered = read_stata(self.dta19_115,
                                          order_categoricals=False)
        parsed_117_unordered = read_stata(self.dta19_117,
                                          order_categoricals=False)
        for col in parsed_115:
            if not is_categorical_dtype(parsed_115[col]):
                continue
            tm.assert_equal(True, parsed_115[col].cat.ordered)
            tm.assert_equal(True, parsed_117[col].cat.ordered)
            tm.assert_equal(False, parsed_115_unordered[col].cat.ordered)
            tm.assert_equal(False, parsed_117_unordered[col].cat.ordered)
    def transform(column):
        if is_categorical_dtype(column.dtype):
            return column.cat.codes
        if column.dtype.char == "O":
            try:
                nc = column.astype(int)
            except ValueError:
                classes = column.dropna().unique()
                classes.sort(kind="mergesort")
                nc = column.replace(classes, numpy.arange(classes.shape[0]))
            return nc
        elif column.dtype == bool:
            return column.astype(int)

        return column
    def transform(column):
        if is_categorical_dtype(column.dtype):
            return column.cat.codes
        if column.dtype.char == "O":
            try:
                nc = column.astype(int)
            except ValueError:
                classes = column.dropna().unique()
                classes.sort(kind="mergesort")
                nc = column.replace(classes, numpy.arange(classes.shape[0]))
            return nc
        elif column.dtype == bool:
            return column.astype(int)

        return column
Exemple #48
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
Exemple #49
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
Exemple #50
0
    def wrapper(self, other):
        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (pa.Array, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not com.is_categorical_dtype(self):
                msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\
                      "If you want to compare values, use 'series <op> np.asarray(other)'."
                raise TypeError(msg.format(op=op,typ=self.dtype))
        else:

            mask = isnull(self)

            values = self.get_values()
            other = _index.convert_scalar(values,_values_from_object(other))

            if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series'
                                % type(other))

            # always return a full value series here
            res = _values_from_object(res)

            res = pd.Series(res, index=self.index, name=self.name,
                            dtype='bool')

            # mask out the invalids
            if mask.any():
                res[mask] = masker

            return res
def convert(values):
    """ convert the numpy values to a list """

    dtype = values.dtype

    if is_categorical_dtype(values):
        return values

    elif is_object_dtype(dtype):
        return values.ravel().tolist()

    if needs_i8_conversion(dtype):
        values = values.view('i8')
    v = values.ravel()

    if compressor == 'zlib':
        _check_zlib()

        # return string arrays like they are
        if dtype == np.object_:
            return v.tolist()

        # convert to a bytes array
        v = v.tostring()
        return ExtType(0, zlib.compress(v))

    elif compressor == 'blosc':
        _check_blosc()

        # return string arrays like they are
        if dtype == np.object_:
            return v.tolist()

        # convert to a bytes array
        v = v.tostring()
        return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))

    # ndarray (on original dtype)
    return ExtType(0, v.tostring())
Exemple #52
0
    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq', 'type']
            result += [top, freq, 'CAT']

        if y is not None:
            try:
                mdld = mdl_1d_cat(data, y)
                result += list(mdld)
            except:
                Tracer()()
        else:
            result += ['No Dep Var', '']

        names += ['AUC', 'cmatrix']

        return pd.Series(result, index=names, name=data.name)
Exemple #53
0
def make_field(arr, dtype=None):
    dtype = dtype or arr.dtype
    if arr.name is None:
        name = 'values'
    else:
        name = arr.name
    field = {'name': name, 'type': as_json_table_type(dtype)}

    if is_categorical_dtype(arr):
        if hasattr(arr, 'categories'):
            cats = arr.categories
            ordered = arr.ordered
        else:
            cats = arr.cat.categories
            ordered = arr.cat.ordered
        field['constraints'] = {"enum": list(cats)}
        field['ordered'] = ordered
    elif is_datetime64tz_dtype(arr):
        if hasattr(arr, 'dt'):
            field['tz'] = arr.dt.tz.zone
        else:
            field['tz'] = arr.tz.zone
    return field
Exemple #54
0
    def describe_categorical_1d(data):
        # Only run if at least 1 non-missing value
        objcounts = data.value_counts()
        top, freq = objcounts.index[0], objcounts.iloc[0]
        names = []
        result = []

        if data.dtype == object or com.is_categorical_dtype(data.dtype):
            names += ['top', 'freq', 'type']
            result += [top, freq, 'CAT']

        if y is not None:
            try:
                mdld = mdl_1d_cat(data, y)
                result += list(mdld)
            except:
                Tracer()()
        else:
            result += ['No Dep Var', '']

        names += ['AUC', 'cmatrix']

        return pd.Series(result, index=names, name=data.name)
def encode_categorical(table, **kwargs):
    """
    Encode categorical columns with `M` categories into `M-1` columns according
    to the one-hot scheme.

    Parameters
    ----------
    table : pandas.DataFrame
        Table with categorical columns to encode.

    allow_drop : boolean, optional, default=True
        Whether to allow dropping categorical columns that only consistent
        of a single category.

    Returns
    -------
    encoded : pandas.DataFrame
        Table with categorical columns encoded as numeric.
        Numeric columns in the input table remain unchanged.
    """
    if isinstance(table, pandas.Series):
        return _encode_categorical_series(table, **kwargs)
    else:
        new_table = pandas.DataFrame(index=table.index)

        for j in range(table.shape[1]):
            series = table.iloc[:, j]

            # for columns containing categories
            if is_categorical_dtype(series.dtype) or series.dtype.char == "O":
                series = _encode_categorical_series(series, **kwargs)
                if series is None:
                    continue

            # join tables on index
            new_table = new_table.join(series)
        return new_table
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas import Index, PeriodIndex, DatetimeIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)
        is_datetimetz = com.is_datetimetz(values)

        if com.is_datetime_or_timedelta_dtype(
                dtype) or is_period or is_datetimetz:

            if is_period:
                values = PeriodIndex(values)
            elif is_datetimetz:
                tz = getattr(values, 'tz', None)
                values = DatetimeIndex(values).tz_localize(None)

            values = values.view(np.int64)
            keys, counts = htable.value_count_scalar64(values, dropna)

            if dropna:
                from pandas.tslib import iNaT
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # localize to the original tz if necessary
            if is_datetimetz:
                keys = DatetimeIndex(keys).tz_localize(tz)

            # convert the keys back to the dtype we came in
            else:
                keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)
        elif com.is_float_dtype(dtype):
            values = com._ensure_float64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)),
                                    fill_value=0)
            result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(values.size)

    return result
Exemple #57
0
 def convert_categorical(x):
     # coerce to object dtype
     if com.is_categorical_dtype(x.dtype):
         return x.get_values()
     return x.ravel()