Python is_datetimetz Exemples, pandas.types.common.is_datetimetz Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_dtypes.py Projet : Alias4bb/pandas

    def test_dst(self):

        dr1 = date_range('2013-01-01', periods=3, tz='US/Eastern')
        s1 = Series(dr1, name='A')
        self.assertTrue(is_datetimetz(s1))

        dr2 = date_range('2013-08-01', periods=3, tz='US/Eastern')
        s2 = Series(dr2, name='A')
        self.assertTrue(is_datetimetz(s2))
        self.assertEqual(s1.dtype, s2.dtype)

Exemple #2

0

Afficher le fichier

Fichier : test_dtypes.py Projet : sunyuzheng/pandas

    def test_dst(self):

        dr1 = date_range('2013-01-01', periods=3, tz='US/Eastern')
        s1 = Series(dr1, name='A')
        self.assertTrue(is_datetimetz(s1))

        dr2 = date_range('2013-08-01', periods=3, tz='US/Eastern')
        s2 = Series(dr2, name='A')
        self.assertTrue(is_datetimetz(s2))
        self.assertEqual(s1.dtype, s2.dtype)

Exemple #3

0

Afficher le fichier

Fichier : test_dtypes.py Projet : Alias4bb/pandas

    def test_basic(self):

        self.assertTrue(is_datetime64tz_dtype(self.dtype))

        dr = date_range('20130101', periods=3, tz='US/Eastern')
        s = Series(dr, name='A')

        # dtypes
        self.assertTrue(is_datetime64tz_dtype(s.dtype))
        self.assertTrue(is_datetime64tz_dtype(s))
        self.assertFalse(is_datetime64tz_dtype(np.dtype('float64')))
        self.assertFalse(is_datetime64tz_dtype(1.0))

        self.assertTrue(is_datetimetz(s))
        self.assertTrue(is_datetimetz(s.dtype))
        self.assertFalse(is_datetimetz(np.dtype('float64')))
        self.assertFalse(is_datetimetz(1.0))

Exemple #4

0

Afficher le fichier

Fichier : test_dtypes.py Projet : sunyuzheng/pandas

    def test_basic(self):

        self.assertTrue(is_datetime64tz_dtype(self.dtype))

        dr = date_range('20130101', periods=3, tz='US/Eastern')
        s = Series(dr, name='A')

        # dtypes
        self.assertTrue(is_datetime64tz_dtype(s.dtype))
        self.assertTrue(is_datetime64tz_dtype(s))
        self.assertFalse(is_datetime64tz_dtype(np.dtype('float64')))
        self.assertFalse(is_datetime64tz_dtype(1.0))

        self.assertTrue(is_datetimetz(s))
        self.assertTrue(is_datetimetz(s.dtype))
        self.assertFalse(is_datetimetz(np.dtype('float64')))
        self.assertFalse(is_datetimetz(1.0))

Exemple #5

0

Afficher le fichier

Fichier : algorithms.py Projet : wuthmonehnin/pandas

def _value_counts_arraylike(values, dropna=True):
    is_datetimetz_type = is_datetimetz(values)
    is_period_type = (is_period_dtype(values) or
                      is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if needs_i8_conversion(dtype) or is_period_type:

        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period_type:
            # values may be an object
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz_type:
            keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz)
        if is_period_type:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif is_signed_integer_dtype(dtype):
        values = _ensure_int64(values)
        keys, counts = htable.value_count_int64(values, dropna)
    elif is_unsigned_integer_dtype(dtype):
        values = _ensure_uint64(values)
        keys, counts = htable.value_count_uint64(values, dropna)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        keys, counts = htable.value_count_float64(values, dropna)
    else:
        values = _ensure_object(values)
        keys, counts = htable.value_count_object(values, dropna)

        mask = isnull(values)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts

Exemple #6

0

Afficher le fichier

Fichier : test_base.py Projet : saharaheart/pandas

    def test_value_counts_unique_nunique(self):
        for orig in self.objs:

            o = orig.copy()
            klass = type(o)
            values = o._values

            if isinstance(values, Index):
                # reset name not to affect latter process
                values.name = None

            # create repeated values, 'n'th element is repeated by n+1 times
            # skip boolean, because it only has 2 values at most
            if isinstance(o, Index) and o.is_boolean():
                continue
            elif isinstance(o, Index):
                expected_index = pd.Index(o[::-1])
                expected_index.name = None
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'
            else:
                expected_index = pd.Index(values[::-1])
                idx = o.index.repeat(range(1, len(o) + 1))
                rep = np.repeat(values, range(1, len(o) + 1))
                o = klass(rep, index=idx, name='a')

            # check values has the same dtype as the original
            self.assertEqual(o.dtype, orig.dtype)

            expected_s = Series(range(10, 0, -1),
                                index=expected_index,
                                dtype='int64',
                                name='a')

            result = o.value_counts()
            tm.assert_series_equal(result, expected_s)
            self.assertTrue(result.index.name is None)
            self.assertEqual(result.name, 'a')

            result = o.unique()
            if isinstance(o, Index):
                self.assertTrue(isinstance(result, o.__class__))
                self.assert_index_equal(result, orig)
            elif is_datetimetz(o):
                # datetimetz Series returns array of Timestamp
                self.assertEqual(result[0], orig[0])
                for r in result:
                    self.assertIsInstance(r, pd.Timestamp)
                tm.assert_numpy_array_equal(result,
                                            orig._values.asobject.values)
            else:
                tm.assert_numpy_array_equal(result, orig.values)

            self.assertEqual(o.nunique(), len(np.unique(o.values)))

Exemple #7

0

Afficher le fichier

Fichier : algorithms.py Projet : ChristopherShort/pandas

def _value_counts_arraylike(values, dropna=True):
    is_datetimetz_type = is_datetimetz(values)
    is_period = (isinstance(values, ABCPeriodIndex) or
                 is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if is_datetime_or_timedelta_dtype(dtype) or is_period:
        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period:
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_scalar64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz_type:
            if isinstance(orig, ABCDatetimeIndex):
                tz = orig.tz
            else:
                tz = orig.dt.tz
            keys = DatetimeIndex._simple_new(keys, tz=tz)
        if is_period:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif is_integer_dtype(dtype):
        values = _ensure_int64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    else:
        values = _ensure_object(values)
        mask = isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts

Exemple #8

0

Afficher le fichier

Fichier : test_base.py Projet : RogerThomas/pandas

    def test_value_counts_unique_nunique(self):
        for orig in self.objs:
            o = orig.copy()
            klass = type(o)
            values = o._values

            if isinstance(values, Index):
                # reset name not to affect latter process
                values.name = None

            # create repeated values, 'n'th element is repeated by n+1 times
            # skip boolean, because it only has 2 values at most
            if isinstance(o, Index) and o.is_boolean():
                continue
            elif isinstance(o, Index):
                expected_index = pd.Index(o[::-1])
                expected_index.name = None
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'
            else:
                expected_index = pd.Index(values[::-1])
                idx = o.index.repeat(range(1, len(o) + 1))
                rep = np.repeat(values, range(1, len(o) + 1))
                o = klass(rep, index=idx, name='a')

            # check values has the same dtype as the original
            self.assertEqual(o.dtype, orig.dtype)

            expected_s = Series(range(10, 0, -1), index=expected_index,
                                dtype='int64', name='a')

            result = o.value_counts()
            tm.assert_series_equal(result, expected_s)
            self.assertTrue(result.index.name is None)
            self.assertEqual(result.name, 'a')

            result = o.unique()
            if isinstance(o, Index):
                self.assertTrue(isinstance(result, o.__class__))
                self.assert_index_equal(result, orig)
            elif is_datetimetz(o):
                # datetimetz Series returns array of Timestamp
                self.assertEqual(result[0], orig[0])
                for r in result:
                    self.assertIsInstance(r, pd.Timestamp)
                tm.assert_numpy_array_equal(result,
                                            orig._values.asobject.values)
            else:
                tm.assert_numpy_array_equal(result, orig.values)

            self.assertEqual(o.nunique(), len(np.unique(o.values)))

Exemple #9

0

Afficher le fichier

Fichier : algorithms.py Projet : ChristopherShort/pandas

def duplicated(values, keep='first'):
    """
    Return boolean ndarray denoting duplicate values

    .. versionadded:: 0.19.0

    Parameters
    ----------
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Mark duplicates as ``True`` except for the first
          occurrence.
        - ``last`` : Mark duplicates as ``True`` except for the last
          occurrence.
        - False : Mark all duplicates as ``True``.

    Returns
    -------
    duplicated : ndarray
    """

    dtype = values.dtype

    # no need to revert to original type
    if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype):
        if isinstance(values, (ABCSeries, ABCIndex)):
            values = values.values.view(np.int64)
        else:
            values = values.view(np.int64)
    elif is_period_arraylike(values):
        from pandas.tseries.period import PeriodIndex
        values = PeriodIndex(values).asi8
    elif is_categorical_dtype(dtype):
        values = values.values.codes
    elif isinstance(values, (ABCSeries, ABCIndex)):
        values = values.values

    if is_integer_dtype(dtype):
        values = _ensure_int64(values)
        duplicated = htable.duplicated_int64(values, keep=keep)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        duplicated = htable.duplicated_float64(values, keep=keep)
    else:
        values = _ensure_object(values)
        duplicated = htable.duplicated_object(values, keep=keep)

    return duplicated

Exemple #10

0

Afficher le fichier

Fichier : algorithms.py Projet : yizhiyong/pandas

def duplicated(values, keep='first'):
    """
    Return boolean ndarray denoting duplicate values

    .. versionadded:: 0.19.0

    Parameters
    ----------
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Mark duplicates as ``True`` except for the first
          occurrence.
        - ``last`` : Mark duplicates as ``True`` except for the last
          occurrence.
        - False : Mark all duplicates as ``True``.

    Returns
    -------
    duplicated : ndarray
    """

    dtype = values.dtype

    # no need to revert to original type
    if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype):
        if isinstance(values, (ABCSeries, ABCIndex)):
            values = values.values.view(np.int64)
        else:
            values = values.view(np.int64)
    elif is_period_arraylike(values):
        from pandas.tseries.period import PeriodIndex
        values = PeriodIndex(values).asi8
    elif is_categorical_dtype(dtype):
        values = values.values.codes
    elif isinstance(values, (ABCSeries, ABCIndex)):
        values = values.values

    if is_integer_dtype(dtype):
        values = _ensure_int64(values)
        duplicated = htable.duplicated_int64(values, keep=keep)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        duplicated = htable.duplicated_float64(values, keep=keep)
    else:
        values = _ensure_object(values)
        duplicated = htable.duplicated_object(values, keep=keep)

    return duplicated

Exemple #11

0

Afficher le fichier

Fichier : algorithms.py Projet : neer201/catboost_SE

def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex, PeriodIndex

    # handling two possibilities here
    # - for a numpy datetimelike simply view as i8 then cast back
    # - for an extension datetimelike view as i8 then
    #   reconstruct from boxed values to transfer metadata
    dtype = None
    if needs_i8_conversion(values):
        if is_period_dtype(values):
            values = PeriodIndex(values)
            vals = values.asi8
        elif is_datetimetz(values):
            values = DatetimeIndex(values)
            vals = values.asi8
        else:
            # numpy dtype
            dtype = values.dtype
            vals = values.view(np.int64)
    else:
        vals = np.asarray(values)

    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques,
                                    labels,
                                    na_sentinel=na_sentinel,
                                    assume_unique=True)

    if dtype is not None:
        uniques = uniques.astype(dtype)

    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques

Exemple #12

0

Afficher le fichier

Fichier : algorithms.py Projet : neer201/catboost_SE

def take_nd(arr,
            indexer,
            axis=0,
            out=None,
            fill_value=np.nan,
            mask_info=None,
            allow_fill=True):
    """
    Specialized Cython take which sets NaN values in one pass

    Parameters
    ----------
    arr : ndarray
        Input array
    indexer : ndarray
        1-D array of indices to take, subarrays corresponding to -1 value
        indicies are filed with fill_value
    axis : int, default 0
        Axis to take from
    out : ndarray or None, default None
        Optional output array, must be appropriate type to hold input and
        fill_value together, if indexer has any -1 value entries; call
        _maybe_promote to determine this type for any fill_value
    fill_value : any, default np.nan
        Fill value to replace -1 values with
    mask_info : tuple of (ndarray, boolean)
        If provided, value should correspond to:
            (indexer != -1, (indexer != -1).any())
        If not provided, it will be computed internally if necessary
    allow_fill : boolean, default True
        If False, indexer is assumed to contain no -1 values so no filling
        will be done.  This short-circuits computation of a mask.  Result is
        undefined if allow_fill == False and -1 is present in indexer.
    """

    # dispatch to internal type takes
    if is_categorical(arr):
        return arr.take_nd(indexer,
                           fill_value=fill_value,
                           allow_fill=allow_fill)
    elif is_datetimetz(arr):
        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)

    if indexer is None:
        indexer = np.arange(arr.shape[axis], dtype=np.int64)
        dtype, fill_value = arr.dtype, arr.dtype.type()
    else:
        indexer = _ensure_int64(indexer)
        if not allow_fill:
            dtype, fill_value = arr.dtype, arr.dtype.type()
            mask_info = None, False
        else:
            # check for promotion based on types only (do this first because
            # it's faster than computing a mask)
            dtype, fill_value = _maybe_promote(arr.dtype, fill_value)
            if dtype != arr.dtype and (out is None or out.dtype != dtype):
                # check if promotion is actually required based on indexer
                if mask_info is not None:
                    mask, needs_masking = mask_info
                else:
                    mask = indexer == -1
                    needs_masking = mask.any()
                    mask_info = mask, needs_masking
                if needs_masking:
                    if out is not None and out.dtype != dtype:
                        raise TypeError('Incompatible type for fill_value')
                else:
                    # if not, then depromote, set fill_value to dummy
                    # (it won't be used but we don't want the cython code
                    # to crash when trying to cast it to dtype)
                    dtype, fill_value = arr.dtype, arr.dtype.type()

    flip_order = False
    if arr.ndim == 2:
        if arr.flags.f_contiguous:
            flip_order = True

    if flip_order:
        arr = arr.T
        axis = arr.ndim - axis - 1
        if out is not None:
            out = out.T

    # at this point, it's guaranteed that dtype can hold both the arr values
    # and the fill_value
    if out is None:
        out_shape = list(arr.shape)
        out_shape[axis] = len(indexer)
        out_shape = tuple(out_shape)
        if arr.flags.f_contiguous and axis == arr.ndim - 1:
            # minor tweak that can make an order-of-magnitude difference
            # for dataframes initialized directly from 2-d ndarrays
            # (s.t. df.values is c-contiguous and df._data.blocks[0] is its
            # f-contiguous transpose)
            out = np.empty(out_shape, dtype=dtype, order='F')
        else:
            out = np.empty(out_shape, dtype=dtype)

    func = _get_take_nd_function(arr.ndim,
                                 arr.dtype,
                                 out.dtype,
                                 axis=axis,
                                 mask_info=mask_info)
    indexer = _ensure_int64(indexer)
    func(arr, indexer, out, fill_value)

    if flip_order:
        out = out.T
    return out

Exemple #13

0

Afficher le fichier

Fichier : test_base.py Projet : saharaheart/pandas

    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for o in self.objs:
                klass = type(o)
                values = o.values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = pd.tslib.iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = pd.tslib.iNaT
                        values = o._values

                elif is_datetime64_dtype(o) or isinstance(o, PeriodIndex):
                    values[0:2] = pd.tslib.iNaT
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original
                self.assertEqual(values.dtype, o.dtype)

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, PeriodIndex):
                    # freq must be specified because repeat makes freq
                    # ambiguous

                    # resets name from Index
                    expected_index = pd.Index(o, name=None)
                    # attach name to klass
                    o = klass(np.repeat(values, range(1,
                                                      len(o) + 1)),
                              freq=o.freq,
                              name='a')
                elif isinstance(o, Index):
                    expected_index = pd.Index(values, name=None)
                    o = klass(np.repeat(values, range(1,
                                                      len(o) + 1)),
                              name='a')
                else:
                    expected_index = pd.Index(values, name=None)
                    idx = np.repeat(o.index.values, range(1, len(o) + 1))
                    o = klass(np.repeat(values, range(1,
                                                      len(o) + 1)),
                              index=idx,
                              name='a')

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64',
                                       name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64',
                                    name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                self.assertTrue(result_s_na.index.name is None)
                self.assertEqual(result_s_na.name, 'a')
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                self.assertTrue(result_s.index.name is None)
                self.assertEqual(result_s.name, 'a')

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result, Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    tm.assert_numpy_array_equal(result[1:],
                                                values[2:].asobject.values)
                    self.assertIs(result[0], pd.NaT)
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])
                    self.assertTrue(pd.isnull(result[0]))

                self.assertEqual(o.nunique(), 8)
                self.assertEqual(o.nunique(dropna=False), 9)

Exemple #14

0

Afficher le fichier

Fichier : algorithms.py Projet : craigyoung2016/pandas

def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex

    vals = np.asarray(values)

    # localize to UTC
    is_datetimetz_type = is_datetimetz(values)
    if is_datetimetz_type:
        values = DatetimeIndex(values)
        vals = values.tz_localize(None)

    is_datetime = is_datetime64_dtype(vals)
    is_timedelta = is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(_ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(np.array([e for i, e in enumerate(uniques) if f(e)],
                                 dtype=object)) for f in
                [lambda x: not isinstance(x, string_types),
                 lambda x: isinstance(x, string_types)]])
            sorter = _ensure_platform_int(t.lookup(
                _ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetimetz_type:

        # reset tz
        uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
            values.tz)
    elif is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques

Exemple #15

0

Afficher le fichier

    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)
                values = o._values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = iNaT
                        values = o._values

                elif needs_i8_conversion(o):
                    values[0:2] = iNaT
                    values = o._shallow_copy(values)
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original

                self.assertEqual(values.dtype, o.dtype)

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    expected_index = o.copy()
                    expected_index.name = None

                    # attach name to klass
                    o = klass(values.repeat(range(1, len(o) + 1)))
                    o.name = 'a'
                else:
                    if is_datetimetz(o):
                        expected_index = orig._values._shallow_copy(values)
                    else:
                        expected_index = pd.Index(values)
                    expected_index.name = None
                    o = o.repeat(range(1, len(o) + 1))
                    o.name = 'a'

                # check values has the same dtype as the original
                self.assertEqual(o.dtype, orig.dtype)
                # check values correctly have NaN
                nanloc = np.zeros(len(o), dtype=np.bool)
                nanloc[:3] = True
                if isinstance(o, Index):
                    self.assert_numpy_array_equal(pd.isnull(o), nanloc)
                else:
                    exp = pd.Series(nanloc, o.index, name='a')
                    self.assert_series_equal(pd.isnull(o), exp)

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64',
                                       name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64',
                                    name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                self.assertTrue(result_s_na.index.name is None)
                self.assertEqual(result_s_na.name, 'a')
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                self.assertTrue(result_s.index.name is None)
                self.assertEqual(result_s.name, 'a')

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result, Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    tm.assert_numpy_array_equal(result[1:],
                                                values[2:].asobject.values)
                    self.assertIs(result[0], pd.NaT)
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])

                    self.assertTrue(pd.isnull(result[0]))
                    self.assertEqual(result.dtype, orig.dtype)

                self.assertEqual(o.nunique(), 8)
                self.assertEqual(o.nunique(dropna=False), 9)

Exemple #16

0

Afficher le fichier

Fichier : ops.py Projet : craigyoung2016/pandas

    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import to_timedelta

        ovalues = values
        supplied_dtype = None
        if not is_list_like(values):
            values = np.array([values])
        # if this is a Series that contains relevant dtype info, then use this
        # instead of the inferred type; this avoids coercing Series([NaT],
        # dtype='datetime64[ns]') to Series([NaT], dtype='timedelta64[ns]')
        elif (isinstance(values, pd.Series) and
              (is_timedelta64_dtype(values) or is_datetime64_dtype(values))):
            supplied_dtype = values.dtype
        inferred_type = supplied_dtype or lib.infer_dtype(values)
        if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or
                is_datetimetz(inferred_type)):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (supplied_dtype is None and other is not None and
                (other.dtype in ('timedelta64[ns]', 'datetime64[ns]')) and
                    isnull(values).all()):
                values = np.empty(values.shape, dtype='timedelta64[ns]')
                values[:] = iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            # datetime with tz
            elif (isinstance(ovalues, datetime.datetime) and
                  hasattr(ovalues, 'tz')):
                values = pd.DatetimeIndex(values)
            # datetime array with tz
            elif is_datetimetz(values):
                if isinstance(values, ABCSeries):
                    values = values._values
            elif not (isinstance(values, (np.ndarray, ABCSeries)) and
                      is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = to_timedelta(values, errors='coerce', box=False)
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__', '__rmul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif inferred_type == 'floating':
            if (isnull(values).all() and
                    name in ('__add__', '__radd__', '__sub__', '__rsub__')):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT
            return values
        elif self._is_offset(values):
            return values
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(np.array(values).dtype))

        return values

Exemple #17

0

Afficher le fichier

Fichier : algorithms.py Projet : andrewkittredge/pandas

def value_counts(values, sort=True, ascending=False, normalize=False,
                 bins=None, dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    name = getattr(values, 'name', None)

    if bins is not None:
        try:
            from pandas.tools.tile import cut
            values = Series(values).values
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if is_extension_type(values) and not is_datetimetz(values):
        # handle Categorical and sparse,
        # datetime tz can be handeled in ndarray path
        result = Series(values).values.value_counts(dropna=dropna)
        result.name = name
        counts = result.values
    else:
        # ndarray path. pass original to handle DatetimeTzBlock
        keys, counts = _value_counts_arraylike(values, dropna=dropna)

        from pandas import Index, Series
        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)),
                                fill_value=0)
        result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(counts.sum())

    return result

Exemple #18

0

Afficher le fichier

Fichier : algorithms.py Projet : andrewkittredge/pandas

def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex, PeriodIndex

    # handling two possibilities here
    # - for a numpy datetimelike simply view as i8 then cast back
    # - for an extension datetimelike view as i8 then
    #   reconstruct from boxed values to transfer metadata
    dtype = None
    if needs_i8_conversion(values):
        if is_period_dtype(values):
            values = PeriodIndex(values)
            vals = values.asi8
        elif is_datetimetz(values):
            values = DatetimeIndex(values)
            vals = values.asi8
        else:
            # numpy dtype
            dtype = values.dtype
            vals = values.view(np.int64)
    else:
        vals = np.asarray(values)

    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    check_nulls = not is_integer_dtype(values)
    labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
                                    assume_unique=True)

    if dtype is not None:
        uniques = uniques.astype(dtype)

    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques

Exemple #19

0

Afficher le fichier

Fichier : algorithms.py Projet : andrewkittredge/pandas

def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
            allow_fill=True):
    """
    Specialized Cython take which sets NaN values in one pass

    Parameters
    ----------
    arr : ndarray
        Input array
    indexer : ndarray
        1-D array of indices to take, subarrays corresponding to -1 value
        indicies are filed with fill_value
    axis : int, default 0
        Axis to take from
    out : ndarray or None, default None
        Optional output array, must be appropriate type to hold input and
        fill_value together, if indexer has any -1 value entries; call
        _maybe_promote to determine this type for any fill_value
    fill_value : any, default np.nan
        Fill value to replace -1 values with
    mask_info : tuple of (ndarray, boolean)
        If provided, value should correspond to:
            (indexer != -1, (indexer != -1).any())
        If not provided, it will be computed internally if necessary
    allow_fill : boolean, default True
        If False, indexer is assumed to contain no -1 values so no filling
        will be done.  This short-circuits computation of a mask.  Result is
        undefined if allow_fill == False and -1 is present in indexer.
    """

    # dispatch to internal type takes
    if is_categorical(arr):
        return arr.take_nd(indexer, fill_value=fill_value,
                           allow_fill=allow_fill)
    elif is_datetimetz(arr):
        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)

    if indexer is None:
        indexer = np.arange(arr.shape[axis], dtype=np.int64)
        dtype, fill_value = arr.dtype, arr.dtype.type()
    else:
        indexer = _ensure_int64(indexer)
        if not allow_fill:
            dtype, fill_value = arr.dtype, arr.dtype.type()
            mask_info = None, False
        else:
            # check for promotion based on types only (do this first because
            # it's faster than computing a mask)
            dtype, fill_value = _maybe_promote(arr.dtype, fill_value)
            if dtype != arr.dtype and (out is None or out.dtype != dtype):
                # check if promotion is actually required based on indexer
                if mask_info is not None:
                    mask, needs_masking = mask_info
                else:
                    mask = indexer == -1
                    needs_masking = mask.any()
                    mask_info = mask, needs_masking
                if needs_masking:
                    if out is not None and out.dtype != dtype:
                        raise TypeError('Incompatible type for fill_value')
                else:
                    # if not, then depromote, set fill_value to dummy
                    # (it won't be used but we don't want the cython code
                    # to crash when trying to cast it to dtype)
                    dtype, fill_value = arr.dtype, arr.dtype.type()

    flip_order = False
    if arr.ndim == 2:
        if arr.flags.f_contiguous:
            flip_order = True

    if flip_order:
        arr = arr.T
        axis = arr.ndim - axis - 1
        if out is not None:
            out = out.T

    # at this point, it's guaranteed that dtype can hold both the arr values
    # and the fill_value
    if out is None:
        out_shape = list(arr.shape)
        out_shape[axis] = len(indexer)
        out_shape = tuple(out_shape)
        if arr.flags.f_contiguous and axis == arr.ndim - 1:
            # minor tweak that can make an order-of-magnitude difference
            # for dataframes initialized directly from 2-d ndarrays
            # (s.t. df.values is c-contiguous and df._data.blocks[0] is its
            # f-contiguous transpose)
            out = np.empty(out_shape, dtype=dtype, order='F')
        else:
            out = np.empty(out_shape, dtype=dtype)

    func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
                                 mask_info=mask_info)
    indexer = _ensure_int64(indexer)
    func(arr, indexer, out, fill_value)

    if flip_order:
        out = out.T
    return out

Exemple #20

0

Afficher le fichier

Fichier : algorithms.py Projet : neer201/catboost_SE

def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    name = getattr(values, 'name', None)

    if bins is not None:
        try:
            from pandas.tools.tile import cut
            values = Series(values).values
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if is_extension_type(values) and not is_datetimetz(values):
        # handle Categorical and sparse,
        # datetime tz can be handeled in ndarray path
        result = Series(values).values.value_counts(dropna=dropna)
        result.name = name
        counts = result.values
    else:
        # ndarray path. pass original to handle DatetimeTzBlock
        keys, counts = _value_counts_arraylike(values, dropna=dropna)

        from pandas import Index, Series
        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(counts.sum())

    return result

Exemple #21

0

Afficher le fichier

Fichier : algorithms.py Projet : ChristopherShort/pandas

def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex

    vals = np.asarray(values)

    # localize to UTC
    is_datetimetz_type = is_datetimetz(values)
    if is_datetimetz_type:
        values = DatetimeIndex(values)
        vals = values.asi8

    is_datetime = is_datetime64_dtype(vals)
    is_timedelta = is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
                                    assume_unique=True)

    if is_datetimetz_type:
        # reset tz
        uniques = values._shallow_copy(uniques)
    elif is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques

Exemple #22

0

Afficher le fichier

Fichier : test_base.py Projet : RogerThomas/pandas

    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)
                values = o._values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = iNaT
                        values = o._values

                elif needs_i8_conversion(o):
                    values[0:2] = iNaT
                    values = o._shallow_copy(values)
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original

                self.assertEqual(values.dtype, o.dtype)

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    expected_index = o.copy()
                    expected_index.name = None

                    # attach name to klass
                    o = klass(values.repeat(range(1, len(o) + 1)))
                    o.name = 'a'
                else:
                    if is_datetimetz(o):
                        expected_index = orig._values._shallow_copy(values)
                    else:
                        expected_index = pd.Index(values)
                    expected_index.name = None
                    o = o.repeat(range(1, len(o) + 1))
                    o.name = 'a'

                # check values has the same dtype as the original
                self.assertEqual(o.dtype, orig.dtype)
                # check values correctly have NaN
                nanloc = np.zeros(len(o), dtype=np.bool)
                nanloc[:3] = True
                if isinstance(o, Index):
                    self.assert_numpy_array_equal(pd.isnull(o), nanloc)
                else:
                    exp = pd.Series(nanloc, o.index, name='a')
                    self.assert_series_equal(pd.isnull(o), exp)

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64', name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64', name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                self.assertTrue(result_s_na.index.name is None)
                self.assertEqual(result_s_na.name, 'a')
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                self.assertTrue(result_s.index.name is None)
                self.assertEqual(result_s.name, 'a')

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result,
                                          Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    tm.assert_numpy_array_equal(result[1:],
                                                values[2:].asobject.values)
                    self.assertIs(result[0], pd.NaT)
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])

                    self.assertTrue(pd.isnull(result[0]))
                    self.assertEqual(result.dtype, orig.dtype)

                self.assertEqual(o.nunique(), 8)
                self.assertEqual(o.nunique(dropna=False), 9)

Exemple #23

0

Afficher le fichier

Fichier : test_base.py Projet : smoofra/pandas

    def test_value_counts_unique_nunique(self):
        for orig in self.objs:

            o = orig.copy()
            klass = type(o)
            values = o.values

            # create repeated values, 'n'th element is repeated by n+1 times
            if isinstance(o, PeriodIndex):
                # freq must be specified because repeat makes freq ambiguous

                # resets name from Index
                expected_index = pd.Index(o[::-1])
                expected_index.name = None

                # attach name to klass
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'

            elif isinstance(o, DatetimeIndex):

                # resets name from Index
                expected_index = pd.Index(o[::-1])
                expected_index.name = None

                # attach name to klass
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'

            # don't test boolean
            elif isinstance(o, Index) and o.is_boolean():
                continue
            elif isinstance(o, Index):
                expected_index = pd.Index(values[::-1])
                expected_index.name = None
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'
            else:
                expected_index = pd.Index(values[::-1])
                idx = o.index.repeat(range(1, len(o) + 1))
                o = klass(np.repeat(values, range(1, len(o) + 1)),
                          index=idx, name='a')

            expected_s = Series(range(10, 0, -1), index=expected_index,
                                dtype='int64', name='a')

            result = o.value_counts()
            tm.assert_series_equal(result, expected_s)
            self.assertTrue(result.index.name is None)
            self.assertEqual(result.name, 'a')

            result = o.unique()
            if isinstance(o, (DatetimeIndex, PeriodIndex)):
                self.assertTrue(isinstance(result, o.__class__))
                self.assertEqual(result.freq, o.freq)
                self.assert_index_equal(result, orig)
            else:
                self.assert_numpy_array_equal(result, values)

            self.assertEqual(o.nunique(), len(np.unique(o.values)))

        for null_obj in [np.nan, None]:
            for o in self.objs:
                klass = type(o)
                values = o.values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = pd.tslib.iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = pd.tslib.iNaT
                        values = o.values
                elif o.values.dtype == 'datetime64[ns]' or isinstance(
                        o, PeriodIndex):
                    values[0:2] = pd.tslib.iNaT
                else:
                    values[0:2] = null_obj

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, PeriodIndex):
                    # freq must be specified because repeat makes freq
                    # ambiguous

                    # resets name from Index
                    expected_index = pd.Index(o, name=None)
                    # attach name to klass
                    o = klass(np.repeat(values, range(1, len(o) + 1)),
                              freq=o.freq, name='a')
                elif isinstance(o, Index):
                    expected_index = pd.Index(values, name=None)
                    o = klass(
                        np.repeat(values, range(1, len(o) + 1)), name='a')
                else:
                    expected_index = pd.Index(values, name=None)
                    idx = np.repeat(o.index.values, range(1, len(o) + 1))
                    o = klass(
                        np.repeat(values, range(
                            1, len(o) + 1)), index=idx, name='a')

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64', name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64', name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                self.assertTrue(result_s_na.index.name is None)
                self.assertEqual(result_s_na.name, 'a')
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                self.assertTrue(result_s.index.name is None)
                self.assertEqual(result_s.name, 'a')

                # numpy_array_equal cannot compare arrays includes nan
                result = o.unique()
                self.assert_numpy_array_equal(result[1:], values[2:])

                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    self.assertTrue(result.asi8[0] == pd.tslib.iNaT)
                else:
                    self.assertTrue(pd.isnull(result[0]))

                self.assertEqual(o.nunique(), 8)
                self.assertEqual(o.nunique(dropna=False), 9)

Exemple #24

0

Afficher le fichier

Fichier : algorithms.py Projet : yizhiyong/pandas

def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex

    vals = np.asarray(values)

    # localize to UTC
    is_datetimetz_type = is_datetimetz(values)
    if is_datetimetz_type:
        values = DatetimeIndex(values)
        vals = values.asi8

    is_datetime = is_datetime64_dtype(vals)
    is_timedelta = is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques,
                                    labels,
                                    na_sentinel=na_sentinel,
                                    assume_unique=True)

    if is_datetimetz_type:
        # reset tz
        uniques = values._shallow_copy(uniques)
    elif is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques