Exemple #1
0
    def test_dst(self):

        dr1 = date_range('2013-01-01', periods=3, tz='US/Eastern')
        s1 = Series(dr1, name='A')
        self.assertTrue(is_datetimetz(s1))

        dr2 = date_range('2013-08-01', periods=3, tz='US/Eastern')
        s2 = Series(dr2, name='A')
        self.assertTrue(is_datetimetz(s2))
        self.assertEqual(s1.dtype, s2.dtype)
Exemple #2
0
    def test_dst(self):

        dr1 = date_range("2013-01-01", periods=3, tz="US/Eastern")
        s1 = Series(dr1, name="A")
        self.assertTrue(is_datetimetz(s1))

        dr2 = date_range("2013-08-01", periods=3, tz="US/Eastern")
        s2 = Series(dr2, name="A")
        self.assertTrue(is_datetimetz(s2))
        self.assertEqual(s1.dtype, s2.dtype)
Exemple #3
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = 'category'
        elif com.is_sparse(arr):
            typ = 'sparse'
        elif com.is_datetimetz(arr):
            typ = 'datetimetz'
        elif com.is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif com.is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif com.is_object_dtype(dtype):
            typ = 'object'
        elif com.is_bool_dtype(dtype):
            typ = 'bool'
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Exemple #4
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = 'category'
        elif com.is_sparse(arr):
            typ = 'sparse'
        elif com.is_datetimetz(arr):
            typ = 'datetimetz'
        elif com.is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif com.is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif com.is_object_dtype(dtype):
            typ = 'object'
        elif com.is_bool_dtype(dtype):
            typ = 'bool'
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Exemple #5
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = "category"
        elif com.is_sparse(arr):
            typ = "sparse"
        elif com.is_datetimetz(arr):
            typ = "datetimetz"
        elif com.is_datetime64_dtype(dtype):
            typ = "datetime"
        elif com.is_timedelta64_dtype(dtype):
            typ = "timedelta"
        elif com.is_object_dtype(dtype):
            typ = "object"
        elif com.is_bool_dtype(dtype):
            typ = "bool"
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Exemple #6
0
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import to_timedelta

        ovalues = values
        if not is_list_like(values):
            values = np.array([values])

        inferred_type = lib.infer_dtype(values)

        if inferred_type in ('datetime64', 'datetime', 'date', 'time'):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (other is not None and other.dtype == 'timedelta64[ns]' and
                    all(isnull(v) for v in values)):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            # datetime with tz
            elif isinstance(ovalues, datetime.datetime) and hasattr(ovalues,'tz'):
                values = pd.DatetimeIndex(values)
            # datetime array with tz
            elif com.is_datetimetz(values):
                if isinstance(values, pd.Series):
                    values = values._values
            elif not (isinstance(values, (np.ndarray, pd.Series)) and
                      is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = to_timedelta(values, errors='coerce')
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif inferred_type == 'floating':
            # all nan, so ok, use the other dtype (e.g. timedelta or datetime)
            if isnull(values).all():
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT
            else:
                raise TypeError(
                    'incompatible type [{0}] for a datetime/timedelta '
                    'operation'.format(np.array(values).dtype))
        elif self._is_offset(values):
            return values
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(np.array(values).dtype))

        return values
Exemple #7
0
    def test_basic(self):

        self.assertTrue(is_datetime64tz_dtype(self.dtype))

        dr = date_range('20130101',periods=3,tz='US/Eastern')
        s = Series(dr,name='A')

        # dtypes
        self.assertTrue(is_datetime64tz_dtype(s.dtype))
        self.assertTrue(is_datetime64tz_dtype(s))
        self.assertFalse(is_datetime64tz_dtype(np.dtype('float64')))
        self.assertFalse(is_datetime64tz_dtype(1.0))

        self.assertTrue(is_datetimetz(s))
        self.assertTrue(is_datetimetz(s.dtype))
        self.assertFalse(is_datetimetz(np.dtype('float64')))
        self.assertFalse(is_datetimetz(1.0))
Exemple #8
0
    def test_basic(self):

        self.assertTrue(is_datetime64tz_dtype(self.dtype))

        dr = date_range("20130101", periods=3, tz="US/Eastern")
        s = Series(dr, name="A")

        # dtypes
        self.assertTrue(is_datetime64tz_dtype(s.dtype))
        self.assertTrue(is_datetime64tz_dtype(s))
        self.assertFalse(is_datetime64tz_dtype(np.dtype("float64")))
        self.assertFalse(is_datetime64tz_dtype(1.0))

        self.assertTrue(is_datetimetz(s))
        self.assertTrue(is_datetimetz(s.dtype))
        self.assertFalse(is_datetimetz(np.dtype("float64")))
        self.assertFalse(is_datetimetz(1.0))
Exemple #9
0
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz = com.is_datetimetz(values)
    is_period = (isinstance(values, gt.ABCPeriodIndex) or
                 com.is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period:
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_scalar64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz:
            if isinstance(orig, gt.ABCDatetimeIndex):
                tz = orig.tz
            else:
                tz = orig.dt.tz
            keys = DatetimeIndex._simple_new(keys, tz=tz)
        if is_period:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    elif com.is_float_dtype(dtype):
        values = com._ensure_float64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
Exemple #10
0
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz = com.is_datetimetz(values)
    is_period = (isinstance(values, gt.ABCPeriodIndex)
                 or com.is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period:
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_scalar64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz:
            if isinstance(orig, gt.ABCDatetimeIndex):
                tz = orig.tz
            else:
                tz = orig.dt.tz
            keys = DatetimeIndex._simple_new(keys, tz=tz)
        if is_period:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    elif com.is_float_dtype(dtype):
        values = com._ensure_float64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
Exemple #11
0
    def test_value_counts_unique_nunique(self):
        for o in self.objs:
            klass = type(o)
            values = o.values

            # create repeated values, 'n'th element is repeated by n+1 times
            if isinstance(o, PeriodIndex):
                # freq must be specified because repeat makes freq ambiguous

                # resets name from Index
                expected_index = pd.Index(o[::-1])
                expected_index.name = None

                # attach name to klass
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'

            elif isinstance(o, DatetimeIndex):

                # resets name from Index
                expected_index = pd.Index(o[::-1])
                expected_index.name = None

                # attach name to klass
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'

            # don't test boolean
            elif isinstance(o, Index) and o.is_boolean():
                continue
            elif isinstance(o, Index):
                expected_index = pd.Index(values[::-1])
                expected_index.name = None
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'
            else:
                expected_index = pd.Index(values[::-1])
                idx = o.index.repeat(range(1, len(o) + 1))
                o = klass(
                    np.repeat(values, range(1,
                                            len(o) + 1)), index=idx, name='a')

            expected_s = Series(
                range(10, 0, -
                      1), index=expected_index, dtype='int64', name='a')

            result = o.value_counts()
            tm.assert_series_equal(result, expected_s)
            self.assertTrue(result.index.name is None)
            self.assertEqual(result.name, 'a')

            result = o.unique()
            if isinstance(o, (DatetimeIndex, PeriodIndex)):
                self.assertTrue(isinstance(result, o.__class__))
                self.assertEqual(result.name, o.name)
                self.assertEqual(result.freq, o.freq)

            self.assert_numpy_array_equal(result, values)

            self.assertEqual(o.nunique(), len(np.unique(o.values)))

        for null_obj in [np.nan, None]:
            for o in self.objs:
                klass = type(o)
                values = o.values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if com.is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = pd.tslib.iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = pd.tslib.iNaT
                        values = o.values
                elif o.values.dtype == 'datetime64[ns]' or isinstance(
                        o, PeriodIndex):
                    values[0:2] = pd.tslib.iNaT
                else:
                    values[0:2] = null_obj

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, PeriodIndex):
                    # freq must be specified because repeat makes freq
                    # ambiguous

                    # resets name from Index
                    expected_index = pd.Index(o, name=None)
                    # attach name to klass
                    o = klass(
                        np.repeat(values, range(
                            1, len(o) + 1)), freq=o.freq, name='a')
                elif isinstance(o, Index):
                    expected_index = pd.Index(values, name=None)
                    o = klass(
                        np.repeat(values, range(1, len(o) + 1)), name='a')
                else:
                    expected_index = pd.Index(values, name=None)
                    idx = np.repeat(o.index.values, range(1, len(o) + 1))
                    o = klass(
                        np.repeat(values, range(
                            1, len(o) + 1)), index=idx, name='a')

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64', name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64', name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                self.assertTrue(result_s_na.index.name is None)
                self.assertEqual(result_s_na.name, 'a')
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                self.assertTrue(result_s.index.name is None)
                self.assertEqual(result_s.name, 'a')

                # numpy_array_equal cannot compare arrays includes nan
                result = o.unique()
                self.assert_numpy_array_equal(result[1:], values[2:])

                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    self.assertTrue(result.asi8[0] == pd.tslib.iNaT)
                else:
                    self.assertTrue(pd.isnull(result[0]))

                self.assertEqual(o.nunique(), 8)
                self.assertEqual(o.nunique(dropna=False), 9)
Exemple #12
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    name = getattr(values, 'name', None)

    if bins is not None:
        try:
            from pandas.tools.tile import cut
            values = Series(values).values
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_extension_type(values) and not com.is_datetimetz(values):
        # handle Categorical and sparse,
        # datetime tz can be handeled in ndarray path
        result = Series(values).values.value_counts(dropna=dropna)
        result.name = name
        counts = result.values
    else:
        # ndarray path. pass original to handle DatetimeTzBlock
        keys, counts = _value_counts_arraylike(values, dropna=dropna)

        from pandas import Index, Series
        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(counts.sum())

    return result
Exemple #13
0
def value_counts(values, sort=True, ascending=False, normalize=False,
                 bins=None, dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    name = getattr(values, 'name', None)

    if bins is not None:
        try:
            from pandas.tools.tile import cut
            values = Series(values).values
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_extension_type(values) and not com.is_datetimetz(values):
        # handle Categorical and sparse,
        # datetime tz can be handeled in ndarray path
        result = Series(values).values.value_counts(dropna=dropna)
        result.name = name
        counts = result.values
    else:
        # ndarray path. pass original to handle DatetimeTzBlock
        keys, counts = _value_counts_arraylike(values, dropna=dropna)

        from pandas import Index, Series
        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)),
                                fill_value=0)
        result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(counts.sum())

    return result
Exemple #14
0
def take_nd(arr,
            indexer,
            axis=0,
            out=None,
            fill_value=np.nan,
            mask_info=None,
            allow_fill=True):
    """
    Specialized Cython take which sets NaN values in one pass

    Parameters
    ----------
    arr : ndarray
        Input array
    indexer : ndarray
        1-D array of indices to take, subarrays corresponding to -1 value
        indicies are filed with fill_value
    axis : int, default 0
        Axis to take from
    out : ndarray or None, default None
        Optional output array, must be appropriate type to hold input and
        fill_value together, if indexer has any -1 value entries; call
        common._maybe_promote to determine this type for any fill_value
    fill_value : any, default np.nan
        Fill value to replace -1 values with
    mask_info : tuple of (ndarray, boolean)
        If provided, value should correspond to:
            (indexer != -1, (indexer != -1).any())
        If not provided, it will be computed internally if necessary
    allow_fill : boolean, default True
        If False, indexer is assumed to contain no -1 values so no filling
        will be done.  This short-circuits computation of a mask.  Result is
        undefined if allow_fill == False and -1 is present in indexer.
    """

    # dispatch to internal type takes
    if com.is_categorical(arr):
        return arr.take_nd(indexer,
                           fill_value=fill_value,
                           allow_fill=allow_fill)
    elif com.is_datetimetz(arr):
        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)

    if indexer is None:
        indexer = np.arange(arr.shape[axis], dtype=np.int64)
        dtype, fill_value = arr.dtype, arr.dtype.type()
    else:
        indexer = com._ensure_int64(indexer)
        if not allow_fill:
            dtype, fill_value = arr.dtype, arr.dtype.type()
            mask_info = None, False
        else:
            # check for promotion based on types only (do this first because
            # it's faster than computing a mask)
            dtype, fill_value = com._maybe_promote(arr.dtype, fill_value)
            if dtype != arr.dtype and (out is None or out.dtype != dtype):
                # check if promotion is actually required based on indexer
                if mask_info is not None:
                    mask, needs_masking = mask_info
                else:
                    mask = indexer == -1
                    needs_masking = mask.any()
                    mask_info = mask, needs_masking
                if needs_masking:
                    if out is not None and out.dtype != dtype:
                        raise TypeError('Incompatible type for fill_value')
                else:
                    # if not, then depromote, set fill_value to dummy
                    # (it won't be used but we don't want the cython code
                    # to crash when trying to cast it to dtype)
                    dtype, fill_value = arr.dtype, arr.dtype.type()

    flip_order = False
    if arr.ndim == 2:
        if arr.flags.f_contiguous:
            flip_order = True

    if flip_order:
        arr = arr.T
        axis = arr.ndim - axis - 1
        if out is not None:
            out = out.T

    # at this point, it's guaranteed that dtype can hold both the arr values
    # and the fill_value
    if out is None:
        out_shape = list(arr.shape)
        out_shape[axis] = len(indexer)
        out_shape = tuple(out_shape)
        if arr.flags.f_contiguous and axis == arr.ndim - 1:
            # minor tweak that can make an order-of-magnitude difference
            # for dataframes initialized directly from 2-d ndarrays
            # (s.t. df.values is c-contiguous and df._data.blocks[0] is its
            # f-contiguous transpose)
            out = np.empty(out_shape, dtype=dtype, order='F')
        else:
            out = np.empty(out_shape, dtype=dtype)

    func = _get_take_nd_function(arr.ndim,
                                 arr.dtype,
                                 out.dtype,
                                 axis=axis,
                                 mask_info=mask_info)
    indexer = com._ensure_int64(indexer)
    func(arr, indexer, out, fill_value)

    if flip_order:
        out = out.T
    return out
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex

    vals = np.asarray(values)

    # localize to UTC
    is_datetimetz = com.is_datetimetz(values)
    if is_datetimetz:
        values = DatetimeIndex(values)
        vals = values.tz_localize(None)

    is_datetime = com.is_datetime64_dtype(vals)
    is_timedelta = com.is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(com._ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(np.array([e for i, e in enumerate(uniques) if f(e)],
                                 dtype=object)) for f in
                [lambda x: not isinstance(x, string_types),
                 lambda x: isinstance(x, string_types)]])
            sorter = com._ensure_platform_int(t.lookup(
                com._ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetimetz:

        # reset tz
        uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
            values.tz)
    elif is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques
Exemple #16
0
def value_counts(values, sort=True, ascending=False, normalize=False,
                 bins=None, dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas import Index, PeriodIndex, DatetimeIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)
        is_datetimetz = com.is_datetimetz(values)

        if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \
                is_datetimetz:

            if is_period:
                values = PeriodIndex(values)
            elif is_datetimetz:
                tz = getattr(values, 'tz', None)
                values = DatetimeIndex(values).tz_localize(None)

            values = values.view(np.int64)
            keys, counts = htable.value_count_scalar64(values, dropna)

            if dropna:
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # localize to the original tz if necessary
            if is_datetimetz:
                keys = DatetimeIndex(keys).tz_localize(tz)

            # convert the keys back to the dtype we came in
            else:
                keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)
        elif com.is_float_dtype(dtype):
            values = com._ensure_float64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)),
                                    fill_value=0)
            result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(values.size)

    return result
Exemple #17
0
def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
            allow_fill=True):
    """
    Specialized Cython take which sets NaN values in one pass

    Parameters
    ----------
    arr : ndarray
        Input array
    indexer : ndarray
        1-D array of indices to take, subarrays corresponding to -1 value
        indicies are filed with fill_value
    axis : int, default 0
        Axis to take from
    out : ndarray or None, default None
        Optional output array, must be appropriate type to hold input and
        fill_value together, if indexer has any -1 value entries; call
        common._maybe_promote to determine this type for any fill_value
    fill_value : any, default np.nan
        Fill value to replace -1 values with
    mask_info : tuple of (ndarray, boolean)
        If provided, value should correspond to:
            (indexer != -1, (indexer != -1).any())
        If not provided, it will be computed internally if necessary
    allow_fill : boolean, default True
        If False, indexer is assumed to contain no -1 values so no filling
        will be done.  This short-circuits computation of a mask.  Result is
        undefined if allow_fill == False and -1 is present in indexer.
    """

    # dispatch to internal type takes
    if com.is_categorical(arr):
        return arr.take_nd(indexer, fill_value=fill_value,
                           allow_fill=allow_fill)
    elif com.is_datetimetz(arr):
        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)

    if indexer is None:
        indexer = np.arange(arr.shape[axis], dtype=np.int64)
        dtype, fill_value = arr.dtype, arr.dtype.type()
    else:
        indexer = com._ensure_int64(indexer)
        if not allow_fill:
            dtype, fill_value = arr.dtype, arr.dtype.type()
            mask_info = None, False
        else:
            # check for promotion based on types only (do this first because
            # it's faster than computing a mask)
            dtype, fill_value = com._maybe_promote(arr.dtype, fill_value)
            if dtype != arr.dtype and (out is None or out.dtype != dtype):
                # check if promotion is actually required based on indexer
                if mask_info is not None:
                    mask, needs_masking = mask_info
                else:
                    mask = indexer == -1
                    needs_masking = mask.any()
                    mask_info = mask, needs_masking
                if needs_masking:
                    if out is not None and out.dtype != dtype:
                        raise TypeError('Incompatible type for fill_value')
                else:
                    # if not, then depromote, set fill_value to dummy
                    # (it won't be used but we don't want the cython code
                    # to crash when trying to cast it to dtype)
                    dtype, fill_value = arr.dtype, arr.dtype.type()

    flip_order = False
    if arr.ndim == 2:
        if arr.flags.f_contiguous:
            flip_order = True

    if flip_order:
        arr = arr.T
        axis = arr.ndim - axis - 1
        if out is not None:
            out = out.T

    # at this point, it's guaranteed that dtype can hold both the arr values
    # and the fill_value
    if out is None:
        out_shape = list(arr.shape)
        out_shape[axis] = len(indexer)
        out_shape = tuple(out_shape)
        if arr.flags.f_contiguous and axis == arr.ndim - 1:
            # minor tweak that can make an order-of-magnitude difference
            # for dataframes initialized directly from 2-d ndarrays
            # (s.t. df.values is c-contiguous and df._data.blocks[0] is its
            # f-contiguous transpose)
            out = np.empty(out_shape, dtype=dtype, order='F')
        else:
            out = np.empty(out_shape, dtype=dtype)

    func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
                                 mask_info=mask_info)
    indexer = com._ensure_int64(indexer)
    func(arr, indexer, out, fill_value)

    if flip_order:
        out = out.T
    return out
Exemple #18
0
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import to_timedelta

        ovalues = values
        supplied_dtype = None
        if not is_list_like(values):
            values = np.array([values])
        # if this is a Series that contains relevant dtype info, then use this
        # instead of the inferred type; this avoids coercing Series([NaT],
        # dtype='datetime64[ns]') to Series([NaT], dtype='timedelta64[ns]')
        elif isinstance(values, pd.Series) and (
                    is_timedelta64_dtype(values) or is_datetime64_dtype(values)):
            supplied_dtype = values.dtype
        inferred_type = supplied_dtype or lib.infer_dtype(values)
        if (inferred_type in ('datetime64', 'datetime', 'date', 'time')
            or com.is_datetimetz(inferred_type)):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (supplied_dtype is None
                and other is not None
                and (other.dtype in ('timedelta64[ns]', 'datetime64[ns]'))
                and isnull(values).all()):
                values = np.empty(values.shape, dtype='timedelta64[ns]')
                values[:] = iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            # datetime with tz
            elif isinstance(ovalues, datetime.datetime) and hasattr(ovalues,'tz'):
                values = pd.DatetimeIndex(values)
            # datetime array with tz
            elif com.is_datetimetz(values):
                if isinstance(values, pd.Series):
                    values = values._values
            elif not (isinstance(values, (np.ndarray, pd.Series)) and
                      is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = to_timedelta(values, errors='coerce')
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__', '__rmul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif inferred_type == 'floating':
            if isnull(values).all() and name in ('__add__', '__radd__',
                                                 '__sub__', '__rsub__'):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT
            return values
        elif self._is_offset(values):
            return values
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(np.array(values).dtype))

        return values
Exemple #19
0
    def test_value_counts_unique_nunique(self):
        for o in self.objs:
            klass = type(o)
            values = o.values

            # create repeated values, 'n'th element is repeated by n+1 times
            if isinstance(o, PeriodIndex):
                # freq must be specified because repeat makes freq ambiguous

                # resets name from Index
                expected_index = pd.Index(o[::-1])

                # attach name to klass
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'

            elif isinstance(o, DatetimeIndex):

                # resets name from Index
                expected_index = pd.Index(o[::-1])

                # attach name to klass
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'

            # don't test boolean
            elif isinstance(o,Index) and o.is_boolean():
                continue
            elif isinstance(o, Index):
                expected_index = pd.Index(values[::-1])
                o = o.repeat(range(1, len(o) + 1))
                o.name = 'a'
            else:
                expected_index = pd.Index(values[::-1])
                idx = o.index.repeat(range(1, len(o) + 1))
                o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a')

            expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a')

            result = o.value_counts()
            tm.assert_series_equal(result, expected_s)
            self.assertTrue(result.index.name is None)
            self.assertEqual(result.name, 'a')

            result = o.unique()
            if isinstance(o, (DatetimeIndex, PeriodIndex)):
                self.assertTrue(isinstance(result, o.__class__))
                self.assertEqual(result.name, o.name)
                self.assertEqual(result.freq, o.freq)

            self.assert_numpy_array_equal(result, values)

            self.assertEqual(o.nunique(), len(np.unique(o.values)))

        for null_obj in [np.nan, None]:
            for o in self.objs:
                klass = type(o)
                values = o.values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if com.is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = pd.tslib.iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = pd.tslib.iNaT
                        values = o.values
                elif o.values.dtype == 'datetime64[ns]' or isinstance(o, PeriodIndex):
                    values[0:2] = pd.tslib.iNaT
                else:
                    values[0:2] = null_obj

                # create repeated values, 'n'th element is repeated by n+1 times
                if isinstance(o, PeriodIndex):
                    # freq must be specified because repeat makes freq ambiguous

                    # resets name from Index
                    expected_index = pd.Index(o, name=None)
                    # attach name to klass
                    o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a')
                elif isinstance(o, Index):
                    expected_index = pd.Index(values, name=None)
                    o = klass(np.repeat(values, range(1, len(o) + 1)), name='a')
                else:
                    expected_index = pd.Index(values, name=None)
                    idx = np.repeat(o.index.values, range(1, len(o) + 1))
                    o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a')

                expected_s_na = Series(list(range(10, 2, -1)) +[3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64', name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64', name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                self.assertTrue(result_s_na.index.name is None)
                self.assertEqual(result_s_na.name, 'a')
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                self.assertTrue(result_s.index.name is None)
                self.assertEqual(result_s.name, 'a')

                # numpy_array_equal cannot compare arrays includes nan
                result = o.unique()
                self.assert_numpy_array_equal(result[1:], values[2:])

                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    self.assertTrue(result.asi8[0] == pd.tslib.iNaT)
                else:
                    self.assertTrue(pd.isnull(result[0]))

                self.assertEqual(o.nunique(), 8)
                self.assertEqual(o.nunique(dropna=False), 9)
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas import Index, PeriodIndex, DatetimeIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)
        is_datetimetz = com.is_datetimetz(values)

        if com.is_datetime_or_timedelta_dtype(
                dtype) or is_period or is_datetimetz:

            if is_period:
                values = PeriodIndex(values)
            elif is_datetimetz:
                tz = getattr(values, 'tz', None)
                values = DatetimeIndex(values).tz_localize(None)

            values = values.view(np.int64)
            keys, counts = htable.value_count_scalar64(values, dropna)

            if dropna:
                from pandas.tslib import iNaT
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # localize to the original tz if necessary
            if is_datetimetz:
                keys = DatetimeIndex(keys).tz_localize(tz)

            # convert the keys back to the dtype we came in
            else:
                keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)
        elif com.is_float_dtype(dtype):
            values = com._ensure_float64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)),
                                    fill_value=0)
            result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(values.size)

    return result
Exemple #21
0
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import to_timedelta

        ovalues = values
        supplied_dtype = None
        if not is_list_like(values):
            values = np.array([values])
        # if this is a Series that contains relevant dtype info, then use this
        # instead of the inferred type; this avoids coercing Series([NaT],
        # dtype='datetime64[ns]') to Series([NaT], dtype='timedelta64[ns]')
        elif (isinstance(values, pd.Series) and
              (is_timedelta64_dtype(values) or is_datetime64_dtype(values))):
            supplied_dtype = values.dtype
        inferred_type = supplied_dtype or lib.infer_dtype(values)
        if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or
                com.is_datetimetz(inferred_type)):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (supplied_dtype is None and other is not None and
                (other.dtype in ('timedelta64[ns]', 'datetime64[ns]')) and
                    isnull(values).all()):
                values = np.empty(values.shape, dtype='timedelta64[ns]')
                values[:] = iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            # datetime with tz
            elif (isinstance(ovalues, datetime.datetime) and
                  hasattr(ovalues, 'tz')):
                values = pd.DatetimeIndex(values)
            # datetime array with tz
            elif com.is_datetimetz(values):
                if isinstance(values, ABCSeries):
                    values = values._values
            elif not (isinstance(values, (np.ndarray, ABCSeries)) and
                      is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = to_timedelta(values, errors='coerce')
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__', '__rmul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif inferred_type == 'floating':
            if (isnull(values).all() and
                    name in ('__add__', '__radd__', '__sub__', '__rsub__')):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT
            return values
        elif self._is_offset(values):
            return values
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(np.array(values).dtype))

        return values