Beispiel #1
0
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz_type = is_datetimetz(values)
    is_period_type = (is_period_dtype(values) or
                      is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if needs_i8_conversion(dtype) or is_period_type:

        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period_type:
            # values may be an object
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz_type:
            keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz)
        if is_period_type:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif is_signed_integer_dtype(dtype):
        values = _ensure_int64(values)
        keys, counts = htable.value_count_int64(values, dropna)
    elif is_unsigned_integer_dtype(dtype):
        values = _ensure_uint64(values)
        keys, counts = htable.value_count_uint64(values, dropna)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        keys, counts = htable.value_count_float64(values, dropna)
    else:
        values = _ensure_object(values)
        keys, counts = htable.value_count_object(values, dropna)

        mask = isnull(values)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
Beispiel #2
0
def value_counts(values, sort=True, ascending=False, normalize=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):

        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys,dtype=dtype)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=keys)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
Beispiel #3
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series
    from collections import defaultdict

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
Beispiel #4
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=keys)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
Beispiel #5
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=keys)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
Beispiel #6
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas.tseries.period import PeriodIndex

    is_period = com.is_period_arraylike(values)
    values = Series(values).values
    is_category = com.is_categorical_dtype(values.dtype)

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes
    elif is_category:
        bins = values.categories
        cat = values
        values = cat.codes

    dtype = values.dtype

    if issubclass(values.dtype.type,
                  (np.datetime64, np.timedelta64)) or is_period:
        if is_period:
            values = PeriodIndex(values)

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        if dropna:
            from pandas.tslib import iNaT
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]
        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna:
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    result = Series(counts, index=com._values_from_object(keys))
    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
        if not is_category:
            result.index = bins[:-1]
        else:
            result.index = cat.categories

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
Beispiel #7
0
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut

    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.labels

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys, dtype=dtype)

    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=com._values_from_object(keys))

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
Beispiel #8
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut

    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.labels

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys, dtype=dtype)

    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=com._values_from_object(keys))

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
Beispiel #9
0
def value_counts(values, sort=True, ascending=False, normalize=False,
                 bins=None, dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas.tseries.period import PeriodIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)

        if com.is_datetime_or_timedelta_dtype(dtype) or is_period:

            if is_period:
                values = PeriodIndex(values, name=name)

            values = values.view(np.int64)
            keys, counts = htable.value_count_int64(values)

            if dropna:
                from pandas.tslib import iNaT
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # convert the keys back to the dtype we came in
            keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_int64(values)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        result = Series(counts, index=com._values_from_object(keys), name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
            result.index = bins[:-1]

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result