Example #1
0
def quadratic_mean(data):
    """quadratic_mean(iterable_of_numbers) -> quadratic mean of numbers
    quadratic_mean(iterable_of_rows) -> quadratic means of columns

    Return the quadratic mean of the given numbers or columns.

    >>> quadratic_mean([2, 2, 4, 5])
    3.5

    The quadratic mean, or RMS (Root Mean Square), is the square root of the
    arithmetic mean of the squares of the data. It is a type of average
    best used to get an average absolute magnitude when quantities vary from
    positive to negative:

    >>> quadratic_mean([-3, -2, 0, 2, 3])
    2.280350850198276

    When passed an iterable of sequences, each inner sequence represents a
    row of data, and ``quadratic_mean`` operates on each column. All rows
    must have the same number of columns, or ValueError is raised.

    >>> data = [[0, 1, 2, 4],
    ...         [1, 2, 4, 6],
    ...         [2, 4, 6, 6]]
    ...
    >>> quadratic_mean(data)  #doctest: +ELLIPSIS
    [1.29099..., 2.64575..., 4.3204..., 5.41602...]

    """
    count, total = stats._len_sum(v.sqr(x) for x in data)
    if not count:
        raise stats.StatsError(
        'quadratic mean of empty sequence is not defined')
    return v.sqrt(v.div(total, count))
Example #2
0
def _variance(data, m, p):
    """Return an estimate of variance with N-p degrees of freedom."""
    n, ss = _std_moment(data, m, 1, 2)
    assert n >= 0
    if n <= p:
        raise StatsError(
        'at least %d items are required but only got %d' % (p+1, n))
    den = n - p
    v.assert_(lambda x: x >= 0.0, ss)
    return v.div(ss, den)
Example #3
0
def pskewness(data, m=None, s=None):
    """pskewness(data [,m [,s]]) -> population skewness of data.

    This returns γ₁ "\\N{GREEK SMALL LETTER GAMMA}\\N{SUBSCRIPT ONE}", the
    population skewness. For more information about skewness, see the sample
    skewness function ``skewness``.

    >>> pskewness([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5])
    ... #doctest: +ELLIPSIS
    1.37474650254...

    """
    n, total = stats._std_moment(data, m, s, 3)
    assert n >= 0
    if n <= 1:
        raise StatsError('no skewness is defined for empty data')
    return v.div(total, n)
Example #4
0
def average_deviation(data, m=None):
    """average_deviation(data [, m]) -> average absolute deviation of data.

    Returns the average deviation of the sample data from the population
    centre ``m`` (usually the mean, or the median). If you know the
    population mean or median, pass it as the second element:

    >>> data = [2.0, 2.25, 2.5, 2.5, 3.25]  # A sample from a population
    >>> mu = 2.75                           # with a known mean.
    >>> average_deviation(data, mu)
    0.45

    If you don't know the centre location, you can estimate it by passing
    the sample mean or median instead. If ``m`` is not None, or not given,
    the sample mean is calculated from the data and used:

    >>> average_deviation(data)
    0.3

    If data is an iterable of sequences, each inner sequence represents a
    row of data, and ``average_deviation`` operates on each column. Every
    row must have the same number of columns, or ValueError is raised.
    Similarly, m (if given) must have either the same number of items, or
    be a single number.

    >>> data = [[0, 1, 2, 4],
    ...         [1, 2, 4, 6],
    ...         [2, 4, 6, 6]]
    ...
    >>> average_deviation(data, [1, 2, 3.5, 6])  #doctest: +ELLIPSIS
    [0.666666..., 1.0, 1.5, 0.666666...]

    """
    if m is None:
        if not isinstance(data, list):
            data = list(data)
        m = stats.mean(data)
    f = lambda x, m: abs(x-m)
    count, total = stats._len_sum(v.apply(f, x, m) for x in data)
    if not count:
        raise stats.StatsError(
        'average deviation requires at least 1 data point')
    return v.div(total, count)
Example #5
0
def pkurtosis(data, m=None, s=None):
    """pkurtosis(data [,m [,s]]) -> population kurtosis of data.

    This returns γ₂ "\\N{GREEK SMALL LETTER GAMMA}\\N{SUBSCRIPT TWO}", the
    population kurtosis relative to that of the normal distribution, also
    known as the excess kurtosis. For the "kurtosis proper" known as
    β₂ "\\N{GREEK SMALL LETTER BETA}\\N{SUBSCRIPT TWO}", add 3 to the result.

    For more information about kurtosis, see the sample kurtosis function
    ``kurtosis``.

    >>> pkurtosis([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5])
    ... #doctest: +ELLIPSIS
    0.7794232987...

    """
    n, total = stats._std_moment(data, m, s, 4)
    assert n >= 0
    assert total >= 1
    if n <= 1:
        raise StatsError('no kurtosis is defined for empty data')
    kurt = v.div(total, n)
    return v.sub(kurt, 3)
Example #6
0
def mean(data):
    """mean(iterable_of_numbers) -> arithmetic mean of numbers
    mean(iterable_of_rows) -> arithmetic means of columns

    Return the arithmetic mean of the given numbers or columns.

    The arithmetic mean is the sum of the data divided by the number of data
    points. It is commonly called "the average", although it is actually only
    one of many different mathematical averages. It is a measure of the
    central location of the data.

    When passed a single sequence or iterator of numbers, ``mean`` adds the
    data points and returns the total divided by the number of data points:

    >>> mean([1.0, 2.0, 3.0, 4.0])
    2.5

    When passed an iterable of sequences, each inner sequence represents a
    row of data, and ``mean`` returns the mean of each column. The rows must
    have the same number of columns, or ValueError is raised.

    >>> data = [[0, 1, 2, 3],
    ...         [1, 2, 4, 5],
    ...         [2, 3, 6, 7]]
    ...
    >>> mean(data)
    [1.0, 2.0, 4.0, 5.0]

    The sample mean is an unbiased estimator of the true population mean.
    However, the mean is strongly effected by outliers and is not a robust
    estimator for central location: the mean is not necessarily a typical
    example of the data points.
    """
    count, total = _len_sum(data)
    if not count:
        raise StatsError('mean of empty sequence is not defined')
    return v.div(total, count)
Example #7
0
def harmonic_mean(data):
    """harmonic_mean(iterable_of_numbers) -> harmonic mean of numbers
    harmonic_mean(iterable_of_rows) -> harmonic means of columns

    Return the harmonic mean of the given numbers or columns.

    The harmonic mean, or subcontrary mean, is the reciprocal of the
    arithmetic mean of the reciprocals of the data. It is a type of average
    best used for averaging rates or speeds.

    >>> harmonic_mean([0.25, 0.5, 1.0, 1.0])
    0.5

    If data includes one or more zero values, the result will be zero if the
    zeroes are all the same sign, or an NAN if they are of opposite signs.

    When passed an iterable of sequences, each inner sequence represents a
    row of data, and ``harmonic_mean`` operates on each column. All rows
    must have the same number of columns, or ValueError is raised.

    >>> data = [[0, 1, 2, 4],
    ...         [1, 2, 4, 8],
    ...         [2, 4, 8, 8]]
    ...
    >>> harmonic_mean(data)  #doctest: +ELLIPSIS
    [0.0, 1.71428..., 3.42857..., 6.0]

    """
    # FIXME harmonic_mean([x]) should equal x exactly, but due to rounding
    # errors in the 1/(1/x) round trip, sometimes it doesn't.
    invert = functools.partial(_divide, 1)
    n, total = stats._len_sum(v.apply(invert, x) for x in data)
    if not n:
        raise stats.StatsError(
        'harmonic mean of empty sequence is not defined')
    return v.div(n, total)
Example #8
0
def kurtosis(data, m=None, s=None):
    """kurtosis(data [,m [,s]]) -> sample excess kurtosis of data.

    The kurtosis of a distribution is a measure of its shape. This function
    returns an estimate of the sample excess kurtosis usually known as g₂
    "g\\N{SUBSCRIPT TWO}". For the population kurtosis, see ``pkurtosis``.

        WARNING: The mathematical terminology and notation related to
        kurtosis is often inconsistent and contradictory. See Wolfram
        Mathworld for further details:

        http://mathworld.wolfram.com/Kurtosis.html

    >>> kurtosis([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5])
    ... #doctest: +ELLIPSIS
    3.03678892733564...

    If you already know one or both of the population mean and standard
    deviation, you can pass the mean as optional argument m and/or the
    standard deviation as s:

    >>> kurtosis([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5], m=2.25, s=1)
    2.3064453125

        CAUTION: "Garbage in, garbage out" applies here. You can pass
        any values you like as ``m`` or ``s``, but if they are not
        sensible estimates for the mean and standard deviation, the
        result returned as the kurtosis will likewise not be sensible.
        If you give either m or s, and the calculated kurtosis is out
        of range, a warning is raised.

    If m or s are not given, or are None, they are estimated from the data.

    If data is an iterable of sequences, each inner sequence represents a
    row of data, and ``kurtosis`` operates on each column. Every row must
    have the same number of columns, or ValueError is raised.

    >>> data = [[0, 1],
    ...         [1, 5],
    ...         [2, 6],
    ...         [5, 7]]
    ...
    >>> kurtosis(data)  #doctest: +ELLIPSIS
    [1.50000000000000..., 2.23486717956161...]

    Similarly, if either m or s are given, they must be either a single
    number or have the same number of items:

    >>> kurtosis(data, m=[3, 5], s=2)  #doctest: +ELLIPSIS
    [-0.140625, 18.4921875]

    The kurtosis of a population is a measure of the peakedness and weight
    of the tails. The normal distribution has kurtosis of zero; positive
    kurtosis generally has heavier tails and a sharper peak than normal;
    negative kurtosis generally has lighter tails and a flatter peak.

    There is no upper limit for kurtosis, and a lower limit of -2. Higher
    kurtosis means more of the variance is the result of infrequent extreme
    deviations, as opposed to frequent modestly sized deviations.

        CAUTION: As a rule of thumb, a non-zero value for kurtosis
        should only be treated as meaningful if its absolute value is
        larger than approximately twice its standard error. See also
        ``stderrkurtosis``.

    """
    n, total = stats._std_moment(data, m, s, 4)
    assert n >= 0
    v.assert_(lambda x: x >= 1, total)
    if n < 4:
        raise StatsError('sample kurtosis requires at least 4 data points')
    q = (n-1)/((n-2)*(n-3))
    gamma2 = v.div(total, n)
    # Don't do this:-
    # kurt = v.mul((n+1)*q, gamma2)
    # kurt = v.sub(kurt, 3*(n-1)*q)
    #   Even though the above two commented out lines are mathematically
    #   equivalent to the next two, and cheaper, they appear to be
    #   slightly less accurate.
    kurt = v.sub(v.mul(n+1, gamma2), 3*(n-1))
    kurt = v.mul(q, kurt)
    if v.isiterable(kurt): out_of_range = any(x < -2 for x in kurt)
    else: out_of_range = kurt < -2
    if m is s is None:
        assert not out_of_range, 'kurtosis failed: %r' % kurt
        # This is a "should never happen" condition, hence an assertion.
    else:
        # This, on the other hand, can easily happen if the caller
        # gives junk values for m or s. The difference between a junk
        # value and a legitimate value can be surprisingly subtle!
        if out_of_range:
            import warnings
            warnings.warn('calculated kurtosis out of range')
    return kurt
Example #9
0
def skewness(data, m=None, s=None):
    """skewness(data [,m [,s]]) -> sample skewness of data.

    The skewness, or third standardised moment, of data is the degree to
    which it is skewed to the left or right of the mean.

    This returns g₁ "g\\N{SUBSCRIPT ONE}", the sample skewness. For the
    population skewness, see function ``pskewness``.

        WARNING: The mathematical terminology and notation related to
        skewness is often inconsistent and contradictory. See Wolfram
        Mathworld for further details:

        http://mathworld.wolfram.com/Skewness.html

    >>> skewness([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5])
    ... #doctest: +ELLIPSIS
    1.71461013539878...

    If you already know one or both of the population mean and standard
    deviation, you can pass the mean as optional argument m and/or the
    standard deviation as s:

    >>> skewness([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5], m=2.25, s=1)
    ... #doctest: +ELLIPSIS
    1.47132881615329...

        CAUTION: "Garbage in, garbage out" applies here. You can pass
        any values you like as ``m`` or ``s``, but if they are not
        sensible estimates for the mean and standard deviation, the
        result returned as the skewness will likewise not be sensible.

    If m or s are not given, or are None, they are estimated from the data.

    If data is an iterable of sequences, each inner sequence represents a
    row of data, and ``skewness`` operates on each column. Every row must
    have the same number of columns, or ValueError is raised.

    >>> data = [[0, 1],
    ...         [1, 5],
    ...         [2, 6],
    ...         [5, 7]]
    ...
    >>> skewness(data)  #doctest: +ELLIPSIS
    [1.19034012827899..., -1.44305883553164...]

    Similarly, if either m or s are given, they must be either a single
    number or have the same number of items as the data:

    >>> skewness(data, m=[2.5, 5.0], s=2)  #doctest: +ELLIPSIS
    [-0.189443057077845..., -2.97696232550900...]

    A negative skewness indicates that the distribution's left-hand tail is
    longer than the tail on the right-hand side, and that the majority of
    the values (including the median) are to the right of the mean. A
    positive skew indicates that the right-hand tail is longer, and that the
    majority of values are to the left of the mean. A zero skew indicates
    that the values are evenly distributed around the mean, often but not
    necessarily implying the distribution is symmetric.

        CAUTION: As a rule of thumb, a non-zero value for skewness
        should only be treated as meaningful if its absolute value is
        larger than approximately twice its standard error. See also
        ``stderrskewness``.

    """
    n, total = stats._std_moment(data, m, s, 3)
    assert n >= 0
    if n < 3:
        raise StatsError('sample skewness requires at least three items')
    skew = v.div(total, n)
    k = math.sqrt(n*(n-1))/(n-2)
    return v.mul(k, skew)