def quadratic_mean(data): """quadratic_mean(iterable_of_numbers) -> quadratic mean of numbers quadratic_mean(iterable_of_rows) -> quadratic means of columns Return the quadratic mean of the given numbers or columns. >>> quadratic_mean([2, 2, 4, 5]) 3.5 The quadratic mean, or RMS (Root Mean Square), is the square root of the arithmetic mean of the squares of the data. It is a type of average best used to get an average absolute magnitude when quantities vary from positive to negative: >>> quadratic_mean([-3, -2, 0, 2, 3]) 2.280350850198276 When passed an iterable of sequences, each inner sequence represents a row of data, and ``quadratic_mean`` operates on each column. All rows must have the same number of columns, or ValueError is raised. >>> data = [[0, 1, 2, 4], ... [1, 2, 4, 6], ... [2, 4, 6, 6]] ... >>> quadratic_mean(data) #doctest: +ELLIPSIS [1.29099..., 2.64575..., 4.3204..., 5.41602...] """ count, total = stats._len_sum(v.sqr(x) for x in data) if not count: raise stats.StatsError( 'quadratic mean of empty sequence is not defined') return v.sqrt(v.div(total, count))
def _variance(data, m, p): """Return an estimate of variance with N-p degrees of freedom.""" n, ss = _std_moment(data, m, 1, 2) assert n >= 0 if n <= p: raise StatsError( 'at least %d items are required but only got %d' % (p+1, n)) den = n - p v.assert_(lambda x: x >= 0.0, ss) return v.div(ss, den)
def pskewness(data, m=None, s=None): """pskewness(data [,m [,s]]) -> population skewness of data. This returns γ₁ "\\N{GREEK SMALL LETTER GAMMA}\\N{SUBSCRIPT ONE}", the population skewness. For more information about skewness, see the sample skewness function ``skewness``. >>> pskewness([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5]) ... #doctest: +ELLIPSIS 1.37474650254... """ n, total = stats._std_moment(data, m, s, 3) assert n >= 0 if n <= 1: raise StatsError('no skewness is defined for empty data') return v.div(total, n)
def average_deviation(data, m=None): """average_deviation(data [, m]) -> average absolute deviation of data. Returns the average deviation of the sample data from the population centre ``m`` (usually the mean, or the median). If you know the population mean or median, pass it as the second element: >>> data = [2.0, 2.25, 2.5, 2.5, 3.25] # A sample from a population >>> mu = 2.75 # with a known mean. >>> average_deviation(data, mu) 0.45 If you don't know the centre location, you can estimate it by passing the sample mean or median instead. If ``m`` is not None, or not given, the sample mean is calculated from the data and used: >>> average_deviation(data) 0.3 If data is an iterable of sequences, each inner sequence represents a row of data, and ``average_deviation`` operates on each column. Every row must have the same number of columns, or ValueError is raised. Similarly, m (if given) must have either the same number of items, or be a single number. >>> data = [[0, 1, 2, 4], ... [1, 2, 4, 6], ... [2, 4, 6, 6]] ... >>> average_deviation(data, [1, 2, 3.5, 6]) #doctest: +ELLIPSIS [0.666666..., 1.0, 1.5, 0.666666...] """ if m is None: if not isinstance(data, list): data = list(data) m = stats.mean(data) f = lambda x, m: abs(x-m) count, total = stats._len_sum(v.apply(f, x, m) for x in data) if not count: raise stats.StatsError( 'average deviation requires at least 1 data point') return v.div(total, count)
def pkurtosis(data, m=None, s=None): """pkurtosis(data [,m [,s]]) -> population kurtosis of data. This returns γ₂ "\\N{GREEK SMALL LETTER GAMMA}\\N{SUBSCRIPT TWO}", the population kurtosis relative to that of the normal distribution, also known as the excess kurtosis. For the "kurtosis proper" known as β₂ "\\N{GREEK SMALL LETTER BETA}\\N{SUBSCRIPT TWO}", add 3 to the result. For more information about kurtosis, see the sample kurtosis function ``kurtosis``. >>> pkurtosis([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5]) ... #doctest: +ELLIPSIS 0.7794232987... """ n, total = stats._std_moment(data, m, s, 4) assert n >= 0 assert total >= 1 if n <= 1: raise StatsError('no kurtosis is defined for empty data') kurt = v.div(total, n) return v.sub(kurt, 3)
def mean(data): """mean(iterable_of_numbers) -> arithmetic mean of numbers mean(iterable_of_rows) -> arithmetic means of columns Return the arithmetic mean of the given numbers or columns. The arithmetic mean is the sum of the data divided by the number of data points. It is commonly called "the average", although it is actually only one of many different mathematical averages. It is a measure of the central location of the data. When passed a single sequence or iterator of numbers, ``mean`` adds the data points and returns the total divided by the number of data points: >>> mean([1.0, 2.0, 3.0, 4.0]) 2.5 When passed an iterable of sequences, each inner sequence represents a row of data, and ``mean`` returns the mean of each column. The rows must have the same number of columns, or ValueError is raised. >>> data = [[0, 1, 2, 3], ... [1, 2, 4, 5], ... [2, 3, 6, 7]] ... >>> mean(data) [1.0, 2.0, 4.0, 5.0] The sample mean is an unbiased estimator of the true population mean. However, the mean is strongly effected by outliers and is not a robust estimator for central location: the mean is not necessarily a typical example of the data points. """ count, total = _len_sum(data) if not count: raise StatsError('mean of empty sequence is not defined') return v.div(total, count)
def harmonic_mean(data): """harmonic_mean(iterable_of_numbers) -> harmonic mean of numbers harmonic_mean(iterable_of_rows) -> harmonic means of columns Return the harmonic mean of the given numbers or columns. The harmonic mean, or subcontrary mean, is the reciprocal of the arithmetic mean of the reciprocals of the data. It is a type of average best used for averaging rates or speeds. >>> harmonic_mean([0.25, 0.5, 1.0, 1.0]) 0.5 If data includes one or more zero values, the result will be zero if the zeroes are all the same sign, or an NAN if they are of opposite signs. When passed an iterable of sequences, each inner sequence represents a row of data, and ``harmonic_mean`` operates on each column. All rows must have the same number of columns, or ValueError is raised. >>> data = [[0, 1, 2, 4], ... [1, 2, 4, 8], ... [2, 4, 8, 8]] ... >>> harmonic_mean(data) #doctest: +ELLIPSIS [0.0, 1.71428..., 3.42857..., 6.0] """ # FIXME harmonic_mean([x]) should equal x exactly, but due to rounding # errors in the 1/(1/x) round trip, sometimes it doesn't. invert = functools.partial(_divide, 1) n, total = stats._len_sum(v.apply(invert, x) for x in data) if not n: raise stats.StatsError( 'harmonic mean of empty sequence is not defined') return v.div(n, total)
def kurtosis(data, m=None, s=None): """kurtosis(data [,m [,s]]) -> sample excess kurtosis of data. The kurtosis of a distribution is a measure of its shape. This function returns an estimate of the sample excess kurtosis usually known as g₂ "g\\N{SUBSCRIPT TWO}". For the population kurtosis, see ``pkurtosis``. WARNING: The mathematical terminology and notation related to kurtosis is often inconsistent and contradictory. See Wolfram Mathworld for further details: http://mathworld.wolfram.com/Kurtosis.html >>> kurtosis([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5]) ... #doctest: +ELLIPSIS 3.03678892733564... If you already know one or both of the population mean and standard deviation, you can pass the mean as optional argument m and/or the standard deviation as s: >>> kurtosis([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5], m=2.25, s=1) 2.3064453125 CAUTION: "Garbage in, garbage out" applies here. You can pass any values you like as ``m`` or ``s``, but if they are not sensible estimates for the mean and standard deviation, the result returned as the kurtosis will likewise not be sensible. If you give either m or s, and the calculated kurtosis is out of range, a warning is raised. If m or s are not given, or are None, they are estimated from the data. If data is an iterable of sequences, each inner sequence represents a row of data, and ``kurtosis`` operates on each column. Every row must have the same number of columns, or ValueError is raised. >>> data = [[0, 1], ... [1, 5], ... [2, 6], ... [5, 7]] ... >>> kurtosis(data) #doctest: +ELLIPSIS [1.50000000000000..., 2.23486717956161...] Similarly, if either m or s are given, they must be either a single number or have the same number of items: >>> kurtosis(data, m=[3, 5], s=2) #doctest: +ELLIPSIS [-0.140625, 18.4921875] The kurtosis of a population is a measure of the peakedness and weight of the tails. The normal distribution has kurtosis of zero; positive kurtosis generally has heavier tails and a sharper peak than normal; negative kurtosis generally has lighter tails and a flatter peak. There is no upper limit for kurtosis, and a lower limit of -2. Higher kurtosis means more of the variance is the result of infrequent extreme deviations, as opposed to frequent modestly sized deviations. CAUTION: As a rule of thumb, a non-zero value for kurtosis should only be treated as meaningful if its absolute value is larger than approximately twice its standard error. See also ``stderrkurtosis``. """ n, total = stats._std_moment(data, m, s, 4) assert n >= 0 v.assert_(lambda x: x >= 1, total) if n < 4: raise StatsError('sample kurtosis requires at least 4 data points') q = (n-1)/((n-2)*(n-3)) gamma2 = v.div(total, n) # Don't do this:- # kurt = v.mul((n+1)*q, gamma2) # kurt = v.sub(kurt, 3*(n-1)*q) # Even though the above two commented out lines are mathematically # equivalent to the next two, and cheaper, they appear to be # slightly less accurate. kurt = v.sub(v.mul(n+1, gamma2), 3*(n-1)) kurt = v.mul(q, kurt) if v.isiterable(kurt): out_of_range = any(x < -2 for x in kurt) else: out_of_range = kurt < -2 if m is s is None: assert not out_of_range, 'kurtosis failed: %r' % kurt # This is a "should never happen" condition, hence an assertion. else: # This, on the other hand, can easily happen if the caller # gives junk values for m or s. The difference between a junk # value and a legitimate value can be surprisingly subtle! if out_of_range: import warnings warnings.warn('calculated kurtosis out of range') return kurt
def skewness(data, m=None, s=None): """skewness(data [,m [,s]]) -> sample skewness of data. The skewness, or third standardised moment, of data is the degree to which it is skewed to the left or right of the mean. This returns g₁ "g\\N{SUBSCRIPT ONE}", the sample skewness. For the population skewness, see function ``pskewness``. WARNING: The mathematical terminology and notation related to skewness is often inconsistent and contradictory. See Wolfram Mathworld for further details: http://mathworld.wolfram.com/Skewness.html >>> skewness([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5]) ... #doctest: +ELLIPSIS 1.71461013539878... If you already know one or both of the population mean and standard deviation, you can pass the mean as optional argument m and/or the standard deviation as s: >>> skewness([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5], m=2.25, s=1) ... #doctest: +ELLIPSIS 1.47132881615329... CAUTION: "Garbage in, garbage out" applies here. You can pass any values you like as ``m`` or ``s``, but if they are not sensible estimates for the mean and standard deviation, the result returned as the skewness will likewise not be sensible. If m or s are not given, or are None, they are estimated from the data. If data is an iterable of sequences, each inner sequence represents a row of data, and ``skewness`` operates on each column. Every row must have the same number of columns, or ValueError is raised. >>> data = [[0, 1], ... [1, 5], ... [2, 6], ... [5, 7]] ... >>> skewness(data) #doctest: +ELLIPSIS [1.19034012827899..., -1.44305883553164...] Similarly, if either m or s are given, they must be either a single number or have the same number of items as the data: >>> skewness(data, m=[2.5, 5.0], s=2) #doctest: +ELLIPSIS [-0.189443057077845..., -2.97696232550900...] A negative skewness indicates that the distribution's left-hand tail is longer than the tail on the right-hand side, and that the majority of the values (including the median) are to the right of the mean. A positive skew indicates that the right-hand tail is longer, and that the majority of values are to the left of the mean. A zero skew indicates that the values are evenly distributed around the mean, often but not necessarily implying the distribution is symmetric. CAUTION: As a rule of thumb, a non-zero value for skewness should only be treated as meaningful if its absolute value is larger than approximately twice its standard error. See also ``stderrskewness``. """ n, total = stats._std_moment(data, m, s, 3) assert n >= 0 if n < 3: raise StatsError('sample skewness requires at least three items') skew = v.div(total, n) k = math.sqrt(n*(n-1))/(n-2) return v.mul(k, skew)