def summarize(data,
              avg=('amean', 'gmean', 'median', 'mode'),
              ci_level=0.95,
              bandwidth='silverman',
              precision=0.1):
    """ Estimate different grain size statistics. This includes different means,
    the median, the frequency peak grain size via KDE, the confidence intervals
    using different methods, and the distribution features.

    Parameters
    ----------
    data : array_like
        the size of the grains

    avg : string, tuple or list; optional
        the averages to be estimated

        | Types:
        | 'amean' - arithmetic mean
        | 'gmean' - geometric mean
        | 'median' - median
        | 'mode' - the kernel-based frequency peak of the distribution

    ci_level : scalar between 0 and 1; optional
        the certainty of the confidence interval (default = 0.95)

    bandwidth : string {'silverman' or 'scott'} or positive scalar; optional
        the method to estimate the bandwidth or a scalar directly defining the
        bandwidth. It uses the Silverman plug-in method by default.

    precision : positive scalar or None; optional
        the maximum precision expected for the "peak" kde-based estimator.
        Default is 0.1. Note that this is not related with the confidence
        intervals

    Call functions
    --------------
    - amean, gmean, median, and freq_peak (from averages)

    Examples
    --------
    >>> summarize(dataset['diameters'])
    >>> summarize(dataset['diameters'], ci_level=0.99)
    >>> summarize(np.log(dataset['diameters']), avg=('amean', 'median', 'mode'))

    Returns
    -------
    None
    """

    # remove missing and infinite values
    data = data[~np.isnan(data) & ~np.isinf(data)]

    # check for negative values and remove
    if data[data <= 0].size > 0:
        print(
            'Warning: There were negative and/or zero values in your dataset!')
        data = data[data > 0]
        print('Negative/zero values were automatically removed')
        print('')

    # estimate Shapiro-Wilk test to check normality and lognormality
    # In Shapiro-Wilk tests, the chances of the null hypothesis being
    # rejected becomes larger for large sample sizes. We limit the
    # sample size to a maximum of 250
    if len(data) > 250:
        W, p_value = shapiro(np.random.choice(data, size=250))
        W2, p_value2 = shapiro(np.random.choice(np.log(data), size=250))
    else:
        W, p_value = shapiro(data)
        W2, p_value2 = shapiro(np.log(data))

    if 'amean' in avg:
        if p_value2 < 0.05:
            amean, __, ci, length = averages.amean(data,
                                                   ci_level,
                                                   method='ASTM')
        else:
            if len(data) > 99:
                amean, __, (low_ci,
                            high_ci), length2 = averages.amean(data,
                                                               ci_level,
                                                               method='mCox')
            else:
                amean, __, (low_ci,
                            high_ci), length2 = averages.amean(data,
                                                               ci_level,
                                                               method='GCI')

            # estimate coefficients of variation
            lower_cvar = 100 * (amean - low_ci) / amean
            upper_cvar = 100 * (high_ci - amean) / amean

        print(' ')
        print(
            '============================================================================'
        )
        print('CENTRAL TENDENCY ESTIMATORS')
        print(
            '============================================================================'
        )
        print(f'Arithmetic mean = {amean:0.2f} microns')
        print(f'Confidence intervals at {ci_level * 100:0.1f} %')
        if p_value2 < 0.05:
            print(
                f'CLT (ASTM) method: {ci[0]:0.2f} - {ci[1]:0.2f}, (±{100 * (ci[1] - amean) / amean:0.1f}%), length = {length:0.3f}'
            )
        else:
            if len(data) > 99:
                print(
                    f'mCox method: {low_ci:0.2f} - {high_ci:0.2f} (-{lower_cvar:0.1f}%, +{upper_cvar:0.1f}%), length = {length2:0.3f}'
                )
            else:
                print(
                    f'GCI method: {low_ci:0.2f} - {high_ci:0.2f} (-{lower_cvar:0.1f}%, +{upper_cvar:0.1f}%), length = {length2:0.3f}'
                )

    if 'gmean' in avg:
        m = 'CLT' if len(
            data
        ) > 99 else 'bayes'  # choose optimal method to estimate confidence intervals
        gmean, msd, (low_ci, high_ci), length = averages.gmean(data,
                                                               ci_level,
                                                               method=m)

        # estimate coefficients of variation
        lower_cvar = 100 * (gmean - low_ci) / gmean
        upper_cvar = 100 * (high_ci - gmean) / gmean

        print(
            '============================================================================'
        )
        print(f'Geometric mean = {gmean:0.2f} microns')
        print(f'Confidence interval at {ci_level * 100:0.1f} %')
        print(
            f'{m} method: {low_ci:0.2f} - {high_ci:0.2f} (-{lower_cvar:0.1f}%, +{upper_cvar:0.1f}%), length = {length:0.3f}'
        )

    if 'median' in avg:
        median, iqr, (low_ci,
                      high_ci), length = averages.median(data, ci_level)

        # estimate coefficients of variation
        lower_cvar = 100 * (median - low_ci) / median
        upper_cvar = 100 * (high_ci - median) / median

        print(
            '============================================================================'
        )
        print(f'Median = {median:0.2f} microns')
        print(f'Confidence interval at {ci_level * 100:0.1f} %')
        print(
            f'robust method: {low_ci:0.2f} - {high_ci:0.2f} (-{lower_cvar:0.1f}%, +{upper_cvar:0.1f}%), length = {length:0.3f}'
        )

    if 'mode' in avg:
        __, mode, __, bw = averages.freq_peak(data, bandwidth, precision)

        print(
            '============================================================================'
        )
        print(f'Mode (KDE-based) = {mode:0.2f} microns')
        print(f'Maximum precision set to {precision}')

        if type(bandwidth) is str:
            print(f'KDE bandwidth = {bw} ({bandwidth} rule)')
        else:
            print(f'KDE bandwidth = {bandwidth}')

    print(' ')
    print(
        '============================================================================'
    )
    print('DISTRIBUTION FEATURES')
    print(
        '============================================================================'
    )
    print(f'Sample size (n) = {len(data)}')
    print(f'Standard deviation = {np.std(data):0.2f} (1-sigma)')
    if 'median' in avg:
        print(f'Interquartile range (IQR) = {iqr:0.2f}')
    if 'gmean' in avg:
        print(
            f'Lognormal shape (Multiplicative Standard Deviation) = {msd:0.2f}'
        )
    print(
        '============================================================================'
    )
    print('Shapiro-Wilk test warnings:')
    if p_value < 0.05:
        print('Data is not normally distributed!')
        print(
            f'Normality test: {W:0.2f}, {p_value:0.2f} (test statistic, p-value)'
        )
    if p_value2 < 0.05:
        print('Data is not lognormally distributed!')
        print(
            f'Lognormality test: {W2:0.2f}, {p_value2:0.2f} (test statistic, p-value)'
        )
    print(
        '============================================================================'
    )

    return None
 def test_median_known_value(self):
     """median should return known value from known input list."""
     for nums, value in self.known_medians:
         result = averages.median(nums)
         self.assertEqual(value, result)
Ejemplo n.º 3
0
import averages
print(averages.mean(1, 5, 1))
print(averages.median(1, 5, 1))
print(averages.rms(1, 5, 1))
print(averages.middle_average(1, 5, 1))