Example #1
0
def chao1(counts, bias_corrected=True):
    r"""Calculate chao1 richness estimator.

    Uses the bias-corrected version unless `bias_corrected` is ``False`` *and*
    there are both singletons and doubletons.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    bias_corrected : bool, optional
        Indicates whether or not to use the bias-corrected version of the
        equation. If ``False`` *and* there are both singletons and doubletons,
        the uncorrected version will be used. The biased-corrected version will
        be used otherwise.

    Returns
    -------
    double
        Computed chao1 richness estimator.

    See Also
    --------
    chao1_ci

    Notes
    -----
    The uncorrected version is based on Equation 6 in [1]_:

    .. math::

       chao1=S_{obs}+\frac{F_1^2}{2F_2}

    where :math:`F_1` and :math:`F_2` are the count of singletons and
    doubletons, respectively.

    The bias-corrected version is defined as

    .. math::

       chao1=S_{obs}+\frac{F_1(F_1-1)}{2(F_2+1)}

    References
    ----------
    .. [1] Chao, A. 1984. Non-parametric estimation of the number of classes in
       a population. Scandinavian Journal of Statistics 11, 265-270.

    """
    counts = _validate_counts_vector(counts)
    o, s, d = osd(counts)

    if not bias_corrected and s and d:
        return o + s ** 2 / (d * 2)
    else:
        return o + s * (s - 1) / (2 * (d + 1))
Example #2
0
def chao1(counts, bias_corrected=True):
    r"""Calculate chao1 richness estimator.

    Uses the bias-corrected version unless `bias_corrected` is ``False`` *and*
    there are both singletons and doubletons.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    bias_corrected : bool, optional
        Indicates whether or not to use the bias-corrected version of the
        equation. If ``False`` *and* there are both singletons and doubletons,
        the uncorrected version will be used. The biased-corrected version will
        be used otherwise.

    Returns
    -------
    double
        Computed chao1 richness estimator.

    See Also
    --------
    chao1_ci

    Notes
    -----
    The uncorrected version is based on Equation 6 in [1]_:

    .. math::

       chao1=S_{obs}+\frac{F_1^2}{2F_2}

    where :math:`F_1` and :math:`F_2` are the count of singletons and
    doubletons, respectively.

    The bias-corrected version is defined as

    .. math::

       chao1=S_{obs}+\frac{F_1(F_1-1)}{2(F_2+1)}

    References
    ----------
    .. [1] Chao, A. 1984. Non-parametric estimation of the number of classes in
       a population. Scandinavian Journal of Statistics 11, 265-270.

    """
    counts = _validate_counts_vector(counts)
    o, s, d = osd(counts)

    if not bias_corrected and s and d:
        return o + s**2 / (d * 2)
    else:
        return o + s * (s - 1) / (2 * (d + 1))
Example #3
0
def chao1_ci(counts, bias_corrected=True, zscore=1.96):
    """Calculate chao1 confidence interval.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    bias_corrected : bool, optional
        Indicates whether or not to use the bias-corrected version of the
        equation. If ``False`` *and* there are both singletons and doubletons,
        the uncorrected version will be used. The biased-corrected version will
        be used otherwise.
    zscore : scalar, optional
        Score to use for confidence. Default of 1.96 is for a 95% confidence
        interval.

    Returns
    -------
    tuple
        chao1 confidence interval as ``(lower_bound, upper_bound)``.

    See Also
    --------
    chao1

    Notes
    -----
    The implementation here is based on the equations in the EstimateS manual
    [1]_. Different equations are employed to calculate the chao1 variance and
    confidence interval depending on `bias_corrected` and the presence/absence
    of singletons and/or doubletons.

    Specifically, the following EstimateS equations are used:

    1. No singletons, Equation 14.
    2. Singletons but no doubletons, Equations 7, 13.
    3. Singletons and doubletons, ``bias_corrected=True``, Equations 6, 13.
    4. Singletons and doubletons, ``bias_corrected=False``, Equations 5, 13.

    References
    ----------
    .. [1] http://viceroy.eeb.uconn.edu/estimates/

    """
    counts = _validate_counts_vector(counts)
    o, s, d = osd(counts)
    if s:
        chao = chao1(counts, bias_corrected)
        chaovar = _chao1_var(counts, bias_corrected)
        return _chao_confidence_with_singletons(chao, o, chaovar, zscore)
    else:
        n = counts.sum()
        return _chao_confidence_no_singletons(n, o, zscore)
Example #4
0
def chao1_ci(counts, bias_corrected=True, zscore=1.96):
    """Calculate chao1 confidence interval.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    bias_corrected : bool, optional
        Indicates whether or not to use the bias-corrected version of the
        equation. If ``False`` *and* there are both singletons and doubletons,
        the uncorrected version will be used. The biased-corrected version will
        be used otherwise.
    zscore : scalar, optional
        Score to use for confidence. Default of 1.96 is for a 95% confidence
        interval.

    Returns
    -------
    tuple
        chao1 confidence interval as ``(lower_bound, upper_bound)``.

    See Also
    --------
    chao1

    Notes
    -----
    The implementation here is based on the equations in the EstimateS manual
    [1]_. Different equations are employed to calculate the chao1 variance and
    confidence interval depending on `bias_corrected` and the presence/absence
    of singletons and/or doubletons.

    Specifically, the following EstimateS equations are used:

    1. No singletons, Equation 14.
    2. Singletons but no doubletons, Equations 7, 13.
    3. Singletons and doubletons, ``bias_corrected=True``, Equations 6, 13.
    4. Singletons and doubletons, ``bias_corrected=False``, Equations 5, 13.

    References
    ----------
    .. [1] http://viceroy.eeb.uconn.edu/estimates/

    """
    counts = _validate_counts_vector(counts)
    o, s, d = osd(counts)
    if s:
        chao = chao1(counts, bias_corrected)
        chaovar = _chao1_var(counts, bias_corrected)
        return _chao_confidence_with_singletons(chao, o, chaovar, zscore)
    else:
        n = counts.sum()
        return _chao_confidence_no_singletons(n, o, zscore)
Example #5
0
def lladser_ci(counts, r, alpha=0.95, f=10, ci_type='ULCL'):
    """Calculate single CI of the conditional uncovered probability.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    r : int
        Number of new colors that are required for the next prediction.
    alpha : float, optional
        Desired confidence level.
    f : float, optional
        Ratio between upper and lower bound.
    ci_type : {'ULCL', 'ULCU', 'U', 'L'}
        Type of confidence interval. If ``'ULCL'``, upper and lower bounds with
        conservative lower bound. If ``'ULCU'``, upper and lower bounds with
        conservative upper bound. If ``'U'``, upper bound only, lower bound
        fixed to 0.0. If ``'L'``, lower bound only, upper bound fixed to 1.0.

    Returns
    -------
    tuple
        Confidence interval as ``(lower_bound, upper_bound)``.

    See Also
    --------
    lladser_pe

    Notes
    -----
    This function is just a wrapper around the full CI estimator described
    in Theorem 2 (iii) in [1]_, intended to be called for a single best CI
    estimate on a complete sample.

    References
    ----------
    .. [1] Lladser, Gouet, and Reeder, "Extrapolation of Urn Models via
       Poissonization: Accurate Measurements of the Microbial Unknown" PLoS
       2011.

    """
    counts = _validate_counts_vector(counts)
    sample = _expand_counts(counts)
    np.random.shuffle(sample)

    try:
        ci = list(_lladser_ci_series(sample, r, alpha, f, ci_type))[-1]
    except IndexError:
        ci = (np.nan, np.nan)

    return ci
Example #6
0
def lladser_ci(counts, r, alpha=0.95, f=10, ci_type='ULCL'):
    """Calculate single CI of the conditional uncovered probability.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    r : int
        Number of new colors that are required for the next prediction.
    alpha : float, optional
        Desired confidence level.
    f : float, optional
        Ratio between upper and lower bound.
    ci_type : {'ULCL', 'ULCU', 'U', 'L'}
        Type of confidence interval. If ``'ULCL'``, upper and lower bounds with
        conservative lower bound. If ``'ULCU'``, upper and lower bounds with
        conservative upper bound. If ``'U'``, upper bound only, lower bound
        fixed to 0.0. If ``'L'``, lower bound only, upper bound fixed to 1.0.

    Returns
    -------
    tuple
        Confidence interval as ``(lower_bound, upper_bound)``.

    See Also
    --------
    lladser_pe

    Notes
    -----
    This function is just a wrapper around the full CI estimator described
    in Theorem 2 (iii) in [1]_, intended to be called for a single best CI
    estimate on a complete sample.

    References
    ----------
    .. [1] Lladser, Gouet, and Reeder, "Extrapolation of Urn Models via
       Poissonization: Accurate Measurements of the Microbial Unknown" PLoS
       2011.

    """
    counts = _validate_counts_vector(counts)
    sample = _expand_counts(counts)
    np.random.shuffle(sample)

    try:
        ci = list(_lladser_ci_series(sample, r, alpha, f, ci_type))[-1]
    except IndexError:
        ci = (np.nan, np.nan)

    return ci
Example #7
0
def lladser_pe(counts, r=10):
    """Calculate single point estimate of conditional uncovered probability.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    r : int, optional
        Number of new colors that are required for the next prediction.

    Returns
    -------
    double
        Single point estimate of the conditional uncovered probability. May be
        ``np.nan`` if a point estimate could not be computed.

    See Also
    --------
    lladser_ci

    Notes
    -----
    This function is just a wrapper around the full point estimator described
    in Theorem 2 (i) in [1]_, intended to be called for a single best estimate
    on a complete sample. This function is not guaranteed to return estimated
    uncovered probabilities less than 1 if the coverage is too low.

    References
    ----------
    .. [1] Lladser, Gouet, and Reeder, "Extrapolation of Urn Models via
       Poissonization: Accurate Measurements of the Microbial Unknown" PLoS
       2011.

    """
    counts = _validate_counts_vector(counts)
    sample = _expand_counts(counts)
    np.random.shuffle(sample)

    try:
        pe = list(_lladser_point_estimates(sample, r))[-1][0]
    except IndexError:
        pe = np.nan

    return pe
Example #8
0
def lladser_pe(counts, r=10):
    """Calculate single point estimate of conditional uncovered probability.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    r : int, optional
        Number of new colors that are required for the next prediction.

    Returns
    -------
    double
        Single point estimate of the conditional uncovered probability. May be
        ``np.nan`` if a point estimate could not be computed.

    See Also
    --------
    lladser_ci

    Notes
    -----
    This function is just a wrapper around the full point estimator described
    in Theorem 2 (i) in [1]_, intended to be called for a single best estimate
    on a complete sample. This function is not guaranteed to return estimated
    uncovered probabilities less than 1 if the coverage is too low.

    References
    ----------
    .. [1] Lladser, Gouet, and Reeder, "Extrapolation of Urn Models via
       Poissonization: Accurate Measurements of the Microbial Unknown" PLoS
       2011.

    """
    counts = _validate_counts_vector(counts)
    sample = _expand_counts(counts)
    np.random.shuffle(sample)

    try:
        pe = list(_lladser_point_estimates(sample, r))[-1][0]
    except IndexError:
        pe = np.nan

    return pe
Example #9
0
def gini_index(data, method='rectangles'):
    r"""Calculate the Gini index.

    The Gini index is defined as

    .. math::

       G=\frac{A}{A+B}

    where :math:`A` is the area between :math:`y=x` and the Lorenz curve and
    :math:`B` is the area under the Lorenz curve. Simplifies to :math:`1-2B`
    since :math:`A+B=0.5`.

    Parameters
    ----------
    data : 1-D array_like
        Vector of counts, abundances, proportions, etc. All entries must be
        non-negative.
    method : {'rectangles', 'trapezoids'}
        Method for calculating the area under the Lorenz curve. If
        ``'rectangles'``, connects the Lorenz curve points by lines parallel to
        the x axis. This is the correct method (in our opinion) though
        ``'trapezoids'`` might be desirable in some circumstances. If
        ``'trapezoids'``, connects the Lorenz curve points by linear segments
        between them. Basically assumes that the given sampling is accurate and
        that more features of given data would fall on linear gradients between
        the values of this data.

    Returns
    -------
    double
        Gini index.

    Raises
    ------
    ValueError
        If `method` isn't one of the supported methods for calculating the area
        under the curve.

    Notes
    -----
    The Gini index was introduced in [1]_. The formula for
    ``method='rectangles'`` is

    .. math::

       dx\sum_{i=1}^n h_i

    The formula for ``method='trapezoids'`` is

    .. math::

       dx(\frac{h_0+h_n}{2}+\sum_{i=1}^{n-1} h_i)

    References
    ----------
    .. [1] Gini, C. (1912). "Variability and Mutability", C. Cuppini, Bologna,
       156 pages. Reprinted in Memorie di metodologica statistica (Ed. Pizetti
       E, Salvemini, T). Rome: Libreria Eredi Virgilio Veschi (1955).

    """
    # Suppress cast to int because this method supports ints and floats.
    data = _validate_counts_vector(data, suppress_cast=True)
    lorenz_points = _lorenz_curve(data)
    B = _lorenz_curve_integrator(lorenz_points, method)
    return 1 - 2 * B
Example #10
0
def ace(counts, rare_threshold=10):
    r"""Calculate the ACE metric (Abundance-based Coverage Estimator).

    The ACE metric is defined as:

    .. math::

       S_{ace}=S_{abund}+\frac{S_{rare}}{C_{ace}}+
       \frac{F_1}{C_{ace}}\gamma^2_{ace}

    where :math:`S_{abund}` is the number of abundant OTUs (with more than
    `rare_threshold`  individuals) when all samples are pooled,
    :math:`S_{rare}` is the number of rare OTUs (with less than or equal to
    `rare_threshold` individuals) when all samples are pooled, :math:`C_{ace}`
    is the sample abundance coverage estimator, :math:`F_1` is the frequency of
    singletons, and :math:`\gamma^2_{ace}` is the estimated coefficient of
    variation for rare OTUs.

    The estimated coefficient of variation is defined as (assuming
    `rare_threshold` is 10, the default):

    .. math::

       \gamma^2_{ace}=max\left[\frac{S_{rare}}{C_{ace}}
       \frac{\sum^{10}_{i=1}{{i\left(i-1\right)}}F_i}
       {\left(N_{rare}\right)\left(N_{rare}-1\right)} -1,0\right]

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    rare_threshold : int, optional
        Threshold at which an OTU containing as many or fewer individuals will
        be considered rare.

    Returns
    -------
    double
        Computed ACE metric.

    Raises
    ------
    ValueError
        If every rare OTU is a singleton.

    Notes
    -----
    ACE was first introduced in [1]_ and [2]_. The implementation here is based
    on the description given in the EstimateS manual [3]_.

    If no rare OTUs exist, returns the number of abundant OTUs. The default
    value of 10 for `rare_threshold` is based on [4]_.

    If `counts` contains zeros, indicating OTUs which are known to exist in the
    environment but did not appear in the sample, they will be ignored for the
    purpose of calculating the number of rare OTUs.

    References
    ----------
    .. [1] Chao, A. & S.-M Lee. 1992 Estimating the number of classes via
       sample coverage. Journal of the American Statistical Association 87,
       210-217.
    .. [2] Chao, A., M.-C. Ma, & M. C. K. Yang. 1993. Stopping rules and
       estimation for recapture debugging with unequal failure rates.
       Biometrika 80, 193-201.
    .. [3] http://viceroy.eeb.uconn.edu/estimates/
    .. [4] Chao, A., W.-H. Hwang, Y.-C. Chen, and C.-Y. Kuo. 2000. Estimating
       the number of shared species in two communities. Statistica Sinica
       10:227-246.

    """
    counts = _validate_counts_vector(counts)
    freq_counts = np.bincount(counts)
    s_rare = _otus_rare(freq_counts, rare_threshold)
    singles = freq_counts[1]

    if singles > 0 and singles == s_rare:
        raise ValueError("The only rare OTUs are singletons, so the ACE "
                         "metric is undefined. EstimateS suggests using "
                         "bias-corrected Chao1 instead.")

    s_abun = _otus_abundant(freq_counts, rare_threshold)
    if s_rare == 0:
        return s_abun

    n_rare = _number_rare(freq_counts, rare_threshold)
    c_ace = 1 - singles / n_rare

    top = s_rare * _number_rare(freq_counts, rare_threshold, gamma=True)
    bottom = c_ace * n_rare * (n_rare - 1)
    gamma_ace = (top / bottom) - 1

    if gamma_ace < 0:
        gamma_ace = 0

    return s_abun + (s_rare / c_ace) + ((singles / c_ace) * gamma_ace)