def make_normal_quantile_normalizer(mean, sd, bins=1000): """returns f(a) that converts a to the specified normal distribution.""" dist = array([ndtri(i) * sd for i in arange(1.0 / bins, 1, 1.0 / bins)]) dist = (dist * sd) + mean return make_quantile_normalizer(dist)
def __call__(self, size, confidence_level=0.95): if confidence_level <= 0 or confidence_level >= 1: raise ValueError("Invalid confidence level: %.4f. Must be between " "zero and one (exclusive)." % confidence_level) # We'll use the variable names from Colwell 2012 for clarity and # brevity. m = size fk = self.getAbundanceFrequencyCounts() n = self.getTotalIndividualCount() s_obs = self.getObservationCount() s_est = self.estimateFullRichness() if m <= n: # Interpolation. # Equation 4 in Colwell 2012 for the estimate. estimate_acc = 0 # Equation 5 in Colwell 2012 gives unconditional variance, but they # report the standard error (SE) (which is the same as the standard # deviation in this case) in their tables and use this to construct # confidence intervals. Thus, we compute SE as sqrt(variance). std_err_acc = 0 for k in range(1, n + 1): alpha_km = self._calculate_alpha_km(n, k, m) estimate_acc += alpha_km * fk[k] std_err_acc += (((1 - alpha_km)**2) * fk[k]) estimate = s_obs - estimate_acc # Convert variance to standard error. std_err = sqrt(std_err_acc - (estimate ** 2 / s_est)) else: # Extrapolation. m_star = m - n f1 = fk[1] f2 = fk[2] f_hat = self.estimateUnobservedObservationCount() try: # Equation 9 in Colwell 2012. estimate = s_obs + f_hat * (1 - (1 - (f1 / (n * f_hat))) ** m_star) except ZeroDivisionError: # This can happen if we have exactly one singleton and no # doubletons, or no singletons and no doubletons. estimate = None std_err = None else: # Equation 10 in Colwell 2012. I used Wolfram Alpha to # calculate the analytic partial derivatives since they weren't # provided in the original paper. We have two partial # derivatives, wrt f1 and f2, that we really care about. All # other partial derivatives (e.g. wrt f3, f4, etc.) get a value # of 1. pd_f1 = self._partial_derivative_f1(f1, f2, m_star, n) pd_f2 = self._partial_derivative_f2(f1, f2, m_star, n) pd_f1f2 = pd_f1 * pd_f2 # To do this efficiently, here's the algorithm: # # 1) Create nxn array filled with ones. Each element represents # the multiplication of two partial derivatives. # 2) Fill in only what we need: the multiplication of partial # derivatives wrt f1 and f2. # 3) Do an element-wise multiply between our partial derivative # matrix and the covariance matrix. tensordot does this and # also sums the result, which is exactly what we need. In # the end, we've summed all n^2 elements, each of which are # (pd_fi * pd_fj * cov_ij). self._pd_matrix[0, :] = pd_f1 self._pd_matrix[1, :] = pd_f2 self._pd_matrix[:, 0] = pd_f1 self._pd_matrix[:, 1] = pd_f2 self._pd_matrix[0, 0] = pd_f1 ** 2 self._pd_matrix[0, 1] = pd_f1f2 self._pd_matrix[1, 0] = pd_f1f2 self._pd_matrix[1, 1] = pd_f2 ** 2 std_err = sqrt(tensordot(self._pd_matrix, self._cov_matrix)) # Compute CI based on std_err. ci_low = None ci_high = None if std_err is not None: # z_crit will be something like 1.96 for 95% CI. z_crit = abs(ndtri((1 - confidence_level) / 2)) ci_bound = z_crit * std_err ci_low = estimate - ci_bound ci_high = estimate + ci_bound return estimate, std_err, ci_low, ci_high
def make_normal_quantile_normalizer(mean, sd, bins=1000): """returns f(a) that converts a to the specified normal distribution.""" dist = array([ndtri(i)*sd for i in arange(1.0/bins,1,1.0/bins)]) dist = (dist * sd) + mean return make_quantile_normalizer(dist)