コード例 #1
0
ファイル: features.py プロジェクト: fmcc/mss_layout_analysis
def std_dev_contrast_stretch(arr: np.ndarray, n=2):
    """ Performs a contrast stretch from +/-2σ around the mean to 
        -1 to 1. 
        """
    sigma = arr.std()*n
    m = arr.mean()
    return np.interp(arr,[m-sigma,m+sigma],[-1,1])
コード例 #2
0
ファイル: som.py プロジェクト: tlhr/plumology
    def _init_weights(self, X: np.ndarray) -> None:
        """Initialize weights from PCA eigenvectors"""
        if not hasattr(self, 'weights'):
            pca = PCA(n_components=self._ndims)
            comp = pca.fit(X).components_[:2]
            coeff = X.mean(0) + 5 * X.std(0) / self._shape[0]

            # Create grid based on PCA eigenvectors and std dev of features
            raw_weights = np.asarray([
                (coeff * (comp[0] * (x - 0.5 / self._shape[0]) +
                          comp[1] * (y - 0.5 / self._shape[1])))
                for x, y in zip(np.nditer(self._X.flatten()),
                                np.nditer(self._Y.flatten()))
            ]).reshape(self._shape + (self._ndims,))

            # Scale to (0, 1)
            full_shape = self._shape + (1,)
            self.weights = (
                (raw_weights - raw_weights.min(2).reshape(full_shape)) /
                raw_weights.ptp(2).reshape(full_shape)
            )
def mean_dif_std(arr: np.ndarray) -> float:
	return arr.mean() - arr.std()
コード例 #4
0
ファイル: utils.py プロジェクト: larryshaw0079/MME
def tensor_standardize(x: np.ndarray, dim=-1):
    x_mean = np.expand_dims(x.mean(axis=dim), axis=dim)
    x_std = np.expand_dims(x.std(axis=dim), axis=dim)
    return (x - x_mean) / tackle_denominator(x_std)
コード例 #5
0
 def _normalize(adv: np.ndarray):
     """#### Normalize advantage function"""
     return (adv - adv.mean()) / (adv.std() + 1e-8)
コード例 #6
0
 def normalize(x: np.ndarray):
     mean = x.mean()
     std = x.std()
     x = (x - mean) / (std + 1e-11 + 1j * 1e-11)
     return x, mean, std
コード例 #7
0
def image_std(np_img: np.ndarray):
    """Return standard deviation of each channel.

    """
    return np_img.std(axis=(0, 1))
コード例 #8
0
def safe_normalize(vector: np.ndarray):
    vector = vector - vector.mean()
    std = vector.std()
    if std > 0:
        vector /= std
    return vector
コード例 #9
0
def zech_aslan(x: np.ndarray, y: np.ndarray, *, dmin: float = 1e-12) -> float:
    r"""
    Compute a modified Zech-Aslan energy distance dissimilarity metric based on an analogy with the energy of a cloud of electrical charges.

    This method is scale-invariant.

    Parameters
    ----------
    x : np.ndarray (n,d)
      Reference sample.
    y : np.ndarray (m,d)
      Candidate sample.
    dmin : float
      The cut-off for low distances to avoid singularities on identical points.

    Returns
    -------
    float
      Zech-Aslan dissimilarity metric ranging from -infinity to infinity.

    Notes
    -----
    The energy measure between two variables :math:`X`, :math:`Y` (target and candidates) of
    sizes :math:`n,d` and :math:`m,d` proposed by [AZ03]_ is defined by:

    .. math::

        e(X, Y) &= \left[\phi_{xx} + \phi_{yy} - \phi_{xy}\right] \\
        \phi_{xy} &= \frac{1}{n m} \sum_{i = 1}^n \sum_{j = 1}^m R\left[SED(X_i, Y_j)\right] \\
        \phi_{xx} &= \frac{1}{n^2} \sum_{i = 1}^n \sum_{j = i + 1}^n R\left[SED(X_i, X_j)\right] \\
        \phi_{yy} &= \frac{1}{m^2} \sum_{i = 1}^m \sum_{j = i + 1}^m R\left[SED(X_i, Y_j)\right] \\

    where :math:`X_i` denotes the i-th observation of :math:`X`. :math:`R` is a weight function
    and :math:`SED(A, B)` denotes the standardized Euclidean distance.

    .. math::

        R(r) &= \left\{\begin{array}{r l} -\ln r & \text{for } r > d_{min} \\ -\ln d_{min} & \text{for } r \leq d_{min} \end{array}\right. \\
        SED(X_i, Y_j) &= \sqrt{\sum_{k=1}^d \frac{\left(X_i(k) - Y_i(k)\right)^2}{\sigma_x(k)\sigma_y(k)}}

    where :math:`k` is a counter over dimensions (indices in the case of spatial analogs)
    and :math:`\sigma_x(k)` is the standard deviation of :math:`X` in dimension :math:`k`.
    Finally, :math:`d_{min}` is a cut-off to avoid poles when :math:`r \to 0`, it is
    controllable through the `dmin` parameter.

    This version corresponds the :math:`D_{ZAE}` test of [Grenier2013]_ (eq. 7), which is
    a version of :math:`\phi_{NM}` from [AZ03]_, modified by using the standardized
    euclidean distance, the log weight function and choosing :math:`d_{min} = 10^{-12}`.

    References
    ----------
    .. Zech G. and Aslan B. (2003) A Multivariate two-sample test based on the concept of minimum energy. PHYStat2003, SLAC, Stanford, CA, Sep 8-11.
    .. [AZ03] Aslan B. and Zech G. (2003) A new class of binning-free, multivariate goodness-of-fit tests: the energy tests. arXiV:hep-ex/0203010.
    """
    nx, d = x.shape
    ny, d = y.shape

    v = (x.std(axis=0, ddof=1) * y.std(axis=0, ddof=1)).astype(np.double)

    dx = spatial.distance.pdist(x, "seuclidean", V=v)
    dy = spatial.distance.pdist(y, "seuclidean", V=v)
    dxy = spatial.distance.cdist(x, y, "seuclidean", V=v)

    phix = -np.log(dx.clip(dmin)).sum() / (nx * (nx - 1))
    phiy = -np.log(dy.clip(dmin)).sum() / (ny * (ny - 1))
    phixy = -np.log(dxy.clip(dmin)).sum() / (nx * ny)
    return phix + phiy - phixy
コード例 #10
0
ファイル: pacman_ppo.py プロジェクト: klottick/ms_pacman
 def _normalize(adv: np.ndarray):
     return (adv - adv.mean()) / (adv.std() + 1e-8)
コード例 #11
0
def scale(x: np.ndarray, axis: int = 0) -> np.ndarray:
    """Normalize features, assuming 2D array t * n with n features and t observations."""
    return (x - x.mean(axis, keepdims=True)) / x.std(axis, keepdims=True)
コード例 #12
0
ファイル: utils.py プロジェクト: Zeta36/fragile
def statistics_from_array(x: numpy.ndarray):
    """Return the (mean, std, max, min) of an array."""
    try:
        return x.mean(), x.std(), x.max(), x.min()
    except AttributeError:
        return numpy.nan, numpy.nan, numpy.nan, numpy.nan
コード例 #13
0
def zscore(x: np.ndarray) -> np.ndarray:
    """Replace all array values with their z-scores."""
    return (x - x.mean()) / x.std(ddof=1)
コード例 #14
0
ファイル: _scaling.py プロジェクト: ruclion/LemonML
def std(data: np.ndarray):
    """Standardization: zero mean and unit variance."""
    mean_ = data.mean(axis=0)
    std_ = data.std(axis=0)
    data -= mean_
    data /= std_
コード例 #15
0
ファイル: utils.py プロジェクト: bioidiap/bob.bio.spear
def normalize_std_array(vector: np.ndarray):
    """Applies a unit mean and variance normalization to an arrayset"""
    return (vector if vector.std(axis=0) == 0 else
            (vector - vector.mean(axis=0)) / vector.std(axis=0))
コード例 #16
0
    def fit(self, X: np.ndarray, y: np.ndarray, sample_weight: Optional[np.ndarray] = None) -> 'Explainer':
        """Fit ensemble of feature-wise GBMs.

        @param X Training batch inputs.
        @param y Training batch outputs.
        @param sample_weight Sample weights (are not supported for now).
        """
        check_X_y(X, y)

        n_features = X.shape[1]
        gbm_prototype = RGBMRegressor(n_estimators=self.pretraining_iter,
                                      max_depth=self.tree_max_depth,
                                      learning_rate=self.gbm_lr,
                                      init_est_type=self.init_est_type,
                                      use_deterministic_trees=self.use_deterministic_trees)

        # make estimators with the same prototype
        self.estimators_ = [clone(gbm_prototype) for _ in range(n_features)]
        # initialize weights with ones
        self.weights_ = torch.ones(n_features, dtype=torch.double,
                                   requires_grad=True)

        y_norm = 0
        if self.norm_target:
            # normalize target
            self.mean_ = y.mean()
            self.std_ = y.std()
            y_norm = (y - self.mean_) / self.std_
            target = torch.tensor(y_norm).double()
        else:
            target = torch.tensor(y).double()

        # # find center (probably it could be passed as an argument)
        # center = X.mean(axis=0)
        # var = np.mean(X.var(axis=0))
        # # RBF as sample weights
        # sample_weight = np.exp(-((X - center) ** 2.0).sum(axis=1) / (2.0 * var))

        if self.init_type == "target":
            init_target = y if not self.norm_target else y_norm
        elif self.init_type == "ones":
            init_target = np.ones_like(y)
        elif self.init_type == "zeros":
            init_target = np.zeros_like(y)
        elif type(self.init_type) == float:
            init_target = np.random.normal(0.0, self.init_type, size=y.shape)
        else:
            raise ValueError(f"Incorrect init_type: {self.init_type}")

        if self.enable_history:
            self.history_ = []
            self.loss_history_ = []

        # init each gbm
        for i, est in enumerate(self.estimators_):
            est.fit(X[:, i:i + 1], init_target, sample_weight)

        use_opt = (self.optimal_weights is not None)

        outputs = np.zeros_like(X.T)
        # train composition
        for epoch in range(self.n_epochs):
            # compute gbms outputs
            outputs += self._predict_last_residuals(X)

            # check if it is needed to recompute weights
            if use_opt:
                opt_started = (epoch >= self.optimal_weights)
            else:
                opt_started = False

            if self.optimal_period is None:
                opt_period = True
            else:
                opt_period = (epoch % self.optimal_period == 0)

            if use_opt and opt_started and opt_period:
                opt_est = LassoCV(cv=self.optimal_cv_folds)

                opt_est.fit(outputs.T, target.numpy())
                cur_opt_weights = opt_est.coef_.ravel()
                new_weights = torch.tensor(cur_opt_weights, dtype=torch.double,
                                           requires_grad=True)
                # new_intercept = torch.tensor(opt_est.intercept_, dtype=torch.double,
                #                              requires_grad=True)
                self.weights_.data = torch.lerp(self.weights_.data, new_weights,
                                                self.optimal_rate)

                # TODO: check that intercept in regression is close to zero

            cur_outputs = torch.tensor(outputs, dtype=torch.double,
                                       requires_grad=True)

            cumulative_pred = (self.weights_.unsqueeze(1) * cur_outputs).sum(dim=0)

            # calculate loss and gradients
            # MSE loss
            loss = ((target - cumulative_pred) ** 2).mean()
            # loss += self.eta * ((cur_outputs.mean(dim=0) - 1) ** 2).sum().sqrt()
            self.weights_.retain_grad()
            cur_outputs.retain_grad()
            loss.backward()

            # update weights
            self.weights_.data -= self.weights_lr * self.weights_.grad.data

            # update gbms
            for i, est in enumerate(self.estimators_[:n_features]):
                est.append(X[:, i:i + 1], -cur_outputs.grad[i].data.numpy(),
                           sample_weight=sample_weight)

            # clear gradients
            self.weights_.grad.data.zero_()

            # update history
            if self.enable_history:
                self.history_.append(self.weights_.data.numpy().copy())
                self.loss_history_.append(loss.item())

        self.coef_ = self.weights_.data.numpy().copy()

        if self.enable_history:
            self.history_ = np.stack(self.history_, axis=0)

        return self
コード例 #17
0
def intrinsic_dimension(X:np.ndarray, k1:int=6, k2:int=12, 
                        estimator:str='levina', metric:str='vector', 
                        trafo:str='var', mem_threshold:int=5000):
    """Calculate intrinsic dimension based on the MLE by Levina and Bickel [1]_.
    
    Parameters
    ----------
    X : ndarray
        - An ``m x n`` vector data matrix with ``n`` objects in an 
          ``m`` dimensional feature space 
        - An ``n x n`` distance matrix.
        
        NOTE: The type must be defined via parameter `metric`!
        
    k1 : int, optional (default: 6)
        Start of neighborhood range to search in.
        
    k2 : int, optional (default: 12)
        End of neighborhood range to search in.
        
    estimator : {'levina', 'mackay'}, optional (default: 'levina')
        Determine the summation strategy: see [2]_.
    
    metric : {'vector', 'distance'}, optional (default: 'vector')
        Determine data type of `X`. 
        
        NOTE: the MLE was derived for euclidean distances. Using 
        other dissimilarity measures may lead to undefined results.
        
    trafo : {None, 'std', 'var'}, optional (default: 'var')
        Transform vector data. 
        
        - None: no transformation
        - 'std': standardization 
        - 'var': subtract mean, divide by variance (default behavior of 
          Laurens van der Maaten's DR toolbox; most likely for other 
          ID/DR techniques).

    mem_treshold : int, optional, default: 5000
        Controls speed-memory usage trade-off: If number of points is higher
        than the given value, don't calculate complete distance matrix at
        once (fast, high memory), but per row (slower, less memory).

    Returns
    -------
    d_mle : int
        Intrinsic dimension estimate (rounded to next integer)
    
    References
    ----------
    .. [1] Levina, E., & Bickel, P. (2004). Maximum likelihood estimation of 
           intrinsic dimension. Advances in Neural Information …, 17, 777–784. 
           http://doi.org/10.2307/2335172
    .. [2] http://www.inference.phy.cam.ac.uk/mackay/dimension/
    """
    n = X.shape[0]
    if estimator not in ['levina', 'mackay']:
        raise ValueError("Parameter 'estimator' must be 'levina' or 'mackay'.")
    if k1 < 1 or k2 < k1 or k2 >= n:
        raise ValueError("Invalid neighborhood: Please make sure that "
                         "0 < k1 <= k2 < n. (Got k1={} and k2={}).".
                         format(k1, k2))
    X = X.copy().astype(float)
        
    if metric == 'vector':
        # New array with unique rows   
        X = X[np.lexsort(np.fliplr(X).T)]
        
        if trafo is None:
            pass
        elif trafo == 'var':
            X -= X.mean(axis=0) # broadcast
            X /= X.var(axis=0) + 1e-7 # broadcast
        elif trafo == 'std':
            # Standardization
            X -= X.mean(axis=0) # broadcast
            X /= X.std(axis=0) + 1e-7 # broadcast
        else:
            raise ValueError("Transformation must be None, 'std', or 'var'.")
        
        # Compute matrix of log nearest neighbor distances
        X2 = (X**2).sum(1)
        
        if n <= mem_threshold: # speed-memory trade-off
            distance = X2.reshape(-1, 1) + X2 - 2*np.dot(X, X.T) #2x br.cast
            distance.sort(1)
            # Replace invalid values with a small number
            distance[distance<0] = 1e-7
            knnmatrix = .5 * np.log(distance[:, 1:k2+1])
        else:
            knnmatrix = np.zeros((n, k2))
            for i in range(n):
                distance = np.sort(X2[i] + X2 - 2 * np.dot(X, X[i, :]))
                # Replace invalid values with a small number
                distance[distance < 0] = 1e-7
                knnmatrix[i, :] = .5 * np.log(distance[1:k2+1])
    
    elif metric == 'distance':
        raise NotImplementedError("ID currently only supports vector data.")
        #=======================================================================
        # # TODO calculation WRONG
        # X.sort(1)
        # X[X < 0] = 1e-7
        # knnmatrix = np.log(X[:, 1:k2+1])
        #=======================================================================
    elif metric == 'similarity':
        raise NotImplementedError("ID currently only supports vector data.")
        #=======================================================================
        # # TODO calculation WRONG
        # print("WARNING: using similarity data may return "
        #       "undefined results.", file=sys.stderr)
        # X[X < 0] = 0
        # distance = 1 - (X / X.max())
        # knnmatrix = np.log(distance[:, 1:k2+1])
        #=======================================================================
    else:
        raise ValueError("Parameter 'metric' must be 'vector' or 'distance'.")
    
    # Compute the ML estimate
    S = np.cumsum(knnmatrix, 1)
    indexk = np.arange(k1, k2+1) # broadcasted afterwards
    dhat = -(indexk - 2) / (S[:, k1-1:k2] - knnmatrix[:, k1-1:k2] * indexk)
       
    if estimator == 'levina':  
        # Average over estimates and over values of k
        no_dims = dhat.mean()
    if estimator == 'mackay':
        # Average over inverses
        dhat **= -1
        dhat_k = dhat.mean(0)
        no_dims = (dhat_k ** -1).mean()
           
    return int(no_dims.round())
コード例 #18
0
ファイル: pareto_front.py プロジェクト: qdbp/cars
    def _find_pareto_points(self, points: np.ndarray) -> np.ndarray:
        """
        Runs the finder on the given points.

        Args:
            points: (n, k) array of n points of k dimensions.
                The convention is that the

        Returns:
            indices into [points] of the approximate pareto set
        """

        n, k = points.shape

        # specialcase some stupid inputs
        if n == 0:
            return np.array([], dtype=np.uint64)

        # specialcased for normalization (see below) to work
        elif n == 1:
            return np.array([0], dtype=np.uint64)

        elif k == 0:
            raise ValueError(
                "No pareto-optimal set of points with no features")

        # we normalize our point sets
        points = (points - points.mean(axis=0)) / points.std(axis=0)

        # for finding qhull faces pointing toward optimality
        test_vector = np.zeros(k + 1)
        test_vector[:-1] = -1

        # vertices in the candidate pareto set
        vertex_mask = np.zeros(len(points), dtype=bool)
        if self.n_peel == 0:
            vertex_mask[:] = True
        else:
            for layer in range(self.n_peel):

                # construct hull of all vertices NOT already in the set
                try:
                    qhull = sss.ConvexHull(points[~vertex_mask])
                except (QhullError, ValueError):
                    break

                # noinspection PyUnresolvedReferences
                pareto_side = np.where((qhull.equations @ test_vector) > 0)
                # noinspection PyUnresolvedReferences
                pareto_vertices = np.unique(
                    qhull.simplices[pareto_side].ravel())
                vertex_mask[pareto_vertices] = True

        # list of indices into points in the candidate set of dominator points
        pareto_vertices = np.where(vertex_mask)[0]

        if self.eliminate_dominated:
            # estimate goodness as the total score
            # NB. this is where it helps to be normalized
            goodness_order = np.argsort(points[pareto_vertices].sum(axis=1))

            # from the most promising dominator
            for dominator_ix in goodness_order:
                # and the most promising dominated
                for dominated_ix in goodness_order[::-1]:
                    if np.all(points[dominator_ix] > points[dominated_ix]):
                        # mark that vertex as dominated
                        pareto_vertices[dominated_ix] = -1
                        break

        return pareto_vertices[pareto_vertices >= 0]
コード例 #19
0
 def set_emul_error_func(self, x_cv: np.ndarray, y_cv_err: np.ndarray) -> None:
     self.emul_error: Callable[[np.ndarray], np.ndarray] = lambda x: y_cv_err.std()
コード例 #20
0
def szekely_rizzo(x: np.ndarray,
                  y: np.ndarray,
                  *,
                  standardize: bool = True) -> float:
    r"""
    Compute the Székely-Rizzo energy distance dissimilarity metric based on an analogy with Newton's gravitational potential energy.

    This method is scale-invariant when `standardize=True` (default), scale-dependent otherwise.

    Parameters
    ----------
    x : ndarray (n,d)
      Reference sample.
    y : ndarray (m,d)
      Candidate sample.
    standardize : bool
      If True (default), the standardized euclidean norm is used, instead of the conventional one.

    Returns
    -------
    float
      Székely-Rizzo's energy distance dissimilarity metric ranging from 0 to infinity.

    Notes
    -----
    The e-distance between two variables :math:`X`, :math:`Y` (target and candidates) of
    sizes :math:`n,d` and :math:`m,d` proposed by [SR2004]_ is defined by:

    .. math::

        e(X, Y) = \frac{n m}{n + m} \left[2\phi_{xy} − \phi_{xx} − \phi_{yy} \right]

    where

    .. math::

        \phi_{xy} &= \frac{1}{n m} \sum_{i = 1}^n \sum_{j = 1}^m \left\Vert X_i − Y_j \right\Vert \\
        \phi_{xx} &= \frac{1}{n^2} \sum_{i = 1}^n \sum_{j = 1}^n \left\Vert X_i − X_j \right\Vert \\
        \phi_{yy} &= \frac{1}{m^2} \sum_{i = 1}^m \sum_{j = 1}^m \left\Vert X_i − Y_j \right\Vert \\

    and where :math:`\Vert\cdot\Vert` denotes the Euclidean norm, :math:`X_i` denotes the i-th
    observation of :math:`X`. When `standardized=False`, this corresponds to the :math:`T`
    test of [RS2016]_ (p. 28) and to the ``eqdist.e`` function of the `energy` R package
    (with two samples) and gives results twice as big as :py:func:`xclim.sdba.processing.escore`.
    The standardization was added following the logic of [Grenier2013] to make the metric scale-invariant.

    References
    ----------
    .. [SR2004] Székely, G. J. and Rizzo, M. L. (2004) Testing for Equal Distributions in High Dimension, InterStat, November (5)
    .. [RS2016] Rizzo, M. L., & Székely, G. J. (2016). Energy distance. Wiley Interdisciplinary Reviews: Computational Statistics, 8(1), 27–38. https://doi.org/10.1002/wics.1375
    """
    n, _ = x.shape
    m, _ = y.shape

    # Mean of the distance pairs
    # We are not taking "mean" because of the condensed output format of pdist
    if standardize:
        v = (x.std(axis=0, ddof=1) * y.std(axis=0, ddof=1)).astype(np.double)
        sXY = spatial.distance.cdist(x, y, "seuclidean", V=v).sum() / (n * m)
        sXX = spatial.distance.pdist(x, "seuclidean", V=v).sum() * 2 / n**2
        sYY = spatial.distance.pdist(y, "seuclidean", V=v).sum() * 2 / m**2
    else:
        sXY = spatial.distance.cdist(x, y, "euclidean").sum() / (n * m)
        sXX = spatial.distance.pdist(x, "euclidean").sum() * 2 / n**2
        sYY = spatial.distance.pdist(y, "euclidean").sum() * 2 / m**2
    w = n * m / (n + m)
    return w * (sXY + sXY - sXX - sYY)
コード例 #21
0
def normalize(x: np.ndarray) -> np.ndarray:
    return (x - x.mean()) / x.std()
コード例 #22
0
ファイル: transformation.py プロジェクト: woshituobaye/pymia
 def _normalize(arr: np.ndarray):
     return (arr - arr.mean()) / arr.std()
コード例 #23
0
ファイル: dei.py プロジェクト: OMaraLab/lipyds
    def _fit_binomial(self, data: dict, n_near_species: np.ndarray,
                      n_near: np.ndarray, n_species: np.ndarray,
                      n_all: np.ndarray, n_near_tot: int, n_all_tot: int):
        """
        This function computes the following approximate probability
        distributions and derives statistics accordingly.

        * The number of lipids near the protein is represented as a
        normal distribution.
        * The fraction of lipids near the protein follows a
        hypergeometric distribution.
        * The enrichment is represented as the log-normal distribution
        derived from the ratio of two binomial convolutions of the
        frame-by-frame binomial distributions.

        All these approximations assume that each frame or observation is
        independent. The binomial approximation assumes that:

        * the number of the lipid species near the protein is
        small compared to the total number of that lipid species
        * the total number of all lipids is large
        * the fraction (n_species / n_all) is not close to 0 or 1.

        .. note::

            The enrichment p-value is calculated from the log-normal
            distribution of the null hypothesis: that the average
            enrichment is representative of the ratio of
            n_species : n_all

        """

        summary = {
            "Total # lipids, all": n_all_tot,
            "Total # lipids, shell": n_near_tot
        }
        p_time = data['Fraction near protein']
        summary['Total # species, shell'] = N = n_near_species.sum()
        summary['Total # species, all'] = N_sp = n_species.sum()
        if n_near_tot:  # catch zeros
            p_shell = N / n_near_tot
        else:
            p_shell = 0
        if n_all_tot:
            p_null = N_sp / n_all_tot
        else:
            p_null = 0

        # n events: assume normal
        summary['Mean # species, shell'] = n_near_species.mean()
        summary['SD # species, shell'] = sd = n_near_species.std()

        # actually hypergeometric, but binomials are easier
        # X ~ B(n_near_tot, p_shell)
        summary['Mean fraction of species, shell'] = p_shell
        summary['SD fraction of species, shell'] = sd_frac = sd / n_near.mean()

        if p_null == 0:
            summary['Mean enrichment'] = 1
            summary['SD enrichment'] = 0

        else:
            summary['Mean enrichment'] = p_shell / p_null
            summary['SD enrichment'] = sd_frac / p_null

        return summary