Exemple #1
0
def interaction_information(ar, ks=None, estimator='NSB', axis=0, r=None):
    """Interaction information from data matrix.

    See Eq.10 in:
    Timme, Nicholas, et al.
    "Synergy, redundancy, and multivariate information measures:
    an experimentalist's perspective."
    Journal of computational neuroscience 36.2 (2014): 119-140.

    Paramaters
    ----------
    ar : array-like
        n-by-p array of n samples from p discrete variables.
    ks : 1D array of length p, optional
        Alphabet size for each variable.
    estimator : str or entropy estimator instance, optional
        If a string, use the estimator class with the same name and default
        parameters. Check ndd.entropy_estimators for the available estimators.
        Default: use the  Nemenman-Shafee-Bialek (NSB) estimator.
    axis : int, optional
        The sample-indexing axis. Defaults to 0.
    r : int, optional; 1<=r<=p.
        If passed, return a generator yielding estimates for the p-choose-r
        possible combinations of r variables.
        Combinations are ordered as: list(itertools.combinations(range(p), r)).

    Returns
    -------
    float
        Interaction information estimate.

    Raises
    ------
    CardinalityError
        If len(ks) != p.

    """
    def iinfo(X, ks, estimator):
        """Helper function for interaction information definition.

        Ref: timme2014synergy
        """
        info = 0.0
        S = len(X)
        for T in range(1, S + 1):
            sgn = (-1)**(S - T)
            info += sgn * sum(from_data(X, ks=ks, estimator=estimator, r=T))
        return -info

    estimator, _ = check_estimator(estimator)

    if not isinstance(ar, DataArray):
        ar = DataArray(ar, ks=ks, axis=axis)

    if r is not None:
        return (iinfo(data, k, estimator) for data, k in ar.iter_data(r=r))

    data, k = ar.iter_data()
    return iinfo(data, k, estimator)
Exemple #2
0
def kullback_leibler_divergence(pk, qk, k=None, estimator='NSB'):
    """
    Kullback-Leibler divergence given counts pk and a reference PMF qk.

    Return an estimate of the Kullback-Leibler given an array of counts pk and
    a reference PMF qk. The estimate (in nats) is computed as:
    - S_p - sum(pk * log(qk)) / sum(pk)
    where S_p is the entropy estimate from counts pk.

    Parameters
    ----------
    pk : array_like
        The number of occurrences of a set of bins.
    qk : array_like
        Reference PMF in sum(pk log(pk/qk). len(qk) = len(pk).
        Must be a valid PMF (non-negative, normalized).
    k : int or array-like, optional
        Total number of bins (including unobserved bins); k >= p.
        A float is a valid input for whole numbers (e.g. k=1.e3).
        If an array, set k = numpy.prod(k). Defaults to len(pk).
    estimator : str or entropy estimator instance, optional
        If a string, use the estimator class with the same name and default
        parameters. Check ndd.entropy_estimators for the available estimators.
        Default: use the  Nemenman-Shafee-Bialek (NSB) estimator.

    Returns
    -------
    float
        Kullback-Leibler divergence.

    """

    if is_pmf(qk):
        log_qk = numpy.log(qk)
    else:
        raise PmfError('qk must be a valid PMF')

    if sum(numpy.isinf(log_qk)) > 0:
        raise PmfError('qk must be positive')

    if len(log_qk) != len(pk):
        raise PmfError('qk and pk must have the same length.')

    if k == 1:  # single bin
        return 0.0
    if k is None:
        k = len(pk)

    estimator, _ = check_estimator(estimator)
    estimate = estimator.fit(pk, k=k).estimate_
    kl = -(estimate + numpy.sum(pk * log_qk) / float(sum(pk)))
    if numpy.isnan(kl):
        logger.warning('nan value for KL divergence')
        kl = numpy.nan

    return kl
Exemple #3
0
def entropy(pk, k=None, estimator='NSB', return_std=False):
    """
    Entropy estimate from an array of counts.

    Return a Bayesian estimate for the entropy of an unknown discrete
    distribution from an input array of counts pk.

    Parameters
    ----------
    pk : array-like
        The number of occurrences of a set of bins.
    k : int or array-like, optional
        Alphabet size (the number of bins with non-zero probability).
        Must be >= len(pk). A float is a valid input for whole numbers
        (e.g. k=1.e3). If an array, set k = numpy.prod(k).
        Default: k = sum(pk > 0)
    estimator : str or entropy estimator instance, optional
        If a string, use the estimator class with the same name and default
        parameters. Check ndd.entropy_estimators for the available estimators.
        Default: use the  Nemenman-Shafee-Bialek (NSB) estimator.
    return_std : boolean, optional
        If True, also return an approximation for the standard deviation
        over the entropy posterior.

    Returns
    -------
    entropy : float
        Entropy estimate.
    std : float, optional
        Uncertainty in the entropy estimate. Only if `return_std` is True.

    """

    estimator, _ = check_estimator(estimator)

    pk = numpy.asarray(pk)
    if k is None:
        k = sum(pk > 0)

    estimator = estimator.fit(pk, k=k)
    S, err = estimator.estimate_, estimator.err_

    if numpy.isnan(S):
        logger.warning('nan value for entropy estimate')
        S = numpy.nan

    if return_std:
        if err is not None and numpy.isnan(err):
            err = numpy.nan
            logger.warning('nan value for entropy posterior std deviation')
        return S, err

    return S
Exemple #4
0
def jensen_shannon_divergence(pk, k=None, estimator='NSB'):
    """
    Return the Jensen-Shannon divergence from a m-by-p matrix of counts.

    Return an estimate of the Jensen-Shannon divergence between
    m unknown discrete distributions from a m-by-p input array of counts.
    The estimate (in nats) is computed as a combination of single Bayesian
    entropy estimates. If the total number of samples varies among the
    distributions, the function returns a weighted divergence with weights
    proportional to the total number of samples in each row
    (see the general definition of Jensen-Shannon divergence:
    https://en.wikipedia.org/wiki/Jensen-Shannon_divergence).

    Parameters
    ----------

    pk : array-like, shape (m, p)
        Matrix of frequency counts. Each row corresponds to the number of
        occurrences of a set of bins from a different distribution.
    k : int or array-like, optional
        Total number of bins (including unobserved bins); k >= p.
        A float is a valid input for whole numbers (e.g. k=1.e3).
        If an array, set k = numpy.prod(k). Defaults to p.
    estimator : str or entropy estimator instance, optional
        If a string, use the estimator class with the same name and default
        parameters. Check ndd.entropy_estimators for the available estimators.
        Default: use the  Nemenman-Shafee-Bialek (NSB) estimator.

    Returns
    -------
    float
        Jensen-Shannon divergence.

    """

    estimator, _ = check_estimator(estimator)

    estimator = JSDivergence(estimator).fit(pk, k=k)
    js = estimator.estimate_

    if numpy.isnan(js):
        logger.warning('nan value for JS divergence')
        js = numpy.nan

    return js
Exemple #5
0
def mutual_information(ar, ks=None, estimator='NSB', axis=0):
    """Mutual information from p-by-n data matrix.

    If p > 2, return an estimate of the mutual information for each possible
    pair of variables, ordered as list(itertools.combinations(range(p), r=2)).

    Paramaters
    ----------
    ar : array-like
        n-by-p array of n samples from p discrete variables.
    ks : 1D p-dimensional array, optional
        Alphabet size for each variable.
    estimator : str or entropy estimator instance, optional
        If a string, use the estimator class with the same name and default
        parameters. Check ndd.entropy_estimators for the available estimators.
        Default: use the  Nemenman-Shafee-Bialek (NSB) estimator.
    axis : int, optional
        The sample-indexing axis. Defaults to 0.

    Returns
    -------
    float
        Coinformation estimate.

    Raises
    ------
    CardinalityError
        If len(ks) != p.

    """
    estimator, _ = check_estimator(estimator)

    if not isinstance(ar, DataArray):
        ar = DataArray(ar, ks=ks, axis=axis)

    p = ar.shape[0]

    if p > 2:
        h1 = list(from_data(ar, r=1))
        return (h1[i1] + h1[i2] - from_data(ar[i1, i2], estimator=estimator)
                for i1, i2 in combinations(range(p), 2))

    return (sum(from_data(ar, r=1, estimator=estimator)) -
            from_data(ar, estimator=estimator))
Exemple #6
0
def from_data(ar, ks=None, estimator='NSB', axis=0, r=None):
    """
    Entropy estimate from data matrix.

    Paramaters
    ----------
    ar : array-like, shape (n, p)
        2D array of n samples from p discrete variables.
    ks : int or 1D array of length p, optional
        Alphabet size for each variable.
    estimator : str or entropy estimator instance, optional
        If a string, use the estimator class with the same name and default
        parameters. Check ndd.entropy_estimators for the available estimators.
        Default: use the  Nemenman-Shafee-Bialek (NSB) estimator.
    axis : int, optional
        The sample-indexing axis. Defaults to 0.
    r : int, optional; ; 1<=r<=p.
        If passed, return a generator yielding estimates for the p-choose-r
        possible combinations of r variables.

    Returns
    -------
    float
        Entropy estimate

    """

    estimator, _ = check_estimator(estimator)

    if not isinstance(ar, DataArray):
        ar = DataArray(ar, ks=ks, axis=axis)

    if r is not None:
        return (estimator(pk, k=k) for pk, k in ar.iter_counts(r=r))

    counts, k = ar.iter_counts()
    return estimator(counts, k=k)
Exemple #7
0
def conditional_entropy(ar, c, ks=None, estimator='NSB', axis=0, r=None):  # pylint: disable=too-many-arguments
    """
    Coditional entropy estimate from data matrix.

    Paramaters
    ----------
    ar : array-like
        n-by-p array of n samples from p discrete variables.
    c : int or array-like
        The variables on which entropy is conditioned (as column indices).
    ks : 1D p-dimensional array, optional
        Alphabet size for each variable.
    estimator : str or entropy estimator instance, optional
        If a string, use the estimator class with the same name and default
        parameters. Check ndd.entropy_estimators for the available estimators.
        Default: use the  Nemenman-Shafee-Bialek (NSB) estimator.
    axis : int, optional
        The sample-indexing axis. Defaults to 0.
    r : int or None, optional; 1<=r<=p-len(c).
        If passed, return a generator yielding estimates for all possible
        combinations of r variables conditioning on the `c` variables.
        Indices are sorted as:
        list(x for x in collections.combinations(range(p), r=r+len(c))
             if set(c) <= set(x))

    Returns
    -------
    float
        Conditional entropy estimate

    """
    # check data shape
    if not isinstance(ar, DataArray):
        ar = DataArray(ar, ks=ks, axis=axis)

    p = ar.shape[0]

    try:
        c = list(c)
    except TypeError:
        c = [c]
    if not set(c) <= set(range(p)):
        return EstimatorInputError('The indices of conditioning variables'
                                   ' are not valid')

    # EntropyEstimator objects are callable and return the fitted estimate
    estimator, _ = check_estimator(estimator)

    # Entropy of features on which we are conditioning
    counts, k = ar[c].iter_counts()
    hc = estimator(counts, k=k)

    if r is not None:

        # r should be >= p - len(c)

        # include the c variables in the set
        r = r + len(c)

        indices = combinations(range(p), r=r)

        return (estimator(counts, k=k) - hc
                for ids, (counts, k) in zip(indices, ar.iter_counts(r=r))
                if set(c) <= set(ids))

    counts, k = ar.iter_counts()
    return estimator(counts, k=k) - hc