Example #1
0
def _self_info_rate(source):
    """
    returns 'rate of self-information' --
    i.e. average (per-symbol) entropy of the sequence **source**,
    where probability of a given symbol occurring is calculated based on
    the number of occurrences within the sequence itself.

    if all elements of the source are unique, this should equal ``log(len(source), 2)``.

    :arg source:
        iterable containing 0+ symbols
        (e.g. list of strings or ints, string of characters, etc).

    :returns:
        float bits of entropy
    """
    try:
        size = len(source)
    except TypeError:
        # if len() doesn't work, calculate size by summing counts later
        size = None
    counts = defaultdict(int)
    for char in source:
        counts[char] += 1
    if size is None:
        values = counts.values()
        size = sum(values)
    else:
        values = itervalues(counts)
    if not size:
        return 0
    # NOTE: the following performs ``- sum(value / size * logf(value / size, 2) for value in values)``,
    #       it just does so with as much pulled out of the sum() loop as possible...
    return logf(size, 2) - sum(value * logf(value, 2)
                               for value in values) / size
Example #2
0
def _self_info_rate(source):
    """
    returns 'rate of self-information' --
    i.e. average (per-symbol) entropy of the sequence **source**,
    where probability of a given symbol occurring is calculated based on
    the number of occurrences within the sequence itself.

    if all elements of the source are unique, this should equal ``log(len(source), 2)``.

    :arg source:
        iterable containing 0+ symbols
        (e.g. list of strings or ints, string of characters, etc).

    :returns:
        float bits of entropy
    """
    try:
        size = len(source)
    except TypeError:
        # if len() doesn't work, calculate size by summing counts later
        size = None
    counts = defaultdict(int)
    for char in source:
        counts[char] += 1
    if size is None:
        values = counts.values()
        size = sum(values)
    else:
        values = itervalues(counts)
    if not size:
        return 0
    # NOTE: the following performs ``- sum(value / size * logf(value / size, 2) for value in values)``,
    #       it just does so with as much pulled out of the sum() loop as possible...
    return logf(size, 2) - sum(value * logf(value, 2) for value in values) / size
Example #3
0
def _average_entropy(source, total=False):
    """returns the rate of self-information in a sequence of symbols,
    (or total self-information if total=True).

    this is eqvuialent to the average entropy of a given symbol,
    using the sequence itself as the symbol probability distribution.
    if all elements of the source are unique, this should equal
    ``log(len(source), 2)``.

    :arg source:
        iterable containing 0+ symbols
    :param total:
        instead of returning average entropy rate,
        return total self-information
    :returns:
        float bits
    """
    try:
        size = len(source)
    except TypeError:
        # if len() doesn't work, calculate size by summing counts later
        size = None
    counts = defaultdict(int)
    for char in source:
        counts[char] += 1
    if size is None:
        values = counts.values()
        size = sum(values)
    else:
        values = itervalues(counts)
    if not size:
        return 0
    ### NOTE: below code performs the calculation
    ###       ``- sum(value / size * logf(value / size, 2) for value in values)``,
    ###       and then multplies by ``size`` if total is True,
    ###       it just does it with fewer operations.
    tmp = sum(value * logf(value, 2) for value in values)
    if total:
        return size * logf(size, 2) - tmp
    else:
        return logf(size, 2) - tmp / size