def _self_info_rate(source): """ returns 'rate of self-information' -- i.e. average (per-symbol) entropy of the sequence **source**, where probability of a given symbol occurring is calculated based on the number of occurrences within the sequence itself. if all elements of the source are unique, this should equal ``log(len(source), 2)``. :arg source: iterable containing 0+ symbols (e.g. list of strings or ints, string of characters, etc). :returns: float bits of entropy """ try: size = len(source) except TypeError: # if len() doesn't work, calculate size by summing counts later size = None counts = defaultdict(int) for char in source: counts[char] += 1 if size is None: values = counts.values() size = sum(values) else: values = itervalues(counts) if not size: return 0 # NOTE: the following performs ``- sum(value / size * logf(value / size, 2) for value in values)``, # it just does so with as much pulled out of the sum() loop as possible... return logf(size, 2) - sum(value * logf(value, 2) for value in values) / size
def _average_entropy(source, total=False): """returns the rate of self-information in a sequence of symbols, (or total self-information if total=True). this is eqvuialent to the average entropy of a given symbol, using the sequence itself as the symbol probability distribution. if all elements of the source are unique, this should equal ``log(len(source), 2)``. :arg source: iterable containing 0+ symbols :param total: instead of returning average entropy rate, return total self-information :returns: float bits """ try: size = len(source) except TypeError: # if len() doesn't work, calculate size by summing counts later size = None counts = defaultdict(int) for char in source: counts[char] += 1 if size is None: values = counts.values() size = sum(values) else: values = itervalues(counts) if not size: return 0 ### NOTE: below code performs the calculation ### ``- sum(value / size * logf(value / size, 2) for value in values)``, ### and then multplies by ``size`` if total is True, ### it just does it with fewer operations. tmp = sum(value * logf(value, 2) for value in values) if total: return size * logf(size, 2) - tmp else: return logf(size, 2) - tmp / size