def _self_info_rate(source): """ returns 'rate of self-information' -- i.e. average (per-symbol) entropy of the sequence **source**, where probability of a given symbol occurring is calculated based on the number of occurrences within the sequence itself. if all elements of the source are unique, this should equal ``log(len(source), 2)``. :arg source: iterable containing 0+ symbols (e.g. list of strings or ints, string of characters, etc). :returns: float bits of entropy """ try: size = len(source) except TypeError: # if len() doesn't work, calculate size by summing counts later size = None counts = defaultdict(int) for char in source: counts[char] += 1 if size is None: values = counts.values() size = sum(values) else: values = itervalues(counts) if not size: return 0 # NOTE: the following performs ``- sum(value / size * logf(value / size, 2) for value in values)``, # it just does so with as much pulled out of the sum() loop as possible... return logf(size, 2) - sum(value * logf(value, 2) for value in values) / size
def __init__(self, wordset=None, preset=None, spaces=True, **kwds): if not (wordset or preset): preset = default_wordset if preset: if wordset: raise TypeError(_PCW_MSG) wordset = wordsets[preset] if wordset is None: wordset = _load_wordset(preset) if len(set(wordset)) != len(wordset): raise ValueError("`wordset` cannot contain duplicate elements") if not isinstance(wordset, (list, tuple)): wordset = tuple(wordset) self.wordset = wordset self.entropy_rate = logf(len(wordset), 2) super(PhraseGenerator, self).__init__(**kwds) # NOTE: regarding min_chars: # in order to ensure a brute force attack against underlying # charset isn't more successful than one against the wordset, # we need to reject any passwords which contain so many short # words that ``chars_in_phrase * entropy_per_char < # words_in_phrase * entropy_per_word``. # this is done by finding the minimum chars required to invalidate # the inequality, and then rejecting any phrases that are shorter. self._entropy_per_char = _average_wordset_entropy(wordset) self._min_chars = int(self.entropy / self._entropy_per_char) if spaces: self._min_chars += self.size-1 self._sep = _USPACE else: self._sep = _UEMPTY
def _self_info_rate(source): try: size = len(source) except TypeError: size = None else: counts = defaultdict(int) for char in source: counts[char] += 1 if size is None: values = counts.values() size = sum(values) else: values = itervalues(counts) if not size: return 0 return logf(size, 2) - sum(value * logf(value, 2) for value in values) / size
def _max_average_entropy(target, source): """calculate maximum _average_entropy() of all possible strings of length <target>, if drawn from a set of symbols of size <source>. """ # NOTE: this accomplishes it's purpose by assuming maximum self-information # would be a string repeating all symbols ``floor(target/source)`` # times, followed by the first ``target % source`` symbols repeated # once more. assert target > 0 assert source > 0 if target < source: # special case of general equation, to prevent intermediate DomainError. return logf(target, 2) else: q, r = divmod(target, source) p1 = (q + 1) / target p2 = q / target return -(r * p1 * logf(p1, 2) + (source - r) * p2 * logf(p2, 2))
def _average_entropy(source, total=False): """returns the rate of self-information in a sequence of symbols, (or total self-information if total=True). this is eqvuialent to the average entropy of a given symbol, using the sequence itself as the symbol probability distribution. if all elements of the source are unique, this should equal ``log(len(source), 2)``. :arg source: iterable containing 0+ symbols :param total: instead of returning average entropy rate, return total self-information :returns: float bits """ try: size = len(source) except TypeError: # if len() doesn't work, calculate size by summing counts later size = None counts = defaultdict(int) for char in source: counts[char] += 1 if size is None: values = counts.values() size = sum(values) else: values = itervalues(counts) if not size: return 0 ### NOTE: below code performs the calculation ### ``- sum(value / size * logf(value / size, 2) for value in values)``, ### and then multplies by ``size`` if total is True, ### it just does it with fewer operations. tmp = sum(value * logf(value, 2) for value in values) if total: return size * logf(size, 2) - tmp else: return logf(size, 2) - tmp / size
def __init__(self, charset=None, preset=None, **kwds): if not (charset or preset): preset = default_charset if preset: if charset: raise TypeError(_PCW_MSG) charset = charsets[preset] if len(set(charset)) != len(charset): raise ValueError("`charset` cannot contain duplicate elements") self.charset = charset self.entropy_rate = logf(len(charset), 2) super(WordGenerator, self).__init__(**kwds)
def entropy_per_symbol(self): """ Average entropy per symbol (assuming all symbols have equal probability) """ return logf(self.symbol_count, 2)
def entropy_per_symbol(self): return logf(self.symbol_count, 2)