Esempio n. 1
0
def _self_info_rate(source):
    """
    returns 'rate of self-information' --
    i.e. average (per-symbol) entropy of the sequence **source**,
    where probability of a given symbol occurring is calculated based on
    the number of occurrences within the sequence itself.

    if all elements of the source are unique, this should equal ``log(len(source), 2)``.

    :arg source:
        iterable containing 0+ symbols
        (e.g. list of strings or ints, string of characters, etc).

    :returns:
        float bits of entropy
    """
    try:
        size = len(source)
    except TypeError:
        # if len() doesn't work, calculate size by summing counts later
        size = None
    counts = defaultdict(int)
    for char in source:
        counts[char] += 1
    if size is None:
        values = counts.values()
        size = sum(values)
    else:
        values = itervalues(counts)
    if not size:
        return 0
    # NOTE: the following performs ``- sum(value / size * logf(value / size, 2) for value in values)``,
    #       it just does so with as much pulled out of the sum() loop as possible...
    return logf(size, 2) - sum(value * logf(value, 2)
                               for value in values) / size
Esempio n. 2
0
def _self_info_rate(source):
    """
    returns 'rate of self-information' --
    i.e. average (per-symbol) entropy of the sequence **source**,
    where probability of a given symbol occurring is calculated based on
    the number of occurrences within the sequence itself.

    if all elements of the source are unique, this should equal ``log(len(source), 2)``.

    :arg source:
        iterable containing 0+ symbols
        (e.g. list of strings or ints, string of characters, etc).

    :returns:
        float bits of entropy
    """
    try:
        size = len(source)
    except TypeError:
        # if len() doesn't work, calculate size by summing counts later
        size = None
    counts = defaultdict(int)
    for char in source:
        counts[char] += 1
    if size is None:
        values = counts.values()
        size = sum(values)
    else:
        values = itervalues(counts)
    if not size:
        return 0
    # NOTE: the following performs ``- sum(value / size * logf(value / size, 2) for value in values)``,
    #       it just does so with as much pulled out of the sum() loop as possible...
    return logf(size, 2) - sum(value * logf(value, 2) for value in values) / size
Esempio n. 3
0
 def __init__(self, wordset=None, preset=None, spaces=True, **kwds):
     if not (wordset or preset):
         preset = default_wordset
     if preset:
         if wordset:
             raise TypeError(_PCW_MSG)
         wordset = wordsets[preset]
         if wordset is None:
             wordset = _load_wordset(preset)
     if len(set(wordset)) != len(wordset):
         raise ValueError("`wordset` cannot contain duplicate elements")
     if not isinstance(wordset, (list, tuple)):
         wordset = tuple(wordset)
     self.wordset = wordset
     self.entropy_rate = logf(len(wordset), 2)
     super(PhraseGenerator, self).__init__(**kwds)
     # NOTE: regarding min_chars:
     #       in order to ensure a brute force attack against underlying
     #       charset isn't more successful than one against the wordset,
     #       we need to reject any passwords which contain so many short
     #       words that ``chars_in_phrase * entropy_per_char <
     #                    words_in_phrase * entropy_per_word``.
     #       this is done by finding the minimum chars required to invalidate
     #       the inequality, and then rejecting any phrases that are shorter.
     self._entropy_per_char = _average_wordset_entropy(wordset)
     self._min_chars = int(self.entropy / self._entropy_per_char)
     if spaces:
         self._min_chars += self.size-1
         self._sep = _USPACE
     else:
         self._sep = _UEMPTY
Esempio n. 4
0
def _self_info_rate(source):
    try:
        size = len(source)
    except TypeError:
        size = None
    else:
        counts = defaultdict(int)
        for char in source:
            counts[char] += 1

        if size is None:
            values = counts.values()
            size = sum(values)
        else:
            values = itervalues(counts)
        if not size:
            return 0

    return logf(size, 2) - sum(value * logf(value, 2) for value in values) / size
Esempio n. 5
0
def _max_average_entropy(target, source):
    """calculate maximum _average_entropy() of all possible
    strings of length <target>, if drawn from a set of symbols
    of size <source>.
    """
    # NOTE: this accomplishes it's purpose by assuming maximum self-information
    #       would be a string repeating all symbols ``floor(target/source)``
    #       times, followed by the first ``target % source`` symbols repeated
    #       once more.
    assert target > 0
    assert source > 0
    if target < source:
        # special case of general equation, to prevent intermediate DomainError.
        return logf(target, 2)
    else:
        q, r = divmod(target, source)
        p1 = (q + 1) / target
        p2 = q / target
        return -(r * p1 * logf(p1, 2) + (source - r) * p2 * logf(p2, 2))
Esempio n. 6
0
def _average_entropy(source, total=False):
    """returns the rate of self-information in a sequence of symbols,
    (or total self-information if total=True).

    this is eqvuialent to the average entropy of a given symbol,
    using the sequence itself as the symbol probability distribution.
    if all elements of the source are unique, this should equal
    ``log(len(source), 2)``.

    :arg source:
        iterable containing 0+ symbols
    :param total:
        instead of returning average entropy rate,
        return total self-information
    :returns:
        float bits
    """
    try:
        size = len(source)
    except TypeError:
        # if len() doesn't work, calculate size by summing counts later
        size = None
    counts = defaultdict(int)
    for char in source:
        counts[char] += 1
    if size is None:
        values = counts.values()
        size = sum(values)
    else:
        values = itervalues(counts)
    if not size:
        return 0
    ### NOTE: below code performs the calculation
    ###       ``- sum(value / size * logf(value / size, 2) for value in values)``,
    ###       and then multplies by ``size`` if total is True,
    ###       it just does it with fewer operations.
    tmp = sum(value * logf(value, 2) for value in values)
    if total:
        return size * logf(size, 2) - tmp
    else:
        return logf(size, 2) - tmp / size
Esempio n. 7
0
 def __init__(self, charset=None, preset=None, **kwds):
     if not (charset or preset):
         preset = default_charset
     if preset:
         if charset:
             raise TypeError(_PCW_MSG)
         charset = charsets[preset]
     if len(set(charset)) != len(charset):
         raise ValueError("`charset` cannot contain duplicate elements")
     self.charset = charset
     self.entropy_rate = logf(len(charset), 2)
     super(WordGenerator, self).__init__(**kwds)
Esempio n. 8
0
 def entropy_per_symbol(self):
     """
     Average entropy per symbol (assuming all symbols have equal probability)
     """
     return logf(self.symbol_count, 2)
Esempio n. 9
0
 def entropy_per_symbol(self):
     return logf(self.symbol_count, 2)
Esempio n. 10
0
 def entropy_per_symbol(self):
     """
     Average entropy per symbol (assuming all symbols have equal probability)
     """
     return logf(self.symbol_count, 2)