def test_get_codes(self, d):
        """the sum of len(code) * freq_dict[code] is optimal, so it
        must be invariant under permutation of the dictionary"""
        # NB: this also tests huffman_tree indirectly

        t = huffman_tree(d)
        c1 = get_codes(t)
        d2 = list(d.items())
        shuffle(d2)
        d2 = dict(d2)
        t2 = huffman_tree(d2)
        c2 = get_codes(t2)
        self.assertEqual(sum([d[k] * len(c1[k]) for k in d]),
                         sum([d2[k] * len(c2[k]) for k in d2]))
Beispiel #2
0
 def recover_bits(self, token_inds, remaining_bits):
     ind = self.lm.SOS_ind
     prefix = [ind]
     p = self.lm.p_next_token(prefix)
     cipher_text = []
     # Terminate the generation after we have consumed all indices or
     # have extracted all bits
     while 0 < len(token_inds) and 0 < remaining_bits:
         # Build Huffman codes for the conditional distribution
         heap = build_min_heap(p)
         hc = huffman_tree(heap)
         # Check if the total variation is low enough
         if tv_huffman(hc, p)[0] < self.tv_threshold:
             # We have controlled this step. Some bits are hidden.
             code = invert_code_tree(hc)
             # Look up the Huffman code for the token.
             ind = token_inds.pop(0)
             # Convert the Huffman code into bits
             # left => 0, right => 1
             cipher_text_fragment = [
                 0 if bit == 'l' else 1 for bit in code[ind]
             ]
             # Truncate possible trailing paddings
             cipher_text += cipher_text_fragment[:remaining_bits]
             remaining_bits -= len(cipher_text_fragment)
             # print(remaining_bits)
             prefix += [ind]
             p = self.lm.p_next_token(prefix)
         else:
             # We did not control this step. Skip.
             prefix.append(token_inds.pop(0))
             p = self.lm.p_next_token(prefix)
     return cipher_text
Beispiel #3
0
    def test_avg_length(self, d):
        """avg_length should return a float in the
        interval [0, 8]"""

        t = huffman_tree(d)
        f = avg_length(t, d)
        self.assertTrue(isinstance(f, float))
        self.assertTrue(0 <= f <= 8.0)
Beispiel #4
0
    def test_round_trip(self, b):
        """test inverting generate_compressed and generate_uncompressed"""

        orig_text = b
        freq = make_freq_dict(orig_text)
        assume(len(freq) > 1)
        tree = huffman_tree(freq)
        codes = get_codes(tree)
        compressed = generate_compressed(orig_text, codes)
        uncompressed = generate_uncompressed(tree, compressed, len(orig_text))
        assert orig_text == uncompressed
Beispiel #5
0
    def test_round_trip(self, b):
        """test inverting generate_compressed and generate_uncompressed"""

        orig_text = b
        freq = make_freq_dict(orig_text)
        assume(len(freq) > 1)
        tree = huffman_tree(freq)
        codes = get_codes(tree)
        compressed = generate_compressed(orig_text, codes)
        uncompressed = generate_uncompressed(tree, compressed, len(orig_text))
        assert orig_text == uncompressed  #, '\n'.join([str(list(orig_text)), str(codes), byte_to_bits(compressed[0]), str(list(uncompressed))])
    def test_number_nodes(self, d):
        """if the root is an interior node, it must be numbered
        two less than the number of symbols"""
        # a complete tree has one fewer interior nodes than
        # it has leaves, and we are numbering from 0
        # NB: this also tests huffman_tree indirectly

        t = huffman_tree(d)
        assume(not t.is_leaf())
        count = len(d)
        number_nodes(t)
        self.assertEqual(count, t.number + 2)
    def test_num_nodes_to_bytes(self, b):
        """num_nodes_to_bytes returns a bytes object that
        has length 1 (since the number of internal nodes cannot
        exceed 256)"""
        # NB: also indirectly tests make_freq_dict and huffman_tree

        d = make_freq_dict(b)
        assume(len(d) > 1)
        t = huffman_tree(d)
        number_nodes(t)
        n = num_nodes_to_bytes(t)
        self.assertTrue(isinstance(n, bytes))
        self.assertEqual(len(n), 1)
    def test_generate_compressed(self, b):
        """generate_compressed should return a bytes
        object that is no longer than the input bytes, and
        the size of the compressed object should be
        invariant under permuting the input"""
        # NB: this also indirectly tests make_freq_dict, huffman_tree,
        # and get_codes

        d = make_freq_dict(b)
        t = huffman_tree(d)
        c = get_codes(t)
        compressed = generate_compressed(b, c)
        self.assertTrue(isinstance(compressed, bytes))
        self.assertTrue(len(compressed) <= len(b))
        l = list(b)
        shuffle(l)
        b = bytes(l)
        d = make_freq_dict(b)
        t = huffman_tree(d)
        c = get_codes(t)
        compressed2 = generate_compressed(b, c)
        self.assertEqual(len(compressed2), len(compressed))
    def test_tree_to_bytes(self, b):
        """tree_to_bytes generates a bytes representation of
        a post-order traversal of a trees internal nodes"""
        # Since each internal node requires 4 bytes to represent,
        # and there are 1 fewer internal node than distinct symbols,
        # the length of the bytes produced should be 4 times the
        # length of the frequency dictionary, minus 4"""
        # NB: also indirectly tests make_freq_dict, huffman_tree, and
        # number_nodes

        d = make_freq_dict(b)
        assume(len(d) > 1)
        t = huffman_tree(d)
        number_nodes(t)
        output_bytes = tree_to_bytes(t)
        dictionary_length = len(d)
        leaf_count = dictionary_length
        self.assertEqual(4 * (leaf_count - 1), len(output_bytes))
Beispiel #10
0
 def embed_bits(self, coin_flips):
     '''We use a sequence of coin flips to control the generation of token
     indices from a language model. This returns _a sequence_ as defined by
     the language model, e.g. sentence, paragraph.'''
     ind = self.lm.SOS_ind
     prefix = [ind]
     p = self.lm.p_next_token(prefix)
     # Terminate the generation after we generate the EOS token
     while len(prefix) == 1 or (len(prefix) < self.max_sequence_length
                                and ind != self.lm.EOS_ind):
         # There is still some cipher text to hide
         le = len(coin_flips)
         if le > 0:
             # Build Huffman codes for the conditional distribution
             heap = build_min_heap(p)
             hc = huffman_tree(heap)
             # Check if the total variation is low enough
             # print(len(prefix) - 1, tv_huffman(hc, p))
             if tv_huffman(hc, p)[0] < self.tv_threshold:
                 # Huffman-decode the cipher text into a token
                 # Consume the cipher text until a token is generated
                 decoder_state = hc
                 while type(decoder_state) is tuple:
                     left, right = decoder_state
                     try:
                         bit = coin_flips.pop(0)
                     except IndexError:
                         # No more cipher text. Pad with random bits
                         bit = self.random.choice(2)
                     # 0 => left, 1 => right
                     decoder_state = left if bit == 0 else right
                 # Decoder settles in a leaf node
                 ind = decoder_state
                 prefix.append(ind)
                 p = self.lm.p_next_token(prefix)
                 continue
         # Forward sample according to LM normally
         ind = self.random.choice(self.lm.vocabulary_size, p=p)
         prefix.append(ind)
         p = self.lm.p_next_token(prefix)
     # Drop the EOS index
     return prefix[1:]
    def test_huffman_tree(self, d):
        """huffman_tree returns a non-leaf HuffmanNode"""

        t = huffman_tree(d)
        self.assertTrue(isinstance(t, HuffmanNode))
        self.assertTrue(not t.is_leaf())
Beispiel #12
0
    # Easily create a text with the desired distribution by adding as many items
    # to an array as the frequency of the item
    freqtable = []
    for symbol, f in freq:
        freqtable += [symbol] * f

    text = ""

    # Add a random symbol from the freq table, over many iterations this converges
    # to the distribution given by the frequency table
    for i in range(50000):
        text += rand.choice(freqtable)

    # Generate the huffman tree and a corresponding translation table
    tree = huff.huffman_tree(freq)
    codes = huff.huffman_codes(tree)

    print "symbol bits:"
    for s, bits in codes.items():
        print "\t%s \t%s" % (s, bits)

    # Entropy and mean bits per symbol:
    ent = entropy(freq)
    mbs = mean_bits(freq, codes)

    print "entropy  ", ent
    print "mean bits", mbs

    # Compression ratio:
    cr = CHARSIZE / mbs
Beispiel #13
0
    """  Sorted() by key test
    l = [[2, 3], [6, 7], [3, 34], [24, 64], [1, 43]]
    l = sorted(l, key=getkey)
    print(l)
    """

    """ Test using multiple arguements in for loop
    d = make_freq_dict(bytes([65, 66, 67, 66]))
    for k, v in d.items():
        print(k, 'corresponds to', v)
    """

    # Testing get_codes()
    freq = {2: 6, 3: 4, 4: 13, 5: 17}
    t = huffman_tree(freq)
    d = get_codes(t)
    print(t)
    print(d)

    freqs = {'a': 2}
    d = HuffmanNode()
    print(d)

    print(get_codes(d))

    print(bytes([65, 66, 67, 66]))

    left = HuffmanNode(None, HuffmanNode(3), HuffmanNode(2))
    right = HuffmanNode(None, HuffmanNode(9), HuffmanNode(10))
    tree = HuffmanNode(None, left, right)