Ejemplo n.º 1
0
def huffman_encode(message, encoding="utf-8"):
    if not message:
        if encoding == "utf-8":
            return ""
        else:
            return b""
    h = []
    for ch, freq in Counter(message).items():
        h.append((freq, len(h), Leaf(ch)))

    heapq.heapify(h)

    count = len(h)
    while len(h) > 1:
        freq1, _count1, left = heapq.heappop(h)
        freq2, _count2, right = heapq.heappop(h)
        heapq.heappush(h, (freq1 + freq2, count, Node(left, right)))
        count += 1

    code = {}
    root = None
    if h:
        [(_freq, _count, root)] = h
        root.walk(code, BitArray())

    tree_bits = serialize_tree(root, encoding)

    message_bits = BitArray()
    for ch in message:
        message_bits.extend(code[ch])

    encoded = make_encoded_bytes(tree_bits, message_bits, encoding)
    return encoded
Ejemplo n.º 2
0
 def walk(self, code, acc):
     """
     Recursive traverse tree and assign codes for all leaves.
     Return maximal code length of node subtree.
     """
     left_seq = BitArray(acc)
     right_seq = BitArray(acc)
     left_seq.append(0)
     right_seq.append(1)
     return max(self.left.walk(code, left_seq),
                self.right.walk(code, right_seq))
Ejemplo n.º 3
0
    def encode(self, acc, encoding):
        """ Encodes leaf to bit-sequence."""
        acc.append(1)
        char_bits = BitArray()

        if encoding == "utf-8":
            char_bits.frombytes(self.char.encode(encoding))
        else:
            char_bits.frombytes(bytes([self.char]))

        acc.extend(char_bits)
Ejemplo n.º 4
0
def parse_bytes(file_bytes):
    file_bits = BitArray()
    file_bits.frombytes(file_bytes)

    enc_bit = file_bits[0]

    extra = int(file_bits[1:4].to01(), 2)
    file_bits.rstrip(extra)

    encoding = "utf-8" if enc_bit else "binary"
    file_bits = file_bits[4:]

    tree_root, code_startpos = tree.from_bitseq(file_bits, encoding)
    encoded = file_bits[code_startpos:]
    return tree_root, encoded, encoding
Ejemplo n.º 5
0
def make_encoded_bytes(tree_bits, message_bits, encoding):
    encoded_len = 4 + len(tree_bits) + len(message_bits)

    extra = (8 - encoded_len % 8) % 8

    encoded_bits = BitArray()
    if encoding == "utf-8":
        encoded_bits.append(1)
    else:
        encoded_bits.append(0)

    encoded_bits.extend("{0:03b}".format(extra))
    encoded_bits.extend(tree_bits)
    encoded_bits.extend(message_bits)

    return encoded_bits.tobytes()
Ejemplo n.º 6
0
def serialize_tree(root, encoding):
    tree_bits = BitArray()
    root.encode(tree_bits, encoding)
    return tree_bits
Ejemplo n.º 7
0
 def walk(self, code, acc):
     code[self.char] = acc or BitArray('0')
     return len(acc) or 1