def compress(f, compf, block = None): """ Compress a file by bwt-mtf-huff processes. f: a file-like object, content for compressing compf: compression result """ # craete alphabet for move-to-front content = f.read() alphabet_set = set(content) alphabet = [''] + list(alphabet_set) # dump alphabet cPickle.dump(alphabet, compf, 2) content = None f.seek(0) count = 0 while True: data = f.read(BLOCK_SIZE) if data == '': break if block == None or block == count: print "block %r:" % count bwt_encode, I = bwt.encode(data) mtf_encode = mtf.encode(alphabet, bwt_encode) # create Huffman tree freqs = Counter(mtf_encode) if block: print freqs coding, root = huff.generate_coding(freqs) if block: print coding # encoding huff_encode = ''.join(huff.encode(mtf_encode, coding)) #print "huff_encode:\n%r" % huff_encode nbits = len(huff_encode) huff_bytes = tobytes(huff_encode) compf.write(struct.pack(">2I", nbits, I)) dump_freqs(compf, freqs) compf.write(huff_bytes) print "nbits = %r, I = %r, length = %r" % (nbits, I, len(data)) count = count + 1
def test_mix(self): seq = generator(16*1024) encode, I = bwt.encode(seq) decode = bwt.decode(encode, I, True) self.assertEqual(list(seq), decode)
def test_encode(self): seq = 'abraca' encode, I = bwt.encode(seq, indexes = [5, 0, 3, 1, 4, 2]) self.assertEqual(list('caraab'), encode) self.assertEqual(I, 1)