def scheme1(kmer): ''' BWT, EOL, 2 bit alphabet Encoding scheme = encoding := eol_pos bases eol_pos := 6 bits for eol, suitable for k <=~ 63 bases := 2 bits for base, 1 bit for RLE, 1 bit for len 2-3, (1 bit for len 2-3 | 6 bits for len <= 63) Examples: AC == 000 010 AAC == 00100 010 AAAC == 00101 010 AAAAC == 0011000000 010 AAAAAC == 0011000001 010 Three-tiered encoding, run len 1 => 3 bits, run len 2-3 => 5 bits, run len 4-68 => 10 bits ''' trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer)) size = 0 size += 6 i = 0 while i < len(trans_kmer): run_length = run_len(trans_kmer, i) if run_length == 1: size += 3 elif run_length in [2,3]: size += 5 else: size += 10 if run_length > 68: run_length = 68 i += run_length return size
def scheme3(kmer): ''' 2 bit alphabet, BWT, MTF, shitty huffman encoding, 0 => 1, 1 => 01, 2 => 000, 3 => 001 Add 6 bits for eol position ''' def huff_size(c): return { 0: 1, 1: 2, 2: 3, 3: 3, }[c] trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer)) trans_kmer = mtf.mtf(trans_kmer) size = 0 size += 6 i = 0 while i < len(trans_kmer): size += huff_size(trans_kmer[i]) i += 1 return size
def scheme5(kmer): ''' 3 bit alphabet, BWT, MTF, shitty huffman encoding, 0 => 1, 1 => 01, 2 => 000, 3 => 0011, 4 => 0010 Add 6 bits for eol position modified MTF to not push to front when base is N ''' def huff_size(c): return { 0: 1, 1: 2, 2: 3, 3: 4, 4: 4, }[c] trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer)) trans_kmer = mtf.mtf_n(trans_kmer) size = 0 size += 6 i = 0 while i < len(trans_kmer): size += huff_size(trans_kmer[i]) i += 1 return size