def scheme2(kmer): ''' BWT, 3 bit alphabet Encoding scheme = encoding := bases bases := 3 bits for base, 1 bit for RLE, 4 bits for len 2-17 Examples: AC == 0000 0010 AAC == 00010000 0010 AAAC == 00010001 0010 AAAAC == 00010010 0010 AAAAAC == 00010011 0010 Two-tiered encoding, run len 1 => 4 bits, run len 2-17 => 8 bits ''' trans_kmer = bwt.bwt(kmer) size = 0 i = 0 while i < len(trans_kmer): run_length = run_len(trans_kmer, i) if run_length == 1: size += 4 else: size += 8 if run_length > 17: run_length = 17 i += run_length return size
def scheme1(kmer): ''' BWT, EOL, 2 bit alphabet Encoding scheme = encoding := eol_pos bases eol_pos := 6 bits for eol, suitable for k <=~ 63 bases := 2 bits for base, 1 bit for RLE, 1 bit for len 2-3, (1 bit for len 2-3 | 6 bits for len <= 63) Examples: AC == 000 010 AAC == 00100 010 AAAC == 00101 010 AAAAC == 0011000000 010 AAAAAC == 0011000001 010 Three-tiered encoding, run len 1 => 3 bits, run len 2-3 => 5 bits, run len 4-68 => 10 bits ''' trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer)) size = 0 size += 6 i = 0 while i < len(trans_kmer): run_length = run_len(trans_kmer, i) if run_length == 1: size += 3 elif run_length in [2,3]: size += 5 else: size += 10 if run_length > 68: run_length = 68 i += run_length return size
def main(): chunks = [] with open(FILENAME, 'rb') as f: for _ in range(MAX_NUM_CHUNKS): c = f.read(MAX_CHUNK_SIZE) if not c: break chunks.append(c) print('Original:') matrix = compute_distance_matrix(chunks, verbose=True) # print(matrix) print('After BWT:') chunks_bwt = [bwt(c) for c in chunks] matrix_bwt = compute_distance_matrix(chunks_bwt, verbose=True)
def scheme3(kmer): ''' 2 bit alphabet, BWT, MTF, shitty huffman encoding, 0 => 1, 1 => 01, 2 => 000, 3 => 001 Add 6 bits for eol position ''' def huff_size(c): return { 0: 1, 1: 2, 2: 3, 3: 3, }[c] trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer)) trans_kmer = mtf.mtf(trans_kmer) size = 0 size += 6 i = 0 while i < len(trans_kmer): size += huff_size(trans_kmer[i]) i += 1 return size
def scheme5(kmer): ''' 3 bit alphabet, BWT, MTF, shitty huffman encoding, 0 => 1, 1 => 01, 2 => 000, 3 => 0011, 4 => 0010 Add 6 bits for eol position modified MTF to not push to front when base is N ''' def huff_size(c): return { 0: 1, 1: 2, 2: 3, 3: 4, 4: 4, }[c] trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer)) trans_kmer = mtf.mtf_n(trans_kmer) size = 0 size += 6 i = 0 while i < len(trans_kmer): size += huff_size(trans_kmer[i]) i += 1 return size
def main(args): #args = sys.argv[1:] if len(args) != 3: print "Please enter the right number of arguments" exit() inputFile = open(args[1],'r') inputFileReadLine = inputFile.readline() inputFileReadLine = inputFileReadLine[1:len(inputFileReadLine)-1] S = inoutPut.inputToList(inputFile) lengthS = len(S) inputFile.close() if args[0] == '-bwt': print "bwt of:", inputFileReadLine newT = util.convertToNumAlphabet(S) s_array = ks.cd3(newT) S += ["$"] b_wt = bwt.bwt(S,s_array) outputFile = open(args[2],'w') inoutPut.outputToFile(outputFile, b_wt, 'bwt', inputFileReadLine) outputFile.close() print "length of string:", lengthS print "For BWT Check file:", args[2] elif args[0] == '-ibwt': print "ibwt of:", inputFileReadLine rec = bwt.ibwt(S) outputFile = open(args[2],'w') inoutPut.outputToFile(outputFile, rec,'ibwt', inputFileReadLine) outputFile.close() print "length of string:", lengthS print "For iBWT Check file:", args[2] else: print "You must indicate whether we are doing -btw or -ibtw" exit() pass
from bwt import bwt from bwtinverse import inverse_bwt from text_generator import text_gen ERROR_THRESHOLD = 0.1 if __name__ == '__main__': stop = 1000 total = 0 err = 0 while True: total += 1 text = text_gen() bwt_text = bwt(text) text1 = inverse_bwt(bwt_text) if text != text1: err += 1 print('test1 get bwt') print('text = {}, text1 = {}'.format(text, text1)) print('bwt = {}'.format(bwt_text)) if total * ERROR_THRESHOLD <= err and err > 0 or total >= stop: break if total % 100 == 0: print('total = {}, err = {}'.format(total, err))