def scheme2(kmer):
    '''
    BWT, 3 bit alphabet
    Encoding scheme = 
    encoding := bases
    bases := 3 bits for base, 1 bit for RLE, 4 bits for len 2-17
    Examples: AC     == 0000 0010
              AAC    == 00010000 0010
              AAAC   == 00010001 0010
              AAAAC  == 00010010 0010
              AAAAAC == 00010011 0010
    Two-tiered encoding, run len 1 => 4 bits, run len 2-17 => 8 bits
    '''
    trans_kmer = bwt.bwt(kmer)
    size = 0
    i = 0
    while i < len(trans_kmer):
        run_length = run_len(trans_kmer, i)
        if run_length == 1:
            size += 4
        else:
            size += 8
        if run_length > 17:
            run_length = 17
        i += run_length
    return size
def scheme1(kmer):
    '''
    BWT, EOL, 2 bit alphabet
    Encoding scheme = 
    encoding := eol_pos bases
    eol_pos := 6 bits for eol, suitable for k <=~ 63
    bases := 2 bits for base, 1 bit for RLE, 1 bit for len 2-3, (1 bit for len 2-3 | 6 bits for len <= 63)
    Examples: AC == 000 010
              AAC == 00100 010
              AAAC == 00101 010
              AAAAC == 0011000000 010
              AAAAAC == 0011000001 010
    Three-tiered encoding, run len 1 => 3 bits, run len 2-3 => 5 bits, run len 4-68 => 10 bits
    '''
    trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer))
    size = 0
    size += 6
    i = 0
    while i < len(trans_kmer):
        run_length = run_len(trans_kmer, i)
        if run_length == 1:
            size += 3
        elif run_length in [2,3]:
            size += 5
        else:
            size += 10
        if run_length > 68:
            run_length = 68
        i += run_length
    return size
Example #3
0
def main():
    chunks = []
    with open(FILENAME, 'rb') as f:
        for _ in range(MAX_NUM_CHUNKS):
            c = f.read(MAX_CHUNK_SIZE)
            if not c:
                break
            chunks.append(c)

    print('Original:')
    matrix = compute_distance_matrix(chunks, verbose=True)
    # print(matrix)

    print('After BWT:')
    chunks_bwt = [bwt(c) for c in chunks]
    matrix_bwt = compute_distance_matrix(chunks_bwt, verbose=True)
def scheme3(kmer):
    '''
    2 bit alphabet, BWT, MTF, shitty huffman encoding, 0 => 1, 1 => 01, 2 => 000, 3 => 001
    Add 6 bits for eol position
    '''
    def huff_size(c):
        return { 0: 1, 
                 1: 2,
                 2: 3,
                 3: 3,
                 }[c]
    trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer))
    trans_kmer = mtf.mtf(trans_kmer)
    size = 0
    size += 6
    i = 0
    while i < len(trans_kmer):
        size += huff_size(trans_kmer[i])
        i += 1
    return size
def scheme5(kmer):
    '''
    3 bit alphabet, BWT, MTF, shitty huffman encoding, 0 => 1, 1 => 01, 2 => 000, 3 => 0011, 4 => 0010
    Add 6 bits for eol position
    modified MTF to not push to front when base is N
    '''
    def huff_size(c):
        return { 0: 1, 
                 1: 2,
                 2: 3,
                 3: 4,
                 4: 4,
                 }[c]
    trans_kmer, eol_idx = bwt.eol_format(bwt.bwt(kmer))
    trans_kmer = mtf.mtf_n(trans_kmer)
    size = 0
    size += 6
    i = 0
    while i < len(trans_kmer):
        size += huff_size(trans_kmer[i])
        i += 1
    return size
Example #6
0
def main(args):
    #args = sys.argv[1:]
    if len(args) != 3:
        print "Please enter the right number of arguments"
        exit()
    inputFile = open(args[1],'r')
    inputFileReadLine = inputFile.readline()
    inputFileReadLine = inputFileReadLine[1:len(inputFileReadLine)-1]
    S = inoutPut.inputToList(inputFile)
    lengthS = len(S)
    inputFile.close()
    if args[0] == '-bwt':
        print "bwt of:", inputFileReadLine
        newT = util.convertToNumAlphabet(S)
        s_array = ks.cd3(newT)
        S += ["$"]
        b_wt = bwt.bwt(S,s_array)
        outputFile = open(args[2],'w')
        inoutPut.outputToFile(outputFile, b_wt, 'bwt', inputFileReadLine)
        outputFile.close()
        print "length of string:", lengthS
        print "For BWT Check file:", args[2]
    elif args[0] == '-ibwt':
        print "ibwt of:", inputFileReadLine
        rec = bwt.ibwt(S)
        outputFile = open(args[2],'w')
        inoutPut.outputToFile(outputFile, rec,'ibwt', inputFileReadLine)
        outputFile.close()
        print "length of string:", lengthS
        print "For iBWT Check file:", args[2]
            
    else:
        print "You must indicate whether we are doing -btw or -ibtw"
        exit()
    
    pass
from bwt import bwt
from bwtinverse import inverse_bwt
from text_generator import text_gen


ERROR_THRESHOLD = 0.1


if __name__ == '__main__':
    stop = 1000
    total = 0
    err = 0
    while True:
        total += 1
        text = text_gen()
        bwt_text = bwt(text)
        text1 = inverse_bwt(bwt_text)
        if text != text1:
            err += 1
            print('test1 get bwt')
            print('text = {}, text1 = {}'.format(text, text1))
            print('bwt = {}'.format(bwt_text))
        if total * ERROR_THRESHOLD <= err and err > 0 or total >= stop:
            break
        if total % 100 == 0:
            print('total = {}, err = {}'.format(total, err))