Ejemplo n.º 1
0
def bwt_command(filename):
    """
    Ecrit à l'écran la transformée de Burrows-Wheeler du génome contenu
    dans le fichier FASTA.
    """
    seq = FastaFile(filename).complete_sequence()
    print bwt(seq)
Ejemplo n.º 2
0
def main():

    start = time.time()

    threshold = 3 # this will be the z value
    usage = ('\nusage: python search_bwt.py [--no-indels] [test|<reference file name>] [<read file name>]\n')

    if '--no-indels' in sys.argv:
        global NO_INDELS
        NO_INDELS = True

    if '--linear-gaps' in sys.argv:
        global gap_open, gap_ext
        gap_open = 0
        gap_ext = 1

    if len(sys.argv) == 1:
        print usage
        return

    elif sys.argv[1].lower() == 'test':
        test()
        return
    
    elif len(sys.argv) < 3:
        print usage
        return

    for i in range(0,len(sys.argv)):
        if sys.argv[i] == '-t' and i < len(sys.argv)-2:
            threshold = int(sys.argv[i+1])


    fread = open(sys.argv[-1])
    fref = open(sys.argv[-2])

    ref = ''.join(fref.readlines()).replace('\n','')
    read = ''.join(fread.readlines()).replace('\n','')

    # estimate the subsitution matrix
    if not '--no-sub-mat' in sys.argv:
        global sub_mat
        sub_mat = estimate_substitution_mat(ref,read)


    sa = suffix_array(ref)

    bw = bwt(ref)
    bwr = bwt(ref[::-1])

    print read
    print_output(inexact_search(bw,bwr,read,threshold), sa, ref)

    elapsed = time.time() - start
    if '--show-time' in sys.argv: print 'time elapsed: '+str(elapsed)
    if '--count-prunes' in sys.argv: print str(num_prunes) + ' nodes pruned.'
    print "error score upper bound: " + str(threshold)
    fread.close()
    fref.close()
Ejemplo n.º 3
0
def main():
    start = time.time()

    threshold = 3  # this will be the z value
    usage = '\nusage:\tpython search_bwt.py [--no-indels] <reference file name> <read file name>' \
            '\nor:\tpython search_bwt.py test\n'

    if '--no-indels' in sys.argv:
        global NO_INDELS
        NO_INDELS = True

    if '--linear-gaps' in sys.argv:
        global gap_open, gap_ext
        gap_open = 0
        gap_ext = 1

    if len(sys.argv) == 1:
        print usage
        return

    elif sys.argv[1].lower() == 'test':
        test()
        return

    elif len(sys.argv) < 3:
        print usage
        return

    for i in range(0, len(sys.argv)):
        if sys.argv[i] == '-t' and i < len(sys.argv) - 2:
            threshold = int(sys.argv[i + 1])

    fread = open(sys.argv[-1])
    fref = open(sys.argv[-2])

    ref = ''.join(fref.readlines()).replace('\n', '')
    read = ''.join(fread.readlines()).replace('\n', '')

    # estimate the substitution matrix
    if '--no-sub-mat' not in sys.argv:
        global sub_mat
        sub_mat = estimate_substitution_mat(ref, read)

    sa = suffix_array(ref)

    bw = bwt(ref)
    bwr = bwt(ref[::-1])

    print read
    print_output(inexact_search(bw, bwr, read, threshold), sa, ref)

    elapsed = time.time() - start
    if '--show-time' in sys.argv:
        print 'time elapsed: ' + str(elapsed)
    if '--count-prunes' in sys.argv:
        print str(num_prunes) + ' nodes pruned.'
    print "error score upper bound: " + str(threshold)
    fread.close()
    fref.close()
Ejemplo n.º 4
0
def test():
    #s = 'ATGCGTAATGCCGTCGATCG'
    s = 'CGATCCGCGCTGCTGATGATCGATG'
    read = 'GATGAT'
    threshold = 2

    sa = suffix_array(s)
    bw = bwt(s)
    bwr = bwt(s[::-1])

    print_output(inexact_search(bw,bwr,read,threshold), sa, s, read)
Ejemplo n.º 5
0
def test():
    # Another test: 'ATGCGTAATGCCGTCGATCG'
    s = 'CGATCCGCGCTGCTGATGATCGATG'
    read = 'GATGAT'
    threshold = 2

    sa = suffix_array(s)
    bw = bwt(s)
    bwr = bwt(s[::-1])

    print_output(inexact_search(bw, bwr, read, threshold), sa, s, read)
Ejemplo n.º 6
0
def compress_fasta_command(filename, outputfile):
    """
    Ecrit la séquence compressée du fichier FASTA dans un fichier de sortie.
    """
    seq = FastaFile(filename).complete_sequence()
    bwt_seq = bwt(seq) # BWT(sequence)
    mtf_seq = move_to_front(bwt_seq) # MTF(BTW(sequence))
    rle_seq = rle(mtf_seq) # RLE(MTF(BTW(sequence)))
    freq = occurences(rle_seq)
    huffman_code = encode(freq) # création de la table de codage
    output_string = ""
    d = {c[0]:c[1] for c in huffman_code} # conversion de la table de codage en dictionnaire
    for c in rle_seq:
        output_string += d[c] # HUFFMAN(RLE(MTF(BTW(sequence))))
    with open(outputfile, 'wb') as out: # ouverture en format binaire
        for i in range(0, len(rle_seq), 8): # écriture par paquet d'octet
            out.write(hex(int(output_string[i:i+8], 2)))
Ejemplo n.º 7
0
def test_bwt(sample_str):
    assert bwt(sample_str + '\xff') == bwt_naive(sample_str + '\xff')
Ejemplo n.º 8
0
def test_bwt_simple():
    assert bwt('abacaba\xff') == list('\xffcbbaaaa')
Ejemplo n.º 9
0
def test_ibwt(sample_str):
    dollar_sign = '\xff'
    bwt_of_str = bwt(sample_str + dollar_sign)
    assert ibwt_string(bwt_of_str, dollar_sign) == sample_str + dollar_sign