def bwt_command(filename): """ Ecrit à l'écran la transformée de Burrows-Wheeler du génome contenu dans le fichier FASTA. """ seq = FastaFile(filename).complete_sequence() print bwt(seq)
def main(): start = time.time() threshold = 3 # this will be the z value usage = ('\nusage: python search_bwt.py [--no-indels] [test|<reference file name>] [<read file name>]\n') if '--no-indels' in sys.argv: global NO_INDELS NO_INDELS = True if '--linear-gaps' in sys.argv: global gap_open, gap_ext gap_open = 0 gap_ext = 1 if len(sys.argv) == 1: print usage return elif sys.argv[1].lower() == 'test': test() return elif len(sys.argv) < 3: print usage return for i in range(0,len(sys.argv)): if sys.argv[i] == '-t' and i < len(sys.argv)-2: threshold = int(sys.argv[i+1]) fread = open(sys.argv[-1]) fref = open(sys.argv[-2]) ref = ''.join(fref.readlines()).replace('\n','') read = ''.join(fread.readlines()).replace('\n','') # estimate the subsitution matrix if not '--no-sub-mat' in sys.argv: global sub_mat sub_mat = estimate_substitution_mat(ref,read) sa = suffix_array(ref) bw = bwt(ref) bwr = bwt(ref[::-1]) print read print_output(inexact_search(bw,bwr,read,threshold), sa, ref) elapsed = time.time() - start if '--show-time' in sys.argv: print 'time elapsed: '+str(elapsed) if '--count-prunes' in sys.argv: print str(num_prunes) + ' nodes pruned.' print "error score upper bound: " + str(threshold) fread.close() fref.close()
def main(): start = time.time() threshold = 3 # this will be the z value usage = '\nusage:\tpython search_bwt.py [--no-indels] <reference file name> <read file name>' \ '\nor:\tpython search_bwt.py test\n' if '--no-indels' in sys.argv: global NO_INDELS NO_INDELS = True if '--linear-gaps' in sys.argv: global gap_open, gap_ext gap_open = 0 gap_ext = 1 if len(sys.argv) == 1: print usage return elif sys.argv[1].lower() == 'test': test() return elif len(sys.argv) < 3: print usage return for i in range(0, len(sys.argv)): if sys.argv[i] == '-t' and i < len(sys.argv) - 2: threshold = int(sys.argv[i + 1]) fread = open(sys.argv[-1]) fref = open(sys.argv[-2]) ref = ''.join(fref.readlines()).replace('\n', '') read = ''.join(fread.readlines()).replace('\n', '') # estimate the substitution matrix if '--no-sub-mat' not in sys.argv: global sub_mat sub_mat = estimate_substitution_mat(ref, read) sa = suffix_array(ref) bw = bwt(ref) bwr = bwt(ref[::-1]) print read print_output(inexact_search(bw, bwr, read, threshold), sa, ref) elapsed = time.time() - start if '--show-time' in sys.argv: print 'time elapsed: ' + str(elapsed) if '--count-prunes' in sys.argv: print str(num_prunes) + ' nodes pruned.' print "error score upper bound: " + str(threshold) fread.close() fref.close()
def test(): #s = 'ATGCGTAATGCCGTCGATCG' s = 'CGATCCGCGCTGCTGATGATCGATG' read = 'GATGAT' threshold = 2 sa = suffix_array(s) bw = bwt(s) bwr = bwt(s[::-1]) print_output(inexact_search(bw,bwr,read,threshold), sa, s, read)
def test(): # Another test: 'ATGCGTAATGCCGTCGATCG' s = 'CGATCCGCGCTGCTGATGATCGATG' read = 'GATGAT' threshold = 2 sa = suffix_array(s) bw = bwt(s) bwr = bwt(s[::-1]) print_output(inexact_search(bw, bwr, read, threshold), sa, s, read)
def compress_fasta_command(filename, outputfile): """ Ecrit la séquence compressée du fichier FASTA dans un fichier de sortie. """ seq = FastaFile(filename).complete_sequence() bwt_seq = bwt(seq) # BWT(sequence) mtf_seq = move_to_front(bwt_seq) # MTF(BTW(sequence)) rle_seq = rle(mtf_seq) # RLE(MTF(BTW(sequence))) freq = occurences(rle_seq) huffman_code = encode(freq) # création de la table de codage output_string = "" d = {c[0]:c[1] for c in huffman_code} # conversion de la table de codage en dictionnaire for c in rle_seq: output_string += d[c] # HUFFMAN(RLE(MTF(BTW(sequence)))) with open(outputfile, 'wb') as out: # ouverture en format binaire for i in range(0, len(rle_seq), 8): # écriture par paquet d'octet out.write(hex(int(output_string[i:i+8], 2)))
def test_bwt(sample_str): assert bwt(sample_str + '\xff') == bwt_naive(sample_str + '\xff')
def test_bwt_simple(): assert bwt('abacaba\xff') == list('\xffcbbaaaa')
def test_ibwt(sample_str): dollar_sign = '\xff' bwt_of_str = bwt(sample_str + dollar_sign) assert ibwt_string(bwt_of_str, dollar_sign) == sample_str + dollar_sign