def test1(self): # T A C G G G T A T matrix = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # G [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # G [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # A [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # C [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # G [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # T [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # A [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # C [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ] # G # T A C G G G T A T filled_matrix = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0, 0, 0], # G [0, 0, 0, 0, 1, 2, 2, 1, 0, 0], # G [0, 0, 1, 0, 0, 1, 1, 1, 2, 1], # A [0, 0, 0, 2, 1, 0, 0, 0, 1, 1], # C [0, 0, 0, 1, 3, 2, 1, 0, 0, 0], # G [0, 1, 0, 0, 2, 2, 1, 2, 1, 1], # T [0, 0, 2, 1, 1, 1, 1, 1, 3, 2], # A [0, 0, 1, 3, 2, 1, 0, 0, 2, 2], # C [0, 0, 0, 2, 4, 3, 2, 1, 1, 1] ] # G list1 = "GGACGTACG" list2 = "TACGGGTAT" SmithWaterman.fill_matrix(list1, list2, matrix) self.assertEqual(matrix, filled_matrix)
def test2(self): # x x c a d e matrix = [ [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], # a [0, 0, 0, 0, 0, 0, 0], # b [0, 0, 0, 0, 0, 0, 0], # c [0, 0, 0, 0, 0, 0, 0], # a [0, 0, 0, 0, 0, 0, 0], # d [0, 0, 0, 0, 0, 0, 0], # e [0, 0, 0, 0, 0, 0, 0] ] # x # x x c a d e filled_matrix = [ [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], # a [0, 0, 0, 0, 0, 0, 0], # b [0, 0, 0, 1, 0, 0, 0], # c [0, 0, 0, 0, 2, 1, 0], # a [0, 0, 0, 0, 1, 3, 2], # d [0, 0, 0, 0, 0, 2, 4], # e [0, 1, 1, 0, 0, 1, 3] ] # x list1 = "abcadex" list2 = "xxcade" SmithWaterman.fill_matrix(list1, list2, matrix) self.assertEqual(matrix, filled_matrix)
def process_msa(sentences, i): a = sentences[i] b = sentences[i + 1] toklst = sw.smith_waterman(a, b) base_tok = sw.print_base_tokens(toklst) m = find_msa(msa, base_tok, toklst) if (m == None): m = create_new_msa(toklst, base_tok) msa.append(m)
class TestScoreMatrix(ATC): def setUp(self): self.seq1 = "AGC" self.seq2 = "ACA" self.SW = SmithWaterman(self.seq1, self.seq2) self.function_toBeTested = self.SW._build_scoreMatrix self.SW._build_scoreMatrix() def test_build_scoreMatrix(self): expected_scoreMatrix = numpy.array([[0, 0, 0, 0], [0, 2, 1, 2], [0, 1, 1, 1], [0, 0, 3, 2]]) scoreMatrix = self.SW.scoreMatrix print("scoreMatrix", scoreMatrix) self.assertTrue((scoreMatrix == expected_scoreMatrix).all())
def test3(self): matrix = SmithWaterman.create_matrix(2, 5) self.assertEqual([ [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], ], matrix)
class TestScoreMatrix(ATC): def setUp(self): self.seq1 = "AGC" self.seq2 = "ACA" self.SW = SmithWaterman(self.seq1, self.seq2) self.function_toBeTested = self.SW._build_scoreMatrix self.SW._build_scoreMatrix() def test_build_scoreMatrix(self): expected_scoreMatrix = numpy.array([[0,0,0,0], [0,2,1,2], [0,1,1,1], [0,0,3,2] ]) scoreMatrix = self.SW.scoreMatrix print("scoreMatrix",scoreMatrix) self.assertTrue((scoreMatrix == expected_scoreMatrix).all())
def hir(self, X, Y): Z = "" W = "" if len(X) == 0: for i in range(1, len(Y)): Z = Z + "-" W = W + Y[i] elif len(Y) == 0: for i in range(1, len(X)): Z = Z + X[i] W = W + "-" elif len(X) == 1 or len(Y) == 1: Z, W = sm_alg.alignSequence(AImage=X, BImage=Y)
def subseq_la_search(target, data, matrix, gap_cost, min_score, first_only): # Substitution matrix sub_matrix = SubMatrix.SubMatrix(matrix) # The maximum score for given target max_score = alignment.calculate_max_score(target, sub_matrix) match_list = list() for model in data.keys(): for chain in data[model].keys(): sequence = data[model][chain]['sequence'] sw = SmithWaterman.SmithWaterman(target, sequence, gap_cost, sub_matrix) # Skip if alignment best score is less than minimum passing score if float(sw.get_best_score()) / max_score * 100 < min_score: continue for i, j in sw.get_coordinates(): aligned_target, aligned_sequence, start_i, start_j =\ sw.get_traceback(i, j) start_pos = start_j - 1 for _ in range(0, len(aligned_sequence.replace('-', ''))): resi = data[model][chain]['ids'][start_pos] match_list.append((model, chain, resi)) start_pos += 1 alignment_string, identities, gaps, mismatches = \ alignment.create_alignment_string(aligned_target, aligned_sequence) alignment.print_alignment(model, chain, target, sequence, sub_matrix.get_name(), gap_cost, sw.get_best_score(), max_score, identities, mismatches, gaps, aligned_target, aligned_sequence, alignment_string, start_i, start_j, data[model][chain]['ids']) if first_only: break else: continue break return match_list if len(match_list) != 0 else None
class TestAlign2(unittest.TestCase): def setUp(self): self.seq1 = "TTACCGGCCAACTAA" self.seq2 = "ACCGTGTCACTAC" self.SW = SmithWaterman(self.seq1, self.seq2) def test_align(self): expected_seq1 = "ACCG-GCCAACTA" expected_seq2 = "ACCGTGTCA-CTA" output_seq1, output_seq2 = self.SW.align() print(self.SW.scoreMatrix) print("output seq1:", output_seq1) print("output seq2:", output_seq2) self.assertEqual(output_seq2, expected_seq2) self.assertEqual(output_seq1, expected_seq1)
class TestAlign(unittest.TestCase): def setUp(self): self.seq1 = "AGCACACA" self.seq2 = "ACACACTA" self.SW = SmithWaterman(self.seq1, self.seq2) def test_align(self): expected_seq1 = "AGCACAC-A" expected_seq2 = "A-CACACTA" output_seq1, output_seq2 = self.SW.align() print(self.SW.scoreMatrix) print(output_seq1) print(output_seq2) self.assertEqual(output_seq2, expected_seq2) self.assertEqual(output_seq1, expected_seq1)
class TestAlign2(unittest.TestCase): def setUp(self): self.seq1 = "TTACCGGCCAACTAA" self.seq2 = "ACCGTGTCACTAC" self.SW = SmithWaterman(self.seq1, self.seq2) def test_align(self): expected_seq1 = "ACCG-GCCAACTA" expected_seq2 = "ACCGTGTCA-CTA" output_seq1, output_seq2 = self.SW.align() print(self.SW.scoreMatrix) print("output seq1:",output_seq1) print("output seq2:",output_seq2) self.assertEqual(output_seq2, expected_seq2) self.assertEqual(output_seq1, expected_seq1)
def test7(self): # x x x c d e matrix = [ [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], # a [0, 0, 0, 0, 0, 0, 0], # b [0, 0, 0, 0, 0, 0, 0], # c [0, 0, 0, 0, 0, 0, 0], # x [0, 0, 0, 0, 0, 0, 0], # d [0, 0, 0, 0, 0, 0, 0], # e [0, 0, 0, 0, 0, 0, 0] ] # x list1 = "abcxdex" list2 = "xxxcde" max = SmithWaterman.f(3, 4, matrix, list1, list2) self.assertEqual(max, 1)
def test1(self): # x x x c d e filled_matrix = [ [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], # a [0, 0, 0, 0, 0, 0, 0], # b [0, 0, 0, 0, 2, 1, 0], # c [0, 2, 2, 2, 1, 1, 0], # x [0, 1, 1, 1, 1, 3, 2], # d [0, 0, 0, 0, 0, 2, 5], # e [0, 2, 2, 2, 1, 1, 4] ] # x list1 = "abcxdex" list2 = "xxxcde" self.assertEqual(SmithWaterman.get_max(filled_matrix), [6, 6])
def test11(self): # T A C G G G T A T matrix = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0, 0, 0], # G [0, 0, 0, 0, 1, 2, 2, 1, 0, 0], # G [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # A [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # C [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # G [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # T [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # A [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # C [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ] # G list1 = "GGACGTACG" list2 = "TACGGGTAT" max = SmithWaterman.f(3, 5, matrix, list1, list2) self.assertEqual(max, 1)
def test1(self): # x x x c d e filled_matrix = [ [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], # a [0, 0, 0, 0, 0, 0, 0], # b [0, 0, 0, 0, 2, 1, 0], # c [0, 2, 2, 2, 1, 1, 0], # x [0, 1, 1, 1, 1, 3, 2], # d [0, 0, 0, 0, 0, 2, 5], # e [0, 2, 2, 2, 1, 1, 4] ] # x list1 = "abcxdex" list2 = "xxxcde" alignment = SmithWaterman.do_alignment(list1, list2, filled_matrix) self.assertEqual( alignment, [['c', 'x', 'd', 'e'], ['|', ' ', '|', '|'], ['c', '-', 'd', 'e']])
def test2(self): # T A C G G G T A T filled_matrix = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0, 0, 0], # G [0, 0, 0, 0, 1, 2, 2, 1, 0, 0], # G [0, 0, 1, 0, 0, 1, 1, 1, 2, 1], # A [0, 0, 0, 2, 1, 0, 0, 0, 1, 1], # C [0, 0, 0, 1, 3, 2, 1, 0, 0, 0], # G [0, 1, 0, 0, 2, 2, 1, 2, 1, 1], # T [0, 0, 2, 1, 1, 1, 1, 1, 3, 2], # A [0, 0, 1, 3, 2, 1, 0, 0, 2, 2], # C [0, 0, 0, 2, 4, 3, 2, 1, 1, 1] ] # G list1 = "GGACGTACG" list2 = "TACGGGTAT" alignment = SmithWaterman.do_alignment(list1, list2, filled_matrix) self.assertEqual( alignment, [['T', 'A', 'C', 'G'], ['|', '|', '|', '|'], ['T', 'A', 'C', 'G']])
def test3(self): # G G A C A A A C G filled_matrix = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0, 0], # G [0, 1, 2, 1, 0, 0, 0, 0, 0, 1], # G [0, 0, 1, 3, 2, 1, 1, 1, 0, 0], # A [0, 0, 0, 2, 4, 3, 2, 1, 1, 0], # C [0, 1, 1, 1, 3, 3, 2, 1, 0, 1], # G [0, 0, 0, 0, 2, 2, 2, 1, 0, 0], # T [0, 0, 0, 1, 1, 3, 3, 3, 2, 1], # A [0, 0, 0, 0, 1, 2, 2, 2, 4, 3], # C [0, 1, 1, 0, 0, 1, 1, 1, 3, 5] ] # G list1 = "GGACGTACG" list2 = "GGACAAACG" alignment = SmithWaterman.do_alignment(list1, list2, filled_matrix) self.assertEqual(alignment, [['G', 'G', 'A', 'C', 'G', 'T', 'A', 'C', 'G'], ['|', '|', '|', '|', '.', '.', '|', '|', '|'], ['G', 'G', 'A', 'C', 'A', 'A', 'A', 'C', 'G']])
def setUp(self): self.seq1 = "AGC" self.seq2 = "ACA" self.SW = SmithWaterman(self.seq1, self.seq2) self.function_toBeTested = self.SW._score_diag
def setUp(self): self.seq1 = "AGCA" self.seq2 = "ACAC" self.SW = SmithWaterman(self.seq1, self.seq2) self.function_toBeTested = self.SW._build_scoreMatrix self.SW._build_scoreMatrix()
import sys, SmithWaterman, string #get filenames from command line args firstSequence = sys.argv[1] secondSequence = sys.argv[2] sequence=['']*2 #parse fasta file and strip header info. def parse_fasta(fasta): sequences='' sep='' with open(fasta) as f: next(f) for line in f: sequences += (line.strip()) return sequences sequence[0]= parse_fasta(firstSequence) sequence[1]= parse_fasta(secondSequence) #run the algorithm SmithWaterman.calculateAlignment(sequence[0], sequence[1])
def setUp(self): self.seq1 = "AG" self.seq2 = "AC" self.SW = SmithWaterman(self.seq1, self.seq2) self.function_toBeTested = self.SW._score_horizontal
def main(): seq = [] filename = raw_input('File with sequences: ') # Input path to file, containing 2 sequences. if filename == '': # If no input, open default. filename = r"file.txt" f = open(filename) line = f.readline() while line != '': # Scanning file for sequences if line.startswith('>'): line = f.readline() tmp = '' while not line.startswith('>') and line != '': tmp += line.strip('\n') line = f.readline() seq.append(''.join(tmp.upper().split())) continue line = f.readline() f.close() print('Found sequences:\n{}\n{}\n'.format(seq[0], seq[1])) if re.search('[^AGTC]', seq[0]) is None: # Get type of sequence by checking for presence of letters seq_type = 'DNA' else: seq_type = 'Protein' if seq_type == 'DNA': match = 10 # default match mismatch = -8 # default mismatch gap = [-10, -1] # default gap print('Default scores:\nMatch {}\nMismatch {}\nGap start {}\nGap extension {}\n'.format(match, mismatch, gap[0], gap[1])) flag = ask('Do you want to change it?') if flag: flag = ask('Do you want to rewrite whole matrix?') if flag: print('Rewriting whole matrix') letter_arr = ['A', 'G', 'C', 'T'] s_matrix = dict(zip(letter_arr, [dict(), dict(), dict(), dict()])) for x in range(len(letter_arr)): for y in range(x, len(letter_arr)): tmp = int(raw_input('{}-{} score: '.format(letter_arr[x], letter_arr[y]))) s_matrix[letter_arr[x]][letter_arr[y]] = tmp s_matrix[letter_arr[y]][letter_arr[x]] = tmp else: print('Rewriting only sores\n') match = int(raw_input('Match score: ')) # Scores input. mismatch = int(raw_input('Mismatch score: ')) gap[0] = int(raw_input('Gap start score: ')) gap[1] = int(raw_input('Gap extension score: ')) s_matrix = { 'A': {'A': match, 'G': mismatch, 'C': mismatch, 'T': mismatch}, 'G': {'A': mismatch, 'G': match, 'C': mismatch, 'T': mismatch}, 'C': {'A': mismatch, 'G': mismatch, 'C': match, 'T': mismatch}, 'T': {'A': mismatch, 'G': mismatch, 'C': mismatch, 'T': match} } else: s_matrix = { 'A': {'A': match, 'G': mismatch, 'C': mismatch, 'T': mismatch}, 'G': {'A': mismatch, 'G': match, 'C': mismatch, 'T': mismatch}, 'C': {'A': mismatch, 'G': mismatch, 'C': match, 'T': mismatch}, 'T': {'A': mismatch, 'G': mismatch, 'C': mismatch, 'T': match} } print('Working with default') else: print('Aligning proteins. Using BLOSUM62 matrix') gap = [0, 0] gap[0] = int(raw_input('Gap start score: ')) gap[1] = int(raw_input('Gap extension score: ')) f = open(r"blosum62.txt") raw_matrix = [line.split() for line in f] f.close() raw_dicts = [dict() for x in range(len(raw_matrix[0]))] for i in range(len(raw_matrix[0])): raw_dicts[i] = dict(zip(raw_matrix[0], map(int, raw_matrix[i + 1]))) s_matrix = dict() for i in range(len(raw_matrix[0])): s_matrix[raw_matrix[0][i]] = raw_dicts[i] flag = ask('Do you want to use global(y) or local(n) alignment?') if flag: seq[0], seq[1], max_score = NeedlemanWunsch.matrix_filling_NW(seq, s_matrix, gap) # Getting results. else: seq[0], seq[1], max_score = SmithWaterman.matrix_filling_SW(seq, s_matrix, gap) # Getting results. char = [':', '|', ' '] align = '' for i in range(len(seq[0])): if seq[0][i] == '-' or seq[1][i] == '-': align += char[2] continue align += char[int(seq[0][i] == seq[1][i])] print('\nScore {}\nResult sequences:\n{}\n{}\n{}'.format(max_score, seq[0], align, seq[1]))
humanAPOE = "CTACTCAGCCCCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACG GGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTCAGGA GAGCTACTCGGGGTCGGGCTTGGGGAGAGGAGGAGCGGGGGTGAGGCAAGCAGCAGGGGACTGGACCTGG GAAGGGCTGGGCAGCAGAGACGACCCGACCCGCTAGAAGGTGGGGTGGGGAGAGCAGCTGGACTGGGATG TAAGCCATAGCAGGACTCCACGAGTTGTCACTATCATTTATCGAGCACCTACTGGGTGTCCCCAGTGTCC TCAGATCTCCATAACTGGGGAGCCAGGGGCAGCGACACGGTAGCTAGCCGTCGATTGGAGAACTTTAAAA TGAGGACTGAATTAGCTCATAAATGGAACACGGCGCTTAACTGTGAGGTTGGAGCTTAGAATGTGAAGGG AGAATGAGGAATGCGAGACTGGGACTGAGATGGAACCGGCGGTGGGGAGGGGGTGGGGGGATGGAATTTG AACCCCGGGAGAGGAAGATGGAATTTTCTATGGAGGCCGACCTGGGGATGGGGAGATAAGAGAAGACCAG GAGGGAGTTAAATAGGGAATGGGTTGGGGGCGGCTTGGTAAATGTGCTGGGATTAGGCTGTTGCAGATAA TGCAACAAGGCTTGGAAGGCTAACCTGGGGTGAGGCCGGGTTGGGGCCGGGCTGGGGGTGGGAGGAGTCC TCACTGGCGGTTGATTGACAGTTTCTCCTTCCCCAGACTGGCCAATCACAGGCAGGAAGATGAAGGTTCT GTGGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGGCGGGGCTTGCTCGGTTCCCCCCGCTCCTC CCCCTCTCATCCTCACCTCAACCTCCTGGCCCCATTCAGGCAGACCCTGGGCCCCCTCTTCTGAGGCTTC TGTGCTGCTTCCTGGCTCTGAACAGCGATTTGACGCTCTCTGGGCCTCGGTTTCCCCCATCCTTGAGATA GGAGTTAGAAGTTGTTTTGTTGTTGTTGTTTGTTGTTGTTGTTTTGTTTTTTTGAGATGAAGTCTCGCTC TGTCGCCCAGGCTGGAGTGCAGTGGCGGGATCTCGGCTCACTGCAAGCTCCGCCTCCCAGGTCCACGCCA TTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCACATGCCACCACACCCGACTAACTTTTTTG TATTTTCAGTAGAGACGGGGTTTCACCATGTTGGCCAGGCTGGTCTGGAACTCCTGACCTCAGGTGATCT GCCCGTTTCGATCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCACCTGGCTGGGAGTTAGAGGT TTCTAATGCATTGCAGGCAGATAGTGAATACCAGACACGGGGCAGCTGTGATCTTTATTCTCCATCACCC CCACACAGCCCTGCCTGGGGCACACAAGGACACTCAATACATGCTTTTCCGCTGGGCGCGGTGGCTCACC CCTGTAATCCCAGCACTTTGGGAGGCCAAGGTGGGAGGATCACTTGAGCCCAGGAGTTCAACACCAGCCT GGGCAACATAGTGAGACCCTGTCTCTACTAAAAATACAAAAATTAGCCAGGCATGGTGCCACACACCTGT GCTCTCAGCTACTCAGGAGGCTGAGGCAGGAGGATCGCTTGAGCCCAGAAGGTCAAGGTTGCAGTGAACC ATGTTCAGGCCGCTGCACTCCAGCCTGGGTGACAGAGCAAGACCCTGTTTATAAATACATAATGCTTTCC AAGTGATTAAACCGACTCCCCCCTCACCCTGCCCACCATGGCTCCAAAGAAGCATTTGTGGAGCACCTTC TGTGTGCCCCTAGGTACTAGATGCCTGGACGGGGTCAGAAGGACCCTGACCCACCTTGAACTTGTTCCAC ACAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCCGAGCTGCGCCAGCAGACCG AGTGGCAGAGCGGCCAGCGCTGGGAACTGGCACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGAC ACTGTCTGAGCAGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGTGAGTGTCCCCA TCCTGGCCCTTGACCCTCCTGGTGGGCGGCTATACCTCCCCAGGTCCAGGTTTCATTCTGCCCCTGTCGC TAAGTCTTGGGGGGCCTGGGTCTCTGCTGGTTCTAGCTTCCTCTTCCCATTTCTGACTCCTGGCTTTAGC TCTCTGGAATTCTCTCTCTCAGCTTTGTCTCTCTCTCTTCCCTTCTGACTCAGTCTCTCACACTCGTCCT GGCTCTGTCTCTGTCCTTCCCTAGCTCTTTTATATAGAGACAGAGAGATGGGGTCTCACTGTGTTGCCCA GGCTGGTCTTGAACTTCTGGGCTCAAGCGATCCTCCCGCCTCGGCCTCCCAAAGTGCTGGGATTAGAGGC ATGAGCCACCTTGCCCGGCCTCCTAGCTCCTTCTTCGTCTCTGCCTCTGCCCTCTGCATCTGCTCTCTGC ATCTGTCTCTGTCTCCTTCTCTCGGCCTCTGCCCCGTTCCTTCTCTCCCTCTTGGGTCTCTCTGGCTCAT CCCCATCTCGCCCGCCCCATCCCAGCCCTTCTCCCCGCCTCCCACTGTGCGACACCCTCCCGCCCTCTCG GCCGCAGGGCGCTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGAGGAACAACT GACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGGAGCTGCAGGCGGCGCAGGCCCGGCTGGGC GCGGACATGGAGGACGTGTGCGGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGA GCACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTAAGCGGCTCCTCCGCGATGC CGATGACCTGCAGAAGCGCCTGGCAGTGTACCAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGC GCCATCCGCGAGCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGGGCTCCCTGG CCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAGCGGCTGCGCGCGCGGATGGAGGAGATGGG CAGCCGGACCCGCGACCGCCTGGACGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAG CAGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAGAGCTGGTTCGAGCCCCTGG TGGAAGACATGCAGCGCCAGTGGGCCGGGCTGGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGC CCCTGTGCCCAGCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACCCCGTGCCTC CTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCCCCAGCCGTCCTCCTGGGGTGGACCCTAGT TTAATAAAGATTCACCAAGTTTCACGCA" monkeyAPOE = "ATGAGCTCAGGGGCCTCTAGAAAGATGTAGCTGGGACCTCGGGAAGCCCTGGCCTCCAGGTAGTCTCAGG AGAGCTACTCGAGGTCGGGCTTGGGGATAGGAGGAGCGGGGGTGAGGCCAGCAGCAGGGGACTGGACCTG GTAAGGGCTGGGCAGCAGAGACGACCCGACCCGCTAGAAGGTGGGGTGGGCAGAGCATGTGGACTAGGAG CTAAGCCACAGCAGGACCCCCACGAGTTGTCACTGTCATTTATCGAGCACCTACTGGGTGTCCCCAGTGT CCTCAGATCTCCATAACAGGGAAGCCAGGGGCAGCGACACGGTAGCTAGCCGTCGATTGGAGAACTTTAA AATGAGGACTGAATTAGCTCATAAATGGAAAACGGCGCTTAAATGTGAGGTTAGAGCTTAGAATGTGAAG GGAGAATGAGGAATGCGAGACTGGGACTGAGATGGAACCGGCGGTGGGGAGGGGGAGGGGGTGTGGAATT TGAACCCCGGGAGAGAAAGATGGAATTTTGACTATGGAGGCCGACCTGGGGATGGGGAAATAAGAGAAGA CCAGGAGGGAGTTAAATAGGGAATGGGTTGGGGGCGGCTTGGTAACTGTTTGTGCTAGGATTAGGCTGTT GCAGATAATGGAACTAGGCTTGGAAGGCTAACCTGGGGTGGGGCCGGGTTGGGGTCGGGCTGGGAGTGGG AGGAGTCCTTACTGGCGGTTGATTGACTGTTTCTCCCTCCCCAGACTGGCCAATCACAGGCAGGAAGATG AAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGGCGGGGCTTGCTCGGTTTCCCT GCTCCTCCCCCTCTCATCCTCACCTCAACCTCCTGGCCCCATTCAGGCAGACTTCGGGCCCCCTTTTCTT CTGCTGGTCTGTCTTCCCCTTGAGGGGAAAGCCCAGGTCTGAGGCTTCTATGCTGCTTTCTGGCTCAGAA CAGCGATTTGACGCTCTGTGAGCCTCGTTTTCCTGCCCCCGCTTTTTTTTTTTTTTTTTTTTTTTGAGCC AGAGTCTCACTCTGTCGCCCGGCTGGAGTGCAGTGGCACAATCTCAGCTCACTGCAAGCTCCGCCTCCCG GGTTCACGCTATTCTCCCGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCCACCATGCCCGGC TAATTTTTTGTACTTTGAGTAGAGAAGGGGTTTCACTGTATTATCCAGGATGGTCTCTATCTCCTGACCT CGTGATCTGCCCGCCTTGGCCTCCCAAAGTGCTGGAATTACAGGCGTGAGCCACCGCGCCCGGCCTCCCC ATCCTTAATATAGGAGTTAGAAGTTTTTGTTTGTTTTTGTTTTGTTTTGAGATGAAGTCCCTCTGTCGCC CAGGCTGGAGTGCGGTGGCTCCCAGGCTGGAGTTCAGTGGCAGGATCTCAGCTCACTGCAAGCTCCCCCT CCCAGGTTCATGCCATTCTCCTGCCTCAGCCTCCGGAGTAGCTGGGACTACAGGAACATGCCACCACACC TGACTAACTTTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCATGTTGGCCAGGCTGGTCTGGAACTCC TGACCTCAGGTGATCTGCCTGCTTCAACCTCCCAAAGTGCTGGGATTACAGGCGTGGGCCACCGCGCCCA GCTGGGAGTTAAGGGGCTTCTAATGCATTGCATTAGAATACCAGACACGGGACAGCTGTGATCTTTATTC TCTATCACCCCACACAGCCCTGCCTGGGACACACAAGGACACTCAATACATGCTTTTCCGCTAGGCACGG TGGCTCACCCCTGTAATCCCAGCATTTTGGGAGGCCAAGGTGGGAGGATCACTTGAGCCCAGGAGTTCAA CACCAGACTCGGCAACATAGTGAGACTCTGTCTCTACTAAAAATACAAAAATTAGCCAGGCCTGGTGCCA CACACCTGTGGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGGATTGCTTGAGCCCAGAAGGTCAAGGTTG CAGTGAACCATGTTCAGGCCACTGCAATCCAGCCTGGGTGACAGAGCAAGACCCTGTCTGTAAATAAATA ACGCTTTTCAAGTGATTAAACAGACTCCCCCCTCACCCTGCCCACCATGGCTCCAAAGCAGCATTTGTGG AGCACCTTCTGTGTGCCCCTAGGTACTAGGTGCCTGGACGGGGTCAGAAGGAACCTGAACCACCTTCAAC TTGTTCCACACAGGATGCCAGGCCAAGGTGGAGCAACCGGTGGAGCCAGAGACGGAACCCGAGCTTCGCC AGCAGGCTGAGGGGCAGAGCGGCCAGCCCTGGGAGCTGGCACTGGGTCGCTTTTGGGATTACCTGCGCTG GGTGCAGACACTGTCTGAGCAGGTGCAGGAGGAGCTGCTCAGCCCCCAGGTCACCCAGGAACTGACGTGA GTGTCCCCATCCCGGCCCTTGACCCTTCTGGTGGGCGGCTATACCTCCCCAGGTCCAGGTTTCATTCTGC CCCTGCCACTAAGTCTTGGGAGTCCTGGGTCTCTGCTGGTTCTAGCTTCCTCTTCCCATTTCTGACTCCT GGCTTTAGCTCTCTGGAATTCTCTCTCTCAGTTCTGTTTCTCCCTCTTCCCTTCTGACTCAGCCTCTCAC ACTCGTCCTGGCTCTGTCTCTGTCCTTCACTAGCTCTTTTATATAGAGACAGAGAGATGGGGTCTCACTG TGTTGCCCAGGCTGGTCTTGAACTTCTGGGCTCAAGCGATCCTCCCACCTCGCCTCCCAAAGTGCTGGGA ATAAAGACATGAGCCACCTTGCCCGGCCTCCTAGCTCTTTCTTCGTCTCTGCCTCTGCTCTCTGCGTCTG TCTTTGTCTCCTCTCTGCCTCTGTCCCGTTCCTTCTCTCTTGGTTCACTGCCCTTCTGTCTCTCCCTGTT CTCCTTAGGAGACTCTCCTCTCTTCCTTCTCGGGTCTCTCTGGCTGATCCCCATCTCACCCACACCTATC CCAGCCCTTCTCGCCTCCCCCTGTGCGCACACCCTCCTGCTCTTTCGGCTGCAGGACGCTGATGGATGAG ACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGAGGAACAGCTGAGCCCGGTGGCGGAGGAGACGC GGGCACGGCTGTCCAAGGAGCTACAGGCGGCGCAGGCCCGGCTGGGTGCCGACATGGAGGACGTGCGCAG CCGCCTGGTGCAGTACCGCAGCGAGGTGCAGGCCATGCTGGGCCAGAGTACCGAGGAGCTGCGGGCGCGC CTCGCCTCCCACCTGCGCAAGCTGCGCAAGCGGCTCCTCCGCGATGCTGATGACCTGCAGAAGCGCCTGG CAGTGTATCAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGGGTCAGCGCCATCCGCGAGCGCCTGGGACC CCTGGTGGAGCAGGGCCGCGTGCGGGCCGCCACTGTGGGCTCCCTGGCCAGCCAGCCGCTTCAGGAGCGG GCCCAGGCCTTGGGTGAGCGGCTTCGCGCACGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCTGG ACGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAACAGGCCCAGCAGATAAGCCTGCA GGCCGAGGCCTTCCAGGCCCGCCTCAAGAGCTGGTTCGAGCCCCTGGTGGAAGATATGCAGCGCCAGTGG GCTGGGCTGGTGGAGAAGGTGCAGGCTGCCGTGGGCGCCAGCACCGCCCCTGTGCCCAGCGACAATCACT GAACGCCCAGGCCTACAGCCATGCGACCCGACTCCACCCCATGCCTCCTCTCTCCGCTCAGCCTGCAGCG GGAGACCCTGTCCCCACCCCAGCCGTCCTCCAGGGGTGGGCCCTAGTTTAATAAAGATTCGCCAAGTTTC ACCGCA" humanInsulin = "AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGGTCTGTTCCAAGGGCCTTTGCGTCAGGT GGGCTCAGGATTCCAGGGTGGCTGGACCCCAGGCCCCAGCTCTGCAGCAGGGAGGACGTGGCTGGGCTCG TGAAGCATGTGGGGGTGAGCCCAGGGGCCCCAAGGCAGGGCACCTGGCCTTCAGCCTGCCTCAGCCCTGC CTGTCTCCCAGATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTG GCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAG CTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCT GCAGGGTGAGCCAACTGCCCATTGCTGCCCCTGGCCGCCCCCAGCCACCCCCTGCTCCTGGCGCTCCCAC CCAGCATGGGCAGAAGGGGGCAGGAGGCTGCCACCCAGCAGGGGGTCAGGTGCACTTTTTTAAAAAGAAG TTCTCTTGGTCACGTCCTAAAAGTGACCAGCTCCCTGTGGCCCAGTCAGAATCTCAGCCTGAGGACGGTG TTGGCTTCGGCAGCCCCGAGATACATCAGAGGGTGGGCACGCTCCTCCCTCCACTCGCCCCTCAAACAAA TGCCCCGCAGCCCATTTCTCCACCCTCATTTGATGACCGCAGATTCAAGTGTTTTGTTAAGTAAAGTCCT GGGTGACCTGGGGTCACAGGGTGCCCCACGCTGCCTGCCTCTGGGCGAACACCCCATCACGCCCGGAGGA GGGCGTGGCTGCCTGCCTGAGTGGGCCAGACCCCTGTCGCCAGGCCTCACGGCAGCTCCATAGTCAGGAG ATGGGGAAGATGCTGGGGACAGGCCCTGGGGAGAAGTACTGGGATCACCTGTTCAGGCTCCCACTGTGAC GCTGCCCCGGGGCGGGGGAAGGAGGTGGGACATGTGGGCGTTGGGGCCTGTAGGTCCACACCCAGTGTGG GTGACCCTCCCTCTAACCTGGGTCCAGCCCGGCTGGAGATGGGTGGGAGTGCGACCTAGGGCTGGCGGGC AGGCGGGCACTGTGTCTCCCTGACTGTGTCCTCCTGTGTCCCTCTGCCTCGCCGCTGTTCCGGAACCTGC TCTGCGCGGCACGTCCTGGCAGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGCAGC CCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTACCAGCATCTGCTCCCT CTACCAGCTGGAGAACTACTGCAACTAGACGCAGCCCGCAGGCAGCCCCACACCCGCCGCCTCCTGCACC GAGAGAGATGGAATAAAGCCCTTGAACCAGC" hamsterInsulin = "ATGGCCCTGTGGATGCGCCTTCTGCCCCTGTTGGCCCTGCTGGCCCTCTGGGAGCCGAACCCTGCCCAGG CTTTTGTCAACCAGCACCTTTGTGGCTCCCACCTTGTGGAGGCACTCTACCTGGTGTGTGGGGAGCGTGG CTTCTTCTACACACCCAAATCCCGTCGTGGAGTGGAGGACCCACAAGGTGAGTTCTGCCCCTGAATTCTG TCCCCAGTGCTTGCCACCCTGGTTTTCCTTGCCCACAGGACCTCACAGATTATGTCCTGGGTGTGGAGGG TCTCAGAGGAACTGGGCAGGGGCACATTTCCGTGGGAAGCTAGACATAGCTAAACACAGCGGCTGCTGGG AATGAATGAGAATCCTGCCTTGAGGCTCCTAGGTGCAGACATGTGGGCAGGCCCCAGGATAGGCACCTAT TTGGGGCCGCCATAAAACACTAGGGGTTGGTGGCAGGATGCGTAGGCTTTAGAGCTCTTTGTGTCCATGC CCGGTGACTTGTCCCACATACTGACTTAGCAGGAGAGACAAGGTGAGAGGAAGCCTGGGGTAGGCAGGAG GCTGCTCAGCTGCCCCTGACTGGATTGTCCCATGTGTCTTTGCTTCTATGTTGCTGACACTCTGGCTTGC TCTGACACTGCCTCCCTGGCAGTGACACAGCTGGAGCTGGGTGGCGGCCCTGGAGCAGGTGACCTTCAGA CCTTGGCACTGGAGGTGGCCCAGCAGAAGCGCGGCATTGTGGATCAGTGCTGCACCAGCATCTGCTCGCT CTACCAGCTAGAGAACTACTGCAACTAG" scores = [ [10, -1, -1], [2, -6, -2], [2, -2, -5] ] for score in scores: wunsch = nw.NeedlemanWunsch(score[0], score[1], score[2]) smith = sw.SmithWaterman(score[0], score[1], score[2]) print(score) print("Needleman-Wunsch") print("\n APOE \n") wunsch.solve(humanAPOE, monkeyAPOE) print("\n INS \n") wunsch.solve(humanInsulin, hamsterInsulin) print("\nSmith-Waterman") print("\n APOE \n") smith.solve(humanAPOE, monkeyAPOE) print("\n INS \n") smith.solve(humanInsulin, hamsterInsulin)
def setUp(self): self.seq1 = "TTACCGGCCAACTAA" self.seq2 = "ACCGTGTCACTAC" self.SW = SmithWaterman(self.seq1, self.seq2)
def test4(self): matrix = SmithWaterman.create_matrix(5, 2) self.assertEqual( [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], matrix)
def setUp(self): self.seq1 = "AGCACACA" self.seq2 = "ACACACTA" self.SW = SmithWaterman(self.seq1, self.seq2)
import SmithWaterman import AlignmentTools file1 = SmithWaterman.get_file_data("Enter first file name: ") file2 = SmithWaterman.get_file_data("Enter second file name: ") file1list = list(file1) file2list = list(file2) file1len = len(file1list) file2len = len(file2list) matrix = SmithWaterman.create_matrix(file1len, file2len) AlignmentTools.fill_matrix(file1list, file2list, matrix) alignment = SmithWaterman.do_alignment(file1list, file2list, matrix) print(''.join(alignment[0])) print(''.join(alignment[1])) print(''.join(alignment[2]))
def test4(self): matrix = [[4, 0, 0], [0, 0, 0], [0, 0, 0]] list1 = "bb" list2 = "cb" max = SmithWaterman.f(1, 1, matrix, list1, list2) self.assertEqual(max, 3)
import sys, SmithWaterman, string #get filenames from command line args firstSequence = sys.argv[1] secondSequence = sys.argv[2] sequence = [''] * 2 #parse fasta file and strip header info. def parse_fasta(fasta): sequences = '' sep = '' with open(fasta) as f: next(f) for line in f: sequences += (line.strip()) return sequences sequence[0] = parse_fasta(firstSequence) sequence[1] = parse_fasta(secondSequence) #run the algorithm SmithWaterman.calculateAlignment(sequence[0], sequence[1])
def setUp(self): self.seq1 = 'AG' self.seq2 = 'AC' self.SW = SmithWaterman(self.seq1, self.seq2) self.test_function = self.SW._score_diag
def setUp(self): self.SW = SmithWaterman('', '')
def align(ref, match, matrix, algorithm, gapOpen, gapExtend, ksdsspCache, ssMatrix=defaults[SS_SCORES], ssFraction=defaults[SS_MIXTURE], gapOpenHelix=defaults[HELIX_OPEN], gapOpenStrand=defaults[STRAND_OPEN], gapOpenOther=defaults[OTHER_OPEN], computeSS=defaults[COMPUTE_SS]): similarityMatrix = SmithWaterman.matrices[matrix] ssf = ssFraction ssm = ssMatrix if ssf is not None and ssf is not False and computeSS: needCompute = [] if ref.molecule not in ksdsspCache: needCompute.append(ref.molecule) ksdsspCache.add(ref.molecule) if match.molecule not in ksdsspCache: needCompute.append(match.molecule) ksdsspCache.add(match.molecule) if needCompute: from chimera.initprefs import ksdsspPrefs, \ KSDSSP_ENERGY, KSDSSP_HELIX_LENGTH, \ KSDSSP_STRAND_LENGTH from Midas import ksdssp ksdssp(needCompute, energy=ksdsspPrefs[KSDSSP_ENERGY], helixLen=ksdsspPrefs[KSDSSP_HELIX_LENGTH], strandLen=ksdsspPrefs[KSDSSP_STRAND_LENGTH]) if algorithm == "nw": score, seqs = NeedlemanWunsch.nw(ref, match, scoreGap=-gapExtend, scoreGapOpen=0-gapOpen, similarityMatrix=similarityMatrix, returnSeqs=True, ssMatrix=ssMatrix, ssFraction=ssFraction, gapOpenHelix=-gapOpenHelix, gapOpenStrand=-gapOpenStrand, gapOpenOther=-gapOpenOther) gappedRef, gappedMatch = seqs elif algorithm =="sw": refName = ref.molecule.name if not ref.name.startswith("principal"): refName += ", " + ref.name gappedRef = StructureSequence(ref.molecule, refName) matchName = match.molecule.name if not match.name.startswith("principal"): matchName += ", " + match.name gappedMatch = StructureSequence(match.molecule, matchName) def ssLet(r): if not r: return ' ' if r.isHelix: return 'H' elif r.isStrand: return 'S' return 'O' if ssf is False or ssf is None: ssf = 0.0 ssm = None if ssm: # account for missing structure (blank SS letter) ssm = ssm.copy() for let in "HSO ": ssm[(let, ' ')] = 0.0 ssm[(' ', let)] = 0.0 score, alignment = SmithWaterman.align(str(ref), str(match), similarityMatrix, float(gapOpen), float(gapExtend), gapChar=".", ssMatrix=ssm, ssFraction=ssf, gapOpenHelix=float(gapOpenHelix), gapOpenStrand=float(gapOpenStrand), gapOpenOther=float(gapOpenOther), ss1="".join([ssLet(r) for r in ref.residues]), ss2="".join([ssLet(r) for r in match.residues])) gappedRef.extend(alignment[0]) gappedMatch.extend(alignment[1]) # Smith-Waterman may not be entirety of sequences... for orig, gapped in [(ref, gappedRef), (match, gappedMatch)]: ungapped = gapped.ungapped() for i in range(len(orig) - len(ungapped) + 1): if ungapped == orig[i:i+len(ungapped)]: break else: raise ValueError("Smith-Waterman result not" " a subsequence of original sequence") gapped.residues = orig.residues[i:i+len(ungapped)] resMap = {} gapped.resMap = resMap for j in range(len(ungapped)): resMap[gapped.residues[j]] = j else: raise ValueError("Unknown sequence alignment algorithm: %s" % algorithm) return score, gappedRef, gappedMatch