def sumOfPairs(self): """ This function gets the sum of pairs score of the multiple sequence alignment.""" sumOfPairs = 0 # Calls the FriendClass's getSubsMatScore function. fr = FriendClass() for i, alignment1 in enumerate(self.MSA): for j, alignment2 in enumerate(self.MSA): if i == j or i > j: continue else: # All the alignments are of the same length. for index in range(len(alignment2)): if alignment1[index] == 'X' and alignment2[ index] == 'X': continue elif ((alignment1[index] == 'X' and alignment2[index] != 'X') or (alignment1[index] != 'X' and alignment2[index] == 'X')): sumOfPairs += self.gapOpenCost # if both of the alignments have amino-acid characters else: sumOfPairs += fr.getSubsMatScore( alignment1[index], alignment2[index], self.subsMat, self.gapOpenCost) return sumOfPairs
def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering): """This is the main function which parses the fasta file, calls functions to create the UPGMA and WPGMA trees, and calls another function to print the final result""" fr = FriendClass() # Parse the fasta file. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_file) # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list if len(record) == 0: print( "You have a problem with your FASTA file. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open # The number of sequences is obtained from the length of the list 'record' num_sequences = len(record) # Get the pairwise similarities using Needleman-Wunsch. ids = [] s = [] for i in range(0, num_sequences): ids.append(record[i].id) s.append(str(record[i].seq)) # convert from Bio.Seq.Seq to str # Make sure s doesn't contain non-amino acid characters if fr.validateAminoSequence(s[i]) == 0: print("You have invalid character(s) in your FASTA file") sys.exit(11) # error code 11 # Call the function which will call other functions to create a UPGMA/WPGMA # tree based on the 'clustering' argument that is sent. newick, newickNoDistance, distanceMatrix, newickIds = self.UandWpgma( ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering) # Call a function which prints the 3 strings on the console self.printTree(newickIds, clustering) # Return both the Newick output with distances and the output with just # the amino acid names. return newick, newickNoDistance, newickIds
def buildMatrices(self, s1, s2, subst_matrix_fn, gap_open_cost, gap_extend_cost, g): """ This function creates the Needleman-Wunsch matrix, taking the sequences, the type of substitution matrix and the gap opening cost as arguments. It also cretes a traceback matrix which can be used in a later function to compute the optimal alignments""" s1_length = len(s1) s2_length = len(s2) # assign a high negative number to infinity, which will be used in initialization of P and Q matrices inf = -60000 D = np.zeros((s1_length + 1, s2_length + 1), dtype=int) # P matrix is used to extend gaps in Sequence 2 P = np.zeros((s1_length + 1, s2_length + 1), dtype=int) # Q matrix is used to extend gaps in Sequence 1 Q = np.zeros((s1_length + 1, s2_length + 1), dtype=int) traceback = np.zeros((s1_length, s2_length), dtype=int) fr = FriendClass() #Initialize D[0, 1] = D[1, 0] = g P[0, 1] = inf Q[1, 0] = inf for i in range(2, s1_length + 1): D[i, 0] = D[i - 1, 0] + gap_extend_cost # P does not need to be initialized in the 1st column. These values are not used in the algorithm Q[i, 0] = inf for j in range(2, s2_length + 1): D[0, j] = D[0, j - 1] + gap_extend_cost P[0, j] = inf # Q does not need to be initialized in the 1st column. These values are not used in the algorithm # sequence 1 is on the left and sequence 2 is on top for i in range(1, s1_length + 1): for j in range(1, s2_length + 1): #D_i-1,j + g # Update P[i,j] -> we can either extend the gap from the previous row in P or create a new gap in Seq 1, which # means that we need to take the previous row's value in D (We don't take into account different values of j) P[i, j] = max(D[i - 1, j] + g, P[i - 1, j] + gap_extend_cost) #Next, update Q[i,j] -> we can either extend the gap from the previous col in Q or create a new gap in Seq 1, which # means that we need to take the previous col's value in D (We don't take into account different values of i) Q[i, j] = max(D[i, j - 1] + g, Q[i, j - 1] + gap_extend_cost) # Finally, update D[i,j]: it is the max of the substitution score (match/mismatch), and the resp. P[i,j] and Q[i,j], # which correspond to gap extension in seq 2 and seq 1 respectively substitution = D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, gap_extend_cost) D[i, j] = max(substitution, P[i, j], Q[i, j]) optimalScore = D[s1_length][s2_length] return D, P, Q, optimalScore
def buildMatrices(self, s1, s2, subst_matrix_fn, gap_cost): """ This function creates the Needleman-Wunsch matrix, taking the sequences, the type of substitution matrix and the gap opening cost as arguments. It also cretes a traceback matrix which can be used in a later function to compute the optimal alignments""" s1_length = len(s1) s2_length = len(s2) nw_matrix = np.zeros((s1_length + 1, s2_length + 1), dtype=int) traceback = np.zeros((s1_length, s2_length), dtype=int) fr = FriendClass() #Initialize for i in range(1, s1_length + 1): nw_matrix[i, 0] = nw_matrix[i - 1, 0] + gap_cost for j in range(1, s2_length + 1): nw_matrix[0, j] = nw_matrix[0, j - 1] + gap_cost # sequence 1 is on the left and sequence 2 is on top for i in range(1, s1_length + 1): for j in range(1, s2_length + 1): # Cost of inserting a gap into seq 1 seq1_gap = nw_matrix[i, j - 1] + gap_cost # Cost of inserting a gap into seq 2 seq2_gap = nw_matrix[i - 1, j] + gap_cost # Cost of a match/mismatch # i-1, j-1 to index the strings as the i and j loops start with value 1 substitution = nw_matrix[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, gap_cost) nw_matrix[i][j] = max(seq1_gap, seq2_gap, substitution) # Store which direction we came from, we need this for traceback # traceback is a s1.length x s2.length matrix, so we need to index # from [0][0], so we use [i-1][j-1] """ We add 1 whenever the value was caluclated from seq1_gap, we add 2 when the value was calculated from seq2_gap, and add 4 when the value was calculated from a substitution. We get the values 5,6,7 when the value came from 2 or 3 directions (i.e. combinations of seq1_gap, seq2_gap and substitutions (1,2 and 4). Note that there are three ifs and not elifs, so all 3 have conditions are checked, and the values are added.""" if seq1_gap == max(seq1_gap, seq2_gap, substitution): traceback[i - 1][j - 1] += 1 if seq2_gap == max(seq1_gap, seq2_gap, substitution): traceback[i - 1][j - 1] += 2 if substitution == max(seq1_gap, seq2_gap, substitution): traceback[i - 1][j - 1] += 4 optimalScore = nw_matrix[s1_length][s2_length] return traceback, optimalScore
def run(self, seq_fasta_fn): """ Fold RNA with Nussinov algorithm. Args: seq_fasta_fn: path to fasta file containing sequence Returns: tuple of (id_seq: fasta id of sequence, seq: sequence, structure: dot-bracket string of optimal folding) """ """This is the main function which parses fasta files, calls functions to create Needleman Wunsch and traceback matrices, and calls another function to print the final result""" fr = FriendClass() # Parse the fasta files. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_fn) # if there is a problem with the fasta files, list(SeqIO.parse) # returns an empty list if len(record) == 0: print("You have a problem with your FASTA file. H" "int: check if the first character is '>'") sys.exit(1) # error code 1 id1 = record[0].id s1 = str(record[0].seq) # convert from Bio.Seq.Seq to str # Make sure s1 doesn't contain non-RNA characters if fr.validateRNASequence(s1) == 0: print("You have invalid character(s) in your file") sys.exit(11) # error code 11 # Build the Nussinov matrix N = self.build_matrix(s1) i = 0 j = len(s1) # Get the traceback: it is stored in the class-variables trace_list and # indices_list self.tracebackInN(N, s1, i, j) # The dot_bracket = self.printer(s1, id1) return N, dot_bracket
def run(self, seq1_fasta_file, seq2_fasta_file, subst_matrix_fn, cost_gap_open, complete_traceback): """This is the main function which parses fasta files, calls functions to create Needleman Wunsch and traceback matrices, and calls another function to print the final result""" fr = FriendClass() # Parse the fasta files. Get sequences out of them, # record1 and record2 are lists containing the sequences and ids in # fasta file 1 and 2 respectively. record1, record2 = fr.parseFastaFiles(seq1_fasta_file, seq2_fasta_file) # if there is a problem with the fasta files, list(SeqIO.parse) returns an empty list if len(record1) == 0 or len(record2) == 0: print( "You have a problem with one of your FASTA files. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 id1 = record1[0].id s1 = str(record1[0].seq) # convert from Bio.Seq.Seq to str # Make sure s1 doesn't contain non-amino acid characters fr = FriendClass() if fr.validateAminoSequence(s1) == 0: print("You have invalid character(s) in your 1st file") sys.exit(11) # error code 11 id2 = record2[0].id s2 = str(record2[0].seq) # convert from Bio.Seq.Seq to str #Make sure s2 doesn't contain any non-amino acid characters if fr.validateAminoSequence(s2) == 0: print("You have invalid character(s) in your 2nd file") sys.exit(12) # error code 12 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open (traceback, optimalScore) = self.buildMatrices(s1, s2, subst_matrix_fn, cost_gap_open) alignment_strings = self.getAlignmentsFromTracebacks(s1, s2, traceback) num_alignments = len(alignment_strings) if complete_traceback == False: randomNum = random.randint(0, num_alignments - 1) alignment_strings = [alignment_strings[randomNum]] # Call a function which prints the 3 strings on the console self.printer(alignment_strings, num_alignments, optimalScore, complete_traceback, id1, id2, s1, s2, subst_matrix_fn) return (id1, s1, id2, s2, optimalScore, alignment_strings, num_alignments)
def similarityToDistance(self, s_ab, a, b, nw, alignment, subsMat, gapOpenCost): """ This function converts a similarity score to a distance score.""" # 1. Calculate S(a,b)_rand using the formula on this page, but with linear gap costs: # http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Feng-Doolittle # Find length of the sequence L = len(alignment[0]) # same length for alignment[0],[1] and [2] # Find number of gaps in alignment[0] and alignment[2] N_g = alignment[0].count('-') + alignment[2].count('-') fr = FriendClass() sum_xy = 0 # Randomize a and b to calculate s_rand. list_a = list(a) list_b = list(b) random.shuffle(list_a) random.shuffle(list_b) rand_a = "".join(list_a) rand_b = "".join(list_b) for i, x in enumerate(a): for j, y in enumerate(b): s_xy = fr.getSubsMatScore(rand_a[i], rand_b[j], subsMat, gapOpenCost) Na_x = a.count(x) Nb_y = b.count(y) sum_xy += (Na_x * Nb_y * s_xy) s_ab_rand = (sum_xy / L) + (N_g * gapOpenCost) # 2. Calculate s_ab_max (traceback_aa, s_aa) = nw.buildMatrices(a, a, subsMat, gapOpenCost) (traceback_bb, s_bb) = nw.buildMatrices(b, b, subsMat, gapOpenCost) s_ab_max = (s_aa + s_bb) / 2 #s_ab_eff is the normalized similarity: between 0 and 1. s_ab_eff = (s_ab - s_ab_rand) / (s_ab_max - s_ab_rand) d = -math.log(s_ab_eff) return d
def run(self, seq_fasta_file, subst_matrix_fn, cost_gap_open, clustering): """This is the main function which parses the fasta file, calls functions to create the UPGMA and WPGMA trees, and calls another function to print the final result""" self.subsMat = subst_matrix_fn self.gapOpenCost = cost_gap_open fr = FriendClass() # Parse the fasta file. Get 2 sequences out of them # record is a list containing the sequences and ids in # the fasta file. record = fr.parseMultSequenceFastaFile(seq_fasta_file) # if there is a problem with the fasta file, list(SeqIO.parse) returns an empty list if len(record) == 0: print( "You have a problem with your FASTA file. Hint: check if the first character is '>'" ) sys.exit(1) # error code 1 # If gap cost is positive, take the additive inverse, return the # negative version of the same value. if cost_gap_open > 0: print( "Your gap cost is positive. I assume you want it to be negative, I have added a minus" ) cost_gap_open = -cost_gap_open # The number of sequences is obtained from the length of the list 'record' num_sequences = len(record) # Get the pairwise similarities using Needleman-Wunsch. ids = [] s = [] for i in range(0, num_sequences): ids.append(record[i].id) s.append(str(record[i].seq)) # convert from Bio.Seq.Seq to str # Make sure s doesn't contain non-amino acid characters if fr.validateAminoSequence(s[i]) == 0: print("You have invalid character(s) in your FASTA file") sys.exit(11) # error code 11 # Call the function in Xpgma which will call other functions to create # a UPGMA/WPGMA tree based on the 'clustering' argument that is sent. # 2 Newick format outputs string are returned: with and without distances. # Only the one without distances will be used. The 3rd Newick format, # contains the original IDs in the fasta file, and is only needed for display gma = Xpgma() newick, newickNoDistance, distanceMatrix, newickIds = gma.UandWpgma( ids, s, num_sequences, subst_matrix_fn, cost_gap_open, clustering) # seqClusterMap is a dict with cluster names as keys and the # corresponding sequences as values. seqClusterMap = {} cl = 0 for seq in s: seqClusterMap['C' + str(cl)] = seq cl += 1 # Call a function which will read and parse the Newick string, and # will internally call other functions to create groups and get # the final multiple sequence alignment. self.processNewickString(newickNoDistance, seqClusterMap) SOP = self.sumOfPairs() self.printer(newickIds, SOP) return SOP, newick, newickNoDistance
def getAlignmentsFromTracebacks(self, s1, s2, subst_matrix_fn, D, P, Q, alpha, beta, g): """This function takes as input the matrices D, P and Q created in an earlier function. It computes the traceback by essentially reversing the process of building the matrices. It returns a list of lists containing the alignment.""" indices_list = [[]] trace_list = [[]] fr = FriendClass() # Set i and j to the index of the last row and column of the ndarray D respectively i = D.shape[0] - 1 j = D.shape[1] - 1 indices_list[0] = [i, j] trace_list[0] = ["", "", ""] indices_duplicate = copy.deepcopy( indices_list ) # A copy of indices list is needed for going through the for loop below while True: completed_counter = 0 #This counter will be set to the number of tracebacks found. for index, [i, j] in enumerate(indices_duplicate): if i == 0 and j == 0: # We reach here only when we have got the complete sequence completed_counter += 1 #increment indicates that we have got 1 more complete traceback continue if i == 0 and j >= 0: # We reach here only when s1 has reached the beginning of the sequence trace_list[index][0] += '-' trace_list[index][1] += s2[j] trace_list[index][2] += ' ' indices_list[index][1] -= 1 continue if i >= 0 and j == 0: # We reach here only when s2 has reached the beginning of the sequence trace_list[index][0] += s1[i] trace_list[index][1] += '-' trace_list[index][2] += ' ' indices_list[index][0] -= 1 continue # indicates that the value in D[i,j] came from P, Q and D if D[i, j] == P[i, j] and D[i, j] == Q[i, j] and D[ i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists trace_list.append(copy.deepcopy(trace_list[index])) indices_list.append(copy.deepcopy( indices_list[index])) #first copy indices_list.append(copy.deepcopy( indices_list[index])) #second copy # treat traceback[index] as the list where the traceback has come from P[i,j] # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i trace_list[index][0] += s1[i2:i] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # treat traceback[second] as the list where the traceback has come from Q[i,j] # second will store the index of the newly duplicated list # (it will always be at the end because that's how append works) second = len(trace_list) - 1 # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[second][1] -= diff_j trace_list[second][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[second][1] += s2[j2:j][::-1] trace_list[second][2] += ' ' * diff_j # treat traceback[third] as the list where the traceback has come from D[i-1,j-1] (decrease i and j) third = len(trace_list) - 2 trace_list[third][0] += s1[i - 1] trace_list[third][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[third][2] += '*' else: trace_list[third][2] += ':' indices_list[third][0] -= 1 indices_list[third][1] -= 1 # The value in D[i,j] came from P and Q, not from D. elif D[i, j] == P[i, j] and D[i, j] == Q[i, j]: trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists indices_list.append(copy.deepcopy( indices_list[index])) #copy # treat traceback[index] as the list where the traceback has come from P[i,j] # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to # trace_list in the reverse order trace_list[index][0] += s1[i2:i][::-1] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # treat traceback[second] as the list where the traceback has come from Q[i,j] # second will store the index of the newly duplicated list # (it will always be at the end because that's how append works) second = len(trace_list) - 1 # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[second][1] -= diff_j trace_list[second][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[second][1] += s2[j2:j][::-1] trace_list[second][2] += ' ' * diff_j # D[i,j] came from P and D elif D[i, j] == P[i, j] and D[ i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists indices_list.append(copy.deepcopy( indices_list[index])) #copy # treat traceback[index] as the list where the traceback has come from P[i,j] # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to # trace_list in the reverse order trace_list[index][0] += s1[i2:i][::-1] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j) second = len(trace_list) - 1 trace_list[second][0] += s1[i - 1] trace_list[second][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[second][2] += '*' else: trace_list[second][2] += ':' indices_list[second][0] -= 1 indices_list[second][1] -= 1 # D[i,j] came from D and Q. elif D[i, j] == Q[i, j] and D[ i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists indices_list.append(copy.deepcopy( indices_list[index])) #copy # treat traceback[index] as the list where the traceback has come from P[i,j] # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[index][1] -= diff_j trace_list[index][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[index][1] += s2[j2:j][::-1] trace_list[index][2] += ' ' * diff_j # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j) second = len(trace_list) - 1 trace_list[second][0] += s1[i - 1] trace_list[second][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[second][2] += '*' else: trace_list[second][2] += ':' indices_list[second][0] -= 1 indices_list[second][1] -= 1 # D[i,j] came from only P #indicates that a gap has been added in Sequence 1, so we have to decrement i elif D[i, j] == P[i, j]: # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to # trace_list in the reverse order trace_list[index][0] += s1[i2:i][::-1] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # D[i,j] came from only Q. #indicates that a gap has been added in Sequence 2, so we have to decrement i elif D[i, j] == Q[i, j]: # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[index][1] -= diff_j trace_list[index][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[index][1] += s2[j2:j][::-1] trace_list[index][2] += ' ' * diff_j # D[i,j] came from D[i-1,j-1] # Indicates a substitution elif D[i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list[index][0] += s1[i - 1] trace_list[index][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[index][2] += '*' else: trace_list[index][2] += ':' indices_list[index][0] -= 1 indices_list[index][1] -= 1 # indices_duplicate, the for loop variable, needs to store the updated value of indices_list before the next loop starts indices_duplicate = copy.deepcopy(indices_list) # when the number of indices (same as no. of tracebacks) is equal to the 'done counter', which is incremented once for # each traceback, we can break out of the while(True) infinite loop if completed_counter == len(indices_duplicate): break # As trace_list contains all the strings (S1, S2 and connect) in the opposite order, they need to be reversed. alignment_strings = [[string[::-1] for string in trace] for trace in trace_list] return alignment_strings
and alignment2[index] != 'X') or (alignment1[index] != 'X' and alignment2[index] == 'X')): sumOfPairs += gapOpenCost # if both of the alignments have amino-acid characters else: sumOfPairs += fr.getSubsMatScore( alignment1[index], alignment2[index], subsMat, gapOpenCost) return sumOfPairs if __name__ == '__main__': sop = sumOfPairs() fr = FriendClass() parser = argparse.ArgumentParser() parser.add_argument( "subsMatrixType", choices=["pam250", "blosum62"], help="Choose if you want to use a PAM250 or BLOSUM62 substitution" " matrix for calculating match/mismatch score") parser.add_argument("gapOpenCost", type=int, help="Specify the cost of opening a gap") args = parser.parse_args() # Hard code the alignments for testing, as Needleman Wunsch otherwise # gives randomized alignments. MSA_PAM = [ "---MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAA",