def sumOfPairs(self): """ This function gets the sum of pairs score of the multiple sequence alignment.""" sumOfPairs = 0 # Calls the FriendClass's getSubsMatScore function. fr = FriendClass() for i, alignment1 in enumerate(self.MSA): for j, alignment2 in enumerate(self.MSA): if i == j or i > j: continue else: # All the alignments are of the same length. for index in range(len(alignment2)): if alignment1[index] == 'X' and alignment2[ index] == 'X': continue elif ((alignment1[index] == 'X' and alignment2[index] != 'X') or (alignment1[index] != 'X' and alignment2[index] == 'X')): sumOfPairs += self.gapOpenCost # if both of the alignments have amino-acid characters else: sumOfPairs += fr.getSubsMatScore( alignment1[index], alignment2[index], self.subsMat, self.gapOpenCost) return sumOfPairs
def buildMatrices(self, s1, s2, subst_matrix_fn, gap_open_cost, gap_extend_cost, g): """ This function creates the Needleman-Wunsch matrix, taking the sequences, the type of substitution matrix and the gap opening cost as arguments. It also cretes a traceback matrix which can be used in a later function to compute the optimal alignments""" s1_length = len(s1) s2_length = len(s2) # assign a high negative number to infinity, which will be used in initialization of P and Q matrices inf = -60000 D = np.zeros((s1_length + 1, s2_length + 1), dtype=int) # P matrix is used to extend gaps in Sequence 2 P = np.zeros((s1_length + 1, s2_length + 1), dtype=int) # Q matrix is used to extend gaps in Sequence 1 Q = np.zeros((s1_length + 1, s2_length + 1), dtype=int) traceback = np.zeros((s1_length, s2_length), dtype=int) fr = FriendClass() #Initialize D[0, 1] = D[1, 0] = g P[0, 1] = inf Q[1, 0] = inf for i in range(2, s1_length + 1): D[i, 0] = D[i - 1, 0] + gap_extend_cost # P does not need to be initialized in the 1st column. These values are not used in the algorithm Q[i, 0] = inf for j in range(2, s2_length + 1): D[0, j] = D[0, j - 1] + gap_extend_cost P[0, j] = inf # Q does not need to be initialized in the 1st column. These values are not used in the algorithm # sequence 1 is on the left and sequence 2 is on top for i in range(1, s1_length + 1): for j in range(1, s2_length + 1): #D_i-1,j + g # Update P[i,j] -> we can either extend the gap from the previous row in P or create a new gap in Seq 1, which # means that we need to take the previous row's value in D (We don't take into account different values of j) P[i, j] = max(D[i - 1, j] + g, P[i - 1, j] + gap_extend_cost) #Next, update Q[i,j] -> we can either extend the gap from the previous col in Q or create a new gap in Seq 1, which # means that we need to take the previous col's value in D (We don't take into account different values of i) Q[i, j] = max(D[i, j - 1] + g, Q[i, j - 1] + gap_extend_cost) # Finally, update D[i,j]: it is the max of the substitution score (match/mismatch), and the resp. P[i,j] and Q[i,j], # which correspond to gap extension in seq 2 and seq 1 respectively substitution = D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, gap_extend_cost) D[i, j] = max(substitution, P[i, j], Q[i, j]) optimalScore = D[s1_length][s2_length] return D, P, Q, optimalScore
def buildMatrices(self, s1, s2, subst_matrix_fn, gap_cost): """ This function creates the Needleman-Wunsch matrix, taking the sequences, the type of substitution matrix and the gap opening cost as arguments. It also cretes a traceback matrix which can be used in a later function to compute the optimal alignments""" s1_length = len(s1) s2_length = len(s2) nw_matrix = np.zeros((s1_length + 1, s2_length + 1), dtype=int) traceback = np.zeros((s1_length, s2_length), dtype=int) fr = FriendClass() #Initialize for i in range(1, s1_length + 1): nw_matrix[i, 0] = nw_matrix[i - 1, 0] + gap_cost for j in range(1, s2_length + 1): nw_matrix[0, j] = nw_matrix[0, j - 1] + gap_cost # sequence 1 is on the left and sequence 2 is on top for i in range(1, s1_length + 1): for j in range(1, s2_length + 1): # Cost of inserting a gap into seq 1 seq1_gap = nw_matrix[i, j - 1] + gap_cost # Cost of inserting a gap into seq 2 seq2_gap = nw_matrix[i - 1, j] + gap_cost # Cost of a match/mismatch # i-1, j-1 to index the strings as the i and j loops start with value 1 substitution = nw_matrix[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, gap_cost) nw_matrix[i][j] = max(seq1_gap, seq2_gap, substitution) # Store which direction we came from, we need this for traceback # traceback is a s1.length x s2.length matrix, so we need to index # from [0][0], so we use [i-1][j-1] """ We add 1 whenever the value was caluclated from seq1_gap, we add 2 when the value was calculated from seq2_gap, and add 4 when the value was calculated from a substitution. We get the values 5,6,7 when the value came from 2 or 3 directions (i.e. combinations of seq1_gap, seq2_gap and substitutions (1,2 and 4). Note that there are three ifs and not elifs, so all 3 have conditions are checked, and the values are added.""" if seq1_gap == max(seq1_gap, seq2_gap, substitution): traceback[i - 1][j - 1] += 1 if seq2_gap == max(seq1_gap, seq2_gap, substitution): traceback[i - 1][j - 1] += 2 if substitution == max(seq1_gap, seq2_gap, substitution): traceback[i - 1][j - 1] += 4 optimalScore = nw_matrix[s1_length][s2_length] return traceback, optimalScore
def similarityToDistance(self, s_ab, a, b, nw, alignment, subsMat, gapOpenCost): """ This function converts a similarity score to a distance score.""" # 1. Calculate S(a,b)_rand using the formula on this page, but with linear gap costs: # http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Feng-Doolittle # Find length of the sequence L = len(alignment[0]) # same length for alignment[0],[1] and [2] # Find number of gaps in alignment[0] and alignment[2] N_g = alignment[0].count('-') + alignment[2].count('-') fr = FriendClass() sum_xy = 0 # Randomize a and b to calculate s_rand. list_a = list(a) list_b = list(b) random.shuffle(list_a) random.shuffle(list_b) rand_a = "".join(list_a) rand_b = "".join(list_b) for i, x in enumerate(a): for j, y in enumerate(b): s_xy = fr.getSubsMatScore(rand_a[i], rand_b[j], subsMat, gapOpenCost) Na_x = a.count(x) Nb_y = b.count(y) sum_xy += (Na_x * Nb_y * s_xy) s_ab_rand = (sum_xy / L) + (N_g * gapOpenCost) # 2. Calculate s_ab_max (traceback_aa, s_aa) = nw.buildMatrices(a, a, subsMat, gapOpenCost) (traceback_bb, s_bb) = nw.buildMatrices(b, b, subsMat, gapOpenCost) s_ab_max = (s_aa + s_bb) / 2 #s_ab_eff is the normalized similarity: between 0 and 1. s_ab_eff = (s_ab - s_ab_rand) / (s_ab_max - s_ab_rand) d = -math.log(s_ab_eff) return d
def getAlignmentsFromTracebacks(self, s1, s2, subst_matrix_fn, D, P, Q, alpha, beta, g): """This function takes as input the matrices D, P and Q created in an earlier function. It computes the traceback by essentially reversing the process of building the matrices. It returns a list of lists containing the alignment.""" indices_list = [[]] trace_list = [[]] fr = FriendClass() # Set i and j to the index of the last row and column of the ndarray D respectively i = D.shape[0] - 1 j = D.shape[1] - 1 indices_list[0] = [i, j] trace_list[0] = ["", "", ""] indices_duplicate = copy.deepcopy( indices_list ) # A copy of indices list is needed for going through the for loop below while True: completed_counter = 0 #This counter will be set to the number of tracebacks found. for index, [i, j] in enumerate(indices_duplicate): if i == 0 and j == 0: # We reach here only when we have got the complete sequence completed_counter += 1 #increment indicates that we have got 1 more complete traceback continue if i == 0 and j >= 0: # We reach here only when s1 has reached the beginning of the sequence trace_list[index][0] += '-' trace_list[index][1] += s2[j] trace_list[index][2] += ' ' indices_list[index][1] -= 1 continue if i >= 0 and j == 0: # We reach here only when s2 has reached the beginning of the sequence trace_list[index][0] += s1[i] trace_list[index][1] += '-' trace_list[index][2] += ' ' indices_list[index][0] -= 1 continue # indicates that the value in D[i,j] came from P, Q and D if D[i, j] == P[i, j] and D[i, j] == Q[i, j] and D[ i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists trace_list.append(copy.deepcopy(trace_list[index])) indices_list.append(copy.deepcopy( indices_list[index])) #first copy indices_list.append(copy.deepcopy( indices_list[index])) #second copy # treat traceback[index] as the list where the traceback has come from P[i,j] # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i trace_list[index][0] += s1[i2:i] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # treat traceback[second] as the list where the traceback has come from Q[i,j] # second will store the index of the newly duplicated list # (it will always be at the end because that's how append works) second = len(trace_list) - 1 # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[second][1] -= diff_j trace_list[second][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[second][1] += s2[j2:j][::-1] trace_list[second][2] += ' ' * diff_j # treat traceback[third] as the list where the traceback has come from D[i-1,j-1] (decrease i and j) third = len(trace_list) - 2 trace_list[third][0] += s1[i - 1] trace_list[third][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[third][2] += '*' else: trace_list[third][2] += ':' indices_list[third][0] -= 1 indices_list[third][1] -= 1 # The value in D[i,j] came from P and Q, not from D. elif D[i, j] == P[i, j] and D[i, j] == Q[i, j]: trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists indices_list.append(copy.deepcopy( indices_list[index])) #copy # treat traceback[index] as the list where the traceback has come from P[i,j] # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to # trace_list in the reverse order trace_list[index][0] += s1[i2:i][::-1] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # treat traceback[second] as the list where the traceback has come from Q[i,j] # second will store the index of the newly duplicated list # (it will always be at the end because that's how append works) second = len(trace_list) - 1 # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[second][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[second][1] -= diff_j trace_list[second][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[second][1] += s2[j2:j][::-1] trace_list[second][2] += ' ' * diff_j # D[i,j] came from P and D elif D[i, j] == P[i, j] and D[ i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists indices_list.append(copy.deepcopy( indices_list[index])) #copy # treat traceback[index] as the list where the traceback has come from P[i,j] # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to # trace_list in the reverse order trace_list[index][0] += s1[i2:i][::-1] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j) second = len(trace_list) - 1 trace_list[second][0] += s1[i - 1] trace_list[second][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[second][2] += '*' else: trace_list[second][2] += ':' indices_list[second][0] -= 1 indices_list[second][1] -= 1 # D[i,j] came from D and Q. elif D[i, j] == Q[i, j] and D[ i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list.append( copy.deepcopy(trace_list[index]) ) # we need to split the trace_list sublist into 3 lists indices_list.append(copy.deepcopy( indices_list[index])) #copy # treat traceback[index] as the list where the traceback has come from P[i,j] # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[index][1] -= diff_j trace_list[index][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[index][1] += s2[j2:j][::-1] trace_list[index][2] += ' ' * diff_j # treat traceback[second] as the list where the traceback has come from D[i-1,j-1] (decrease i and j) second = len(trace_list) - 1 trace_list[second][0] += s1[i - 1] trace_list[second][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[second][2] += '*' else: trace_list[second][2] += ':' indices_list[second][0] -= 1 indices_list[second][1] -= 1 # D[i,j] came from only P #indicates that a gap has been added in Sequence 1, so we have to decrement i elif D[i, j] == P[i, j]: # i2 is the index after calling tracebackInP: after traversing through the # P matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length, and closing the gap. i2 = self.tracebackInP(P, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_i gives the number of the gap we have to insert in sequence 2, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list row index. diff_i = i - i2 indices_list[index][0] -= diff_i # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between i2 and i as we always add to # trace_list in the reverse order trace_list[index][0] += s1[i2:i][::-1] trace_list[index][1] += '-' * diff_i trace_list[index][2] += ' ' * diff_i # D[i,j] came from only Q. #indicates that a gap has been added in Sequence 2, so we have to decrement i elif D[i, j] == Q[i, j]: # j2 is the index after calling tracebackInQ: after traversing through the # Q matrix and returning to the D matrix. This corresponds to creating a gap, extending # it to a certain length and closing the gap. j2 = self.tracebackInQ(Q, D, indices_list[index][0], indices_list[index][1], beta, g) # diff_j gives the number of gaps we have to insert in sequence 1, and in # trace_list[index][1] . It is also used to decrement the corresponding indices_list col index. diff_j = j - j2 indices_list[index][1] -= diff_j trace_list[index][0] += '-' * diff_j # Note that the indices for s2/s1 are one less than that for the D, P and Q matrices # Also, while adding to the trace_list, we need to reverse the string between j2 and j as we always add to # trace_list in the reverse order trace_list[index][1] += s2[j2:j][::-1] trace_list[index][2] += ' ' * diff_j # D[i,j] came from D[i-1,j-1] # Indicates a substitution elif D[i, j] == D[i - 1, j - 1] + fr.getSubsMatScore( s1[i - 1], s2[j - 1], subst_matrix_fn, beta): trace_list[index][0] += s1[i - 1] trace_list[index][1] += s2[j - 1] if s1[i - 1] == s2[j - 1]: trace_list[index][2] += '*' else: trace_list[index][2] += ':' indices_list[index][0] -= 1 indices_list[index][1] -= 1 # indices_duplicate, the for loop variable, needs to store the updated value of indices_list before the next loop starts indices_duplicate = copy.deepcopy(indices_list) # when the number of indices (same as no. of tracebacks) is equal to the 'done counter', which is incremented once for # each traceback, we can break out of the while(True) infinite loop if completed_counter == len(indices_duplicate): break # As trace_list contains all the strings (S1, S2 and connect) in the opposite order, they need to be reversed. alignment_strings = [[string[::-1] for string in trace] for trace in trace_list] return alignment_strings