def test_align_pairwise(self): """Test the Needleman-Wunsch sequence alignment for two protein sequences. This uses the sequences: - 'IHAAEEKDWKTAYSYbgFYEAFEGYdsidspkaitslkymllckimlntpedvqalvsgkla', - 'LHAADEKDFKTAFSYabiggapFYEAFEGYdsvdekvsaltalkymllckvmldlpdevnsllsakl'. From online servers, the results with a gap open penalty of 5 and gap extend of 1 should be:: https://www.ebi.ac.uk/Tools/psa/emboss_needle/ EMBOSS_001 IHAAEEKDWKTAYSY-B-G---FYEAFEGYDSIDSP-KAITSLKYMLLCKIMLNTPEDVQALVSGKLA :|||:|||:|||:|| | | ||||||||||:|.. .|:|:||||||||:||:.|::|.:|:|.|| EMBOSS_001 LHAADEKDFKTAFSYABIGGAPFYEAFEGYDSVDEKVSALTALKYMLLCKVMLDLPDEVNSLLSAKL- http://web.expasy.org/cgi-bin/sim/sim.pl?prot UserSeq1 IHAAEEKDWKTAYSY-B-G---FYEAFEGYDSIDSP-KAITSLKYMLLCKIMLNTPEDVQALVSGKL UserSeq2 LHAADEKDFKTAFSYABIGGAPFYEAFEGYDSVDEKVSALTALKYMLLCKVMLDLPDEVNSLLSAKL *** *** *** ** * * ********** * * * ******** ** * * * * ** """ # The sequences. seq1 = 'IHAAEEKDWKTAYSYbgFYEAFEGYdsidspkaitslkymllckimlntpedvqalvsgkla' seq2 = 'LHAADEKDFKTAFSYabiggapFYEAFEGYdsvdekvsaltalkymllckvmldlpdevnsllsakl' print(seq1) print(seq2) # Perform the alignment. score, align1, align2, gaps = align_pairwise(seq1, seq2, matrix='BLOSUM62', gap_open_penalty=5.0, gap_extend_penalty=1.0) print(score) print(align1) print(align2) print(gaps) # Check the alignment. self.assertEqual( align1, 'IHAAEEKDWKTAYSY-B-G---FYEAFEGYDSIDSP-KAITSLKYMLLCKIMLNTPEDVQALVSGKLA' ) self.assertEqual( align2, 'LHAADEKDFKTAFSYABIGGAPFYEAFEGYDSVDEKVSALTALKYMLLCKVMLDLPDEVNSLLSAKL-' ) # The gap matrix. real_gaps = zeros((2, 68), int16) real_gaps[0, 15] = 1 real_gaps[0, 17] = 1 real_gaps[0, 19] = 1 real_gaps[0, 20] = 1 real_gaps[0, 21] = 1 real_gaps[0, 36] = 1 real_gaps[1, 67] = 1 for i in range(2): for j in range(68): self.assertEqual(gaps[i, j], real_gaps[i][j])
def test_align_pairwise_PAM250(self): """Test the Needleman-Wunsch sequence alignment for two protein sequences using the PAM250 substitution matrix. This uses the sequences: - 'IHAAEEKDWKTAYSYbgFYEAFEGYdsidspkaitslkymllckimlntpedvqalvsgkla', - 'LHAADEKDFKTAFSYabiggapFYEAFEGYdsvdekvsaltalkymllckvmldlpdevnsllsakl'. From online servers, the results with a gap open penalty of 5 and gap extend of 0.5 should be:: https://www.ebi.ac.uk/Tools/psa/emboss_needle/ EMBOSS_001 IHAAEEKDWKTAYSYb--g---FYEAFEGYdsidspk--aitslkymllckimlntpedvqalvsgkla :|||:|||.|||:||. | ||||||||||:|. | |:|:||||||||:||:.|::|::|:|:|| EMBOSS_001 LHAADEKDFKTAFSYabiggapFYEAFEGYdsvde-kvsaltalkymllckvmldlpdevnsllsakl- http://web.expasy.org/cgi-bin/sim/sim.pl?prot UserSeq1 IHAAEEKDWKTAYSYBG-----FYEAFEGYDSIDSPK--AITSLKYMLLCKIMLNTPEDVQALVSGKL UserSeq2 LHAADEKDFKTAFSYABIGGAPFYEAFEGYDSVDE-KVSALTALKYMLLCKVMLDLPDEVNSLLSAKL *** *** *** ** ********** * * * * ******** ** * * * * ** """ # The sequences. seq1 = 'IHAAEEKDWKTAYSYbgFYEAFEGYdsidspkaitslkymllckimlntpedvqalvsgkla' seq2 = 'LHAADEKDFKTAFSYabiggapFYEAFEGYdsvdekvsaltalkymllckvmldlpdevnsllsakl' print(seq1) print(seq2) # Perform the alignment. score, align1, align2, gaps = align_pairwise(seq1, seq2, matrix='PAM250', gap_open_penalty=5.0, gap_extend_penalty=0.5) print(score) print(align1) print(align2) print(gaps) # Check the alignment. self.assertEqual(align1, 'IHAAEEKDWKTAYSYB--G---FYEAFEGYDSIDSPK--AITSLKYMLLCKIMLNTPEDVQALVSGKLA') self.assertEqual(align2, 'LHAADEKDFKTAFSYABIGGAPFYEAFEGYDSVDE-KVSALTALKYMLLCKVMLDLPDEVNSLLSAKL-') # The gap matrix. real_gaps = zeros((2, 69), int16) real_gaps[0, 16] = 1 real_gaps[0, 17] = 1 real_gaps[0, 19] = 1 real_gaps[0, 20] = 1 real_gaps[0, 21] = 1 real_gaps[0, 37] = 1 real_gaps[0, 38] = 1 real_gaps[1, 35] = 1 real_gaps[1, 68] = 1 for i in range(2): for j in range(68): self.assertEqual(gaps[i, j], real_gaps[i][j])
def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): """Align multiple protein sequences to one reference by fusing multiple pairwise alignments. @param sequences: The list of residue sequences as one letter codes. @type sequences: list of str @keyword algorithm: The pairwise sequence alignment algorithm to use. @type algorithm: str @keyword matrix: The substitution matrix to use. @type matrix: str @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. @type gap_open_penalty: float @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. @type gap_extend_penalty: float @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. @type end_gap_open_penalty: float @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. @type end_gap_extend_penalty: float @return: The list of alignment strings and the gap matrix. @rtype: list of str, numpy rank-2 int array """ # Initialise. N = len(sequences) scores = zeros((N, N), float64) # Set up lists of lists for storing all alignment strings. align1_matrix = [] align2_matrix = [] for i in range(N): align1_matrix.append([]) align2_matrix.append([]) for j in range(N): if i == j: align1_matrix[i].append(sequences[i]) align2_matrix[i].append(sequences[i]) else: align1_matrix[i].append(None) align2_matrix[i].append(None) # Printout. sys.stdout.write("\nCentral Star multiple sequence alignment.\n\n") sys.stdout.write("%-30s %s\n" % ("Pairwise algorithm:", algorithm)) sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty)) sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty)) sys.stdout.write("Initial sequences:\n") for i in range(N): sys.stdout.write("%3i %s\n" % (i+1, sequences[i])) # All pairwise alignments. sys.stdout.write("\nDetermining the scores for all pairwise alignments:\n") for i in range(N): for j in range(i+1, N): # Align the pair. sys.stdout.write("%-30s " % ("Sequences %i-%i:" % (i+1, j+1))) score, align1, align2, gaps = align_pairwise(sequences[i], sequences[j], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) sys.stdout.write("%10.1f\n" % score) # Store the score and alignment strings. scores[i, j] = scores[j, i] = score align1_matrix[i][j] = align1_matrix[j][i] = align1 align2_matrix[i][j] = align2_matrix[j][i] = align2 # The central sequence. sys.stdout.write("\nDetermining the central sequence:\n") sum_scores = scores.sum(0) Sc_sum_score = 1e100 Sc_index = 0 for i in range(N): if sum_scores[i] < Sc_sum_score: Sc_sum_score = sum_scores[i] Sc_index = i sys.stdout.write("%-30s %10.1f\n" % (("Sum of scores, sequence %i:" % (i+1)), sum_scores[i])) sys.stdout.write("%-30s %i\n" % ("Central sequence:", Sc_index+1)) # Partition the sequences. Sc = sequences[Sc_index] Si = [] for i in range(N): if i != Sc_index: Si.append(sequences[i]) # Optimal alignments. sys.stdout.write("\nDetermining the iterative optimal alignments:\n") Sc_prime = Sc string_lists = [] for i in range(N-1): # Update the string lists. string_lists.append([]) # Perform the pairwise alignment between Sc' and Si, replacing all '-' with 'X'. score, Sc_prime, Si_prime, gaps = align_pairwise(Sc_prime.replace('-', 'X'), Si[i].replace('-', 'X'), algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) sys.stdout.write("\n%-30s %s\n" % ("Sequence Sc':", Sc_prime.replace('X', '-'))) sys.stdout.write("%-30s %s\n" % (("Sequence S%i':" % (i+1)), Si_prime.replace('X', '-'))) # Store the Si alignment. for j in range(len(Sc_prime)): string_lists[i].append(Si_prime[j]) # Add spaces to the lists for all previous alignments. else: # Find gaps in the central sequence. for j in range(len(Sc_prime)): if Sc_prime[j] == '-': # Pad the previous alignments. for k in range(0, i): string_lists[k].insert(j, '-') # Rebuild the alignment lists and create a gap matrix. strings = [] M = len(Sc_prime) strings.append(Sc_prime) for i in range(N-1): strings.append(''.join(string_lists[i])) for i in range(N): strings[i] = strings[i].replace('X', '-') # Restore the original sequence ordering. string = strings.pop(0) strings.insert(Sc_index, string) # Create the gap matrix. gaps = zeros((N, M), int16) for i in range(N): for j in range(M): if strings[i][j] == '-': gaps[i, j] = 1 # Final printout. sys.stdout.write("\nFinal MSA:\n") for i in range(N): sys.stdout.write("%3i %s\n" % (i+1, strings[i])) # Return the results. return strings, gaps
def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): """Align multiple protein sequences to one reference by fusing multiple pairwise alignments. @param sequences: The list of residue sequences as one letter codes. @type sequences: list of str @keyword algorithm: The pairwise sequence alignment algorithm to use. @type algorithm: str @keyword matrix: The substitution matrix to use. @type matrix: str @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. @type gap_open_penalty: float @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. @type gap_extend_penalty: float @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. @type end_gap_open_penalty: float @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. @type end_gap_extend_penalty: float @return: The list of alignment strings and the gap matrix. @rtype: list of str, numpy rank-2 int array """ # Initialise. N = len(sequences) scores = zeros((N, N), float64) # Set up lists of lists for storing all alignment strings. align1_matrix = [] align2_matrix = [] for i in range(N): align1_matrix.append([]) align2_matrix.append([]) for j in range(N): if i == j: align1_matrix[i].append(sequences[i]) align2_matrix[i].append(sequences[i]) else: align1_matrix[i].append(None) align2_matrix[i].append(None) # Printout. sys.stdout.write("\nCentral Star multiple sequence alignment.\n\n") sys.stdout.write("%-30s %s\n" % ("Pairwise algorithm:", algorithm)) sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty)) sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty)) sys.stdout.write("Initial sequences:\n") for i in range(N): sys.stdout.write("%3i %s\n" % (i + 1, sequences[i])) # All pairwise alignments. sys.stdout.write("\nDetermining the scores for all pairwise alignments:\n") for i in range(N): for j in range(i + 1, N): # Align the pair. sys.stdout.write("%-30s " % ("Sequences %i-%i:" % (i + 1, j + 1))) score, align1, align2, gaps = align_pairwise( sequences[i], sequences[j], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) sys.stdout.write("%10.1f\n" % score) # Store the score and alignment strings. scores[i, j] = scores[j, i] = score align1_matrix[i][j] = align1_matrix[j][i] = align1 align2_matrix[i][j] = align2_matrix[j][i] = align2 # The central sequence. sys.stdout.write("\nDetermining the central sequence:\n") sum_scores = scores.sum(0) Sc_sum_score = 1e100 Sc_index = 0 for i in range(N): if sum_scores[i] < Sc_sum_score: Sc_sum_score = sum_scores[i] Sc_index = i sys.stdout.write("%-30s %10.1f\n" % (("Sum of scores, sequence %i:" % (i + 1)), sum_scores[i])) sys.stdout.write("%-30s %i\n" % ("Central sequence:", Sc_index + 1)) # Partition the sequences. Sc = sequences[Sc_index] Si = [] for i in range(N): if i != Sc_index: Si.append(sequences[i]) # Optimal alignments. sys.stdout.write("\nDetermining the iterative optimal alignments:\n") Sc_prime = Sc string_lists = [] for i in range(N - 1): # Update the string lists. string_lists.append([]) # Perform the pairwise alignment between Sc' and Si, replacing all '-' with 'X'. score, Sc_prime, Si_prime, gaps = align_pairwise( Sc_prime.replace('-', 'X'), Si[i].replace('-', 'X'), algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) sys.stdout.write("\n%-30s %s\n" % ("Sequence Sc':", Sc_prime.replace('X', '-'))) sys.stdout.write("%-30s %s\n" % (("Sequence S%i':" % (i + 1)), Si_prime.replace('X', '-'))) # Store the Si alignment. for j in range(len(Sc_prime)): string_lists[i].append(Si_prime[j]) # Add spaces to the lists for all previous alignments. else: # Find gaps in the central sequence. for j in range(len(Sc_prime)): if Sc_prime[j] == '-': # Pad the previous alignments. for k in range(0, i): string_lists[k].insert(j, '-') # Rebuild the alignment lists and create a gap matrix. strings = [] M = len(Sc_prime) strings.append(Sc_prime) for i in range(N - 1): strings.append(''.join(string_lists[i])) for i in range(N): strings[i] = strings[i].replace('X', '-') # Restore the original sequence ordering. string = strings.pop(0) strings.insert(Sc_index, string) # Create the gap matrix. gaps = zeros((N, M), int16) for i in range(N): for j in range(M): if strings[i][j] == '-': gaps[i, j] = 1 # Final printout. sys.stdout.write("\nFinal MSA:\n") for i in range(N): sys.stdout.write("%3i %s\n" % (i + 1, strings[i])) # Return the results. return strings, gaps