def test_globalxx_simple(self): aligns = pairwise2.align.globalxx("GAACT", "GAT") self.assertEqual(len(aligns), 2) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GAACT ||||| G-A-T Score=3 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GAACT ||||| GA--T Score=3 """, )
def align_and_mutate(ref_seq, work_seq, mut_set): #align reference to work using a blosum62 matrix alignment with no gap penalty matrix = matlist.blosum62 for aln in pairwise2.align.globalds(ref_seq.seq, work_seq.seq, matrix, -5, -1): #go through the reference sequence until we get to one of the mapped positions ref_aln_seq = aln[0] work_aln_seq = aln[1] aln_position = -1 ref_position = -1 mapped_mut_set = {} for aa in ref_aln_seq: aln_position += 1 if aa != "-": ref_position += 1 if ref_position in mut_set.keys(): mapped_mut_set[aln_position] = mut_set[ref_position] pprint(mapped_mut_set) new_seq = work_aln_seq.tomutable() for pos in sorted(mapped_mut_set.keys()): if(work_aln_seq[pos] == '-'): print "Error, mutation position " + str(pos) + "(" + ref_aln_seq[pos]+"->"+str(mapped_mut_set[pos]) + ") aligned to a gap!" print(pairwise2.format_alignment(*aln)) exit() new_seq[pos] = mapped_mut_set[pos] print "making mutation at position "+str(pos)+": "+work_aln_seq[pos]+"->"+new_seq[pos]+" (canonincal mutation is "+ref_aln_seq[pos]+"->"+mapped_mut_set[pos]+")" print(pairwise2.format_alignment(*aln)) return str(new_seq).replace('-', '')
def test_penalize_end_gaps(self): aligns = pairwise2.align.globalxs("GACT", "GT", -0.2, -0.8, penalize_end_gaps=0) self.assertEqual(len(aligns), 3) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| --GT Score=1 """) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| G--T Score=1 """) seq1, seq2, score, begin, end = aligns[2] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| GT-- Score=1 """)
def test_separate_gap_penalties_with_extension(self): aligns = pairwise2.align.localxd(list("GAAT"), list("GTCCT"), -0.1, 0, -0.1, -0.1, gap_char=["-"]) self.assertEqual(len(aligns), 3) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ ['G', '-', 'A', 'A', 'T'] ||||| ['G', 'T', 'C', 'C', 'T'] Score=1.9 """) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ ['G', 'A', '-', 'A', 'T'] ||||| ['G', 'T', 'C', 'C', 'T'] Score=1.9 """) seq1, seq2, score, begin, end = aligns[2] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ ['G', 'A', 'A', '-', 'T'] ||||| ['G', 'T', 'C', 'C', 'T'] Score=1.9 """)
def test_align_one_char2(self): aligns = pairwise2.align.localxs("abcce", "c", -0.3, -0.1) self.assertEqual(len(aligns), 2) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ abcce | ---c- Score=1 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ abcce | --c-- Score=1 """, )
def test_extend_penalty2(self): aligns = pairwise2.align.globalxs("GACT", "GT", -0.2, -1.5) self.assertEqual(len(aligns), 2) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GACT |||| -G-T Score=0.6 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GACT |||| G-T- Score=0.6 """, )
def test_penalize_end_gaps2(self): """Do the same, but use the generic method (with the same resutlt)""" aligns = pairwise2.align.globalxs("GACT", "GT", -0.8, -0.2, penalize_end_gaps=0, force_generic=True) self.assertEqual(len(aligns), 3) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| --GT Score=1 """) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| G--T Score=1 """) seq1, seq2, score, begin, end = aligns[2] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| GT-- Score=1 """)
def test_separate_gap_penalties1(self): aligns = pairwise2.align.localxd("GAT", "GTCT", -0.3, 0, -0.8, 0) self.assertEqual(len(aligns), 2) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ G-AT |||| GTCT Score=1.7 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GA-T |||| GTCT Score=1.7 """, )
def test_localxs(self): aligns = pairwise2.align.localxs("AxBx", "zABz", -0.1, 0) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ -AxBx ||| zA-Bz Score=1.9 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ -AxBx |||| zA-Bz Score=1.9 """, )
def test_match_score_open_penalty1(self): aligns = pairwise2.align.globalms("AA", "A", 2.0, -1, -0.1, 0) self.assertEqual(len(aligns), 2) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ AA || -A Score=1.9 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ AA || A- Score=1.9 """, )
def test_match_score_open_penalty2(self): aligns = pairwise2.align.globalms("GAA", "GA", 1.5, 0, -0.1, 0) self.assertEqual(len(aligns), 2) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GAA ||| G-A Score=2.9 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GAA ||| GA- Score=2.9 """, )
def test_separate_gap_penalties_with_extension(self): """Test separate gap-extension penalties and list input.""" aligns = pairwise2.align.localxd( list("GAAT"), list("GTCCT"), -0.1, 0, -0.1, -0.1, gap_char=["-"]) self.assertEqual(len(aligns), 3) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ G - A A T | . . | G T C C T Score=1.9 """) # noqa: W291 seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ G A - A T | . . | G T C C T Score=1.9 """) # noqa: W291 seq1, seq2, score, begin, end = aligns[2] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ G A A - T | . . | G T C C T Score=1.9 """) # noqa: W291
def test_penalize_end_gaps(self): """Turn off end-gap penalties.""" aligns = pairwise2.align.globalxs("GACT", "GT", -0.8, -0.2, penalize_end_gaps=0) self.assertEqual(len(aligns), 3) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT .| --GT Score=1 """) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT | | G--T Score=1 """) seq1, seq2, score, begin, end = aligns[2] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |. GT-- Score=1 """) # noqa: W291
def test_match_dictionary1(self): aligns = pairwise2.align.localds("ATAT", "ATT", self.match_dict, -0.5, 0) self.assertEqual(len(aligns), 2) aligns.sort() seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ ATAT |||| AT-T Score=3 """, ) seq1, seq2, score, begin, end = aligns[1] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ ATAT ||| ATT- Score=3 """, )
def align(): matrix = MatrixInfo.blosum62 for x in range(len(seqs)): for y in range(x, len(seqs)): a = pairwise2.align.globaldx(seqs[x].seq, seqs[y].seq, matrix) print len(a) a = a[0] print pairwise2.format_alignment(*a)
def align_chain_to_seq(sequence,chain,verbose=False): #Build Polypeptides from the chains polypeptides = build_polypeptides(chain) #Can't be broken out into another function, because we need seq_lens contiguous_seqs = [single_pp.get_sequence().tostring() for single_pp in polypeptides] ATOM_joined_seq = ''.join(contiguous_seqs) seq_lens = [0] + [len(single_pp) for single_pp in polypeptides] #Figuring all of this out took days... #I am so tired of dealing with mapping various numberings around #I wish Biopython, especially Bio.pairwise2 had better documentation breaks = set(S.cumsum(seq_lens) )#TODO : Tear hair out GYAAAAA nogaps = lambda x,y: -2000 -200*y #There really should not be inserts with respect to the database sequence. def specificgaps(x,y): if x in breaks:#very minor penalty for gaps at breaks in the PDB structure, consider using 0 return (0 -y) else: return (-2000 -200*y)#strongly discourage gaps anywhere else. alignments = __PW.align.globalxc(sequence.seq.tostring(),ATOM_joined_seq,nogaps,specificgaps) if verbose: #some output? for a in alignments: __stderr.write( __PW.format_alignment(*a) ) __stderr.write('\n') return alignments
def test_gap_here_only_2(self): """Force a bad alignment. Forces a bad alignment by having a very expensive gap penalty where one would normally expect a gap, and a cheap gap penalty in another place. """ seq1 = "AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA" seq2 = "AABBBAAAACCCCAAAABBBAA" breaks = [0, 3, len(seq2)] # Very expensive to open a gap in seq1: nogaps = lambda x, y: -2000 - y # Very expensive to open a gap in seq2 unless it is in one of the allowed positions: specificgaps = lambda x, y: (-2 - y) if x in breaks else (-2000 - y) alignments = pairwise2.align.globalmc(seq1, seq2, 1, -1, nogaps, specificgaps) self.assertEqual(len(alignments), 1) formatted = pairwise2.format_alignment(*alignments[0]) self.assertEqual( formatted, """\ AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA |||||||||||||||||||||||||||||||||||| --AAB----------BBAAAACCCCAAAABBBAA-- Score=-10 """, )
def _pretty_print_align(align1, align2, score, begin, end): s = pairwise2.format_alignment(align1, align2, score, begin, end) a1 = n.fromstring(align1,dtype='S1') a2 = n.fromstring(align2,dtype='S1') print "Identity: %.2f Alignment length: %s" % (float(score)/len(align1)*100,len(align1)) print "(a1 == a2).sum() = ", (a1 == a2).sum() print s,
def main(argv): with open(argv[1], "r") as fstream: sequence1=fstream.readline().rstrip() sequence2=fstream.readline().rstrip() print sequence1 print"-" print sequence2 print"" alignments=[] matrix = MatrixInfo.pam250 for a in pairwise2.align.localds(sequence1, sequence2,matrix,-5,-5): print(format_alignment(*a)) alignments.append(a) alignments=alignments[0] seq1= alignments[0][alignments[3]:alignments[4]] seq2= alignments[1][alignments[3]:alignments[4]] currentScore=score_pairwise(seq1, seq2, matrix, -5, -5) print currentScore print seq1 print len(seq1) print seq2 print len(seq2)
def test_gap_here_only_2(self): """Force a bad alignment. Forces a bad alignment by having a very expensive gap penalty where one would normally expect a gap, and a cheap gap penalty in another place. """ seq1 = "AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA" seq2 = "AABBBAAAACCCCAAAABBBAA" def no_gaps(x, y): """Very expensive to open a gap in seq1.""" x = 0 # fool QuantifiedCode, x is not used here return -2000 - y def specific_gaps(x, y): """Very expensive to open a gap in seq2 ...unless it is in one of the allowed positions: """ breaks = [0, 3, len(seq2)] return (-2 - y) if x in breaks else (-2000 - y) alignments = pairwise2.align.globalmc(seq1, seq2, 1, -1, no_gaps, specific_gaps) self.assertEqual(len(alignments), 1) formatted = pairwise2.format_alignment(*alignments[0]) self.assertEqual(formatted, """\ AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA |||||||||||||||||||||||||||||||||||| --AAB----------BBAAAACCCCAAAABBBAA-- Score=-10 """)
def test_gap_here_only_1(self): seq1 = "AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA" seq2 = "AABBBAAAACCCCAAAABBBAA" def no_gaps(x, y): """Very expensive to open a gap in seq1.""" x = 0 # fool QuantifiedCode, x is not used here return -2000 - y def specific_gaps(x, y): """Very expensive to open a gap in seq2 ...unless it is in one of the allowed positions: """ breaks = [0, 11, len(seq2)] return (-2 - y) if x in breaks else (-2000 - y) alignments = pairwise2.align.globalmc(seq1, seq2, 1, -1, no_gaps, specific_gaps) self.assertEqual(len(alignments), 1) formatted = pairwise2.format_alignment(*alignments[0]) self.assertEqual(formatted, """\ AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA |||||||||||||||||||||||||||||||||||| --AABBBAAAACC----------CCAAAABBBAA-- Score=2 """)
def test_localms(self): """Two different local alignments""" aligns = sorted(pairwise2.align.localms("xxxABCDxxx", "zzzABzzCDz", 1, -0.5, -3, -1)) alignment = pairwise2.format_alignment(*aligns[0]) self.assertEqual(alignment, """\ --xxxABCDxxx || zzzABzzCDz-- Score=2 """) alignment = pairwise2.format_alignment(*aligns[1]) self.assertEqual(alignment, """\ xxxABCDxxx || zzzABzzCDz Score=2 """)
def test_blosum62(self): """Test localds with blosum62.""" self.assertEqual(1, blosum62[('K', 'Q')]) self.assertEqual(4, blosum62[('A', 'A')]) self.assertEqual(8, blosum62[('H', 'H')]) alignments = pairwise2.align.localds('VKAHGKKV', 'FQAHCAGV', blosum62, -4, -4) for a in alignments: self.assertEqual(pairwise2.format_alignment(*a), "VKAHGKKV\n |||\nFQAHCAGV\n Score=13\n")
def seqdist(s1, s2, mismatchpen=-.5, gapopenpen=-.25, gapextendpen=-.05): """ The distance between two sequences. """ # s1, s2 = removecommongaps(s1, s2) from Bio.pairwise2 import align, format_alignment alignment = next(iter(align.globalms( s1, s2, 1, mismatchpen, gapopenpen, gapextendpen))) print(format_alignment(*alignment)) return alignment[2]
def test_align_one_char3(self): aligns = pairwise2.align.globalxs("abcde", "c", -0.3, -0.1) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ abcde ||||| --c-- Score=0.2 """)
def test_separate_gap_penalties2(self): aligns = pairwise2.align.localxd("GAT", "GTCT", -0.5, 0, -0.2, 0) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GAT-- ||| G-TCT Score=1.8 """)
def test_match_dictionary3(self): aligns = pairwise2.align.localds("ATT", "ATAT", self.match_dict, -1, 0) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ ATT- ||| ATAT Score=3 """)
def test_match_score_open_penalty4(self): aligns = pairwise2.align.globalms("GCT", "GATA", 1, -2, -0.1, 0) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GCT- |||| GATA Score=-0.1 """)
def test_penalize_extend_when_opening(self): aligns = pairwise2.align.globalxs("GACT", "GT", -0.2, -1.5, penalize_extend_when_opening=1) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| G--T Score=-1.2 """)
def test_extend_penalty1(self): aligns = pairwise2.align.globalxs("GACT", "GT", -0.2, -0.5) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ GACT |||| G--T Score=1.3 """)
def findMaxAlignmentForProteins(protein_records, output_file, carrier_proteomes, mild_carrier_proteomes, non_carrier_proteomes): output_file.write("\n") output_file.write( "optimal alignments ---------------------------------- \n") for record in protein_records: output_file.write(record.id + " : " + "\n") output_file.write("\n") output_file.write("\t Carrier alignments : \n") for proteomeID in carrier_proteomes: alignment = computeMaxAlignment(record, carrier_proteomes[proteomeID]) output_file.write("\t \t -" + proteomeID + " : \t" + format_alignment(alignment) + "\n") output_file.write("\n \n") output_file.write("\t mild carrier alignments : \n") for proteomeID in mild_carrier_proteomes: alignment = computeMaxAlignment(record, mild_carrier_proteomes[proteomeID]) output_file.write("\t \t -" + proteomeID + " : \t" + format_alignment(alignment) + "\n") output_file.write("\n \n") output_file.write("\t Non Carrier alignments : \n") for proteomeID in non_carrier_proteomes: alignment = computeMaxAlignment(record, non_carrier_proteomes[proteomeID]) output_file.write("\t \t -" + proteomeID + " : \t" + format_alignment(alignment) + "\n") output_file.write("\n \n") return
def test_match_score_open_penalty3(self): """Test 3.""" aligns = pairwise2.align.globalxs("GAACT", "GAT", -0.1, 0) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GAACT || | GA--T Score=2.9 """, )
def test_match_score_open_penalty4(self): """Test 4.""" aligns = pairwise2.align.globalms("GCT", "GATA", 1, -2, -0.1, 0) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GC-T- | | G-ATA Score=1.7 """, # noqa: W291 )
def aligner(new_file,SSR_list): #converts fasta file into a set of tuples for speed fiter = fasta_iter('AutoJobber_Logs/SSR_Containing_Genes.fa') SSR = 0 #iterates through all of the sequences for ff in fiter: headerStr, seq = ff #uses pairwise2 align tool to generate alignments alignments = pairwise2.align.globalds(SSR_list[SSR], seq, blosum62,-10,-0.5, penalize_end_gaps = False, one_alignment_only = True) #writes alignments into a text document for a in alignments: new_file.write(pairwise2.format_alignment(*a)) #increments the SSR index SSR += 1 new_file.close()
def test_separate_gap_penalties2(self): """Test 2.""" aligns = pairwise2.align.localxd("GAT", "GTCT", -0.5, 0, -0.2, 0) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ 1 GAT | | 1 G-T Score=1.8 """, )
def test_match_dictionary3(self): """Test 3.""" aligns = pairwise2.align.localds("ATT", "ATAT", self.match_dict, -1, 0) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ 1 ATT ||. 1 ATA Score=3 """, )
def test_align_one_char1(self): """Test sequence with only one match.""" aligns = pairwise2.align.localxs("abcde", "c", -0.3, -0.1) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ 3 c | 1 c Score=1 """, )
def local_similarity(s1, s2): '''Compute a match score. When a match score sets a new record, print it.''' s1, s2 = sorted((s1, s2), key=len) score = 0 read_length = len(s1) for start_loc in range(0, len(s2), read_length): results = pairwise2.align.localxs(s1, s2[start_loc:start_loc+2*read_length], -10, -10) new_score = int(results[0][2]) if new_score > score: score = new_score if score > local_similarity.best_score: local_similarity.best_score = score print('new best alignment: ') print(pairwise2.format_alignment(*results[0])) return score
def test_align_one_char3(self): """Like test 1, but global alignment.""" aligns = pairwise2.align.globalxs("abcde", "c", -0.3, -0.1) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ abcde | --c-- Score=0.2 """, # noqa: W291 )
def test_extend_penalty1(self): """Test 1.""" aligns = pairwise2.align.globalxs("GACT", "GT", -0.5, -0.2) self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual( alignment, """\ GACT | | G--T Score=1.3 """, )
def test_localxs_generic(self): """Test the generic method with local alignments.""" aligns = sorted(pairwise2.align.localxs("AxBx", "zABz", -0.1, 0, force_generic=True)) # From Biopython 1.74 on this should only give one alignment, since # we disallow leading and trailing 'zero-extensions' self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end) self.assertEqual(alignment, """\ 1 AxB | | 2 A-B Score=1.9 """)
def format_alignment(mol1: Molecule, mol2: Molecule): '''### Do alignment of two molecules #### params: - mol1, mol2: You includer molecule to align *returns* -> Molecule with alignment result and indentity ''' alignment = align.localds(mol1.seq, mol2.seq, blosum62, -12, -4) alignment_formated = pairwise2.format_alignment(*alignment[0]) alignment_formated = alignment_formated.split('\n') header = '' seq_mol1 = alignment_formated[0] seq_mol2 = alignment_formated[2] result_raw = alignment_formated[1] identity = alignment[0][-1] body_mol1 = '' body_mol2 = '' result = '' body = '' count = 0 errors = 0 for i in range(len(seq_mol1)): body_mol1 += seq_mol1[i] body_mol2 += seq_mol2[i] result += result_raw[i] if not seq_mol1[i].isnumeric() and seq_mol1[i] != ' ': if seq_mol1[i] == seq_mol2[i]: count += 1 else: errors += 1 if (i + 1) % 60 == 0: body += f"{body_mol1}\n{result}\n{body_mol2}\n\n" result = '' body_mol1 = '' body_mol2 = '' identity = count / (count + errors) header = "< %s - %s | %s | %.1f%%\n" % (mol1.dbname, mol2.dbname, mol1.name, identity * 100) text = header + body return {'text': text, 'identity': identity}
def __init__(self, reference_sequence, other_sequence): matrix = matlist.blosum62 self.reference_sequence = reference_sequence self.other_sequence = other_sequence #using the default in Emboss Needle on the EBI website. Blosum62 sub matrix, gap penalty of -10, extension penalty of -0.5 alignment = pairwise2.align.globalds(reference_sequence, other_sequence, matrix, -10.0, -0.5, one_alignment_only=True) print('alignment') print(format_alignment(*alignment[0])) ref_alignment = alignment[0][0] other_alignment = alignment[0][1] self.ref_to_other_positions = {} self.other_to_ref_positions = {} assert (len(ref_alignment) == len(other_alignment)) ref_position = 0 other_position = 0 other_started = False ref_started = False """ self.reference_matched_positions should be the same length as self.other_matched_positions Basically, just have positions of match states in both the reference and other sequence """ self.reference_matched_positions = [] self.other_matched_positions = [] for i in range(0, len(ref_alignment)): if (not ref_started) and ref_alignment[i] != '-': ref_started = True if (not other_started) and other_alignment[i] != '-': other_started = True old_ref_position = ref_position if other_started and ref_started and ref_alignment[ i] != '-' and other_alignment[i] != '-': self.reference_matched_positions.append(ref_position) self.other_matched_positions.append(other_position) if ref_alignment[i] != '-' and other_started: self.ref_to_other_positions[ref_position] = other_position if ref_alignment[i] != '-': ref_position += 1 if other_alignment[i] != '-' and ref_started: self.other_to_ref_positions[other_position] = old_ref_position if other_alignment[i] != '-': other_position += 1
def align_l(self): if self.var_fromfile.get() == 1: self.load_fasta() seq1 = self.sequences[0][1] seq2 = self.sequences[1][1] pass else: self.sequences = [] seq1 = self.field_S1.get() seq2 = self.field_S2.get() alignments = pairwise2.align.localxx(seq1, seq2) with open('output.txt', 'w') as file: file.write("SEQUENCE ALIGNMENT TOOL OUTPUT - LOCAL ALIGNMENT\n") for i, alignment in enumerate(alignments): file.write(format_alignment(*alignments[i])) file.write("\n")
def balign(first_seq, second_seq, op_gap=-5, ext_gap=-0.5): # Load the matrix matrix = matlist.blosum62 # Generate the alignments alns = pairwise2.align.globalds(first_seq, second_seq, matrix, op_gap, ext_gap) # Extract the best alignment (first one in the alns list) top_aln = alns[0] # Print the alignment #aln_A, aln_B, score, begin, end = #<<<<<<<<<<...>>>>>>>>>>> print(pairwise2.format_alignment(*top_aln)) #<<<<<<<<<<...>>>>>>>>>>> return (top_aln)
def get_pairwise2_localds_result(self, asequence, bsequence, matrx=blosum62, gap_open_penalty=10, extension_penalty=1): alignments = pairwise2.align.localds( asequence.upper().replace(" ", ""), bsequence.upper().replace(" ", ""), matrx, -gap_open_penalty, -extension_penalty) alignments_result = pairwise2.format_alignment(*alignments[0]) align_arr = alignments_result.split("\n") return ''.join([i for i in align_arr[0] if not i.isdigit()]), ''.join([ i for i in align_arr[1] if not i.isdigit() ]), ''.join([i for i in align_arr[2] if not i.isdigit()]), alignments_result
def alignment_pairwise2(fuzzy_list,string, Dict): ### defining function for alignment '''perform the alignment of the string and pattern by using extracted match objects from the list''' empty_list=[] ### create empty list for x in range(1, len(fuzzy_list),2): ### iterates over the matched list in the given range start_ = int(str(fuzzy_list[x]).split(",")[0].split("=")[1]) ### extracting the start position using "split" method end_ = int(str(fuzzy_list[x]).split(",")[1].split("=")[1]) ### to extract the end position using "split" match_ = str(fuzzy_list[x]).split(",")[3].split("=")[1].strip("'").split("'")[0] ### extract matched string for k,v in Dict.items(): ### for keys and values in test_dict if match_ in v: ### if matched string is present in values for a in pairwise2.align.localms(v[start_:end_:1],string,1, 0, -1, -0.5, one_alignment_only=True): ### performing alignment and iterates over each alignment empty_list.append(k +"\n"+format_alignment(*a)) ### appending the outputs to the list in a precise format return(empty_list) ### returning the list
def align_to_ref(self, ref_seq, query_seq, ident_score=4, sim_score=2, gap_open=-2, gap_ext=-.5, verbose=False): pw = pairwise2.align.localms(ref_seq, query_seq, ident_score, sim_score, gap_open, gap_ext) score = pw[0][2] if verbose: print(format_alignment(*pw[0])) print(score) print(self.aligned_seq_len, len(pw[0][1])) return score
def test_gap_here_only_1(self): seq1 = "AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA" seq2 = "AABBBAAAACCCCAAAABBBAA" breaks = [0, 11, len(seq2)] # Very expensive to open a gap in seq1: nogaps = lambda x, y: -2000 - y # Very expensive to open a gap in seq2 unless it is in one of the allowed positions specificgaps = lambda x, y: (-2 -y) if x in breaks else (-2000 - y) alignments = pairwise2.align.globalmc(seq1, seq2, 1, -1, nogaps, specificgaps) self.assertEqual(len(alignments), 1) formatted = pairwise2.format_alignment(*alignments[0]) self.assertEqual(formatted, """\ AAAABBBAAAACCCCCCCCCCCCCCAAAABBBAAAA |||||||||||||||||||||||||||||||||||| --AABBBAAAACC----------CCAAAABBBAA-- Score=2 """)
def getGlobalAlign(seq1, seq2): """ Complete global alignment using Needleman-Wunsch algorithm""" print(BIYellow + "GLOBAL ALIGNMENTS (Needleman-Wunsch algorithm)" + White) myAlignments = pairwise2.align.globalxx(seq1, seq2) print(myAlignments) for thisAlignment in myAlignments: print(thisAlignment) print(BICyan) from Bio.pairwise2 import format_alignment for thisAlignment in myAlignments: print(format_alignment(*thisAlignment)) print(White)
def mainloop(): seq = input("Sequence 1:") seq2 = input("Sequence 2:") alignments = pairwise2.align.globalxx(seq, seq2) print(format_alignment(*alignments[0])) print("No1.Total GC content:") print(GC(seq)) print("No1.GC by parts:") print(GC123(seq)) print("No2.Total GC content:") print(GC(seq2)) print("No2.GC by parts:") print(GC123(seq2)) input('next prot') cls = lambda: os.system('cls') cls() mainloop()
def _check_seq(self): from Bio import pairwise2 from Bio.pairwise2 import format_alignment try: primary_seq = self.seq protocol_seq = one(self.get_synthesis_attr('product_seqs')) except (QueryError, ValueError): pass else: if primary_seq != protocol_seq: alignments = pairwise2.align.globalxx(primary_seq, protocol_seq) err = CheckError(culprit=self, alignments=alignments) err.brief = "sequence doesn't match construction" err.info = lambda e: format_alignment(e.alignments[0]) raise err
def result_local(request): input_seq = request.POST.get('tool1', 'default') input_seq2 = request.POST.get('tool2', 'default') str1 = "" for x in input_seq: if (x != " "): str1 = str1 + x str2 = "" for x in input_seq2: if (x != " "): str2 = str2 + x dna6a = Seq(str1) dna6b = Seq(str2) stri = "" for a in alignments: stri = stri + str(format_alignment(*a)) params = {'res': stri} return render(request, 'mysite/result_align.html', params)
def getLocalAlign(seq1, seq2): """ Complete local alignments using Smith-Waterman algorithm""" print(BIYellow + "LOCAL ALIGNMENTS (Smith-Waterman algorithm)" + White) myAlignments = pairwise2.align.localxx(seq1, seq2) print(myAlignments) for thisAlignment in myAlignments: print(thisAlignment) print(BICyan) from Bio.pairwise2 import format_alignment for thisAlignment in myAlignments: print(format_alignment(*thisAlignment)) print(White)
def align(s1, s2, test=False, psm=2, pmm=0.5, pgo=-3, pge=-1): """ Creates pairwise local alignment between seqeunces. Get the visualization and alignment scores. :param s1: seqeunce 1 :param s2: seqeunce 2 REF: http://biopython.org/DIST/docs/api/Bio.pairwise2-module.html The match parameters are: CODE DESCRIPTION x No parameters. Identical characters have score of 1, otherwise 0. m A match score is the score of identical chars, otherwise mismatch score. d A dictionary returns the score of any pair of characters. c A callback function returns scores. The gap penalty parameters are: CODE DESCRIPTION x No gap penalties. s Same open and extend gap penalties for both sequences. d The sequences have different open and extend gap penalties. c A callback function returns the gap penalties. """ import operator from Bio import pairwise2 if any([p is None for p in [psm, pmm, pgo, pge]]): alignments = pairwise2.align.localxx(s1.upper(), s2.upper()) else: alignments = pairwise2.align.localms(s1.upper(), s2.upper(), psm, pmm, pgo, pge) if test: print(alignments) alignsymb = np.nan score = np.nan sorted_alignments = sorted(alignments, key=operator.itemgetter(2)) for a in alignments: alignstr = pairwise2.format_alignment(*a) alignsymb = alignstr.split('\n')[1] score = a[2] if test: print(alignstr) break return alignsymb.replace(' ', '-'), score
def test_localxs_2(self): """Test localxx with ``full_sequences=True``.""" aligns = sorted(pairwise2.align.localxs("AxBx", "zABz", -0.1, 0)) # From Biopython 1.74 on this should only give one alignment, since # we disallow leading and trailing 'zero-extensions' self.assertEqual(len(aligns), 1) seq1, seq2, score, begin, end = aligns[0] alignment = pairwise2.format_alignment(seq1, seq2, score, begin, end, full_sequences=True) self.assertEqual(alignment, """\ -AxBx | | zA-Bz Score=1.9 """) # noqa: W291
def main(): # Reading File file_name = '../pdm2_neurogenic.fa' print("Reading in Original Sequences:") sequences = [] print("Completed Sequences by ID:\n") for seq_record in SeqIO.parse(file_name,'fasta'): print(seq_record.id, "\n") sequences.append(seq_record) print("Completed.") # Turn sequences into a basic Python data structure # # `df_sequences`: Pandas Dataframe with one sequence object per row # `id`: Description header of each sequence # `seq`: Letter sequence # `length`: integer length of the sequence # `func`: 0 for negative function, 1 for positive function # `GC_ratio`: Ratio of GC pairs to full sequence df_sequences = pd.DataFrame({"id": [i.id for i in sequences], "seq": [j.seq for j in sequences], "length": [len(k.seq) for k in sequences], }) df_sequences['func'] = df_sequences['id'].str[-1].replace({'-': 0, '+': 1}) df_sequences['species'] = df_sequences['id'].str.findall('MEMB(....)') # Measure GC content per sequence df_sequences["GC_ratio"] = df_sequences.seq.apply(GC) / 100.0 print(df_sequences) # Alignment Sequence # `pairwise2.align.globalxx`: Pairwise alignment with no cost value for misaligned pairs, and a value of 1 for matched pairs between the two sequences. # `pairwise2.format_alignment(*alignment[0])`: Show matches from start to finish positions as defined by previous function. alignment_0_1 = pairwise2.align.globalxx(df_sequences.seq.iloc[0], df_sequences.seq.iloc[1]) print(pairwise2.format_alignment(*alignment_0_1[0]))
def get_alignment(Rb, Ru): """ get alignment of two PDBs, based on the biopython algorithm and two input PDBs, name1 and name2 """ # get CA for alignment Rb_idxs = Rb.atomselect("*", "*", "CA", get_index=True)[1] Ru_idxs = Ru.atomselect("*", "*", "CA", get_index=True)[1] Rb_sub = Rb.get_subset(Rb_idxs) Ru_sub = Ru.get_subset(Ru_idxs) # convert to sequence code Rb_seq = convert_res(Rb_sub.data["resname"]) Ru_seq = convert_res(Ru_sub.data["resname"]) R_alignment = pairwise2.align.globalxx(Rb_seq, Ru_seq) txt = format_alignment(*R_alignment[-1]) return txt
def findBestAlignment(seq, query, dna=False, offset=0, show=False): if not dna: alignments = align.localds(seq.replace('*', 'X'), query, matlist.blosum62, -100, -100) else: alignments = align.localms(seq, query, 1, -2, -2, -2) # print(seq, query, alignments) scores = [a[2] for a in alignments] if len(scores) == 0: return -1, -1, True best = scores.index(max(scores)) if show: print(format_alignment(*alignments[best])) print(alignments[best]) # FR4 start is where both sequence start to align with each other # including leading mismatches (these mismatches maybe due to mutations) # 0123456 # eg: GGGGACGTACGTACGT # |||||||||| # ----CAGTACGTACGT # although alignment starts at pos 6, we still consider FR4 to start at pos 4 start = extend5align(alignments[best]) + offset + 1 # 1-based start end = int(offset + alignments[best][-1]) # 1-based end gapped = False # subtract away non-existing '-'s from the seq because seq itself doesn't have these '-'s # eg: -GGGACGTACGTACGT # ||||||||||||||| # GGGACAGTACGTACGT # should start at 1, not 2. because the leading '-' doesn't exist in the actual sequence! if '-' in alignments[best][0]: start -= alignments[best][0][:(alignments[best][-2] + 1)].count('-') end -= alignments[best][0][:(alignments[best][-1] + 1)].count('-') gapped = True return start, end, gapped # 1-based