def add_consensus(alignment, threshold=0.9, ambiguous='-', name='consensus'): """Add a consensus line""" a = SummaryInfo(alignment) # cons=a.dumb_consensus(threshold, ambiguous) cons = a.gap_consensus(threshold, ambiguous) alignment.extend([SeqRecord(cons, id=name, name=name)]) return alignment
def get_hist_ss_in_aln_as_string(alignment,type='Unknown',debug=0): """ gets an annotation line as a string, borrowed from aln2html""" sinfo=SummaryInfo(alignment) cons=sinfo.gap_consensus(threshold=0.9, ambiguous='X') features=get_hist_ss_in_aln_for_html(alignment,type=type,debug=0) f_description='' annot_line=[0,1,2] if(features): annot_line[0]=list(' '*len(cons)) annot_line[1]=list(' '*len(cons)) annot_line[2]=list(' '*len(cons)) keys=sorted(list(features.keys()),key=lambda x: x[0]) for k in keys: if(features[k].get('description',0)): f_description+='{0}-{1};'.format(features[k]['symbol'],features[k]['description']) lev=features[k].get('level',0) if(re.match('^\s+$',''.join(annot_line[lev][k[0]:k[1]+1]))): annot_line[lev][k[0]:k[1]+1]=features[k]['symbol']*(k[1]-k[0]+1) else: lev+=1 if(re.match('^\s+$',''.join(annot_line[lev][k[0]:k[1]+1]))): annot_line[lev][k[0]:k[1]+1]=features[k]['symbol']*(k[1]-k[0]+1) else: lev+=1 if(re.match('^\s+$',''.join(annot_line[lev][k[0]:k[1]+1]))): annot_line[lev][k[0]:k[1]+1]=features[k]['symbol']*(k[1]-k[0]+1) return annot_line[0] #other are ignored currently
def add_consensus(alignment,threshold=0.9, ambiguous='-',name='consensus'): """Add a consensus line""" a=SummaryInfo(alignment) # cons=a.dumb_consensus(threshold, ambiguous) cons=a.gap_consensus(threshold, ambiguous) alignment.extend([SeqRecord(cons,id=name,name=name)]) return alignment
def get_hist_ss_in_aln_as_string(alignment, type='Unknown', debug=0): """ gets an annotation line as a string, borrowed from aln2html""" sinfo = SummaryInfo(alignment) cons = sinfo.gap_consensus(threshold=0.9, ambiguous='X') features = get_hist_ss_in_aln_for_html(alignment, type=type, debug=0) f_description = '' annot_line = [0, 1, 2] if (features): annot_line[0] = list(' ' * len(cons)) annot_line[1] = list(' ' * len(cons)) annot_line[2] = list(' ' * len(cons)) keys = sorted(list(features.keys()), key=lambda x: x[0]) for k in keys: if (features[k].get('description', 0)): f_description += '{0}-{1};'.format(features[k]['symbol'], features[k]['description']) lev = features[k].get('level', 0) if (re.match('^\s+$', ''.join(annot_line[lev][k[0]:k[1] + 1]))): annot_line[lev][k[0]:k[1] + 1] = features[k]['symbol'] * (k[1] - k[0] + 1) else: lev += 1 if (re.match('^\s+$', ''.join(annot_line[lev][k[0]:k[1] + 1]))): annot_line[lev][k[0]:k[1] + 1] = features[k]['symbol'] * (k[1] - k[0] + 1) else: lev += 1 if (re.match('^\s+$', ''.join(annot_line[lev][k[0]:k[1] + 1]))): annot_line[lev][k[0]:k[1] + 1] = features[k]['symbol'] * (k[1] - k[0] + 1) return annot_line[0] #other are ignored currently
def test_proteins(self): alpha = HasStopCodon(Gapped(generic_protein, "-"), "*") a = MultipleSeqAlignment([ SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"), SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"), SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003") ]) self.assertEqual(32, a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*") c = s.gap_consensus(ambiguous="X") self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX") m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) self.assertEqual( str(m), """ A D E F G H I K L M N P Q R S W Y M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """) ic = s.information_content(chars_to_ignore=['-', '*']) self.assertAlmostEqual(ic, 133.061475107, places=6)
def test_nucleotides(self): filename = "GFF/multi.fna" format = "fasta" alignment = AlignIO.read(filename, format, alphabet=unambiguous_dna) summary = SummaryInfo(alignment) c = summary.dumb_consensus(ambiguous="N") self.assertEqual(str(c), "NNNNNNNN") c = summary.gap_consensus(ambiguous="N") self.assertEqual(str(c), "NNNNNNNN") expected = {"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25} m = summary.pos_specific_score_matrix(chars_to_ignore=["-"], axis_seq=c) self.assertEqual( str(m), """ A C G T N 2.0 0.0 1.0 0.0 N 1.0 1.0 1.0 0.0 N 1.0 0.0 2.0 0.0 N 0.0 1.0 1.0 1.0 N 1.0 2.0 0.0 0.0 N 0.0 2.0 1.0 0.0 N 1.0 2.0 0.0 0.0 N 0.0 2.0 1.0 0.0 """) # Have a generic alphabet, without a declared gap char, so must tell # provide the frequencies and chars to ignore explicitly. ic = summary.information_content(e_freq_table=expected, chars_to_ignore=["-"]) self.assertAlmostEqual(ic, 7.32029999423075, places=6)
def test_proteins(self): a = MultipleSeqAlignment([ SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-"), id="ID001"), SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*"), id="ID002"), SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*"), id="ID003") ]) self.assertEqual(32, a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*") c = s.gap_consensus(ambiguous="X") self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX") m = s.pos_specific_score_matrix(chars_to_ignore=["-", "*"], axis_seq=c) self.assertEqual( str(m), """ A D E F G H I K L M N P Q R S W Y M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """) letters = IUPACData.protein_letters base_freq = 1.0 / len(letters) e_freq_table = {letter: base_freq for letter in letters} ic = s.information_content(e_freq_table=e_freq_table, chars_to_ignore=["-", "*"]) self.assertAlmostEqual(ic, 133.061475107, places=6)
def test_proteins(self): alpha = HasStopCodon(Gapped(generic_protein, "-"), "*") a = MultipleSeqAlignment([ SeqRecord(Seq("MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-", alpha), id="ID001"), SeqRecord(Seq("MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*", alpha), id="ID002"), SeqRecord(Seq("MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*", alpha), id="ID003")]) self.assertEqual(32, a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") self.assertEqual(str(c), "MHQAIFIYQIGYXXLKSGYIQSIRSPEYDNW*") c = s.gap_consensus(ambiguous="X") self.assertEqual(str(c), "MHXXIFIYQIGYXXLKSGYIQSIRSPEYXNWX") m = s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) self.assertEqual(str(m), """ A D E F G H I K L M N P Q R S W Y M 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 H 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 X 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 F 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 L 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 K 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 G 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Q 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 I 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 R 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 S 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 E 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 Y 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 X 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 W 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 X 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """) ic = s.information_content(chars_to_ignore=['-', '*']) self.assertAlmostEqual(ic, 133.061475107, places=6)
def trim_aln_gaps(alignment,threshold=0.8): """Removes positions with more than threshold gaps in alignment""" a=SummaryInfo(alignment) cons=a.gap_consensus(threshold=threshold, ambiguous='X') new_aln=alignment[:,0:0] for c,i in zip(cons,range(len(cons))): if(c=='-'): continue else: new_aln+=alignment[:,i:i+1] return new_aln
def trim_aln_gaps(alignment, threshold=0.8): """Removes positions with more than threshold gaps in alignment""" a = SummaryInfo(alignment) cons = a.gap_consensus(threshold=threshold, ambiguous='X') new_aln = alignment[:, 0:0] for c, i in zip(cons, range(len(cons))): if (c == '-'): continue else: new_aln += alignment[:, i:i + 1] return new_aln
def get_hist_ss_in_aln(alignment, type='Unknown', debug=0): """Returns sequence elements in histone alignment, all numbers assume first element in seq has number 0!!! Not like in PDB""" #Let's extract consensus if (debug): print(alignment) a = SummaryInfo(alignment) cons = a.gap_consensus(threshold=0.5, ambiguous='X') cons = Seq(str(cons).replace('-', 'X')) if (debug): print("Consensus") print(cons) hv, ss = get_hist_ss(cons, type, debug) return hv, ss
def get_hist_ss_in_aln(alignment,type='Unknown',debug=0): """Returns sequence elements in histone alignment, all numbers assume first element in seq has number 0!!! Not like in PDB""" #Let's extract consensus if(debug): print alignment a=SummaryInfo(alignment) cons=a.gap_consensus(threshold=0.5, ambiguous='X') cons=Seq(str(cons).replace('-','X')) if(debug): print "Consensus" print cons hv,ss=get_hist_ss(cons,type,debug) return hv,ss
def getConservedDomain(self): cons = [] align = SummaryInfo(self.__alignment) consenso = str(align.gap_consensus()) temp = '' for i in range(len(consenso)): if consenso[i] not in "X-": temp += consenso[i] else: if temp != '': cons.append(temp) temp = '' max_cons = '' for i in cons: if len(i) > len(max_cons): max_cons = i return max_cons
def output_consensus(y, threshold_value, consensus_output_dir): """Takes as input an alignment file and outputs a consensus sequence in fasta format""" file_name = os.path.basename(y) fasta_name = file_name.split('_align')[0] alignment = AlignIO.read(open(y), "fasta") summary_align = SummaryInfo(alignment) consensus = summary_align.gap_consensus(threshold = threshold_value, ambiguous = 'N', consensus_alpha = alphabet, require_multiple = 1) consensus_seq = SeqRecord.SeqRecord(consensus,id=fasta_name+"_consensus") output_file_name = str(consensus_output_dir+'/'+fasta_name+"_cons.fasta") output_handle = open(output_file_name, "w") print "Writing consensus sequence for " + fasta_name SeqIO.write(consensus_seq, output_handle, "fasta") output_handle.close()
def test_nucleotides(self): filename = "GFF/multi.fna" format = "fasta" alignment = AlignIO.read(filename, format, alphabet=unambiguous_dna) summary = SummaryInfo(alignment) c = summary.dumb_consensus(ambiguous="N") self.assertEqual(str(c), 'NNNNNNNN') self.assertNotEqual(c.alphabet, unambiguous_dna) self.assertTrue(isinstance(c.alphabet, DNAAlphabet)) c = summary.gap_consensus(ambiguous="N") self.assertEqual(str(c), 'NNNNNNNN') self.assertNotEqual(c.alphabet, unambiguous_dna) self.assertTrue(isinstance(c.alphabet, DNAAlphabet)) expected = FreqTable({"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25}, FREQ, unambiguous_dna) m = summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=c) self.assertEqual(str(m), """ A C G T N 2.0 0.0 1.0 0.0 N 1.0 1.0 1.0 0.0 N 1.0 0.0 2.0 0.0 N 0.0 1.0 1.0 1.0 N 1.0 2.0 0.0 0.0 N 0.0 2.0 1.0 0.0 N 1.0 2.0 0.0 0.0 N 0.0 2.0 1.0 0.0 """) # Have a generic alphabet, without a declared gap char, so must tell # provide the frequencies and chars to ignore explicitly. ic = summary.information_content(e_freq_table=expected, chars_to_ignore=['-']) self.assertAlmostEqual(ic, 7.32029999423075, places=6)
def aln2html(msa,filename,features=None,title=None,description=True,field1w=20,field2w=35): """ This function outputs HTML from msa and annotates features. msa - Biopython MSA, filename - html file to output the result. features - a dictionary of features, organized as follows: {(begin,end):{'level':0(default),'symbol':'H','description':'desc'}} if features overlap and not levels, they will be split to different levels. Only three levels (0,1,2) are available. """ style=""" pre,td{margin: 0px;padding: 0px;border: 0px;} .pos{color:blue;} .neg{color:red;} .pol{color:green;} .hphob{color:grey;} .def{color:black;} .conserved{background:lightblue;} .nonconserved{background:white;} """ sinfo=SummaryInfo(msa) cons=sinfo.gap_consensus(threshold=0.9, ambiguous='X') #Let's work on features f_description='' msatext='' annot_line=[0,1,2] if(features): annot_line[0]=list(' '*len(cons)) annot_line[1]=list(' '*len(cons)) annot_line[2]=list(' '*len(cons)) keys=sorted(list(features.keys()),key=lambda x: x[0]) for k in keys: if(features[k].get('description',0)): f_description+='{0}-{1};'.format(features[k]['symbol'],features[k]['description']) lev=features[k].get('level',0) if(re.match('^\s+$',''.join(annot_line[lev][k[0]:k[1]+1]))): annot_line[lev][k[0]:k[1]+1]=features[k]['symbol']*(k[1]-k[0]+1) else: lev+=1 if(re.match('^\s+$',''.join(annot_line[lev][k[0]:k[1]+1]))): annot_line[lev][k[0]:k[1]+1]=features[k]['symbol']*(k[1]-k[0]+1) else: lev+=1 if(re.match('^\s+$',''.join(annot_line[lev][k[0]:k[1]+1]))): annot_line[lev][k[0]:k[1]+1]=features[k]['symbol']*(k[1]-k[0]+1) if(not re.match('^\s+$',''.join(annot_line[2]))): msatext='<TR><TD><PRE>{0:<{field1w}}</PRE></TD>'.format('annotation',field1w=field1w+2) if(description): msatext+='<TD><PRE>{0:<{field2w}}</PRE></TD>'.format('level 2',field2w=field2w+2) for c in annot_line[2]: msatext+='<TD><PRE>{0}</PRE></TD>'.format(c) msatext+='</TR>' if(not re.match('^\s+$',''.join(annot_line[1]))): msatext='<TR><TD><PRE>{0:<{field1w}}</PRE></TD>'.format('annotation',field1w=field1w+2) if(description): msatext+='<TD><PRE>{0:<{field2w}}</PRE></TD>'.format('level 1',field2w=field2w+2) for c in annot_line[1]: msatext+='<TD><PRE>{0}</PRE></TD>'.format(c) msatext+='</TR>' if(not re.match('^\s+$',''.join(annot_line[0]))): msatext='<TR><TD><PRE>{0:<{field1w}}</PRE></TD>'.format('annotation',field1w=field1w+2) if(description): msatext+='<TD><PRE>{0:<{field2w}}</PRE></TD>'.format('level 0',field2w=field2w+20) for c in annot_line[0]: msatext+='<TD><PRE>{0}</PRE></TD>'.format(c) msatext+='</TR>' f_description+='<BR><BR>' for s in msa: if(re.search(r'\d\d+',s.id)): gi=re.search(r'(\d\d+)',s.id).group(1) line='<TR><TD><PRE><a href="http://www.ncbi.nlm.nih.gov/protein/?term={0}">{1:<{field1w}}</a></PRE></TD>'.format(gi,s.id[:field1w],field1w=field1w+2) if(description): line+='<TD><PRE>{0:<{field2w}}</PRE></TD>'.format(s.description[:field2w],field2w=field2w+2) else: line='<TR><TD><PRE>{0:<{field1w}}</PRE></TD>'.format(s.id[:field1w],field1w=field1w+2) if(description): line+='<TD><PRE>{0:<{field2w}}</PRE></TD>'.format(s.description[:field2w],field1w=field1w+2) for c,i in zip(s.seq,range(len(s.seq))): line+='<TD><PRE class="{0} {1}">{2}</PRE></TD>'.format(restypedict.get(c,'def'),'conserved' if c==cons[i] and c!='-' else 'nonconserved',c) line+='</TR>' msatext=msatext+line a=open(filename,'w') a.write(""" <!DOCTYPE html> <HTML> <HEAD> <META http-equiv="Content-Type" content="text/html; charset=utf-8"/> <TITLE>MultipleSequenceAlignment</TITLE> <style> {style} </style> </HEAD> <BODY style="background-color:white; color:black; a:link:blue; a:active:red; a:visited:purple"> {title}<BR><BR> {features} <TABLE style="border:0px; border-spacing:0px; background-color:white; color:black; a:link:blue; a:active:red; a:visited:purple;"> {msatext} </TABLE> </BODY> </HTML> """.format(\ title=title,\ msatext=msatext,\ style=style,\ features=f_description )) a.close()
def getPerc(self): align = SummaryInfo(self.__alignment) return float(1 - (align.gap_consensus().count("X") + align.gap_consensus().count("-")) / len(str(align.gap_consensus())))
def getConsenso(self): align = SummaryInfo(self.__alignment) return align.gap_consensus()