def pwm(self, laplace=True): """ returns the PWM computed for the set of instances if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions. """ if self._pwm_is_current: return self._pwm # we need to compute new pwm self._pwm = [] for i in range(self.length): dict = {} # filling the dict with 0's for letter in self.alphabet.letters: if laplace: dict[letter]=self.beta*self.background[letter] else: dict[letter]=0.0 if self.has_counts: # taking the raw counts for letter in self.alphabet.letters: dict[letter]+=self.counts[letter][i] elif self.has_instances: # counting the occurences of letters in instances for seq in self.instances: # dict[seq[i]]=dict[seq[i]]+1 try: dict[seq[i]]+=1 except KeyError: # we need to ignore non-alphabet letters pass self._pwm.append(FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet)) self._pwm_is_current=1 return self._pwm
def get_ic(path, pdb): """ Process given MSA (in file_path). Return consensus and information content. """ alignment = AlignIO.read(path, "stockholm") summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus() pdb2alignid = get_pdb2alignmentid(path) alignid = pdb2alignid[pdb] aligned_pdb_seq = None for seq in alignment: if seq.id == alignid: aligned_pdb_seq = seq break aligned_seqs = [str(a.seq) for a in alignment] freqs = count_freqs(aligned_seqs) info_content = [] for pos in range(len(consensus)): info_content.append( summary_align.information_content(start=pos, end=pos + 1, e_freq_table=FreqTable.FreqTable( freqs, dict_type=FreqTable.FREQ))) return consensus, info_content, aligned_pdb_seq
def _exp_freq_table_from_obs_freq(obs_freq_mat): exp_freq_table = {} for i in obs_freq_mat.alphabet.letters: exp_freq_table[i] = 0. for i in obs_freq_mat: if i[0] == i[1]: exp_freq_table[i[0]] += obs_freq_mat[i] else: exp_freq_table[i[0]] += obs_freq_mat[i] / 2. exp_freq_table[i[1]] += obs_freq_mat[i] / 2. return FreqTable.FreqTable(exp_freq_table, FreqTable.FREQ)
def _exp_freq_table_from_obs_freq(obs_freq_mat): """Build expected frequence table from observed frequences (PRIVATE).""" exp_freq_table = {} for i in obs_freq_mat.alphabet.letters: exp_freq_table[i] = 0.0 for i in obs_freq_mat: if i[0] == i[1]: exp_freq_table[i[0]] += obs_freq_mat[i] else: exp_freq_table[i[0]] += obs_freq_mat[i] / 2.0 exp_freq_table[i[1]] += obs_freq_mat[i] / 2.0 return FreqTable.FreqTable(exp_freq_table, FreqTable.FREQ)
def cal_IC(path_motifInstance): e_freq_table = FreqTable.FreqTable(EXPECT_FREQ, FreqTable.FREQ, IUPAC.unambiguous_dna) information_content = [] alignment = AlignIO.read(path_motifInstance, "fasta") summary_align = AlignInfo.SummaryInfo(alignment) for j in range(FILTER_LENGTH): information_content.append( summary_align.information_content(j, j + 1, e_freq_table=e_freq_table, chars_to_ignore=['N'])) return information_content
def summary(msa, output, title): ''' Ala (A) 9.10 Gln (Q) 3.79 Leu (L) 9.87 Ser (S) 6.69 Arg (R) 5.71 Glu (E) 6.16 Lys (K) 4.99 Thr (T) 5.57 Asn (N) 3.88 Gly (G) 7.26 Met (M) 2.38 Trp (W) 1.29 Asp (D) 5.45 His (H) 2.19 Phe (F) 3.92 Tyr (Y) 2.93 Cys (C) 1.21 Ile (I) 5.70 Pro (P) 4.85 Val (V) 6.88 ''' #unit_prot freq table of aminoacids 23/11/2017 e_freq_dict = { 'A': 0.091, 'R': 0.0571, 'N': 0.0388, 'D': 0.0545, 'C': 0.0121, 'Q': 0.0379, 'E': 0.0616, 'G': 0.0726, 'H': 0.0219, 'I': 0.0570, 'L': 0.0987, 'K': 0.0499, 'M': 0.0238, 'F': 0.0392, 'P': 0.0485, 'S': 0.0669, 'T': 0.0557, 'W': 0.0129, 'Y': 0.0293, 'V': 0.0688 } #e_freq_dict={'A': 0.175, 'B': 0.325, 'C': 0.5} e_freq_table = FreqTable.FreqTable(e_freq_dict, FreqTable.FREQ, alphabet=Alphabet.ProteinAlphabet()) #e_freq_table=None df = pandas.DataFrame() alignment = AlignIO.read(msa, "fasta", alphabet=Alphabet.ProteinAlphabet()) summary_align = AlignInfo.SummaryInfo(alignment) total_entropy, entropy_columns, freq_dict_columns = information_content( summary_align, e_freq_table=e_freq_table) '''Print File de resultados''' for i in range(len(entropy_columns.values())): freq_dict = freq_dict_columns[i] df_2 = pandas.DataFrame([freq_dict], columns=freq_dict.keys()) df_2['Entropy'] = entropy_columns[i] df = df.append(df_2, ignore_index=True) #df.set_value(i, 'Entropy' , entropy_columns[i]) df.to_csv(output)
def pwm(self): """ returns the PWM computed for the set of instances """ if self._pwm_is_current: return self._pwm #we need to compute new pwm self._pwm = [] for i in xrange(len(self.mask)): dict = {} #filling the dict with 0's for letter in self.alphabet.letters: dict[letter] = 0 #counting the occurences of letters in instances for seq in self.instances: dict[seq[i]] = dict[seq[i]] + 1 self._pwm.append( FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet)) self._pwm_is_current = True return self._pwm
print('first description: %s' % alignment[0].description) print('first sequence: %s' % alignment[0].seq) # get the length of the alignment print('length %i' % alignment.get_alignment_length()) print(alignment) # print out interesting information about the alignment summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus() print('consensus %s' % consensus) my_pssm = summary_align.pos_specific_score_matrix(consensus, chars_to_ignore=['N']) print(my_pssm) expect_freq = {'A': .3, 'G': .2, 'T': .3, 'C': .2} freq_table_info = FreqTable.FreqTable(expect_freq, FreqTable.FREQ, IUPAC.unambiguous_dna) info_content = summary_align.information_content(5, 30, chars_to_ignore=['N'], e_freq_table=freq_table_info) print("relative info content: %f" % info_content)
"Install NumPy if you want to use Bio.SubsMat.") try: import cPickle as pickle # Only available on Python 2 except ImportError: import pickle import sys import os from Bio import SubsMat from Bio.SubsMat import FreqTable, MatrixInfo f = sys.stdout ftab_file = os.path.join('SubsMat', 'protein_count.txt') with open(ftab_file) as handle: ftab_prot = FreqTable.read_count(handle) ctab_file = os.path.join('SubsMat', 'protein_freq.txt') with open(ctab_file) as handle: ctab_prot = FreqTable.read_freq(handle) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik') # Don't want to use text mode on Python 3, with open(pickle_file, 'rb') as handle: acc_rep_mat = pickle.load(handle) acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
positions.sort() for pos in positions: fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos], summary_info.ic_vector[pos])) if __name__ == "__main__": print "Quick test" from Bio import AlignIO from Bio.Align.Generic import Alignment filename = "../../Tests/GFF/multi.fna" format = "fasta" expected = FreqTable.FreqTable({ "A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25 }, FreqTable.FREQ, IUPAC.unambiguous_dna) alignment = AlignIO.read(open(filename), format) for record in alignment: print str(record.seq) print "=" * alignment.get_alignment_length() summary = SummaryInfo(alignment) consensus = summary.dumb_consensus(ambiguous="N") print consensus consensus = summary.gap_consensus(ambiguous="N") print consensus print print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
# This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. import cPickle import sys import os from Bio import SubsMat from Bio.SubsMat import FreqTable, MatrixInfo f = sys.stdout ftab_file = os.path.join('SubsMat', 'protein_count.txt') ftab_prot = FreqTable.read_count(open(ftab_file)) ctab_file = os.path.join('SubsMat', 'protein_freq.txt') ctab_prot = FreqTable.read_freq(open(ctab_file)) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik') #Don't want to use text mode on Python 3, acc_rep_mat = cPickle.load(open(pickle_file, 'rb')) acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat) obs_freq_mat.print_mat(f=f, format=" %4.3f") f.write( "Diff between supplied and matrix-derived frequencies, should be small\n") ks = ftab_prot.keys()
def test_read_write_clustal(self): """Test the base alignment stuff.""" path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln") alignment = AlignIO.read(path, "clustal", alphabet=Alphabet.Gapped( IUPAC.unambiguous_dna)) self.assertEqual(len(alignment), 7) seq_record = alignment[0] self.assertEqual(seq_record.description, "gi|6273285|gb|AF191659.1|AF191") self.assertEqual( seq_record.seq, Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA" )) seq_record = alignment[1] self.assertEqual(seq_record.description, "gi|6273284|gb|AF191658.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[2] self.assertEqual(seq_record.description, "gi|6273287|gb|AF191661.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[3] self.assertEqual(seq_record.description, "gi|6273286|gb|AF191660.1|AF191") self.assertEqual( seq_record.seq, "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[4] self.assertEqual(seq_record.description, "gi|6273290|gb|AF191664.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[5] self.assertEqual(seq_record.description, "gi|6273289|gb|AF191663.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA" ) seq_record = alignment[6] self.assertEqual(seq_record.description, "gi|6273291|gb|AF191665.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) self.assertEqual(alignment.get_alignment_length(), 156) align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus() self.assertIsInstance(consensus, Seq) self.assertEqual( consensus, "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) dictionary = align_info.replacement_dictionary(["N"]) self.assertEqual(len(dictionary), 16) self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1) self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1) self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1) self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1) self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1) self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1) self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1) self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1) self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1) self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1) self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1) self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1) self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1) matrix = align_info.pos_specific_score_matrix(consensus, ["N"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) second_seq = alignment[1].seq matrix = align_info.pos_specific_score_matrix(second_seq, ["N"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 - 0.0 0.0 0.0 3.0 - 3.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) value = align_info.information_content(5, 50, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 88.42, places=2) value = align_info.information_content(chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) e_freq = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25} e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ, IUPAC.unambiguous_dna) value = align_info.information_content(e_freq_table=e_freq_table, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) self.assertEqual(align_info.get_column(1), "AAAAAAA") self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2) self.assertEqual(align_info.get_column(7), "TTTATTT") self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2) handle = StringIO() AlignInfo.print_info_content(align_info, fout=handle) self.assertEqual( handle.getvalue(), """\ 0 T 2.000 1 A 2.000 2 T 2.000 3 A 2.000 4 C 2.000 5 A 2.000 6 T 2.000 7 T 1.408 8 A 2.000 9 A 2.000 10 A 2.000 11 G 2.000 12 A 1.015 13 A 2.000 14 G 2.000 15 G 2.000 16 G 2.000 17 G 2.000 18 G 2.000 19 A 2.000 20 T 2.000 21 G 2.000 22 C 2.000 23 G 2.000 24 G 2.000 25 A 2.000 26 T 2.000 27 A 2.000 28 A 2.000 29 A 2.000 30 T 2.000 31 G 2.000 32 G 2.000 33 A 2.000 34 A 2.000 35 A 2.000 36 G 2.000 37 G 2.000 38 C 2.000 39 G 2.000 40 A 2.000 41 A 2.000 42 A 2.000 43 G 2.000 44 A 2.000 45 A 2.000 46 A 2.000 47 G 2.000 48 A 2.000 49 A 2.000 50 T 2.000 51 A 2.000 52 T 2.000 53 A 2.000 54 T 2.000 55 A 2.000 56 - 0.682 57 - 0.682 58 - 0.333 59 - 0.333 60 - -0.115 61 - -0.115 62 - -0.115 63 - -0.115 64 - -0.115 65 - -0.115 66 A 2.000 67 T 2.000 68 A 2.000 69 T 2.000 70 A 2.000 71 T 2.000 72 T 2.000 73 T 2.000 74 C 1.408 75 A 1.408 76 A 2.000 77 A 2.000 78 T 2.000 79 T 2.000 80 T 1.015 81 C 2.000 82 C 2.000 83 T 2.000 84 T 2.000 85 A 2.000 86 T 2.000 87 A 2.000 88 T 2.000 89 A 2.000 90 C 1.137 91 C 2.000 92 C 2.000 93 A 2.000 94 A 2.000 95 A 2.000 96 T 2.000 97 A 2.000 98 T 2.000 99 A 2.000 100 A 2.000 101 A 2.000 102 A 2.000 103 A 2.000 104 T 2.000 105 A 2.000 106 T 2.000 107 C 2.000 108 T 2.000 109 A 2.000 110 A 2.000 111 T 2.000 112 A 2.000 113 A 2.000 114 A 2.000 115 T 2.000 116 T 2.000 117 A 2.000 118 G 2.000 119 A 2.000 120 T 2.000 121 G 2.000 122 A 2.000 123 A 2.000 124 T 2.000 125 A 2.000 126 T 2.000 127 C 2.000 128 A 2.000 129 A 2.000 130 A 2.000 131 G 2.000 132 A 2.000 133 A 2.000 134 T 2.000 135 C 2.000 136 C 1.408 137 A 2.000 138 T 2.000 139 T 2.000 140 G 2.000 141 A 2.000 142 T 2.000 143 T 2.000 144 T 2.000 145 A 2.000 146 G 2.000 147 T 2.000 148 G 1.408 149 T 2.000 150 A 2.000 151 C 2.000 152 C 2.000 153 A 2.000 154 G 2.000 155 A 2.000 """)
print(align_info.pos_specific_score_matrix(chars_to_ignore=['N'])) print('with a selected sequence...') second_seq = alignment[1].seq print(align_info.pos_specific_score_matrix(second_seq, ['N'])) print('information content') print('part of alignment: %0.2f' % align_info.information_content(5, 50, chars_to_ignore=['N'])) print('entire alignment: %0.2f' % align_info.information_content(chars_to_ignore=['N'])) print('relative information content') e_freq = {'G': 0.25, 'C': 0.25, 'A': 0.25, 'T': 0.25} e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ, IUPAC.unambiguous_dna) print('relative information: %0.2f' % align_info.information_content( e_freq_table=e_freq_table, chars_to_ignore=['N'])) print('Column 1: %s' % align_info.get_column(1)) print('IC for column 1: %0.2f' % align_info.ic_vector[1]) print('Column 7: %s' % align_info.get_column(7)) print('IC for column 7: %0.2f' % align_info.ic_vector[7]) print('test print_info_content') AlignInfo.print_info_content(align_info) print("testing reading and writing fasta format...") to_parse = os.path.join(os.curdir, 'Quality', 'example.fasta') alignment = AlignIO.read(to_parse,
'V': 0.0676, 'L': 0.0936, 'Y': 0.0287, 'M': 0.0244, 'I': 0.0528, 'G': 0.0661, 'A': 0.0567, 'N': 0.0432, 'C': 0.0175, 'Q': 0.0345, 'F': 0.0426, 'P': 0.0487, 'X': 0 } summary_align = AlignInfo.SummaryInfo(alignment) e_freq_table = FreqTable.FreqTable(expect_freq, FreqTable.FREQ, Alphabet.IUPAC.Alphabet) pseudo_count = 0.1 chars_to_ignore = ['-'] log_base = 2 if types_choice == "t": new_expect_freq = {} for j in types_dict.keys(): new_expect_freq[j] = 0 for i in expect_freq.keys(): for j in types_dict.keys(): if i in types_dict[j]: new_expect_freq[j] += expect_freq[i] e_freq_table = FreqTable.FreqTable(new_expect_freq, FreqTable.FREQ,
import cPickle import sys import os from Bio import SubsMat from Bio.SubsMat import FreqTable, MatrixInfo f = sys.stdout ftab_file = os.path.join('SubsMat', 'protein_count.txt') ftab_prot = FreqTable.read_count(open(ftab_file)) ctab_file = os.path.join('SubsMat', 'protein_freq.txt') ctab_prot = FreqTable.read_freq(open(ctab_file)) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik') acc_rep_mat = cPickle.load(open(pickle_file)) acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat) obs_freq_mat.print_mat(f=f,format=" %4.3f") f.write("Diff between supplied and matrix-derived frequencies, should be small\n") ks = ftab_prot.keys() ks.sort() for i in ks: f.write("%s %.2f\n" % (i,abs(ftab_prot[i] - ftab_prot2[i]))) s = 0.
del corrcoef except ImportError: from Bio import MissingExternalDependencyError raise MissingExternalDependencyError( "Install NumPy if you want to use Bio.SubsMat.") import cPickle import sys import os from Bio import SubsMat from Bio.SubsMat import FreqTable, MatrixInfo f = sys.stdout ftab_file = os.path.join('SubsMat', 'protein_count.txt') with open(ftab_file) as handle: ftab_prot = FreqTable.read_count(handle) ctab_file = os.path.join('SubsMat', 'protein_freq.txt') with open(ctab_file) as handle: ctab_prot = FreqTable.read_freq(handle) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik') #Don't want to use text mode on Python 3, with open(pickle_file, 'rb') as handle: acc_rep_mat = cPickle.load(handle) acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
from Bio.Alphabet import IUPAC from Bio.SubsMat import FreqTable dna_letters = IUPAC.unambiguous_dna.letters rna_letters = IUPAC.unambiguous_rna.letters protein_letters = IUPAC.protein.letters dna_naive_freq = {k: 0.25 for k in dna_letters} rna_naive_freq = {k: 0.25 for k in rna_letters} aa_naive_freq = {k: 0.05 for k in protein_letters} dna_naive_freq_table = FreqTable.FreqTable(dna_naive_freq, FreqTable.FREQ, IUPAC.unambiguous_dna) rna_naive_freq_table = FreqTable.FreqTable(rna_naive_freq, FreqTable.FREQ, IUPAC.unambiguous_rna) aa_naive_freq_table = FreqTable.FreqTable(aa_naive_freq, FreqTable.FREQ, IUPAC.protein) naive_freq_tables = { 'aa': aa_naive_freq_table, 'dna': dna_naive_freq_table, 'rna': rna_naive_freq_table }