Ejemplo n.º 1
0
    def pwm(self, laplace=True):
        """
        returns the PWM computed for the set of instances

        if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions.
        """
        if self._pwm_is_current:
            return self._pwm
        # we need to compute new pwm
        self._pwm = []
        for i in range(self.length):
            dict = {}
            # filling the dict with 0's
            for letter in self.alphabet.letters:
                if laplace:
                    dict[letter]=self.beta*self.background[letter]
                else:
                    dict[letter]=0.0
            if self.has_counts:
                # taking the raw counts
                for letter in self.alphabet.letters:
                    dict[letter]+=self.counts[letter][i]
            elif self.has_instances:
                # counting the occurences of letters in instances
                for seq in self.instances:
                    # dict[seq[i]]=dict[seq[i]]+1
                    try:
                        dict[seq[i]]+=1
                    except KeyError:  # we need to ignore non-alphabet letters
                        pass
            self._pwm.append(FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet))
        self._pwm_is_current=1
        return self._pwm
Ejemplo n.º 2
0
def get_ic(path, pdb):
    """
    Process given MSA (in file_path). Return consensus and information content.
    """

    alignment = AlignIO.read(path, "stockholm")

    summary_align = AlignInfo.SummaryInfo(alignment)

    consensus = summary_align.dumb_consensus()

    pdb2alignid = get_pdb2alignmentid(path)

    alignid = pdb2alignid[pdb]

    aligned_pdb_seq = None
    for seq in alignment:
        if seq.id == alignid:
            aligned_pdb_seq = seq
            break

    aligned_seqs = [str(a.seq) for a in alignment]
    freqs = count_freqs(aligned_seqs)

    info_content = []
    for pos in range(len(consensus)):
        info_content.append(
            summary_align.information_content(start=pos,
                                              end=pos + 1,
                                              e_freq_table=FreqTable.FreqTable(
                                                  freqs,
                                                  dict_type=FreqTable.FREQ)))

    return consensus, info_content, aligned_pdb_seq
Ejemplo n.º 3
0
def _exp_freq_table_from_obs_freq(obs_freq_mat):
    exp_freq_table = {}
    for i in obs_freq_mat.alphabet.letters:
        exp_freq_table[i] = 0.
    for i in obs_freq_mat:
        if i[0] == i[1]:
            exp_freq_table[i[0]] += obs_freq_mat[i]
        else:
            exp_freq_table[i[0]] += obs_freq_mat[i] / 2.
            exp_freq_table[i[1]] += obs_freq_mat[i] / 2.
    return FreqTable.FreqTable(exp_freq_table, FreqTable.FREQ)
Ejemplo n.º 4
0
def _exp_freq_table_from_obs_freq(obs_freq_mat):
    """Build expected frequence table from observed frequences (PRIVATE)."""
    exp_freq_table = {}
    for i in obs_freq_mat.alphabet.letters:
        exp_freq_table[i] = 0.0
    for i in obs_freq_mat:
        if i[0] == i[1]:
            exp_freq_table[i[0]] += obs_freq_mat[i]
        else:
            exp_freq_table[i[0]] += obs_freq_mat[i] / 2.0
            exp_freq_table[i[1]] += obs_freq_mat[i] / 2.0
    return FreqTable.FreqTable(exp_freq_table, FreqTable.FREQ)
Ejemplo n.º 5
0
def cal_IC(path_motifInstance):
    e_freq_table = FreqTable.FreqTable(EXPECT_FREQ, FreqTable.FREQ,
                                       IUPAC.unambiguous_dna)
    information_content = []
    alignment = AlignIO.read(path_motifInstance, "fasta")
    summary_align = AlignInfo.SummaryInfo(alignment)

    for j in range(FILTER_LENGTH):
        information_content.append(
            summary_align.information_content(j,
                                              j + 1,
                                              e_freq_table=e_freq_table,
                                              chars_to_ignore=['N']))
    return information_content
Ejemplo n.º 6
0
def summary(msa, output, title):
    '''
       Ala (A) 9.10   Gln (Q) 3.79   Leu (L) 9.87   Ser (S) 6.69
       Arg (R) 5.71   Glu (E) 6.16   Lys (K) 4.99   Thr (T) 5.57
       Asn (N) 3.88   Gly (G) 7.26   Met (M) 2.38   Trp (W) 1.29
       Asp (D) 5.45   His (H) 2.19   Phe (F) 3.92   Tyr (Y) 2.93
       Cys (C) 1.21   Ile (I) 5.70   Pro (P) 4.85   Val (V) 6.88
    '''
    #unit_prot freq table of aminoacids 23/11/2017
    e_freq_dict = {
        'A': 0.091,
        'R': 0.0571,
        'N': 0.0388,
        'D': 0.0545,
        'C': 0.0121,
        'Q': 0.0379,
        'E': 0.0616,
        'G': 0.0726,
        'H': 0.0219,
        'I': 0.0570,
        'L': 0.0987,
        'K': 0.0499,
        'M': 0.0238,
        'F': 0.0392,
        'P': 0.0485,
        'S': 0.0669,
        'T': 0.0557,
        'W': 0.0129,
        'Y': 0.0293,
        'V': 0.0688
    }
    #e_freq_dict={'A': 0.175, 'B': 0.325, 'C': 0.5}
    e_freq_table = FreqTable.FreqTable(e_freq_dict,
                                       FreqTable.FREQ,
                                       alphabet=Alphabet.ProteinAlphabet())
    #e_freq_table=None
    df = pandas.DataFrame()
    alignment = AlignIO.read(msa, "fasta", alphabet=Alphabet.ProteinAlphabet())
    summary_align = AlignInfo.SummaryInfo(alignment)
    total_entropy, entropy_columns, freq_dict_columns = information_content(
        summary_align, e_freq_table=e_freq_table)
    '''Print File de resultados'''
    for i in range(len(entropy_columns.values())):
        freq_dict = freq_dict_columns[i]
        df_2 = pandas.DataFrame([freq_dict], columns=freq_dict.keys())
        df_2['Entropy'] = entropy_columns[i]
        df = df.append(df_2, ignore_index=True)
        #df.set_value(i, 'Entropy' , entropy_columns[i])
    df.to_csv(output)
Ejemplo n.º 7
0
    def pwm(self):
        """
        returns the PWM computed for the set of instances
        """

        if self._pwm_is_current:
            return self._pwm
        #we need to compute new pwm
        self._pwm = []
        for i in xrange(len(self.mask)):
            dict = {}
            #filling the dict with 0's
            for letter in self.alphabet.letters:
                dict[letter] = 0
            #counting the occurences of letters in instances
            for seq in self.instances:
                dict[seq[i]] = dict[seq[i]] + 1
            self._pwm.append(
                FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet))
        self._pwm_is_current = True
        return self._pwm
Ejemplo n.º 8
0
print('first description: %s' % alignment[0].description)
print('first sequence: %s' % alignment[0].seq)

# get the length of the alignment
print('length %i' % alignment.get_alignment_length())

print(alignment)

# print out interesting information about the alignment
summary_align = AlignInfo.SummaryInfo(alignment)

consensus = summary_align.dumb_consensus()
print('consensus %s' % consensus)

my_pssm = summary_align.pos_specific_score_matrix(consensus,
                                                  chars_to_ignore=['N'])

print(my_pssm)

expect_freq = {'A': .3, 'G': .2, 'T': .3, 'C': .2}

freq_table_info = FreqTable.FreqTable(expect_freq, FreqTable.FREQ,
                                      IUPAC.unambiguous_dna)

info_content = summary_align.information_content(5,
                                                 30,
                                                 chars_to_ignore=['N'],
                                                 e_freq_table=freq_table_info)

print("relative info content: %f" % info_content)
Ejemplo n.º 9
0
        "Install NumPy if you want to use Bio.SubsMat.")

try:
    import cPickle as pickle  # Only available on Python 2
except ImportError:
    import pickle

import sys
import os
from Bio import SubsMat
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
with open(ftab_file) as handle:
    ftab_prot = FreqTable.read_count(handle)
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
with open(ctab_file) as handle:
    ctab_prot = FreqTable.read_freq(handle)
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
# Don't want to use text mode on Python 3,
with open(pickle_file, 'rb') as handle:
    acc_rep_mat = pickle.load(handle)
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
Ejemplo n.º 10
0
    positions.sort()
    for pos in positions:
        fout.write("%d %s %.3f\n" %
                   (pos, rep_sequence[pos], summary_info.ic_vector[pos]))


if __name__ == "__main__":
    print "Quick test"
    from Bio import AlignIO
    from Bio.Align.Generic import Alignment

    filename = "../../Tests/GFF/multi.fna"
    format = "fasta"
    expected = FreqTable.FreqTable({
        "A": 0.25,
        "G": 0.25,
        "T": 0.25,
        "C": 0.25
    }, FreqTable.FREQ, IUPAC.unambiguous_dna)

    alignment = AlignIO.read(open(filename), format)
    for record in alignment:
        print str(record.seq)
    print "=" * alignment.get_alignment_length()

    summary = SummaryInfo(alignment)
    consensus = summary.dumb_consensus(ambiguous="N")
    print consensus
    consensus = summary.gap_consensus(ambiguous="N")
    print consensus
    print
    print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
Ejemplo n.º 11
0
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

import cPickle
import sys
import os
from Bio import SubsMat
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
ftab_prot = FreqTable.read_count(open(ftab_file))
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
ctab_prot = FreqTable.read_freq(open(ctab_file))
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
#Don't want to use text mode on Python 3,
acc_rep_mat = cPickle.load(open(pickle_file, 'rb'))
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")

f.write(
    "Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
Ejemplo n.º 12
0
    def test_read_write_clustal(self):
        """Test the base alignment stuff."""
        path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln")
        alignment = AlignIO.read(path,
                                 "clustal",
                                 alphabet=Alphabet.Gapped(
                                     IUPAC.unambiguous_dna))
        self.assertEqual(len(alignment), 7)
        seq_record = alignment[0]
        self.assertEqual(seq_record.description,
                         "gi|6273285|gb|AF191659.1|AF191")
        self.assertEqual(
            seq_record.seq,
            Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA"
                ))
        seq_record = alignment[1]
        self.assertEqual(seq_record.description,
                         "gi|6273284|gb|AF191658.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[2]
        self.assertEqual(seq_record.description,
                         "gi|6273287|gb|AF191661.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[3]
        self.assertEqual(seq_record.description,
                         "gi|6273286|gb|AF191660.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[4]
        self.assertEqual(seq_record.description,
                         "gi|6273290|gb|AF191664.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        seq_record = alignment[5]
        self.assertEqual(seq_record.description,
                         "gi|6273289|gb|AF191663.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA"
        )
        seq_record = alignment[6]
        self.assertEqual(seq_record.description,
                         "gi|6273291|gb|AF191665.1|AF191")
        self.assertEqual(
            seq_record.seq,
            "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        self.assertEqual(alignment.get_alignment_length(), 156)
        align_info = AlignInfo.SummaryInfo(alignment)
        consensus = align_info.dumb_consensus()
        self.assertIsInstance(consensus, Seq)
        self.assertEqual(
            consensus,
            "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA"
        )
        dictionary = align_info.replacement_dictionary(["N"])
        self.assertEqual(len(dictionary), 16)
        self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1)
        self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1)
        self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1)
        self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1)
        self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1)
        self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1)
        self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1)
        self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1)
        self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1)
        self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1)
        matrix = align_info.pos_specific_score_matrix(consensus, ["N"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
X  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
T  0.0 0.0 0.0 3.0
A  3.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
X  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")

        matrix = align_info.pos_specific_score_matrix(chars_to_ignore=["N"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
X  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
T  0.0 0.0 0.0 3.0
A  3.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
T  0.0 0.0 0.0 1.0
A  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
X  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")

        second_seq = alignment[1].seq
        matrix = align_info.pos_specific_score_matrix(second_seq, ["N"])
        self.assertEqual(
            str(matrix), """\
    A   C   G   T
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  1.0 0.0 0.0 6.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  4.0 0.0 3.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
G  0.0 0.0 7.0 0.0
C  0.0 7.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 4.0
A  4.0 0.0 0.0 0.0
-  0.0 0.0 0.0 3.0
-  3.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
-  0.0 0.0 0.0 1.0
-  1.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
C  1.0 6.0 0.0 0.0
A  6.0 0.0 0.0 1.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 3.0 0.0 4.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 2.0 0.0 5.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
C  0.0 7.0 0.0 0.0
T  0.0 1.0 0.0 6.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
T  0.0 0.0 0.0 7.0
G  1.0 0.0 6.0 0.0
T  0.0 0.0 0.0 7.0
A  7.0 0.0 0.0 0.0
C  0.0 7.0 0.0 0.0
C  0.0 7.0 0.0 0.0
A  7.0 0.0 0.0 0.0
G  0.0 0.0 7.0 0.0
A  7.0 0.0 0.0 0.0
""")
        value = align_info.information_content(5, 50, chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 88.42, places=2)
        value = align_info.information_content(chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 287.55, places=2)
        e_freq = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25}
        e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ,
                                           IUPAC.unambiguous_dna)
        value = align_info.information_content(e_freq_table=e_freq_table,
                                               chars_to_ignore=["N"])
        self.assertAlmostEqual(value, 287.55, places=2)
        self.assertEqual(align_info.get_column(1), "AAAAAAA")
        self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2)
        self.assertEqual(align_info.get_column(7), "TTTATTT")
        self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2)
        handle = StringIO()
        AlignInfo.print_info_content(align_info, fout=handle)
        self.assertEqual(
            handle.getvalue(), """\
0 T 2.000
1 A 2.000
2 T 2.000
3 A 2.000
4 C 2.000
5 A 2.000
6 T 2.000
7 T 1.408
8 A 2.000
9 A 2.000
10 A 2.000
11 G 2.000
12 A 1.015
13 A 2.000
14 G 2.000
15 G 2.000
16 G 2.000
17 G 2.000
18 G 2.000
19 A 2.000
20 T 2.000
21 G 2.000
22 C 2.000
23 G 2.000
24 G 2.000
25 A 2.000
26 T 2.000
27 A 2.000
28 A 2.000
29 A 2.000
30 T 2.000
31 G 2.000
32 G 2.000
33 A 2.000
34 A 2.000
35 A 2.000
36 G 2.000
37 G 2.000
38 C 2.000
39 G 2.000
40 A 2.000
41 A 2.000
42 A 2.000
43 G 2.000
44 A 2.000
45 A 2.000
46 A 2.000
47 G 2.000
48 A 2.000
49 A 2.000
50 T 2.000
51 A 2.000
52 T 2.000
53 A 2.000
54 T 2.000
55 A 2.000
56 - 0.682
57 - 0.682
58 - 0.333
59 - 0.333
60 - -0.115
61 - -0.115
62 - -0.115
63 - -0.115
64 - -0.115
65 - -0.115
66 A 2.000
67 T 2.000
68 A 2.000
69 T 2.000
70 A 2.000
71 T 2.000
72 T 2.000
73 T 2.000
74 C 1.408
75 A 1.408
76 A 2.000
77 A 2.000
78 T 2.000
79 T 2.000
80 T 1.015
81 C 2.000
82 C 2.000
83 T 2.000
84 T 2.000
85 A 2.000
86 T 2.000
87 A 2.000
88 T 2.000
89 A 2.000
90 C 1.137
91 C 2.000
92 C 2.000
93 A 2.000
94 A 2.000
95 A 2.000
96 T 2.000
97 A 2.000
98 T 2.000
99 A 2.000
100 A 2.000
101 A 2.000
102 A 2.000
103 A 2.000
104 T 2.000
105 A 2.000
106 T 2.000
107 C 2.000
108 T 2.000
109 A 2.000
110 A 2.000
111 T 2.000
112 A 2.000
113 A 2.000
114 A 2.000
115 T 2.000
116 T 2.000
117 A 2.000
118 G 2.000
119 A 2.000
120 T 2.000
121 G 2.000
122 A 2.000
123 A 2.000
124 T 2.000
125 A 2.000
126 T 2.000
127 C 2.000
128 A 2.000
129 A 2.000
130 A 2.000
131 G 2.000
132 A 2.000
133 A 2.000
134 T 2.000
135 C 2.000
136 C 1.408
137 A 2.000
138 T 2.000
139 T 2.000
140 G 2.000
141 A 2.000
142 T 2.000
143 T 2.000
144 T 2.000
145 A 2.000
146 G 2.000
147 T 2.000
148 G 1.408
149 T 2.000
150 A 2.000
151 C 2.000
152 C 2.000
153 A 2.000
154 G 2.000
155 A 2.000
""")
Ejemplo n.º 13
0
print(align_info.pos_specific_score_matrix(chars_to_ignore=['N']))

print('with a selected sequence...')
second_seq = alignment[1].seq
print(align_info.pos_specific_score_matrix(second_seq, ['N']))

print('information content')
print('part of alignment: %0.2f' %
      align_info.information_content(5, 50, chars_to_ignore=['N']))
print('entire alignment: %0.2f' %
      align_info.information_content(chars_to_ignore=['N']))

print('relative information content')
e_freq = {'G': 0.25, 'C': 0.25, 'A': 0.25, 'T': 0.25}

e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ,
                                   IUPAC.unambiguous_dna)

print('relative information: %0.2f' % align_info.information_content(
    e_freq_table=e_freq_table, chars_to_ignore=['N']))

print('Column 1: %s' % align_info.get_column(1))
print('IC for column 1: %0.2f' % align_info.ic_vector[1])
print('Column 7: %s' % align_info.get_column(7))
print('IC for column 7: %0.2f' % align_info.ic_vector[7])
print('test print_info_content')
AlignInfo.print_info_content(align_info)
print("testing reading and writing fasta format...")

to_parse = os.path.join(os.curdir, 'Quality', 'example.fasta')

alignment = AlignIO.read(to_parse,
Ejemplo n.º 14
0
    'V': 0.0676,
    'L': 0.0936,
    'Y': 0.0287,
    'M': 0.0244,
    'I': 0.0528,
    'G': 0.0661,
    'A': 0.0567,
    'N': 0.0432,
    'C': 0.0175,
    'Q': 0.0345,
    'F': 0.0426,
    'P': 0.0487,
    'X': 0
}
summary_align = AlignInfo.SummaryInfo(alignment)
e_freq_table = FreqTable.FreqTable(expect_freq, FreqTable.FREQ,
                                   Alphabet.IUPAC.Alphabet)
pseudo_count = 0.1
chars_to_ignore = ['-']
log_base = 2

if types_choice == "t":
    new_expect_freq = {}
    for j in types_dict.keys():
        new_expect_freq[j] = 0

    for i in expect_freq.keys():
        for j in types_dict.keys():
            if i in types_dict[j]:
                new_expect_freq[j] += expect_freq[i]

    e_freq_table = FreqTable.FreqTable(new_expect_freq, FreqTable.FREQ,
Ejemplo n.º 15
0
import cPickle
import sys
import os
from Bio import SubsMat
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
ftab_prot = FreqTable.read_count(open(ftab_file))
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
ctab_prot = FreqTable.read_freq(open(ctab_file))
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
acc_rep_mat = cPickle.load(open(pickle_file))
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f,format=" %4.3f")


f.write("Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
ks.sort()
for i in ks:
    f.write("%s %.2f\n" % (i,abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.
Ejemplo n.º 16
0
    del corrcoef
except ImportError:
    from Bio import MissingExternalDependencyError
    raise MissingExternalDependencyError(
        "Install NumPy if you want to use Bio.SubsMat.")

import cPickle
import sys
import os
from Bio import SubsMat
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
with open(ftab_file) as handle:
    ftab_prot = FreqTable.read_count(handle)
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
with open(ctab_file) as handle:
    ctab_prot = FreqTable.read_freq(handle)
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
#Don't want to use text mode on Python 3,
with open(pickle_file, 'rb') as handle:
    acc_rep_mat = cPickle.load(handle)
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
Ejemplo n.º 17
0
from Bio.Alphabet import IUPAC
from Bio.SubsMat import FreqTable

dna_letters = IUPAC.unambiguous_dna.letters
rna_letters = IUPAC.unambiguous_rna.letters
protein_letters = IUPAC.protein.letters

dna_naive_freq = {k: 0.25 for k in dna_letters}
rna_naive_freq = {k: 0.25 for k in rna_letters}
aa_naive_freq = {k: 0.05 for k in protein_letters}

dna_naive_freq_table = FreqTable.FreqTable(dna_naive_freq, FreqTable.FREQ,
                                           IUPAC.unambiguous_dna)

rna_naive_freq_table = FreqTable.FreqTable(rna_naive_freq, FreqTable.FREQ,
                                           IUPAC.unambiguous_rna)

aa_naive_freq_table = FreqTable.FreqTable(aa_naive_freq, FreqTable.FREQ,
                                          IUPAC.protein)

naive_freq_tables = {
    'aa': aa_naive_freq_table,
    'dna': dna_naive_freq_table,
    'rna': rna_naive_freq_table
}