def substitution_matrices(self):
     subs = []
     for msa in [self.kinase_msa, self.peptide_msa]:
         c_align = AlignIO.read(msa, "tab")
         summary_align = AlignInfo.SummaryInfo(c_align)
         replace_info = summary_align.replacement_dictionary()
         my_arm = SubsMat.SeqMat(replace_info)
         my_lom = SubsMat.make_log_odds_matrix(my_arm)
         subs.append(my_lom)
     return subs[0], subs[1]
def create_matriz_Sustitucion(summary_align):
    """Esta información nos da nuestro número aceptado de reemplazos, o con qué frecuencia esperamos
	que diferentes residuos se sustituyan entre sí"""
    replace_info = summary_align.replacement_dictionary()
    #print replace_info[ ("A", "G")]
    #print replace_info[ ("A", "K")]
    #La funcion SeqMat() toma como parametro el diccionario de reemplazos
    my_arm = SubsMat.SeqMat(replace_info)
    #crear una matriz de reemplazo aceptada (Accepted Replacement Matrix - ARM).
    my_lom = SubsMat.make_log_odds_matrix(my_arm)
    my_lom.print_full_mat()
    return my_lom
Example #3
0
def createScoreMatrixFromAlignment(filename, output, print_=False):
    c_align = AlignIO.read(filename, "fasta")
    summary_align = AlignInfo.SummaryInfo(c_align)
    replace_info = summary_align.replacement_dictionary(["*"])
    my_arm = SubsMat.SeqMat(replace_info)

    #add pseudocounts
    for m in my_arm:
        my_arm[m] += 1
    my_lom = SubsMat.make_log_odds_matrix(my_arm)

    pickle.dump(my_lom, open(output, "wb"))
    return my_lom
Example #4
0
def get_matrix(seq_ls, scale, weight_type):

    seqs = seq_ls

    bcounts = freq_counts(seqs, "ACDEFGHIKLMNPQRSTVWY")
    if weight_type == 'None':
        weights = [1.0]*len(seqs)
    elif weight_type == 'Henikoff':
        weights = henikoff_weights(seqs, "ACDEFGHIKLMNPQRSTVWY",
                                   bcounts) 

    align = Generic.Alignment(IUPAC.protein)
    for i in range(len(seqs)):
        align.add_sequence("Seq #%d" % i, seqs[i], weight=weights[i])
    summary_align = AlignInfo.SummaryInfo(align)

    # Must get expected frequencies from our own (i.e. whole
    # database) background frequencies. Otherwise they would be
    # derived from the alignment which wouldn't be good if we
    # have a small sample
    ftab = FreqTable(bg_dict, FREQ)
    arm = SubsMat.SeqMat(summary_align.replacement_dictionary())
    lom = SubsMat.make_log_odds_matrix(arm, ftab, factor=scale,
                                       round_digit=0, keep_nd=0)

    PM = lom
    matrix_type = '?'#SCORE
    return PM, matrix_type, 0
Example #5
0
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
with open(ftab_file) as handle:
    ftab_prot = FreqTable.read_count(handle)
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
with open(ctab_file) as handle:
    ctab_prot = FreqTable.read_freq(handle)
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
#Don't want to use text mode on Python 3,
with open(pickle_file, 'rb') as handle:
    acc_rep_mat = cPickle.load(handle)
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")

f.write(
    "Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
ks.sort()
for i in ks:
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.
f.write("Calculating sum of letters for an observed frequency matrix\n")
counts = obs_freq_mat.sum()
keys = counts.keys()
Example #6
0
"""
# standard library
import sys

# Biopython
from Bio import SubsMat
from Bio import Clustalw
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo

# get an alignment object from a Clustalw alignment output
c_align = Clustalw.parse_file('protein.aln', IUPAC.protein)
summary_align = AlignInfo.SummaryInfo(c_align)

# get a replacement dictionary and accepted replacement matrix
# exclude all amino acids that aren't charged polar
replace_info = summary_align.replacement_dictionary(["G", "A", "V", "L", "I",
                                                     "M", "P", "F", "W", "S",
                                                     "T", "N", "Q", "Y", "C"])

my_arm = SubsMat.SeqMat(replace_info)

print replace_info

my_lom = SubsMat.make_log_odds_matrix(my_arm)

print 'log_odds_mat:', my_lom

my_lom.print_mat()

Example #7
0
#!/usr/bin/env python

import os, re, string
from Bio import AlignIO, SubsMat
from Bio.SubsMat import MatrixInfo
from Bio.pairwise2 import dictionary_match
from optparse import OptionParser
from pfacts003.phylofacts.models import Family, TreeNodeAlignmentConservation

blosum62_of_residues = dictionary_match(SubsMat.SeqMat(MatrixInfo.blosum62))


def get_alignment_seqs_and_aligned_column_indices(alignment):
    alignment_length = 0
    aligned_column_indices = set()
    alignment_seqs = {}
    first_pass = True
    i = 0

    for row in alignment:
        seq = row.seq.tostring()
        if first_pass:
            alignment_length = len(row.seq)
            for j in range(len(seq)):
                if seq[j] == '-' or seq[j].isupper():
                    aligned_column_indices.add(j)
            first_pass = False
        alignment_seqs[i] = seq
        i += 1
    return (alignment_seqs, aligned_column_indices)
Example #8
0
with open(ftab_file) as handle:
    ftab_prot = FreqTable.read_count(handle)
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
with open(ctab_file) as handle:
    ctab_prot = FreqTable.read_freq(handle)
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
# Don't want to use text mode on Python 3,
with open(pickle_file, 'rb') as handle:
    acc_rep_mat = pickle.load(handle)
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")


f.write("Diff between supplied and matrix-derived frequencies, should be small\n")
for i in sorted(ftab_prot):
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.
f.write("Calculating sum of letters for an observed frequency matrix\n")
counts = obs_freq_mat.sum()
for key in sorted(counts):
    f.write("%s\t%.2f\n" % (key, counts[key]))
    s += counts[key]
f.write("Total sum %.2f should be 1.0\n" % (s))
Example #9
0
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join("SubsMat", "protein_count.txt")
ftab_prot = FreqTable.read_count(open(ftab_file))
ctab_file = os.path.join("SubsMat", "protein_freq.txt")
ctab_prot = FreqTable.read_freq(open(ctab_file))
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join("SubsMat", "acc_rep_mat.pik")
acc_rep_mat = cPickle.load(open(pickle_file))
acc_rep_mat = SubsMat.SeqMat(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")


f.write("Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
ks.sort()
for i in ks:
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.0
f.write("Calculating sum of letters for an observed frequency matrix\n")
obs_freq_mat.all_letters_sum()
ks = obs_freq_mat.sum_letters.keys()
ks.sort()
Example #10
0
        print("Type 'C' to see the specific values of a score matrix")
        user_anwer = input ()
        if user_anwer == "A":
            print(options)
        if user_anwer == "B":
            print("\nExactly, in what matrix do you whant to focus?")
            user_anwer2 = input()
            if user_anwer2 == "benner":
                    print(posible_matrix [9:12])
            if user_anwer2 == "blosum":
                    print(posible_matrix [12:28])
            if user_anwer2 == "pam":
                    print(posible_matrix [39:-3])
        if user_anwer == "C":
            user_anwer3 = input("\nIndicate which one (in lower case):")
            print(SubsMat.SeqMat(mappings[user_anwer3]))

    #With this option it calculate the Ras score for one alignment
    if main_answer == "score":
        print("\nDo you have the first sequence in a .txt fasta file?")
        file_aswer = input("Type 'yes' or 'no': ")
        if file_aswer == "yes":
            #It allows to read the sequence from a fasta file (apply to al the sequence inputs)
            seq1 = ""
            file_input = input("\nIndicate the name of the file. If it is not in the same directory, indicate the path: ")
            with open(file_input) as f:
                for line in f:
                    if not line.startswith(">"):
                        seq1 += line.strip()
        if file_aswer == "no":
            #Also it allows to type by hand the sequence you whant to use (apply to al the sequence inputs)
Example #11
0
from Bio import SubsMat
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
ftab_prot = FreqTable.read_count(open(ftab_file))
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
ctab_prot = FreqTable.read_freq(open(ctab_file))
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
acc_rep_mat = cPickle.load(open(pickle_file))
acc_rep_mat = SubsMat.SeqMat(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")

f.write(
    "Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
ks.sort()
for i in ks:
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.
f.write("Calculating sum of letters for an observed frequency matrix\n")
obs_freq_mat.all_letters_sum()
ks = obs_freq_mat.sum_letters.keys()