def substitution_matrices(self):
     subs = []
     for msa in [self.kinase_msa, self.peptide_msa]:
         c_align =, "tab")
         summary_align = AlignInfo.SummaryInfo(c_align)
         replace_info = summary_align.replacement_dictionary()
         my_arm = SubsMat.SeqMat(replace_info)
         my_lom = SubsMat.make_log_odds_matrix(my_arm)
     return subs[0], subs[1]
def create_matriz_Sustitucion(summary_align):
    """Esta información nos da nuestro número aceptado de reemplazos, o con qué frecuencia esperamos
	que diferentes residuos se sustituyan entre sí"""
    replace_info = summary_align.replacement_dictionary()
    #print replace_info[ ("A", "G")]
    #print replace_info[ ("A", "K")]
    #La funcion SeqMat() toma como parametro el diccionario de reemplazos
    my_arm = SubsMat.SeqMat(replace_info)
    #crear una matriz de reemplazo aceptada (Accepted Replacement Matrix - ARM).
    my_lom = SubsMat.make_log_odds_matrix(my_arm)
    return my_lom
Esempio n. 3
def createScoreMatrixFromAlignment(filename, output, print_=False):
    c_align =, "fasta")
    summary_align = AlignInfo.SummaryInfo(c_align)
    replace_info = summary_align.replacement_dictionary(["*"])
    my_arm = SubsMat.SeqMat(replace_info)

    #add pseudocounts
    for m in my_arm:
        my_arm[m] += 1
    my_lom = SubsMat.make_log_odds_matrix(my_arm)

    pickle.dump(my_lom, open(output, "wb"))
    return my_lom
Esempio n. 4
def get_matrix(seq_ls, scale, weight_type):

    seqs = seq_ls

    bcounts = freq_counts(seqs, "ACDEFGHIKLMNPQRSTVWY")
    if weight_type == 'None':
        weights = [1.0]*len(seqs)
    elif weight_type == 'Henikoff':
        weights = henikoff_weights(seqs, "ACDEFGHIKLMNPQRSTVWY",

    align = Generic.Alignment(IUPAC.protein)
    for i in range(len(seqs)):
        align.add_sequence("Seq #%d" % i, seqs[i], weight=weights[i])
    summary_align = AlignInfo.SummaryInfo(align)

    # Must get expected frequencies from our own (i.e. whole
    # database) background frequencies. Otherwise they would be
    # derived from the alignment which wouldn't be good if we
    # have a small sample
    ftab = FreqTable(bg_dict, FREQ)
    arm = SubsMat.SeqMat(summary_align.replacement_dictionary())
    lom = SubsMat.make_log_odds_matrix(arm, ftab, factor=scale,
                                       round_digit=0, keep_nd=0)

    PM = lom
    matrix_type = '?'#SCORE
    return PM, matrix_type, 0
Esempio n. 5
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
with open(ftab_file) as handle:
    ftab_prot = FreqTable.read_count(handle)
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
with open(ctab_file) as handle:
    ctab_prot = FreqTable.read_freq(handle)
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
#Don't want to use text mode on Python 3,
with open(pickle_file, 'rb') as handle:
    acc_rep_mat = cPickle.load(handle)
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")

    "Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
for i in ks:
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.
f.write("Calculating sum of letters for an observed frequency matrix\n")
counts = obs_freq_mat.sum()
keys = counts.keys()
Esempio n. 6
# standard library
import sys

# Biopython
from Bio import SubsMat
from Bio import Clustalw
from Bio.Alphabet import IUPAC
from Bio.Align import AlignInfo

# get an alignment object from a Clustalw alignment output
c_align = Clustalw.parse_file('protein.aln', IUPAC.protein)
summary_align = AlignInfo.SummaryInfo(c_align)

# get a replacement dictionary and accepted replacement matrix
# exclude all amino acids that aren't charged polar
replace_info = summary_align.replacement_dictionary(["G", "A", "V", "L", "I",
                                                     "M", "P", "F", "W", "S",
                                                     "T", "N", "Q", "Y", "C"])

my_arm = SubsMat.SeqMat(replace_info)

print replace_info

my_lom = SubsMat.make_log_odds_matrix(my_arm)

print 'log_odds_mat:', my_lom


Esempio n. 7
#!/usr/bin/env python

import os, re, string
from Bio import AlignIO, SubsMat
from Bio.SubsMat import MatrixInfo
from Bio.pairwise2 import dictionary_match
from optparse import OptionParser
from pfacts003.phylofacts.models import Family, TreeNodeAlignmentConservation

blosum62_of_residues = dictionary_match(SubsMat.SeqMat(MatrixInfo.blosum62))

def get_alignment_seqs_and_aligned_column_indices(alignment):
    alignment_length = 0
    aligned_column_indices = set()
    alignment_seqs = {}
    first_pass = True
    i = 0

    for row in alignment:
        seq = row.seq.tostring()
        if first_pass:
            alignment_length = len(row.seq)
            for j in range(len(seq)):
                if seq[j] == '-' or seq[j].isupper():
            first_pass = False
        alignment_seqs[i] = seq
        i += 1
    return (alignment_seqs, aligned_column_indices)
Esempio n. 8
with open(ftab_file) as handle:
    ftab_prot = FreqTable.read_count(handle)
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
with open(ctab_file) as handle:
    ctab_prot = FreqTable.read_freq(handle)
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
# Don't want to use text mode on Python 3,
with open(pickle_file, 'rb') as handle:
    acc_rep_mat = pickle.load(handle)
acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")

f.write("Diff between supplied and matrix-derived frequencies, should be small\n")
for i in sorted(ftab_prot):
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.
f.write("Calculating sum of letters for an observed frequency matrix\n")
counts = obs_freq_mat.sum()
for key in sorted(counts):
    f.write("%s\t%.2f\n" % (key, counts[key]))
    s += counts[key]
f.write("Total sum %.2f should be 1.0\n" % (s))
Esempio n. 9
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join("SubsMat", "protein_count.txt")
ftab_prot = FreqTable.read_count(open(ftab_file))
ctab_file = os.path.join("SubsMat", "protein_freq.txt")
ctab_prot = FreqTable.read_freq(open(ctab_file))
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join("SubsMat", "acc_rep_mat.pik")
acc_rep_mat = cPickle.load(open(pickle_file))
acc_rep_mat = SubsMat.SeqMat(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")

f.write("Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
for i in ks:
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.0
f.write("Calculating sum of letters for an observed frequency matrix\n")
ks = obs_freq_mat.sum_letters.keys()
Esempio n. 10
        print("Type 'C' to see the specific values of a score matrix")
        user_anwer = input ()
        if user_anwer == "A":
        if user_anwer == "B":
            print("\nExactly, in what matrix do you whant to focus?")
            user_anwer2 = input()
            if user_anwer2 == "benner":
                    print(posible_matrix [9:12])
            if user_anwer2 == "blosum":
                    print(posible_matrix [12:28])
            if user_anwer2 == "pam":
                    print(posible_matrix [39:-3])
        if user_anwer == "C":
            user_anwer3 = input("\nIndicate which one (in lower case):")

    #With this option it calculate the Ras score for one alignment
    if main_answer == "score":
        print("\nDo you have the first sequence in a .txt fasta file?")
        file_aswer = input("Type 'yes' or 'no': ")
        if file_aswer == "yes":
            #It allows to read the sequence from a fasta file (apply to al the sequence inputs)
            seq1 = ""
            file_input = input("\nIndicate the name of the file. If it is not in the same directory, indicate the path: ")
            with open(file_input) as f:
                for line in f:
                    if not line.startswith(">"):
                        seq1 += line.strip()
        if file_aswer == "no":
            #Also it allows to type by hand the sequence you whant to use (apply to al the sequence inputs)
Esempio n. 11
from Bio import SubsMat
from Bio.SubsMat import FreqTable, MatrixInfo

f = sys.stdout
ftab_file = os.path.join('SubsMat', 'protein_count.txt')
ftab_prot = FreqTable.read_count(open(ftab_file))
ctab_file = os.path.join('SubsMat', 'protein_freq.txt')
ctab_prot = FreqTable.read_freq(open(ctab_file))
f.write("Check differences between derived and true frequencies for each\n")
f.write("letter. Differences should be very small\n")
for i in ftab_prot.alphabet.letters:
    f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i])))

pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik')
acc_rep_mat = cPickle.load(open(pickle_file))
acc_rep_mat = SubsMat.SeqMat(acc_rep_mat)
obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat)
ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat)
obs_freq_mat.print_mat(f=f, format=" %4.3f")

    "Diff between supplied and matrix-derived frequencies, should be small\n")
ks = ftab_prot.keys()
for i in ks:
    f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i])))

s = 0.
f.write("Calculating sum of letters for an observed frequency matrix\n")
ks = obs_freq_mat.sum_letters.keys()