def substitution_matrices(self): subs = [] for msa in [self.kinase_msa, self.peptide_msa]: c_align = AlignIO.read(msa, "tab") summary_align = AlignInfo.SummaryInfo(c_align) replace_info = summary_align.replacement_dictionary() my_arm = SubsMat.SeqMat(replace_info) my_lom = SubsMat.make_log_odds_matrix(my_arm) subs.append(my_lom) return subs[0], subs[1]
def create_matriz_Sustitucion(summary_align): """Esta información nos da nuestro número aceptado de reemplazos, o con qué frecuencia esperamos que diferentes residuos se sustituyan entre sí""" replace_info = summary_align.replacement_dictionary() #print replace_info[ ("A", "G")] #print replace_info[ ("A", "K")] #La funcion SeqMat() toma como parametro el diccionario de reemplazos my_arm = SubsMat.SeqMat(replace_info) #crear una matriz de reemplazo aceptada (Accepted Replacement Matrix - ARM). my_lom = SubsMat.make_log_odds_matrix(my_arm) my_lom.print_full_mat() return my_lom
def createScoreMatrixFromAlignment(filename, output, print_=False): c_align = AlignIO.read(filename, "fasta") summary_align = AlignInfo.SummaryInfo(c_align) replace_info = summary_align.replacement_dictionary(["*"]) my_arm = SubsMat.SeqMat(replace_info) #add pseudocounts for m in my_arm: my_arm[m] += 1 my_lom = SubsMat.make_log_odds_matrix(my_arm) pickle.dump(my_lom, open(output, "wb")) return my_lom
def get_matrix(seq_ls, scale, weight_type): seqs = seq_ls bcounts = freq_counts(seqs, "ACDEFGHIKLMNPQRSTVWY") if weight_type == 'None': weights = [1.0]*len(seqs) elif weight_type == 'Henikoff': weights = henikoff_weights(seqs, "ACDEFGHIKLMNPQRSTVWY", bcounts) align = Generic.Alignment(IUPAC.protein) for i in range(len(seqs)): align.add_sequence("Seq #%d" % i, seqs[i], weight=weights[i]) summary_align = AlignInfo.SummaryInfo(align) # Must get expected frequencies from our own (i.e. whole # database) background frequencies. Otherwise they would be # derived from the alignment which wouldn't be good if we # have a small sample ftab = FreqTable(bg_dict, FREQ) arm = SubsMat.SeqMat(summary_align.replacement_dictionary()) lom = SubsMat.make_log_odds_matrix(arm, ftab, factor=scale, round_digit=0, keep_nd=0) PM = lom matrix_type = '?'#SCORE return PM, matrix_type, 0
ftab_file = os.path.join('SubsMat', 'protein_count.txt') with open(ftab_file) as handle: ftab_prot = FreqTable.read_count(handle) ctab_file = os.path.join('SubsMat', 'protein_freq.txt') with open(ctab_file) as handle: ctab_prot = FreqTable.read_freq(handle) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik') #Don't want to use text mode on Python 3, with open(pickle_file, 'rb') as handle: acc_rep_mat = cPickle.load(handle) acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat) obs_freq_mat.print_mat(f=f, format=" %4.3f") f.write( "Diff between supplied and matrix-derived frequencies, should be small\n") ks = ftab_prot.keys() ks.sort() for i in ks: f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i]))) s = 0. f.write("Calculating sum of letters for an observed frequency matrix\n") counts = obs_freq_mat.sum() keys = counts.keys()
""" # standard library import sys # Biopython from Bio import SubsMat from Bio import Clustalw from Bio.Alphabet import IUPAC from Bio.Align import AlignInfo # get an alignment object from a Clustalw alignment output c_align = Clustalw.parse_file('protein.aln', IUPAC.protein) summary_align = AlignInfo.SummaryInfo(c_align) # get a replacement dictionary and accepted replacement matrix # exclude all amino acids that aren't charged polar replace_info = summary_align.replacement_dictionary(["G", "A", "V", "L", "I", "M", "P", "F", "W", "S", "T", "N", "Q", "Y", "C"]) my_arm = SubsMat.SeqMat(replace_info) print replace_info my_lom = SubsMat.make_log_odds_matrix(my_arm) print 'log_odds_mat:', my_lom my_lom.print_mat()
#!/usr/bin/env python import os, re, string from Bio import AlignIO, SubsMat from Bio.SubsMat import MatrixInfo from Bio.pairwise2 import dictionary_match from optparse import OptionParser from pfacts003.phylofacts.models import Family, TreeNodeAlignmentConservation blosum62_of_residues = dictionary_match(SubsMat.SeqMat(MatrixInfo.blosum62)) def get_alignment_seqs_and_aligned_column_indices(alignment): alignment_length = 0 aligned_column_indices = set() alignment_seqs = {} first_pass = True i = 0 for row in alignment: seq = row.seq.tostring() if first_pass: alignment_length = len(row.seq) for j in range(len(seq)): if seq[j] == '-' or seq[j].isupper(): aligned_column_indices.add(j) first_pass = False alignment_seqs[i] = seq i += 1 return (alignment_seqs, aligned_column_indices)
with open(ftab_file) as handle: ftab_prot = FreqTable.read_count(handle) ctab_file = os.path.join('SubsMat', 'protein_freq.txt') with open(ctab_file) as handle: ctab_prot = FreqTable.read_freq(handle) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik') # Don't want to use text mode on Python 3, with open(pickle_file, 'rb') as handle: acc_rep_mat = pickle.load(handle) acc_rep_mat = SubsMat.AcceptedReplacementsMatrix(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat) obs_freq_mat.print_mat(f=f, format=" %4.3f") f.write("Diff between supplied and matrix-derived frequencies, should be small\n") for i in sorted(ftab_prot): f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i]))) s = 0. f.write("Calculating sum of letters for an observed frequency matrix\n") counts = obs_freq_mat.sum() for key in sorted(counts): f.write("%s\t%.2f\n" % (key, counts[key])) s += counts[key] f.write("Total sum %.2f should be 1.0\n" % (s))
from Bio.SubsMat import FreqTable, MatrixInfo f = sys.stdout ftab_file = os.path.join("SubsMat", "protein_count.txt") ftab_prot = FreqTable.read_count(open(ftab_file)) ctab_file = os.path.join("SubsMat", "protein_freq.txt") ctab_prot = FreqTable.read_freq(open(ctab_file)) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join("SubsMat", "acc_rep_mat.pik") acc_rep_mat = cPickle.load(open(pickle_file)) acc_rep_mat = SubsMat.SeqMat(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat) obs_freq_mat.print_mat(f=f, format=" %4.3f") f.write("Diff between supplied and matrix-derived frequencies, should be small\n") ks = ftab_prot.keys() ks.sort() for i in ks: f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i]))) s = 0.0 f.write("Calculating sum of letters for an observed frequency matrix\n") obs_freq_mat.all_letters_sum() ks = obs_freq_mat.sum_letters.keys() ks.sort()
print("Type 'C' to see the specific values of a score matrix") user_anwer = input () if user_anwer == "A": print(options) if user_anwer == "B": print("\nExactly, in what matrix do you whant to focus?") user_anwer2 = input() if user_anwer2 == "benner": print(posible_matrix [9:12]) if user_anwer2 == "blosum": print(posible_matrix [12:28]) if user_anwer2 == "pam": print(posible_matrix [39:-3]) if user_anwer == "C": user_anwer3 = input("\nIndicate which one (in lower case):") print(SubsMat.SeqMat(mappings[user_anwer3])) #With this option it calculate the Ras score for one alignment if main_answer == "score": print("\nDo you have the first sequence in a .txt fasta file?") file_aswer = input("Type 'yes' or 'no': ") if file_aswer == "yes": #It allows to read the sequence from a fasta file (apply to al the sequence inputs) seq1 = "" file_input = input("\nIndicate the name of the file. If it is not in the same directory, indicate the path: ") with open(file_input) as f: for line in f: if not line.startswith(">"): seq1 += line.strip() if file_aswer == "no": #Also it allows to type by hand the sequence you whant to use (apply to al the sequence inputs)
from Bio import SubsMat from Bio.SubsMat import FreqTable, MatrixInfo f = sys.stdout ftab_file = os.path.join('SubsMat', 'protein_count.txt') ftab_prot = FreqTable.read_count(open(ftab_file)) ctab_file = os.path.join('SubsMat', 'protein_freq.txt') ctab_prot = FreqTable.read_freq(open(ctab_file)) f.write("Check differences between derived and true frequencies for each\n") f.write("letter. Differences should be very small\n") for i in ftab_prot.alphabet.letters: f.write("%s %f\n" % (i, abs(ftab_prot[i] - ctab_prot[i]))) pickle_file = os.path.join('SubsMat', 'acc_rep_mat.pik') acc_rep_mat = cPickle.load(open(pickle_file)) acc_rep_mat = SubsMat.SeqMat(acc_rep_mat) obs_freq_mat = SubsMat._build_obs_freq_mat(acc_rep_mat) ftab_prot2 = SubsMat._exp_freq_table_from_obs_freq(obs_freq_mat) obs_freq_mat.print_mat(f=f, format=" %4.3f") f.write( "Diff between supplied and matrix-derived frequencies, should be small\n") ks = ftab_prot.keys() ks.sort() for i in ks: f.write("%s %.2f\n" % (i, abs(ftab_prot[i] - ftab_prot2[i]))) s = 0. f.write("Calculating sum of letters for an observed frequency matrix\n") obs_freq_mat.all_letters_sum() ks = obs_freq_mat.sum_letters.keys()