def infer_longest_peptide(masses): '''Returns the longest protein string that matches the spectrum graph of the given masses.''' # Build the graph from the given masses. graph = dict() protein_weight_dict = ProteinWeightDict() for i in xrange(len(masses)): for j in xrange(i+1, len(masses)): # Break the inner loop if we've exceeded the maximum weight. if masses[j] - masses[i] > max(protein_weight_dict.values()) + 1: break # Check if the weight associated with masses i and j approximately matches a known protein. temp_protein = find_weight_match(masses[j] - masses[i], 0.001) if temp_protein is not None: graph[masses[i], masses[j]] = temp_protein # Get the topological ordering of the graph. top_order = topological_ordering(graph.keys()) # Build the longest path to each node. S = {node: '' for node in top_order} for node in top_order: for predecessor in map(lambda n: n[0], filter(lambda e: e[1] == node, graph.keys())): if len(S[predecessor]) + 1 > len(S[node]): S[node] = S[predecessor] + graph[(predecessor, node)] # Return the longest path. return max(S.values(), key=len)
def infer_longest_peptide(masses): '''Returns the longest protein string that matches the spectrum graph of the given masses.''' # Build the graph from the given masses. graph = dict() protein_weight_dict = ProteinWeightDict() for i in xrange(len(masses)): for j in xrange(i + 1, len(masses)): # Break the inner loop if we've exceeded the maximum weight. if masses[j] - masses[i] > max(protein_weight_dict.values()) + 1: break # Check if the weight associated with masses i and j approximately matches a known protein. temp_protein = find_weight_match(masses[j] - masses[i], 0.001) if temp_protein is not None: graph[masses[i], masses[j]] = temp_protein # Get the topological ordering of the graph. top_order = topological_ordering(graph.keys()) # Build the longest path to each node. S = {node: '' for node in top_order} for node in top_order: for predecessor in map(lambda n: n[0], filter(lambda e: e[1] == node, graph.keys())): if len(S[predecessor]) + 1 > len(S[node]): S[node] = S[predecessor] + graph[(predecessor, node)] # Return the longest path. return max(S.values(), key=len)
def spectrum(peptide): '''Returns the linear spectrum of a given peptide.''' # Dictionary translating RNA to Protein weight = ProteinWeightDict() # Initialize as the mass 0 and the mass of the entire peptide. spec = [0, sum([int(weight[protein]) for protein in peptide])] # Find the masses of the adjacent intermediary subpeptides spec += [sum([int(weight[protein]) for protein in peptide[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide)-i+1)] # Sort the list in ascending order and convert to strings. spec = map(str,sorted(spec)) return spec
def cyclospectrum(peptide): # Dictionary translating RNA to Protein weight = ProteinWeightDict() # Initialize as the mass 0 and the mass of the entire peptide. cyclospec = [0, sum([int(weight[protein]) for protein in peptide])] # Find the masses of the adjacent intermediary subpeptides cyclospec += [ sum([int(weight[protein]) for protein in (peptide * 2)[j:j + i]]) for i in xrange(1, len(peptide)) for j in xrange(len(peptide)) ] # Sort the list in ascending order and convert to strings. cyclospec = map(str, sorted(cyclospec)) return cyclospec
# -*- coding: utf-8 -*- """ Created on Tue May 31 19:32:46 2016 @author: Johnqiu """ """ 问题1:真实的数据是按大小的顺序排列,模拟数据需不要排序? """ from scripts import ProteinWeightDict, IonTypeDict import random from operator import itemgetter aa_table = ProteinWeightDict() ion_table = IonTypeDict() def simulatePeptide(pep_len): acids = [k for k in aa_table] peptide=[random.choice(acids) for i in range(pep_len)] peptide = ''.join(peptide) # change list to String return peptide def generateSpectrum(peptide, ion_table,intensity = 100): """ Args: -peptide: a peptide string -iontables: {ion:(offset,prob)} """ spectrum = [] prefix_mass = 0
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Calculating Protein Mass Rosalind ID: PRTM Rosalind #: 020 URL: http://rosalind.info/problems/prtm/ ''' from scripts import ProteinWeightDict # Load the data. file1 = open('data/rosalind_prtm.txt') protein_str = file1.read().strip() file1.close() # Load the dictionary that translates protein to monoisotipic weight. weight_dict = ProteinWeightDict() # Calculate the weight protein by protein. monoisotopic_weight = 0 for protein in protein_str: monoisotopic_weight += weight_dict[protein] # Print and save the weight. print monoisotopic_weight with open('output/020_PRTM.txt', 'w') as output_data: output_data.write(str(monoisotopic_weight))
def spectrum_score(peptide, exp_spec): '''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.''' pep_spec = spectrum(peptide) # Return -1 if the peptide has more mass than exp_spec. if pep_spec[-1] > exp_spec[-1]: return -1 return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)]) if __name__ == '__main__': with open('data/stepic_2e.txt') as input_data: n, spec = [int(line.strip()) if i==0 else map(int,line.strip().split()) for i, line in enumerate(input_data.readlines())] # Create the protein weight dictionary. weight = ProteinWeightDict() # Initialize the scores dictionary. scores = dict() # Build the intial peptides. seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(weight.keys())]) # Build the sequence until the masses all grow too large. while seq != []: # Store the scores of the current sequence in a dictionary. scores = dict() for item in seq: if item[0] in scores: scores[item[0]].append(item[1]) else: scores[item[0]] = [item[1]]
# Dictionary translating RNA to Protein weight = ProteinWeightDict() # Initialize as the mass 0 and the mass of the entire peptide. spec = [0, sum([int(weight[protein]) for protein in peptide])] # Find the masses of the adjacent intermediary subpeptides spec += [sum([int(weight[protein]) for protein in peptide[j:j+i]]) for i in xrange(1,len(peptide)) for j in xrange(len(peptide)-i+1)] # Sort the list in ascending order and convert to strings. spec = map(str,sorted(spec)) return spec with open('data/textbook/rosalind_2d.txt') as input_data: cyclospec = input_data.read().strip().split() # Create the protein weight dictionary. weight = ProteinWeightDict() # Let n be the length of a given peptide, and L be the length of its cyclospectrum. Then L = n(n-1) + 2. # Using the quadratic formula to to solve for n: n = (sqrt(4L-7) + 1)/2 n = int((sqrt(4*len(cyclospec)-7)+1)/2) # Find the first n protein in the peptide. # Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries. # Fortunately, no two small proteins masses add to that of a larger protein. protein, i = [], 1 while len(protein) != n: if int(cyclospec[i]) in map(int,weight.values()): protein.append(cyclospec[i]) i += 1 # Get the name of each protein corresponding to a given weight (if multiple, only take one).
# Find the masses of the adjacent intermediary subpeptides spec += [ sum([int(weight[protein]) for protein in peptide[j:j + i]]) for i in xrange(1, len(peptide)) for j in xrange(len(peptide) - i + 1) ] # Sort the list in ascending order and convert to strings. spec = map(str, sorted(spec)) return spec with open('data/textbook/rosalind_2d.txt') as input_data: cyclospec = input_data.read().strip().split() # Create the protein weight dictionary. weight = ProteinWeightDict() # Let n be the length of a given peptide, and L be the length of its cyclospectrum. Then L = n(n-1) + 2. # Using the quadratic formula to to solve for n: n = (sqrt(4L-7) + 1)/2 n = int((sqrt(4 * len(cyclospec) - 7) + 1) / 2) # Find the first n protein in the peptide. # Need to be careful: two small proteins can add to be less than a larger one, so we can't just take the first n nonzero entries. # Fortunately, no two small proteins masses add to that of a larger protein. protein, i = [], 1 while len(protein) != n: if int(cyclospec[i]) in map(int, weight.values()): protein.append(cyclospec[i]) i += 1 # Get the name of each protein corresponding to a given weight (if multiple, only take one).
def append_protein(add_list): '''Returns a list containing all peptides from add_list with every possible protein suffix.''' newlist = [] for item in add_list: newlist += [item + ch for ch in ProteinWeightDict().keys()] return newlist
min(pep_spec.count(protein), exp_spec.count(protein)) for protein in set(pep_spec) ]) if __name__ == '__main__': with open('data/stepic_2e.txt') as input_data: n, spec = [ int(line.strip()) if i == 0 else map(int, line.strip().split()) for i, line in enumerate(input_data.readlines()) ] # Create the protein weight dictionary. weight = ProteinWeightDict() # Initialize the scores dictionary. scores = dict() # Build the intial peptides. seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide, spec), peptide] for peptide in append_protein(weight.keys())]) # Build the sequence until the masses all grow too large. while seq != []: # Store the scores of the current sequence in a dictionary. scores = dict() for item in seq: if item[0] in scores: scores[item[0]].append(item[1]) else:
def find_weight_match(approx_weight, error): for item in ProteinWeightDict().items(): if abs(item[1] - approx_weight) < error: return item[0] return None
Problem Title: Inferring Protein from Spectrum Rosalind ID: SPEC Rosalind #: 053 URL: http://rosalind.info/problems/spec/ ''' from scripts import ProteinWeightDict # The only major issue is that the given values aren't as precise as those in the table. # Need to find the closest match (or rewrite the weight dictionary with less precision). with open('data/rosalind_spec.txt') as input_data: masses = [float(line.strip()) for line in input_data.readlines()] # Load a list of (protein, weight) pairs. weight_list = ProteinWeightDict().items() # Gives the difference between a given weight and the protein at position i in the weight list. weight_diff = lambda (i, weight): abs(weight - weight_list[i][1]) # Returns the protein whose mass is closest to specified weight. closest_prot = lambda weight: weight_list[min(zip(range(len(weight_list)), [weight]*len(weight_list)), key=weight_diff)[0]][0] # Determine each protein. prot = [closest_prot(masses[i+1]-masses[i]) for i in range(len(masses)-1)] # Concatonate to get the desired protein. print ''.join(prot) with open('output/053_SPEC.txt', 'w') as output_data: output_data.write(''.join(prot))
def spectrum_score(peptide, exp_spec): '''Returns the number of matching masses from the spectrum of peptide when compared with the spectrum exp_spec.''' pep_spec = spectrum(peptide) # Return -1 if the peptide has more mass than exp_spec. if pep_spec[-1] > exp_spec[-1]: return -1 return sum([min(pep_spec.count(protein),exp_spec.count(protein)) for protein in set(pep_spec)]) if __name__ == '__main__': with open('data/textbook/rosalind_2e.txt') as input_data: n, spec = [int(line.strip()) if i==0 else map(int,line.strip().split()) for i, line in enumerate(input_data.readlines())] # Create the protein weight dictionary. weight = ProteinWeightDict() # Initialize the scores dictionary. scores = dict() # Build the intial peptides. seq = filter(lambda L: L[0] != -1, [[spectrum_score(peptide,spec), peptide] for peptide in append_protein(weight.keys())]) # Build the sequence until the masses all grow too large. while seq != []: # Store the scores of the current sequence in a dictionary. scores = dict() for item in seq: if item[0] in scores: scores[item[0]].append(item[1]) else: scores[item[0]] = [item[1]]