def test2_generate_peptides_novariants(self): """ Test if a list of proteins is correctly broken into peptide fragments. Here the proteins are constructed just from their sequence, having no transcript or variant information. """ pep_set = generate_peptides_from_proteins(self.prot_set, 3) # # Print peptide generator results: # for pep in pep_set: # print pep, pep.proteins.items() # print pep, pep.vars.items() # print pep, pep.transcripts.items() # get the number of peptides generated for each protein in self.prot_set and sum up number_of_peps = sum(len(pep.proteins.keys()) for pep in pep_set) # The total number of peptides of length 3 from all proteins in self.pro_set should be 14 self.assertEqual(number_of_peps, 14) # generated pep_set should consist only of unique-sequence entries unique_test_prot_set = list() unique_test_prot_set.extend(self.prot_set) unique_test_prot_set.extend(self.prot_set) unique_test_pep_set = set(generate_peptides_from_proteins(unique_test_prot_set, 3)) unique_test_pep_seqs = set([str(pep) for pep in unique_test_pep_set]) self.assertEqual(len(unique_test_pep_set), len(unique_test_pep_seqs))
def setUp(self): self.peptides = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")] testsequences_file = pkg_resources.resource_filename('Fred2', path.join('Data', 'examples', 'testSequences.fasta')) with open(testsequences_file, "rU") as handle: records = list(SeqIO.parse(handle, "fasta")) prot_set = [Protein(str(r.seq)) for r in records] unique_test_pep_set = generate_peptides_from_proteins(prot_set, 9) self.selfpeptides = [str(x) for x in unique_test_pep_set] small_prot_set = [Protein("MKERRIDMKEKNVKAKAPNKKVLGLTTKIFIALLAGAILGIVLCYLVPDSSFKKDVIVEGILYVIGQGFIRLMKMLVVPLVFCSLVCGSMAIGDTKKLGTVGVRTLAFYLATTALAVVALGVGNLINPGVGLDMSAIQSSAASVETMEATSLTDTILNIIPDNPINSLASGSMLQVIVFALIVGVILAKMGERAETVANFFSQFNDIMMEMTMMIMSLAPIGVFCLISRTFANIGFSAFIPLAKYMIGVLLALAIQCFGVYQILLKIFTGLNPIRFIKKFFPVMAFAFSTATSNATIPMSIDTLSKKVGVSKKISSFTIPLGATINMDGTSIMQGVAVVFAAQAFGIHLTPMDYVTVIGTATLASVGTAGVPSVGLVTLTMVFNSVGLPVEAIGLIMGIDRILDMTRTAVNITGDAVCTTIVAHQNGALDKKVFNETE"), Protein("MLKVWIAGASGQIGRALNDVLDPMQIEALNTDLDELDITDTDEVINFGTVNRPDVIINCTGITDTDECEANPEHAYRVNALGARNLSIVARKCGSKIVQLSTDDVFDGQSKKPYTEFDDTNPLTVYGRSKRAGENYVKEFTHKHFVIRSNWVYGHGGHNFVNRVLAAAEAGNGLSVASDQFGSPTSAKDLAKMIMYLISTNEYGTYHVTCRGVCSRYEFAQEILKLAGKDIELRAVPTEQSDLSAVRPPYAVLDNFILRIIEVYDMPDWKESLKEYMDERTED")] small_unique_test_pep_set = generate_peptides_from_proteins(small_prot_set, 9) self.fewselfpeptides = [str(x) for x in small_unique_test_pep_set]
def setUp(self): self.peptides = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")] testsequences_file = pkg_resources.resource_filename( 'Fred2', path.join('Data', 'examples', 'testSequences.fasta')) with open(testsequences_file, "rU") as handle: records = list(SeqIO.parse(handle, "fasta")) prot_set = [Protein(str(r.seq)) for r in records] unique_test_pep_set = generate_peptides_from_proteins(prot_set, 9) self.selfpeptides = [str(x) for x in unique_test_pep_set] small_prot_set = [ Protein( "MKERRIDMKEKNVKAKAPNKKVLGLTTKIFIALLAGAILGIVLCYLVPDSSFKKDVIVEGILYVIGQGFIRLMKMLVVPLVFCSLVCGSMAIGDTKKLGTVGVRTLAFYLATTALAVVALGVGNLINPGVGLDMSAIQSSAASVETMEATSLTDTILNIIPDNPINSLASGSMLQVIVFALIVGVILAKMGERAETVANFFSQFNDIMMEMTMMIMSLAPIGVFCLISRTFANIGFSAFIPLAKYMIGVLLALAIQCFGVYQILLKIFTGLNPIRFIKKFFPVMAFAFSTATSNATIPMSIDTLSKKVGVSKKISSFTIPLGATINMDGTSIMQGVAVVFAAQAFGIHLTPMDYVTVIGTATLASVGTAGVPSVGLVTLTMVFNSVGLPVEAIGLIMGIDRILDMTRTAVNITGDAVCTTIVAHQNGALDKKVFNETE" ), Protein( "MLKVWIAGASGQIGRALNDVLDPMQIEALNTDLDELDITDTDEVINFGTVNRPDVIINCTGITDTDECEANPEHAYRVNALGARNLSIVARKCGSKIVQLSTDDVFDGQSKKPYTEFDDTNPLTVYGRSKRAGENYVKEFTHKHFVIRSNWVYGHGGHNFVNRVLAAAEAGNGLSVASDQFGSPTSAKDLAKMIMYLISTNEYGTYHVTCRGVCSRYEFAQEILKLAGKDIELRAVPTEQSDLSAVRPPYAVLDNFILRIIEVYDMPDWKESLKEYMDERTED" ) ] small_unique_test_pep_set = generate_peptides_from_proteins( small_prot_set, 9) self.fewselfpeptides = [str(x) for x in small_unique_test_pep_set]
def test3_protein_from_variants(self): """ Generate some transcripts from the 3 input variants (should give 8 transcripts, check also if all fields are complete) Using a protein made from variants: Translate to proteins (check if all fields are there/filled) fragment to unique peptides (check for uniqueness of sequences, check fields of peptides, check correctness of fragments) """ dummy_db = DummyAdapter() dummy_vars = [var_10, var_11, var_12] proteins = [] t = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)) for trans in t: # check gene id field: print trans self.assertEqual(trans.gene_id, "gene_1") # check trans id name: name = trans.transcript_id.split(":FRED2_") self.assertEqual(len(name), 2) self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2") self.assertTrue(len(name[1]) == 1 and name[1].isdigit) # check var: self.assertIsNotNone(trans.vars) self.assertTrue(len(trans.vars) > 0) # check sequence: self.assertTrue(str(trans) > 5) ### GET PROTS: # IGNORE invalid sequence lengths try: proteins.append(generate_proteins_from_transcripts(trans).next()) except ValueError: pass self.assertEqual(len(proteins), 8) ## CHECK Proteins: for prot in proteins: self.assertEqual(prot.gene_id, "gene_1") # check trans id name: name = prot.transcript_id.split(":FRED2_") self.assertEqual(len(name), 2) self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2") self.assertTrue(len(name[1]) == 1 and name[1].isdigit) orig = prot.orig_transcript self.assertEqual(prot.transcript_id, orig.transcript_id) self.assertEqual(len(set(e for subl in prot.vars.itervalues() for e in subl)), len(orig.vars)) # check sequence: self.assertTrue(str(prot) > 2) ## GENERATE Peptides: peptides = generate_peptides_from_proteins(proteins,2)
def test4_peptides_from_variants(self): """ Ref trancript: AAAAACCCCCGGGGG ref protein: KNPRG ref peps(3): KNPR, NPRG variant1: heterozygous, fs+1 in first aa variant2: heterozygous, insertion +2 in last aa trans-var1: TKPPGA 1: peps(3): TKPP, KPPG, PPGA trans-var2: KNPRG 2: peps(3): KNPR, NPRG Output: ------- PEPTIDE: PPGA TRANSCRIPT: tsc_1:FRED2_3 Variant(15CC) Variant(1C) PEPTIDE: KPPG TRANSCRIPT: tsc_1:FRED2_3 Variant(1C) PEPTIDE: TKPP TRANSCRIPT: tsc_1:FRED2_3 Variant(1C) PEPTIDE: KNPR TRANSCRIPT: tsc_1:FRED2_0 PEPTIDE: NPRG TRANSCRIPT: tsc_1:FRED2_0 """ #TODO Somewhere here a print statement is called peps_trans1 = ["KNPR", "NPRG"] peps_trans2 = ["PPGA", "KPPG", "TKPP"] expected_vars = ["Variant(1C)", "Variant(15CC)"] expected = peps_trans1 + peps_trans2 dummy_db = DummyAdapter() dummy_vars = [var_13, var_14] proteins = [] transcripts = list(generate_transcripts_from_variants(dummy_vars, dummy_db, EIdentifierTypes.REFSEQ)) for trans in transcripts: ### GET PROTS: # IGNORE invalid sequence lengths try: proteins.append(generate_proteins_from_transcripts(trans).next()) except ValueError: pass peptides = list(generate_peptides_from_proteins(proteins, 4)) sequences = [str(pep) for pep in peptides] # Check if all peptides are generated as expected self.assertTrue(all(pep in sequences for pep in expected)) # no duplicates or more than the expected ones: self.assertEqual(len(peptides), len(expected)) #vari_peps = [pep.get_all_variants() for pep in peptides \ # if str(pep) in peps_trans2] #vars_ = [str(var) for varlist in vari_peps for var in varlist] # Check that for the peptides from the transcript containing the # variants, we also get all expected variants. Especally the first # variant needs to be present in all peptides for prot in proteins: for p in peptides: try: vars_ = map(str, p.get_variants_by_protein(prot.transcript_id)) expected_vars = [str(v) for vars in prot.vars.itervalues() for v in vars] print "peptide vars: ", vars_ print "Prot vars: ", expected_vars print repr(p) print repr(prot) self.assertTrue(all(var in expected_vars for var in vars_)) except KeyError: pass
def __init__(self, peptides, cl_pred, ep_pred, alleles, threshold, comparator, length=9, solver="glpk", weight=0.0, matrix=None, verbosity=0): if not isinstance(cl_pred, ACleavageSitePrediction): raise ValueError( "Cleave site predictor must be of type ACleavageSitePrediction" ) if not isinstance(ep_pred, AEpitopePrediction): raise ValueError( "Epitope predictor must be of type AEpitopePrediction") if any(not isinstance(a, Allele) for a in alleles): raise ValueError("alleles contains non Allele objects.") if len(peptides) > 60: warnings.warn( "The peptide set exceeds 60. Above this level one has to expect " + "considerably long running times due to the complexity of the problem." ) _alleles = copy.deepcopy(alleles) #test if allele prob is set, if not set allele prob uniform #if only partly set infer missing values (assuming uniformity of missing value) prob = [] no_prob = [] for a in _alleles: if a.prob is None: no_prob.append(a) else: prob.append(a) if len(no_prob) > 0: #group by locus no_prob_grouped = {} prob_grouped = {} for a in no_prob: no_prob_grouped.setdefault(a.locus, []).append(a) for a in prob: prob_grouped.setdefault(a.locus, []).append(a) for g, v in no_prob_grouped.items(): total_loc_a = len(v) if g in prob_grouped: remaining_mass = 1.0 - sum(a.prob for a in prob_grouped[g]) for a in v: a.prob = remaining_mass / total_loc_a else: for a in v: a.prob = 1.0 / total_loc_a probs = {a.name: a.prob for a in _alleles} if verbosity: for a in _alleles: print(a.name, a.prob) #Generate model #1. Generate peptides for which cleave sites have to be predicted #2. generate graph with dummy element self.__verbosity = verbosity pep_tmp = peptides[:] pep_tmp.append("Dummy") cl_edge_matrix = {} ep_edge_matrix = defaultdict(int) fragments = {} seq_to_pep = {} self.neo_cleavage = {} self.good_cleavage = {} if matrix is None: for start, stop in itr.combinations(pep_tmp, 2): if start == "Dummy" or stop == "Dummy": seq_to_pep[str(start)] = start seq_to_pep[str(stop)] = stop cl_edge_matrix[(str(start), str(stop))] = 0 cl_edge_matrix[(str(stop), str(start))] = 0 ep_edge_matrix[(str(start), str(stop))] = 0 ep_edge_matrix[(str(stop), str(start))] = 0 else: start_str = str(start) stop_str = str(stop) frag = Protein(start_str + stop_str) garf = Protein(stop_str + start_str) fragments[frag] = (start_str, stop_str) fragments[garf] = (stop_str, start_str) epi_pred = ep_pred.predict(generate_peptides_from_proteins( list(fragments.keys()), length), alleles=_alleles) for index, row in epi_pred.iterrows(): nof_epis = sum(comparator(row[a],threshold.get(a.name, 0)) for a in _alleles) \ for protein in index[0].proteins.values(): start, stop = fragments[protein] ep_edge_matrix[start, stop] += len( index[0].proteinPos[protein.transcript_id]) * nof_epis cleave_pred = cl_pred.predict(list(fragments.keys())) #cleave_site_df = cleave_pred.xs((slice(None), (cleavage_pos-1))) for i in set(cleave_pred.index.get_level_values(0)): fragment = "".join(cleave_pred.ix[i]["Seq"]) start, stop = fragments[fragment] cleav_pos = len(str(start)) - 1 cl_edge_matrix[(start, stop)] = -1.0 * ( cleave_pred.loc[(i, len(str(start)) - 1), cl_pred.name] - weight * sum(cleave_pred.loc[(i, j), cl_pred.name] for j in range(cleav_pos - 1, cleav_pos + 4, 1) if j != cleav_pos)) self.neo_cleavage[(start, stop)] = sum( cleave_pred.loc[(i, j), cl_pred.name] for j in range(cleav_pos - 1, cleav_pos + 4, 1) if j != cleav_pos) self.good_cleavage[(start, stop)] = cleave_pred.loc[(i, len(str(start)) - 1), cl_pred.name] else: cl_edge_matrix = matrix seq_to_pep = {str(p): p for p in pep_tmp} for p in seq_to_pep.keys(): if p != "Dummy": cl_edge_matrix[(p, "Dummy")] = 0 cl_edge_matrix[("Dummy", p)] = 0 ep_edge_matrix[(p, "Dummy")] = 0 ep_edge_matrix[("Dummy", p)] = 0 self.__seq_to_pep = seq_to_pep #3. initialize ILP self.__solver = SolverFactory(solver) model = ConcreteModel() E = [x for x in list(seq_to_pep.keys()) if x != "Dummy"] model.E = Set(initialize=E) model.E_prime = Set(initialize=list(seq_to_pep.keys())) model.ExE = Set(initialize=itr.permutations(E, 2), dimen=2) model.w_ab = Param(model.E_prime, model.E_prime, initialize=cl_edge_matrix) model.e_ab = Param(model.E_prime, model.E_prime, initialize=ep_edge_matrix) model.card = Param(initialize=len(model.E_prime)) model.eps1 = Param(initialize=1e6, mutable=True) model.eps2 = Param(initialize=1e6, mutable=True) model.x = Var(model.E_prime, model.E_prime, within=Binary) model.u = Var(model.E, domain=PositiveIntegers, bounds=(2, model.card)) model.cleavage_obj = Objective( rule=lambda mode: sum(model.w_ab[a, b] * model.x[a, b] for a in model.E_prime for b in model.E_prime if a != b), sense=minimize) model.epitope_obj = Objective( rule=lambda mode: sum(model.e_ab[a, b] * model.x[a, b] for a in model.E_prime for b in model.E_prime if a != b), sense=minimize) model.tour_constraint_1 = Constraint( model.E_prime, rule=lambda model, a: sum(model.x[a, b] for b in model.E_prime if a != b) == 1) model.tour_constraint_2 = Constraint( model.E_prime, rule=lambda model, a: sum(model.x[b, a] for b in model.E_prime if a != b) == 1) model.cardinality_constraint = Constraint( model.ExE, rule=lambda model, a, b: model.u[a] - model.u[b] + 1 <= (model.card - 1) * (1 - model.x[a, b])) model.cleavageobjective_constraint = Constraint(rule=lambda model: sum( model.w_ab[a, b] * model.x[a, b] for a in model.E_prime for b in model.E_prime if a != b) <= model.eps1) model.epitopeobjective_constraint = Constraint(rule=lambda model: sum( model.e_ab[a, b] * model.x[a, b] for a in model.E_prime for b in model.E_prime if a != b) <= model.eps2) self.objectsives = [model.cleavage_obj, model.epitope_obj] self.constraints = [ model.epitopeobjective_constraint, model.cleavageobjective_constraint ] self.epsilons = [model.eps2, model.eps1] self.instance = model if self.__verbosity > 0: print("MODEL INSTANCE") self.instance.pprint()
def __init__(self, peptides, cl_pred, ep_pred, alleles, threshold, comparator, length=9, solver="glpk", weight=0.0, matrix=None, verbosity=0): if not isinstance(cl_pred, ACleavageSitePrediction): raise ValueError("Cleave site predictor must be of type ACleavageSitePrediction") if not isinstance(ep_pred, AEpitopePrediction): raise ValueError("Epitope predictor must be of type AEpitopePrediction") if any( not isinstance(a, Allele) for a in alleles): raise ValueError("alleles contains non Allele objects.") if len(peptides) > 60: warnings.warn("The peptide set exceeds 60. Above this level one has to expect " + "considerably long running times due to the complexity of the problem.") _alleles = copy.deepcopy(alleles) #test if allele prob is set, if not set allele prob uniform #if only partly set infer missing values (assuming uniformity of missing value) prob = [] no_prob = [] for a in _alleles: if a.prob is None: no_prob.append(a) else: prob.append(a) if len(no_prob) > 0: #group by locus no_prob_grouped = {} prob_grouped = {} for a in no_prob: no_prob_grouped.setdefault(a.locus, []).append(a) for a in prob: prob_grouped.setdefault(a.locus, []).append(a) for g, v in no_prob_grouped.iteritems(): total_loc_a = len(v) if g in prob_grouped: remaining_mass = 1.0 - sum(a.prob for a in prob_grouped[g]) for a in v: a.prob = remaining_mass/total_loc_a else: for a in v: a.prob = 1.0/total_loc_a probs = {a.name:a.prob for a in _alleles} if verbosity: for a in _alleles: print a.name, a.prob #Generate model #1. Generate peptides for which cleave sites have to be predicted #2. generate graph with dummy element self.__verbosity = verbosity pep_tmp = peptides[:] pep_tmp.append("Dummy") cl_edge_matrix = {} ep_edge_matrix = defaultdict(int) fragments = {} seq_to_pep = {} self.neo_cleavage = {} self.good_cleavage = {} if matrix is None: for start, stop in itr.combinations(pep_tmp, 2): if start == "Dummy" or stop == "Dummy": seq_to_pep[str(start)] = start seq_to_pep[str(stop)] = stop cl_edge_matrix[(str(start), str(stop))] = 0 cl_edge_matrix[(str(stop), str(start))] = 0 ep_edge_matrix[(str(start), str(stop))] = 0 ep_edge_matrix[(str(stop), str(start))] = 0 else: start_str = str(start) stop_str = str(stop) frag = Protein(start_str+stop_str) garf = Protein(stop_str+start_str) fragments[frag] = (start_str, stop_str) fragments[garf] = (stop_str, start_str) epi_pred = ep_pred.predict(generate_peptides_from_proteins(fragments.keys(), length), alleles=_alleles) for index,row in epi_pred.iterrows(): nof_epis = sum(comparator(row[a],threshold.get(a.name, 0)) for a in _alleles) \ for protein in index[0].proteins.itervalues(): start, stop = fragments[protein] ep_edge_matrix[start,stop] += len(index[0].proteinPos[protein.transcript_id])*nof_epis cleave_pred = cl_pred.predict(fragments.keys()) #cleave_site_df = cleave_pred.xs((slice(None), (cleavage_pos-1))) for i in set(cleave_pred.index.get_level_values(0)): fragment = "".join(cleave_pred.ix[i]["Seq"]) start, stop = fragments[fragment] cleav_pos = len(str(start)) - 1 cl_edge_matrix[(start, stop)] = -1.0 * ( cleave_pred.loc[(i, len(str(start)) - 1), cl_pred.name] - weight * sum( cleave_pred.loc[(i, j), cl_pred.name] for j in xrange(cleav_pos - 1, cleav_pos + 4, 1) if j != cleav_pos)) self.neo_cleavage[(start, stop)] = sum( cleave_pred.loc[(i, j), cl_pred.name] for j in xrange(cleav_pos - 1, cleav_pos + 4, 1) if j != cleav_pos) self.good_cleavage[(start, stop)] = cleave_pred.loc[(i, len(str(start)) - 1), cl_pred.name] else: cl_edge_matrix = matrix seq_to_pep = {str(p): p for p in pep_tmp} for p in seq_to_pep.iterkeys(): if p != "Dummy": cl_edge_matrix[(p,"Dummy")] = 0 cl_edge_matrix[("Dummy",p)] = 0 ep_edge_matrix[(p,"Dummy")] = 0 ep_edge_matrix[("Dummy",p)] = 0 self.__seq_to_pep = seq_to_pep #3. initialize ILP self.__solver = SolverFactory(solver) model = ConcreteModel() E = filter(lambda x: x != "Dummy", seq_to_pep.keys()) model.E = Set(initialize=E) model.E_prime = Set(initialize=seq_to_pep.keys()) model.ExE = Set(initialize=itr.permutations(E,2), dimen=2) model.w_ab = Param(model.E_prime, model.E_prime, initialize=cl_edge_matrix) model.e_ab = Param(model.E_prime, model.E_prime, initialize=ep_edge_matrix) model.card = Param(initialize=len(model.E_prime)) model.eps1 = Param(initialize=1e6, mutable=True) model.eps2 = Param(initialize=1e6, mutable=True) model.x = Var(model.E_prime, model.E_prime, within=Binary) model.u = Var(model.E, domain=PositiveIntegers, bounds=(2,model.card)) model.cleavage_obj = Objective( rule=lambda mode: sum(model.w_ab[a,b]*model.x[a,b] for a in model.E_prime for b in model.E_prime if a != b), sense=minimize) model.epitope_obj = Objective( rule=lambda mode: sum( model.e_ab[a,b]*model.x[a,b] for a in model.E_prime for b in model.E_prime if a != b), sense=minimize) model.tour_constraint_1 = Constraint(model.E_prime, rule=lambda model, a: sum(model.x[a,b] for b in model.E_prime if a != b) == 1) model.tour_constraint_2 = Constraint(model.E_prime, rule=lambda model, a: sum(model.x[b,a] for b in model.E_prime if a != b) == 1) model.cardinality_constraint = Constraint(model.ExE, rule=lambda model, a, b: model.u[a]-model.u[b]+1 <= (model.card -1)*(1-model.x[a, b])) model.cleavageobjective_constraint = Constraint(rule=lambda model: sum(model.w_ab[a,b]*model.x[a,b] for a in model.E_prime for b in model.E_prime if a != b) <= model.eps1) model.epitopeobjective_constraint = Constraint(rule=lambda model: sum(model.e_ab[a,b]*model.x[a,b] for a in model.E_prime for b in model.E_prime if a != b) <= model.eps2) self.objectsives = [model.cleavage_obj, model.epitope_obj] self.constraints = [model.epitopeobjective_constraint, model.cleavageobjective_constraint] self.epsilons = [model.eps2, model.eps1] self.instance = model if self.__verbosity > 0: print "MODEL INSTANCE" self.instance.pprint()